xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_affinity.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_i18n.h"
16 #include "kmp_io.h"
17 #include "kmp_str.h"
18 #include "kmp_wrapper_getpid.h"
19 #if KMP_USE_HIER_SCHED
20 #include "kmp_dispatch_hier.h"
21 #endif
22 #if KMP_USE_HWLOC
23 // Copied from hwloc
24 #define HWLOC_GROUP_KIND_INTEL_MODULE 102
25 #define HWLOC_GROUP_KIND_INTEL_TILE 103
26 #define HWLOC_GROUP_KIND_INTEL_DIE 104
27 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220
28 #endif
29 #include <ctype.h>
30 
31 // The machine topology
32 kmp_topology_t *__kmp_topology = nullptr;
33 // KMP_HW_SUBSET environment variable
34 kmp_hw_subset_t *__kmp_hw_subset = nullptr;
35 
36 // Store the real or imagined machine hierarchy here
37 static hierarchy_info machine_hierarchy;
38 
__kmp_cleanup_hierarchy()39 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
40 
41 #if KMP_AFFINITY_SUPPORTED
42 // Helper class to see if place lists further restrict the fullMask
43 class kmp_full_mask_modifier_t {
44   kmp_affin_mask_t *mask;
45 
46 public:
kmp_full_mask_modifier_t()47   kmp_full_mask_modifier_t() {
48     KMP_CPU_ALLOC(mask);
49     KMP_CPU_ZERO(mask);
50   }
~kmp_full_mask_modifier_t()51   ~kmp_full_mask_modifier_t() {
52     KMP_CPU_FREE(mask);
53     mask = nullptr;
54   }
include(const kmp_affin_mask_t * other)55   void include(const kmp_affin_mask_t *other) { KMP_CPU_UNION(mask, other); }
56   // If the new full mask is different from the current full mask,
57   // then switch them. Returns true if full mask was affected, false otherwise.
restrict_to_mask()58   bool restrict_to_mask() {
59     // See if the new mask further restricts or changes the full mask
60     if (KMP_CPU_EQUAL(__kmp_affin_fullMask, mask) || KMP_CPU_ISEMPTY(mask))
61       return false;
62     return __kmp_topology->restrict_to_mask(mask);
63   }
64 };
65 
66 static inline const char *
__kmp_get_affinity_env_var(const kmp_affinity_t & affinity,bool for_binding=false)67 __kmp_get_affinity_env_var(const kmp_affinity_t &affinity,
68                            bool for_binding = false) {
69   if (affinity.flags.omp_places) {
70     if (for_binding)
71       return "OMP_PROC_BIND";
72     return "OMP_PLACES";
73   }
74   return affinity.env_var;
75 }
76 #endif // KMP_AFFINITY_SUPPORTED
77 
__kmp_get_hierarchy(kmp_uint32 nproc,kmp_bstate_t * thr_bar)78 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
79   kmp_uint32 depth;
80   // The test below is true if affinity is available, but set to "none". Need to
81   // init on first use of hierarchical barrier.
82   if (TCR_1(machine_hierarchy.uninitialized))
83     machine_hierarchy.init(nproc);
84 
85   // Adjust the hierarchy in case num threads exceeds original
86   if (nproc > machine_hierarchy.base_num_threads)
87     machine_hierarchy.resize(nproc);
88 
89   depth = machine_hierarchy.depth;
90   KMP_DEBUG_ASSERT(depth > 0);
91 
92   thr_bar->depth = depth;
93   __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1,
94                      &(thr_bar->base_leaf_kids));
95   thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
96 }
97 
98 static int nCoresPerPkg, nPackages;
99 static int __kmp_nThreadsPerCore;
100 #ifndef KMP_DFLT_NTH_CORES
101 static int __kmp_ncores;
102 #endif
103 
__kmp_hw_get_catalog_string(kmp_hw_t type,bool plural)104 const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
105   switch (type) {
106   case KMP_HW_SOCKET:
107     return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket));
108   case KMP_HW_DIE:
109     return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die));
110   case KMP_HW_MODULE:
111     return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module));
112   case KMP_HW_TILE:
113     return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile));
114   case KMP_HW_NUMA:
115     return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain));
116   case KMP_HW_L3:
117     return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache));
118   case KMP_HW_L2:
119     return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache));
120   case KMP_HW_L1:
121     return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache));
122   case KMP_HW_LLC:
123     return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache));
124   case KMP_HW_CORE:
125     return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core));
126   case KMP_HW_THREAD:
127     return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread));
128   case KMP_HW_PROC_GROUP:
129     return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup));
130   case KMP_HW_UNKNOWN:
131   case KMP_HW_LAST:
132     return KMP_I18N_STR(Unknown);
133   }
134   KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
135   KMP_BUILTIN_UNREACHABLE;
136 }
137 
__kmp_hw_get_keyword(kmp_hw_t type,bool plural)138 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
139   switch (type) {
140   case KMP_HW_SOCKET:
141     return ((plural) ? "sockets" : "socket");
142   case KMP_HW_DIE:
143     return ((plural) ? "dice" : "die");
144   case KMP_HW_MODULE:
145     return ((plural) ? "modules" : "module");
146   case KMP_HW_TILE:
147     return ((plural) ? "tiles" : "tile");
148   case KMP_HW_NUMA:
149     return ((plural) ? "numa_domains" : "numa_domain");
150   case KMP_HW_L3:
151     return ((plural) ? "l3_caches" : "l3_cache");
152   case KMP_HW_L2:
153     return ((plural) ? "l2_caches" : "l2_cache");
154   case KMP_HW_L1:
155     return ((plural) ? "l1_caches" : "l1_cache");
156   case KMP_HW_LLC:
157     return ((plural) ? "ll_caches" : "ll_cache");
158   case KMP_HW_CORE:
159     return ((plural) ? "cores" : "core");
160   case KMP_HW_THREAD:
161     return ((plural) ? "threads" : "thread");
162   case KMP_HW_PROC_GROUP:
163     return ((plural) ? "proc_groups" : "proc_group");
164   case KMP_HW_UNKNOWN:
165   case KMP_HW_LAST:
166     return ((plural) ? "unknowns" : "unknown");
167   }
168   KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
169   KMP_BUILTIN_UNREACHABLE;
170 }
171 
__kmp_hw_get_core_type_string(kmp_hw_core_type_t type)172 const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
173   switch (type) {
174   case KMP_HW_CORE_TYPE_UNKNOWN:
175   case KMP_HW_MAX_NUM_CORE_TYPES:
176     return "unknown";
177 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
178   case KMP_HW_CORE_TYPE_ATOM:
179     return "Intel Atom(R) processor";
180   case KMP_HW_CORE_TYPE_CORE:
181     return "Intel(R) Core(TM) processor";
182 #endif
183   }
184   KMP_ASSERT2(false, "Unhandled kmp_hw_core_type_t enumeration");
185   KMP_BUILTIN_UNREACHABLE;
186 }
187 
188 #if KMP_AFFINITY_SUPPORTED
189 // If affinity is supported, check the affinity
190 // verbose and warning flags before printing warning
191 #define KMP_AFF_WARNING(s, ...)                                                \
192   if (s.flags.verbose || (s.flags.warnings && (s.type != affinity_none))) {    \
193     KMP_WARNING(__VA_ARGS__);                                                  \
194   }
195 #else
196 #define KMP_AFF_WARNING(s, ...) KMP_WARNING(__VA_ARGS__)
197 #endif
198 
199 ////////////////////////////////////////////////////////////////////////////////
200 // kmp_hw_thread_t methods
compare_ids(const void * a,const void * b)201 int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
202   const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a;
203   const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b;
204   int depth = __kmp_topology->get_depth();
205   for (int level = 0; level < depth; ++level) {
206     if (ahwthread->ids[level] < bhwthread->ids[level])
207       return -1;
208     else if (ahwthread->ids[level] > bhwthread->ids[level])
209       return 1;
210   }
211   if (ahwthread->os_id < bhwthread->os_id)
212     return -1;
213   else if (ahwthread->os_id > bhwthread->os_id)
214     return 1;
215   return 0;
216 }
217 
218 #if KMP_AFFINITY_SUPPORTED
compare_compact(const void * a,const void * b)219 int kmp_hw_thread_t::compare_compact(const void *a, const void *b) {
220   int i;
221   const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a;
222   const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b;
223   int depth = __kmp_topology->get_depth();
224   int compact = __kmp_topology->compact;
225   KMP_DEBUG_ASSERT(compact >= 0);
226   KMP_DEBUG_ASSERT(compact <= depth);
227   for (i = 0; i < compact; i++) {
228     int j = depth - i - 1;
229     if (aa->sub_ids[j] < bb->sub_ids[j])
230       return -1;
231     if (aa->sub_ids[j] > bb->sub_ids[j])
232       return 1;
233   }
234   for (; i < depth; i++) {
235     int j = i - compact;
236     if (aa->sub_ids[j] < bb->sub_ids[j])
237       return -1;
238     if (aa->sub_ids[j] > bb->sub_ids[j])
239       return 1;
240   }
241   return 0;
242 }
243 #endif
244 
print() const245 void kmp_hw_thread_t::print() const {
246   int depth = __kmp_topology->get_depth();
247   printf("%4d ", os_id);
248   for (int i = 0; i < depth; ++i) {
249     printf("%4d ", ids[i]);
250   }
251   if (attrs) {
252     if (attrs.is_core_type_valid())
253       printf(" (%s)", __kmp_hw_get_core_type_string(attrs.get_core_type()));
254     if (attrs.is_core_eff_valid())
255       printf(" (eff=%d)", attrs.get_core_eff());
256   }
257   if (leader)
258     printf(" (leader)");
259   printf("\n");
260 }
261 
262 ////////////////////////////////////////////////////////////////////////////////
263 // kmp_topology_t methods
264 
265 // Add a layer to the topology based on the ids. Assume the topology
266 // is perfectly nested (i.e., so no object has more than one parent)
_insert_layer(kmp_hw_t type,const int * ids)267 void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) {
268   // Figure out where the layer should go by comparing the ids of the current
269   // layers with the new ids
270   int target_layer;
271   int previous_id = kmp_hw_thread_t::UNKNOWN_ID;
272   int previous_new_id = kmp_hw_thread_t::UNKNOWN_ID;
273 
274   // Start from the highest layer and work down to find target layer
275   // If new layer is equal to another layer then put the new layer above
276   for (target_layer = 0; target_layer < depth; ++target_layer) {
277     bool layers_equal = true;
278     bool strictly_above_target_layer = false;
279     for (int i = 0; i < num_hw_threads; ++i) {
280       int id = hw_threads[i].ids[target_layer];
281       int new_id = ids[i];
282       if (id != previous_id && new_id == previous_new_id) {
283         // Found the layer we are strictly above
284         strictly_above_target_layer = true;
285         layers_equal = false;
286         break;
287       } else if (id == previous_id && new_id != previous_new_id) {
288         // Found a layer we are below. Move to next layer and check.
289         layers_equal = false;
290         break;
291       }
292       previous_id = id;
293       previous_new_id = new_id;
294     }
295     if (strictly_above_target_layer || layers_equal)
296       break;
297   }
298 
299   // Found the layer we are above. Now move everything to accommodate the new
300   // layer. And put the new ids and type into the topology.
301   for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
302     types[j] = types[i];
303   types[target_layer] = type;
304   for (int k = 0; k < num_hw_threads; ++k) {
305     for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
306       hw_threads[k].ids[j] = hw_threads[k].ids[i];
307     hw_threads[k].ids[target_layer] = ids[k];
308   }
309   equivalent[type] = type;
310   depth++;
311 }
312 
313 #if KMP_GROUP_AFFINITY
314 // Insert the Windows Processor Group structure into the topology
_insert_windows_proc_groups()315 void kmp_topology_t::_insert_windows_proc_groups() {
316   // Do not insert the processor group structure for a single group
317   if (__kmp_num_proc_groups == 1)
318     return;
319   kmp_affin_mask_t *mask;
320   int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads);
321   KMP_CPU_ALLOC(mask);
322   for (int i = 0; i < num_hw_threads; ++i) {
323     KMP_CPU_ZERO(mask);
324     KMP_CPU_SET(hw_threads[i].os_id, mask);
325     ids[i] = __kmp_get_proc_group(mask);
326   }
327   KMP_CPU_FREE(mask);
328   _insert_layer(KMP_HW_PROC_GROUP, ids);
329   __kmp_free(ids);
330 
331   // sort topology after adding proc groups
332   __kmp_topology->sort_ids();
333 }
334 #endif
335 
336 // Remove layers that don't add information to the topology.
337 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
_remove_radix1_layers()338 void kmp_topology_t::_remove_radix1_layers() {
339   int preference[KMP_HW_LAST];
340   int top_index1, top_index2;
341   // Set up preference associative array
342   preference[KMP_HW_SOCKET] = 110;
343   preference[KMP_HW_PROC_GROUP] = 100;
344   preference[KMP_HW_CORE] = 95;
345   preference[KMP_HW_THREAD] = 90;
346   preference[KMP_HW_NUMA] = 85;
347   preference[KMP_HW_DIE] = 80;
348   preference[KMP_HW_TILE] = 75;
349   preference[KMP_HW_MODULE] = 73;
350   preference[KMP_HW_L3] = 70;
351   preference[KMP_HW_L2] = 65;
352   preference[KMP_HW_L1] = 60;
353   preference[KMP_HW_LLC] = 5;
354   top_index1 = 0;
355   top_index2 = 1;
356   while (top_index1 < depth - 1 && top_index2 < depth) {
357     kmp_hw_t type1 = types[top_index1];
358     kmp_hw_t type2 = types[top_index2];
359     KMP_ASSERT_VALID_HW_TYPE(type1);
360     KMP_ASSERT_VALID_HW_TYPE(type2);
361     // Do not allow the three main topology levels (sockets, cores, threads) to
362     // be compacted down
363     if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE ||
364          type1 == KMP_HW_SOCKET) &&
365         (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE ||
366          type2 == KMP_HW_SOCKET)) {
367       top_index1 = top_index2++;
368       continue;
369     }
370     bool radix1 = true;
371     bool all_same = true;
372     int id1 = hw_threads[0].ids[top_index1];
373     int id2 = hw_threads[0].ids[top_index2];
374     int pref1 = preference[type1];
375     int pref2 = preference[type2];
376     for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) {
377       if (hw_threads[hwidx].ids[top_index1] == id1 &&
378           hw_threads[hwidx].ids[top_index2] != id2) {
379         radix1 = false;
380         break;
381       }
382       if (hw_threads[hwidx].ids[top_index2] != id2)
383         all_same = false;
384       id1 = hw_threads[hwidx].ids[top_index1];
385       id2 = hw_threads[hwidx].ids[top_index2];
386     }
387     if (radix1) {
388       // Select the layer to remove based on preference
389       kmp_hw_t remove_type, keep_type;
390       int remove_layer, remove_layer_ids;
391       if (pref1 > pref2) {
392         remove_type = type2;
393         remove_layer = remove_layer_ids = top_index2;
394         keep_type = type1;
395       } else {
396         remove_type = type1;
397         remove_layer = remove_layer_ids = top_index1;
398         keep_type = type2;
399       }
400       // If all the indexes for the second (deeper) layer are the same.
401       // e.g., all are zero, then make sure to keep the first layer's ids
402       if (all_same)
403         remove_layer_ids = top_index2;
404       // Remove radix one type by setting the equivalence, removing the id from
405       // the hw threads and removing the layer from types and depth
406       set_equivalent_type(remove_type, keep_type);
407       for (int idx = 0; idx < num_hw_threads; ++idx) {
408         kmp_hw_thread_t &hw_thread = hw_threads[idx];
409         for (int d = remove_layer_ids; d < depth - 1; ++d)
410           hw_thread.ids[d] = hw_thread.ids[d + 1];
411       }
412       for (int idx = remove_layer; idx < depth - 1; ++idx)
413         types[idx] = types[idx + 1];
414       depth--;
415     } else {
416       top_index1 = top_index2++;
417     }
418   }
419   KMP_ASSERT(depth > 0);
420 }
421 
_set_last_level_cache()422 void kmp_topology_t::_set_last_level_cache() {
423   if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN)
424     set_equivalent_type(KMP_HW_LLC, KMP_HW_L3);
425   else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
426     set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
427 #if KMP_MIC_SUPPORTED
428   else if (__kmp_mic_type == mic3) {
429     if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
430       set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
431     else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN)
432       set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE);
433     // L2/Tile wasn't detected so just say L1
434     else
435       set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
436   }
437 #endif
438   else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN)
439     set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
440   // Fallback is to set last level cache to socket or core
441   if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) {
442     if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN)
443       set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET);
444     else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN)
445       set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE);
446   }
447   KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN);
448 }
449 
450 // Gather the count of each topology layer and the ratio
_gather_enumeration_information()451 void kmp_topology_t::_gather_enumeration_information() {
452   int previous_id[KMP_HW_LAST];
453   int max[KMP_HW_LAST];
454 
455   for (int i = 0; i < depth; ++i) {
456     previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
457     max[i] = 0;
458     count[i] = 0;
459     ratio[i] = 0;
460   }
461   int core_level = get_level(KMP_HW_CORE);
462   for (int i = 0; i < num_hw_threads; ++i) {
463     kmp_hw_thread_t &hw_thread = hw_threads[i];
464     for (int layer = 0; layer < depth; ++layer) {
465       int id = hw_thread.ids[layer];
466       if (id != previous_id[layer]) {
467         // Add an additional increment to each count
468         for (int l = layer; l < depth; ++l)
469           count[l]++;
470         // Keep track of topology layer ratio statistics
471         max[layer]++;
472         for (int l = layer + 1; l < depth; ++l) {
473           if (max[l] > ratio[l])
474             ratio[l] = max[l];
475           max[l] = 1;
476         }
477         // Figure out the number of different core types
478         // and efficiencies for hybrid CPUs
479         if (__kmp_is_hybrid_cpu() && core_level >= 0 && layer <= core_level) {
480           if (hw_thread.attrs.is_core_eff_valid() &&
481               hw_thread.attrs.core_eff >= num_core_efficiencies) {
482             // Because efficiencies can range from 0 to max efficiency - 1,
483             // the number of efficiencies is max efficiency + 1
484             num_core_efficiencies = hw_thread.attrs.core_eff + 1;
485           }
486           if (hw_thread.attrs.is_core_type_valid()) {
487             bool found = false;
488             for (int j = 0; j < num_core_types; ++j) {
489               if (hw_thread.attrs.get_core_type() == core_types[j]) {
490                 found = true;
491                 break;
492               }
493             }
494             if (!found) {
495               KMP_ASSERT(num_core_types < KMP_HW_MAX_NUM_CORE_TYPES);
496               core_types[num_core_types++] = hw_thread.attrs.get_core_type();
497             }
498           }
499         }
500         break;
501       }
502     }
503     for (int layer = 0; layer < depth; ++layer) {
504       previous_id[layer] = hw_thread.ids[layer];
505     }
506   }
507   for (int layer = 0; layer < depth; ++layer) {
508     if (max[layer] > ratio[layer])
509       ratio[layer] = max[layer];
510   }
511 }
512 
_get_ncores_with_attr(const kmp_hw_attr_t & attr,int above_level,bool find_all) const513 int kmp_topology_t::_get_ncores_with_attr(const kmp_hw_attr_t &attr,
514                                           int above_level,
515                                           bool find_all) const {
516   int current, current_max;
517   int previous_id[KMP_HW_LAST];
518   for (int i = 0; i < depth; ++i)
519     previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
520   int core_level = get_level(KMP_HW_CORE);
521   if (find_all)
522     above_level = -1;
523   KMP_ASSERT(above_level < core_level);
524   current_max = 0;
525   current = 0;
526   for (int i = 0; i < num_hw_threads; ++i) {
527     kmp_hw_thread_t &hw_thread = hw_threads[i];
528     if (!find_all && hw_thread.ids[above_level] != previous_id[above_level]) {
529       if (current > current_max)
530         current_max = current;
531       current = hw_thread.attrs.contains(attr);
532     } else {
533       for (int level = above_level + 1; level <= core_level; ++level) {
534         if (hw_thread.ids[level] != previous_id[level]) {
535           if (hw_thread.attrs.contains(attr))
536             current++;
537           break;
538         }
539       }
540     }
541     for (int level = 0; level < depth; ++level)
542       previous_id[level] = hw_thread.ids[level];
543   }
544   if (current > current_max)
545     current_max = current;
546   return current_max;
547 }
548 
549 // Find out if the topology is uniform
_discover_uniformity()550 void kmp_topology_t::_discover_uniformity() {
551   int num = 1;
552   for (int level = 0; level < depth; ++level)
553     num *= ratio[level];
554   flags.uniform = (num == count[depth - 1]);
555 }
556 
557 // Set all the sub_ids for each hardware thread
_set_sub_ids()558 void kmp_topology_t::_set_sub_ids() {
559   int previous_id[KMP_HW_LAST];
560   int sub_id[KMP_HW_LAST];
561 
562   for (int i = 0; i < depth; ++i) {
563     previous_id[i] = -1;
564     sub_id[i] = -1;
565   }
566   for (int i = 0; i < num_hw_threads; ++i) {
567     kmp_hw_thread_t &hw_thread = hw_threads[i];
568     // Setup the sub_id
569     for (int j = 0; j < depth; ++j) {
570       if (hw_thread.ids[j] != previous_id[j]) {
571         sub_id[j]++;
572         for (int k = j + 1; k < depth; ++k) {
573           sub_id[k] = 0;
574         }
575         break;
576       }
577     }
578     // Set previous_id
579     for (int j = 0; j < depth; ++j) {
580       previous_id[j] = hw_thread.ids[j];
581     }
582     // Set the sub_ids field
583     for (int j = 0; j < depth; ++j) {
584       hw_thread.sub_ids[j] = sub_id[j];
585     }
586   }
587 }
588 
_set_globals()589 void kmp_topology_t::_set_globals() {
590   // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores
591   int core_level, thread_level, package_level;
592   package_level = get_level(KMP_HW_SOCKET);
593 #if KMP_GROUP_AFFINITY
594   if (package_level == -1)
595     package_level = get_level(KMP_HW_PROC_GROUP);
596 #endif
597   core_level = get_level(KMP_HW_CORE);
598   thread_level = get_level(KMP_HW_THREAD);
599 
600   KMP_ASSERT(core_level != -1);
601   KMP_ASSERT(thread_level != -1);
602 
603   __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level);
604   if (package_level != -1) {
605     nCoresPerPkg = calculate_ratio(core_level, package_level);
606     nPackages = get_count(package_level);
607   } else {
608     // assume one socket
609     nCoresPerPkg = get_count(core_level);
610     nPackages = 1;
611   }
612 #ifndef KMP_DFLT_NTH_CORES
613   __kmp_ncores = get_count(core_level);
614 #endif
615 }
616 
allocate(int nproc,int ndepth,const kmp_hw_t * types)617 kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
618                                          const kmp_hw_t *types) {
619   kmp_topology_t *retval;
620   // Allocate all data in one large allocation
621   size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc +
622                 sizeof(int) * (size_t)KMP_HW_LAST * 3;
623   char *bytes = (char *)__kmp_allocate(size);
624   retval = (kmp_topology_t *)bytes;
625   if (nproc > 0) {
626     retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t));
627   } else {
628     retval->hw_threads = nullptr;
629   }
630   retval->num_hw_threads = nproc;
631   retval->depth = ndepth;
632   int *arr =
633       (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc);
634   retval->types = (kmp_hw_t *)arr;
635   retval->ratio = arr + (size_t)KMP_HW_LAST;
636   retval->count = arr + 2 * (size_t)KMP_HW_LAST;
637   retval->num_core_efficiencies = 0;
638   retval->num_core_types = 0;
639   retval->compact = 0;
640   for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
641     retval->core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN;
642   KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
643   for (int i = 0; i < ndepth; ++i) {
644     retval->types[i] = types[i];
645     retval->equivalent[types[i]] = types[i];
646   }
647   return retval;
648 }
649 
deallocate(kmp_topology_t * topology)650 void kmp_topology_t::deallocate(kmp_topology_t *topology) {
651   if (topology)
652     __kmp_free(topology);
653 }
654 
check_ids() const655 bool kmp_topology_t::check_ids() const {
656   // Assume ids have been sorted
657   if (num_hw_threads == 0)
658     return true;
659   for (int i = 1; i < num_hw_threads; ++i) {
660     kmp_hw_thread_t &current_thread = hw_threads[i];
661     kmp_hw_thread_t &previous_thread = hw_threads[i - 1];
662     bool unique = false;
663     for (int j = 0; j < depth; ++j) {
664       if (previous_thread.ids[j] != current_thread.ids[j]) {
665         unique = true;
666         break;
667       }
668     }
669     if (unique)
670       continue;
671     return false;
672   }
673   return true;
674 }
675 
dump() const676 void kmp_topology_t::dump() const {
677   printf("***********************\n");
678   printf("*** __kmp_topology: ***\n");
679   printf("***********************\n");
680   printf("* depth: %d\n", depth);
681 
682   printf("* types: ");
683   for (int i = 0; i < depth; ++i)
684     printf("%15s ", __kmp_hw_get_keyword(types[i]));
685   printf("\n");
686 
687   printf("* ratio: ");
688   for (int i = 0; i < depth; ++i) {
689     printf("%15d ", ratio[i]);
690   }
691   printf("\n");
692 
693   printf("* count: ");
694   for (int i = 0; i < depth; ++i) {
695     printf("%15d ", count[i]);
696   }
697   printf("\n");
698 
699   printf("* num_core_eff: %d\n", num_core_efficiencies);
700   printf("* num_core_types: %d\n", num_core_types);
701   printf("* core_types: ");
702   for (int i = 0; i < num_core_types; ++i)
703     printf("%3d ", core_types[i]);
704   printf("\n");
705 
706   printf("* equivalent map:\n");
707   KMP_FOREACH_HW_TYPE(i) {
708     const char *key = __kmp_hw_get_keyword(i);
709     const char *value = __kmp_hw_get_keyword(equivalent[i]);
710     printf("%-15s -> %-15s\n", key, value);
711   }
712 
713   printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No"));
714 
715   printf("* num_hw_threads: %d\n", num_hw_threads);
716   printf("* hw_threads:\n");
717   for (int i = 0; i < num_hw_threads; ++i) {
718     hw_threads[i].print();
719   }
720   printf("***********************\n");
721 }
722 
print(const char * env_var) const723 void kmp_topology_t::print(const char *env_var) const {
724   kmp_str_buf_t buf;
725   int print_types_depth;
726   __kmp_str_buf_init(&buf);
727   kmp_hw_t print_types[KMP_HW_LAST + 2];
728 
729   // Num Available Threads
730   if (num_hw_threads) {
731     KMP_INFORM(AvailableOSProc, env_var, num_hw_threads);
732   } else {
733     KMP_INFORM(AvailableOSProc, env_var, __kmp_xproc);
734   }
735 
736   // Uniform or not
737   if (is_uniform()) {
738     KMP_INFORM(Uniform, env_var);
739   } else {
740     KMP_INFORM(NonUniform, env_var);
741   }
742 
743   // Equivalent types
744   KMP_FOREACH_HW_TYPE(type) {
745     kmp_hw_t eq_type = equivalent[type];
746     if (eq_type != KMP_HW_UNKNOWN && eq_type != type) {
747       KMP_INFORM(AffEqualTopologyTypes, env_var,
748                  __kmp_hw_get_catalog_string(type),
749                  __kmp_hw_get_catalog_string(eq_type));
750     }
751   }
752 
753   // Quick topology
754   KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST);
755   // Create a print types array that always guarantees printing
756   // the core and thread level
757   print_types_depth = 0;
758   for (int level = 0; level < depth; ++level)
759     print_types[print_types_depth++] = types[level];
760   if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) {
761     // Force in the core level for quick topology
762     if (print_types[print_types_depth - 1] == KMP_HW_THREAD) {
763       // Force core before thread e.g., 1 socket X 2 threads/socket
764       // becomes 1 socket X 1 core/socket X 2 threads/socket
765       print_types[print_types_depth - 1] = KMP_HW_CORE;
766       print_types[print_types_depth++] = KMP_HW_THREAD;
767     } else {
768       print_types[print_types_depth++] = KMP_HW_CORE;
769     }
770   }
771   // Always put threads at very end of quick topology
772   if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD)
773     print_types[print_types_depth++] = KMP_HW_THREAD;
774 
775   __kmp_str_buf_clear(&buf);
776   kmp_hw_t numerator_type;
777   kmp_hw_t denominator_type = KMP_HW_UNKNOWN;
778   int core_level = get_level(KMP_HW_CORE);
779   int ncores = get_count(core_level);
780 
781   for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) {
782     int c;
783     bool plural;
784     numerator_type = print_types[plevel];
785     KMP_ASSERT_VALID_HW_TYPE(numerator_type);
786     if (equivalent[numerator_type] != numerator_type)
787       c = 1;
788     else
789       c = get_ratio(level++);
790     plural = (c > 1);
791     if (plevel == 0) {
792       __kmp_str_buf_print(&buf, "%d %s", c,
793                           __kmp_hw_get_catalog_string(numerator_type, plural));
794     } else {
795       __kmp_str_buf_print(&buf, " x %d %s/%s", c,
796                           __kmp_hw_get_catalog_string(numerator_type, plural),
797                           __kmp_hw_get_catalog_string(denominator_type));
798     }
799     denominator_type = numerator_type;
800   }
801   KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores);
802 
803   // Hybrid topology information
804   if (__kmp_is_hybrid_cpu()) {
805     for (int i = 0; i < num_core_types; ++i) {
806       kmp_hw_core_type_t core_type = core_types[i];
807       kmp_hw_attr_t attr;
808       attr.clear();
809       attr.set_core_type(core_type);
810       int ncores = get_ncores_with_attr(attr);
811       if (ncores > 0) {
812         KMP_INFORM(TopologyHybrid, env_var, ncores,
813                    __kmp_hw_get_core_type_string(core_type));
814         KMP_ASSERT(num_core_efficiencies <= KMP_HW_MAX_NUM_CORE_EFFS)
815         for (int eff = 0; eff < num_core_efficiencies; ++eff) {
816           attr.set_core_eff(eff);
817           int ncores_with_eff = get_ncores_with_attr(attr);
818           if (ncores_with_eff > 0) {
819             KMP_INFORM(TopologyHybridCoreEff, env_var, ncores_with_eff, eff);
820           }
821         }
822       }
823     }
824   }
825 
826   if (num_hw_threads <= 0) {
827     __kmp_str_buf_free(&buf);
828     return;
829   }
830 
831   // Full OS proc to hardware thread map
832   KMP_INFORM(OSProcToPhysicalThreadMap, env_var);
833   for (int i = 0; i < num_hw_threads; i++) {
834     __kmp_str_buf_clear(&buf);
835     for (int level = 0; level < depth; ++level) {
836       kmp_hw_t type = types[level];
837       __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
838       __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
839     }
840     if (__kmp_is_hybrid_cpu())
841       __kmp_str_buf_print(
842           &buf, "(%s)",
843           __kmp_hw_get_core_type_string(hw_threads[i].attrs.get_core_type()));
844     KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str);
845   }
846 
847   __kmp_str_buf_free(&buf);
848 }
849 
850 #if KMP_AFFINITY_SUPPORTED
set_granularity(kmp_affinity_t & affinity) const851 void kmp_topology_t::set_granularity(kmp_affinity_t &affinity) const {
852   const char *env_var = __kmp_get_affinity_env_var(affinity);
853   // If requested hybrid CPU attributes for granularity (either OMP_PLACES or
854   // KMP_AFFINITY), but none exist, then reset granularity and have below method
855   // select a granularity and warn user.
856   if (!__kmp_is_hybrid_cpu()) {
857     if (affinity.core_attr_gran.valid) {
858       // OMP_PLACES with cores:<attribute> but non-hybrid arch, use cores
859       // instead
860       KMP_AFF_WARNING(
861           affinity, AffIgnoringNonHybrid, env_var,
862           __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true));
863       affinity.gran = KMP_HW_CORE;
864       affinity.gran_levels = -1;
865       affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN;
866       affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0;
867     } else if (affinity.flags.core_types_gran ||
868                affinity.flags.core_effs_gran) {
869       // OMP_PLACES=core_types|core_effs but non-hybrid, use cores instead
870       if (affinity.flags.omp_places) {
871         KMP_AFF_WARNING(
872             affinity, AffIgnoringNonHybrid, env_var,
873             __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true));
874       } else {
875         // KMP_AFFINITY=granularity=core_type|core_eff,...
876         KMP_AFF_WARNING(affinity, AffGranularityBad, env_var,
877                         "Intel(R) Hybrid Technology core attribute",
878                         __kmp_hw_get_catalog_string(KMP_HW_CORE));
879       }
880       affinity.gran = KMP_HW_CORE;
881       affinity.gran_levels = -1;
882       affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN;
883       affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0;
884     }
885   }
886   // Set the number of affinity granularity levels
887   if (affinity.gran_levels < 0) {
888     kmp_hw_t gran_type = get_equivalent_type(affinity.gran);
889     // Check if user's granularity request is valid
890     if (gran_type == KMP_HW_UNKNOWN) {
891       // First try core, then thread, then package
892       kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET};
893       for (auto g : gran_types) {
894         if (get_equivalent_type(g) != KMP_HW_UNKNOWN) {
895           gran_type = g;
896           break;
897         }
898       }
899       KMP_ASSERT(gran_type != KMP_HW_UNKNOWN);
900       // Warn user what granularity setting will be used instead
901       KMP_AFF_WARNING(affinity, AffGranularityBad, env_var,
902                       __kmp_hw_get_catalog_string(affinity.gran),
903                       __kmp_hw_get_catalog_string(gran_type));
904       affinity.gran = gran_type;
905     }
906 #if KMP_GROUP_AFFINITY
907     // If more than one processor group exists, and the level of
908     // granularity specified by the user is too coarse, then the
909     // granularity must be adjusted "down" to processor group affinity
910     // because threads can only exist within one processor group.
911     // For example, if a user sets granularity=socket and there are two
912     // processor groups that cover a socket, then the runtime must
913     // restrict the granularity down to the processor group level.
914     if (__kmp_num_proc_groups > 1) {
915       int gran_depth = get_level(gran_type);
916       int proc_group_depth = get_level(KMP_HW_PROC_GROUP);
917       if (gran_depth >= 0 && proc_group_depth >= 0 &&
918           gran_depth < proc_group_depth) {
919         KMP_AFF_WARNING(affinity, AffGranTooCoarseProcGroup, env_var,
920                         __kmp_hw_get_catalog_string(affinity.gran));
921         affinity.gran = gran_type = KMP_HW_PROC_GROUP;
922       }
923     }
924 #endif
925     affinity.gran_levels = 0;
926     for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
927       affinity.gran_levels++;
928   }
929 }
930 #endif
931 
canonicalize()932 void kmp_topology_t::canonicalize() {
933 #if KMP_GROUP_AFFINITY
934   _insert_windows_proc_groups();
935 #endif
936   _remove_radix1_layers();
937   _gather_enumeration_information();
938   _discover_uniformity();
939   _set_sub_ids();
940   _set_globals();
941   _set_last_level_cache();
942 
943 #if KMP_MIC_SUPPORTED
944   // Manually Add L2 = Tile equivalence
945   if (__kmp_mic_type == mic3) {
946     if (get_level(KMP_HW_L2) != -1)
947       set_equivalent_type(KMP_HW_TILE, KMP_HW_L2);
948     else if (get_level(KMP_HW_TILE) != -1)
949       set_equivalent_type(KMP_HW_L2, KMP_HW_TILE);
950   }
951 #endif
952 
953   // Perform post canonicalization checking
954   KMP_ASSERT(depth > 0);
955   for (int level = 0; level < depth; ++level) {
956     // All counts, ratios, and types must be valid
957     KMP_ASSERT(count[level] > 0 && ratio[level] > 0);
958     KMP_ASSERT_VALID_HW_TYPE(types[level]);
959     // Detected types must point to themselves
960     KMP_ASSERT(equivalent[types[level]] == types[level]);
961   }
962 }
963 
964 // Canonicalize an explicit packages X cores/pkg X threads/core topology
canonicalize(int npackages,int ncores_per_pkg,int nthreads_per_core,int ncores)965 void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
966                                   int nthreads_per_core, int ncores) {
967   int ndepth = 3;
968   depth = ndepth;
969   KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; }
970   for (int level = 0; level < depth; ++level) {
971     count[level] = 0;
972     ratio[level] = 0;
973   }
974   count[0] = npackages;
975   count[1] = ncores;
976   count[2] = __kmp_xproc;
977   ratio[0] = npackages;
978   ratio[1] = ncores_per_pkg;
979   ratio[2] = nthreads_per_core;
980   equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET;
981   equivalent[KMP_HW_CORE] = KMP_HW_CORE;
982   equivalent[KMP_HW_THREAD] = KMP_HW_THREAD;
983   types[0] = KMP_HW_SOCKET;
984   types[1] = KMP_HW_CORE;
985   types[2] = KMP_HW_THREAD;
986   //__kmp_avail_proc = __kmp_xproc;
987   _discover_uniformity();
988 }
989 
990 #if KMP_AFFINITY_SUPPORTED
991 static kmp_str_buf_t *
__kmp_hw_get_catalog_core_string(const kmp_hw_attr_t & attr,kmp_str_buf_t * buf,bool plural)992 __kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
993                                  bool plural) {
994   __kmp_str_buf_init(buf);
995   if (attr.is_core_type_valid())
996     __kmp_str_buf_print(buf, "%s %s",
997                         __kmp_hw_get_core_type_string(attr.get_core_type()),
998                         __kmp_hw_get_catalog_string(KMP_HW_CORE, plural));
999   else
1000     __kmp_str_buf_print(buf, "%s eff=%d",
1001                         __kmp_hw_get_catalog_string(KMP_HW_CORE, plural),
1002                         attr.get_core_eff());
1003   return buf;
1004 }
1005 
restrict_to_mask(const kmp_affin_mask_t * mask)1006 bool kmp_topology_t::restrict_to_mask(const kmp_affin_mask_t *mask) {
1007   // Apply the filter
1008   bool affected;
1009   int new_index = 0;
1010   for (int i = 0; i < num_hw_threads; ++i) {
1011     int os_id = hw_threads[i].os_id;
1012     if (KMP_CPU_ISSET(os_id, mask)) {
1013       if (i != new_index)
1014         hw_threads[new_index] = hw_threads[i];
1015       new_index++;
1016     } else {
1017       KMP_CPU_CLR(os_id, __kmp_affin_fullMask);
1018       __kmp_avail_proc--;
1019     }
1020   }
1021 
1022   KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
1023   affected = (num_hw_threads != new_index);
1024   num_hw_threads = new_index;
1025 
1026   // Post hardware subset canonicalization
1027   if (affected) {
1028     _gather_enumeration_information();
1029     _discover_uniformity();
1030     _set_globals();
1031     _set_last_level_cache();
1032 #if KMP_OS_WINDOWS
1033     // Copy filtered full mask if topology has single processor group
1034     if (__kmp_num_proc_groups <= 1)
1035 #endif
1036       __kmp_affin_origMask->copy(__kmp_affin_fullMask);
1037   }
1038   return affected;
1039 }
1040 
1041 // Apply the KMP_HW_SUBSET envirable to the topology
1042 // Returns true if KMP_HW_SUBSET filtered any processors
1043 // otherwise, returns false
filter_hw_subset()1044 bool kmp_topology_t::filter_hw_subset() {
1045   // If KMP_HW_SUBSET wasn't requested, then do nothing.
1046   if (!__kmp_hw_subset)
1047     return false;
1048 
1049   // First, sort the KMP_HW_SUBSET items by the machine topology
1050   __kmp_hw_subset->sort();
1051 
1052   __kmp_hw_subset->canonicalize(__kmp_topology);
1053 
1054   // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
1055   bool using_core_types = false;
1056   bool using_core_effs = false;
1057   bool is_absolute = __kmp_hw_subset->is_absolute();
1058   int hw_subset_depth = __kmp_hw_subset->get_depth();
1059   kmp_hw_t specified[KMP_HW_LAST];
1060   int *topology_levels = (int *)KMP_ALLOCA(sizeof(int) * hw_subset_depth);
1061   KMP_ASSERT(hw_subset_depth > 0);
1062   KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; }
1063   int core_level = get_level(KMP_HW_CORE);
1064   for (int i = 0; i < hw_subset_depth; ++i) {
1065     int max_count;
1066     const kmp_hw_subset_t::item_t &item = __kmp_hw_subset->at(i);
1067     int num = item.num[0];
1068     int offset = item.offset[0];
1069     kmp_hw_t type = item.type;
1070     kmp_hw_t equivalent_type = equivalent[type];
1071     int level = get_level(type);
1072     topology_levels[i] = level;
1073 
1074     // Check to see if current layer is in detected machine topology
1075     if (equivalent_type != KMP_HW_UNKNOWN) {
1076       __kmp_hw_subset->at(i).type = equivalent_type;
1077     } else {
1078       KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetNotExistGeneric,
1079                       __kmp_hw_get_catalog_string(type));
1080       return false;
1081     }
1082 
1083     // Check to see if current layer has already been
1084     // specified either directly or through an equivalent type
1085     if (specified[equivalent_type] != KMP_HW_UNKNOWN) {
1086       KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetEqvLayers,
1087                       __kmp_hw_get_catalog_string(type),
1088                       __kmp_hw_get_catalog_string(specified[equivalent_type]));
1089       return false;
1090     }
1091     specified[equivalent_type] = type;
1092 
1093     // Check to see if each layer's num & offset parameters are valid
1094     max_count = get_ratio(level);
1095     if (!is_absolute) {
1096       if (max_count < 0 ||
1097           (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
1098         bool plural = (num > 1);
1099         KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric,
1100                         __kmp_hw_get_catalog_string(type, plural));
1101         return false;
1102       }
1103     }
1104 
1105     // Check to see if core attributes are consistent
1106     if (core_level == level) {
1107       // Determine which core attributes are specified
1108       for (int j = 0; j < item.num_attrs; ++j) {
1109         if (item.attr[j].is_core_type_valid())
1110           using_core_types = true;
1111         if (item.attr[j].is_core_eff_valid())
1112           using_core_effs = true;
1113       }
1114 
1115       // Check if using a single core attribute on non-hybrid arch.
1116       // Do not ignore all of KMP_HW_SUBSET, just ignore the attribute.
1117       //
1118       // Check if using multiple core attributes on non-hyrbid arch.
1119       // Ignore all of KMP_HW_SUBSET if this is the case.
1120       if ((using_core_effs || using_core_types) && !__kmp_is_hybrid_cpu()) {
1121         if (item.num_attrs == 1) {
1122           if (using_core_effs) {
1123             KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr,
1124                             "efficiency");
1125           } else {
1126             KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr,
1127                             "core_type");
1128           }
1129           using_core_effs = false;
1130           using_core_types = false;
1131         } else {
1132           KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrsNonHybrid);
1133           return false;
1134         }
1135       }
1136 
1137       // Check if using both core types and core efficiencies together
1138       if (using_core_types && using_core_effs) {
1139         KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat, "core_type",
1140                         "efficiency");
1141         return false;
1142       }
1143 
1144       // Check that core efficiency values are valid
1145       if (using_core_effs) {
1146         for (int j = 0; j < item.num_attrs; ++j) {
1147           if (item.attr[j].is_core_eff_valid()) {
1148             int core_eff = item.attr[j].get_core_eff();
1149             if (core_eff < 0 || core_eff >= num_core_efficiencies) {
1150               kmp_str_buf_t buf;
1151               __kmp_str_buf_init(&buf);
1152               __kmp_str_buf_print(&buf, "%d", item.attr[j].get_core_eff());
1153               __kmp_msg(kmp_ms_warning,
1154                         KMP_MSG(AffHWSubsetAttrInvalid, "efficiency", buf.str),
1155                         KMP_HNT(ValidValuesRange, 0, num_core_efficiencies - 1),
1156                         __kmp_msg_null);
1157               __kmp_str_buf_free(&buf);
1158               return false;
1159             }
1160           }
1161         }
1162       }
1163 
1164       // Check that the number of requested cores with attributes is valid
1165       if ((using_core_types || using_core_effs) && !is_absolute) {
1166         for (int j = 0; j < item.num_attrs; ++j) {
1167           int num = item.num[j];
1168           int offset = item.offset[j];
1169           int level_above = core_level - 1;
1170           if (level_above >= 0) {
1171             max_count = get_ncores_with_attr_per(item.attr[j], level_above);
1172             if (max_count <= 0 ||
1173                 (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
1174               kmp_str_buf_t buf;
1175               __kmp_hw_get_catalog_core_string(item.attr[j], &buf, num > 0);
1176               KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric, buf.str);
1177               __kmp_str_buf_free(&buf);
1178               return false;
1179             }
1180           }
1181         }
1182       }
1183 
1184       if ((using_core_types || using_core_effs) && item.num_attrs > 1) {
1185         for (int j = 0; j < item.num_attrs; ++j) {
1186           // Ambiguous use of specific core attribute + generic core
1187           // e.g., 4c & 3c:intel_core or 4c & 3c:eff1
1188           if (!item.attr[j]) {
1189             kmp_hw_attr_t other_attr;
1190             for (int k = 0; k < item.num_attrs; ++k) {
1191               if (item.attr[k] != item.attr[j]) {
1192                 other_attr = item.attr[k];
1193                 break;
1194               }
1195             }
1196             kmp_str_buf_t buf;
1197             __kmp_hw_get_catalog_core_string(other_attr, &buf, item.num[j] > 0);
1198             KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat,
1199                             __kmp_hw_get_catalog_string(KMP_HW_CORE), buf.str);
1200             __kmp_str_buf_free(&buf);
1201             return false;
1202           }
1203           // Allow specifying a specific core type or core eff exactly once
1204           for (int k = 0; k < j; ++k) {
1205             if (!item.attr[j] || !item.attr[k])
1206               continue;
1207             if (item.attr[k] == item.attr[j]) {
1208               kmp_str_buf_t buf;
1209               __kmp_hw_get_catalog_core_string(item.attr[j], &buf,
1210                                                item.num[j] > 0);
1211               KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrRepeat, buf.str);
1212               __kmp_str_buf_free(&buf);
1213               return false;
1214             }
1215           }
1216         }
1217       }
1218     }
1219   }
1220 
1221   // For keeping track of sub_ids for an absolute KMP_HW_SUBSET
1222   // or core attributes (core type or efficiency)
1223   int prev_sub_ids[KMP_HW_LAST];
1224   int abs_sub_ids[KMP_HW_LAST];
1225   int core_eff_sub_ids[KMP_HW_MAX_NUM_CORE_EFFS];
1226   int core_type_sub_ids[KMP_HW_MAX_NUM_CORE_TYPES];
1227   for (size_t i = 0; i < KMP_HW_LAST; ++i) {
1228     abs_sub_ids[i] = -1;
1229     prev_sub_ids[i] = -1;
1230   }
1231   for (size_t i = 0; i < KMP_HW_MAX_NUM_CORE_EFFS; ++i)
1232     core_eff_sub_ids[i] = -1;
1233   for (size_t i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
1234     core_type_sub_ids[i] = -1;
1235 
1236   // Determine which hardware threads should be filtered.
1237 
1238   // Helpful to determine if a topology layer is targeted by an absolute subset
1239   auto is_targeted = [&](int level) {
1240     if (is_absolute) {
1241       for (int i = 0; i < hw_subset_depth; ++i)
1242         if (topology_levels[i] == level)
1243           return true;
1244       return false;
1245     }
1246     // If not absolute KMP_HW_SUBSET, then every layer is seen as targeted
1247     return true;
1248   };
1249 
1250   // Helpful to index into core type sub Ids array
1251   auto get_core_type_index = [](const kmp_hw_thread_t &t) {
1252     switch (t.attrs.get_core_type()) {
1253     case KMP_HW_CORE_TYPE_UNKNOWN:
1254     case KMP_HW_MAX_NUM_CORE_TYPES:
1255       return 0;
1256 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1257     case KMP_HW_CORE_TYPE_ATOM:
1258       return 1;
1259     case KMP_HW_CORE_TYPE_CORE:
1260       return 2;
1261 #endif
1262     }
1263     KMP_ASSERT2(false, "Unhandled kmp_hw_thread_t enumeration");
1264     KMP_BUILTIN_UNREACHABLE;
1265   };
1266 
1267   // Helpful to index into core efficiencies sub Ids array
1268   auto get_core_eff_index = [](const kmp_hw_thread_t &t) {
1269     return t.attrs.get_core_eff();
1270   };
1271 
1272   int num_filtered = 0;
1273   kmp_affin_mask_t *filtered_mask;
1274   KMP_CPU_ALLOC(filtered_mask);
1275   KMP_CPU_COPY(filtered_mask, __kmp_affin_fullMask);
1276   for (int i = 0; i < num_hw_threads; ++i) {
1277     kmp_hw_thread_t &hw_thread = hw_threads[i];
1278 
1279     // Figure out the absolute sub ids and core eff/type sub ids
1280     if (is_absolute || using_core_effs || using_core_types) {
1281       for (int level = 0; level < get_depth(); ++level) {
1282         if (hw_thread.sub_ids[level] != prev_sub_ids[level]) {
1283           bool found_targeted = false;
1284           for (int j = level; j < get_depth(); ++j) {
1285             bool targeted = is_targeted(j);
1286             if (!found_targeted && targeted) {
1287               found_targeted = true;
1288               abs_sub_ids[j]++;
1289               if (j == core_level && using_core_effs)
1290                 core_eff_sub_ids[get_core_eff_index(hw_thread)]++;
1291               if (j == core_level && using_core_types)
1292                 core_type_sub_ids[get_core_type_index(hw_thread)]++;
1293             } else if (targeted) {
1294               abs_sub_ids[j] = 0;
1295               if (j == core_level && using_core_effs)
1296                 core_eff_sub_ids[get_core_eff_index(hw_thread)] = 0;
1297               if (j == core_level && using_core_types)
1298                 core_type_sub_ids[get_core_type_index(hw_thread)] = 0;
1299             }
1300           }
1301           break;
1302         }
1303       }
1304       for (int level = 0; level < get_depth(); ++level)
1305         prev_sub_ids[level] = hw_thread.sub_ids[level];
1306     }
1307 
1308     // Check to see if this hardware thread should be filtered
1309     bool should_be_filtered = false;
1310     for (int hw_subset_index = 0; hw_subset_index < hw_subset_depth;
1311          ++hw_subset_index) {
1312       const auto &hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
1313       int level = topology_levels[hw_subset_index];
1314       if (level == -1)
1315         continue;
1316       if ((using_core_effs || using_core_types) && level == core_level) {
1317         // Look for the core attribute in KMP_HW_SUBSET which corresponds
1318         // to this hardware thread's core attribute. Use this num,offset plus
1319         // the running sub_id for the particular core attribute of this hardware
1320         // thread to determine if the hardware thread should be filtered or not.
1321         int attr_idx;
1322         kmp_hw_core_type_t core_type = hw_thread.attrs.get_core_type();
1323         int core_eff = hw_thread.attrs.get_core_eff();
1324         for (attr_idx = 0; attr_idx < hw_subset_item.num_attrs; ++attr_idx) {
1325           if (using_core_types &&
1326               hw_subset_item.attr[attr_idx].get_core_type() == core_type)
1327             break;
1328           if (using_core_effs &&
1329               hw_subset_item.attr[attr_idx].get_core_eff() == core_eff)
1330             break;
1331         }
1332         // This core attribute isn't in the KMP_HW_SUBSET so always filter it.
1333         if (attr_idx == hw_subset_item.num_attrs) {
1334           should_be_filtered = true;
1335           break;
1336         }
1337         int sub_id;
1338         int num = hw_subset_item.num[attr_idx];
1339         int offset = hw_subset_item.offset[attr_idx];
1340         if (using_core_types)
1341           sub_id = core_type_sub_ids[get_core_type_index(hw_thread)];
1342         else
1343           sub_id = core_eff_sub_ids[get_core_eff_index(hw_thread)];
1344         if (sub_id < offset ||
1345             (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
1346           should_be_filtered = true;
1347           break;
1348         }
1349       } else {
1350         int sub_id;
1351         int num = hw_subset_item.num[0];
1352         int offset = hw_subset_item.offset[0];
1353         if (is_absolute)
1354           sub_id = abs_sub_ids[level];
1355         else
1356           sub_id = hw_thread.sub_ids[level];
1357         if (sub_id < offset ||
1358             (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
1359           should_be_filtered = true;
1360           break;
1361         }
1362       }
1363     }
1364     // Collect filtering information
1365     if (should_be_filtered) {
1366       KMP_CPU_CLR(hw_thread.os_id, filtered_mask);
1367       num_filtered++;
1368     }
1369   }
1370 
1371   // One last check that we shouldn't allow filtering entire machine
1372   if (num_filtered == num_hw_threads) {
1373     KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAllFiltered);
1374     return false;
1375   }
1376 
1377   // Apply the filter
1378   restrict_to_mask(filtered_mask);
1379   return true;
1380 }
1381 
is_close(int hwt1,int hwt2,const kmp_affinity_t & stgs) const1382 bool kmp_topology_t::is_close(int hwt1, int hwt2,
1383                               const kmp_affinity_t &stgs) const {
1384   int hw_level = stgs.gran_levels;
1385   if (hw_level >= depth)
1386     return true;
1387   bool retval = true;
1388   const kmp_hw_thread_t &t1 = hw_threads[hwt1];
1389   const kmp_hw_thread_t &t2 = hw_threads[hwt2];
1390   if (stgs.flags.core_types_gran)
1391     return t1.attrs.get_core_type() == t2.attrs.get_core_type();
1392   if (stgs.flags.core_effs_gran)
1393     return t1.attrs.get_core_eff() == t2.attrs.get_core_eff();
1394   for (int i = 0; i < (depth - hw_level); ++i) {
1395     if (t1.ids[i] != t2.ids[i])
1396       return false;
1397   }
1398   return retval;
1399 }
1400 
1401 ////////////////////////////////////////////////////////////////////////////////
1402 
1403 bool KMPAffinity::picked_api = false;
1404 
operator new(size_t n)1405 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
operator new[](size_t n)1406 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); }
operator delete(void * p)1407 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); }
operator delete[](void * p)1408 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); }
operator new(size_t n)1409 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); }
operator delete(void * p)1410 void KMPAffinity::operator delete(void *p) { __kmp_free(p); }
1411 
pick_api()1412 void KMPAffinity::pick_api() {
1413   KMPAffinity *affinity_dispatch;
1414   if (picked_api)
1415     return;
1416 #if KMP_USE_HWLOC
1417   // Only use Hwloc if affinity isn't explicitly disabled and
1418   // user requests Hwloc topology method
1419   if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
1420       __kmp_affinity.type != affinity_disabled) {
1421     affinity_dispatch = new KMPHwlocAffinity();
1422   } else
1423 #endif
1424   {
1425     affinity_dispatch = new KMPNativeAffinity();
1426   }
1427   __kmp_affinity_dispatch = affinity_dispatch;
1428   picked_api = true;
1429 }
1430 
destroy_api()1431 void KMPAffinity::destroy_api() {
1432   if (__kmp_affinity_dispatch != NULL) {
1433     delete __kmp_affinity_dispatch;
1434     __kmp_affinity_dispatch = NULL;
1435     picked_api = false;
1436   }
1437 }
1438 
1439 #define KMP_ADVANCE_SCAN(scan)                                                 \
1440   while (*scan != '\0') {                                                      \
1441     scan++;                                                                    \
1442   }
1443 
1444 // Print the affinity mask to the character array in a pretty format.
1445 // The format is a comma separated list of non-negative integers or integer
1446 // ranges: e.g., 1,2,3-5,7,9-15
1447 // The format can also be the string "{<empty>}" if no bits are set in mask
__kmp_affinity_print_mask(char * buf,int buf_len,kmp_affin_mask_t * mask)1448 char *__kmp_affinity_print_mask(char *buf, int buf_len,
1449                                 kmp_affin_mask_t *mask) {
1450   int start = 0, finish = 0, previous = 0;
1451   bool first_range;
1452   KMP_ASSERT(buf);
1453   KMP_ASSERT(buf_len >= 40);
1454   KMP_ASSERT(mask);
1455   char *scan = buf;
1456   char *end = buf + buf_len - 1;
1457 
1458   // Check for empty set.
1459   if (mask->begin() == mask->end()) {
1460     KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}");
1461     KMP_ADVANCE_SCAN(scan);
1462     KMP_ASSERT(scan <= end);
1463     return buf;
1464   }
1465 
1466   first_range = true;
1467   start = mask->begin();
1468   while (1) {
1469     // Find next range
1470     // [start, previous] is inclusive range of contiguous bits in mask
1471     for (finish = mask->next(start), previous = start;
1472          finish == previous + 1 && finish != mask->end();
1473          finish = mask->next(finish)) {
1474       previous = finish;
1475     }
1476 
1477     // The first range does not need a comma printed before it, but the rest
1478     // of the ranges do need a comma beforehand
1479     if (!first_range) {
1480       KMP_SNPRINTF(scan, end - scan + 1, "%s", ",");
1481       KMP_ADVANCE_SCAN(scan);
1482     } else {
1483       first_range = false;
1484     }
1485     // Range with three or more contiguous bits in the affinity mask
1486     if (previous - start > 1) {
1487       KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous);
1488     } else {
1489       // Range with one or two contiguous bits in the affinity mask
1490       KMP_SNPRINTF(scan, end - scan + 1, "%u", start);
1491       KMP_ADVANCE_SCAN(scan);
1492       if (previous - start > 0) {
1493         KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous);
1494       }
1495     }
1496     KMP_ADVANCE_SCAN(scan);
1497     // Start over with new start point
1498     start = finish;
1499     if (start == mask->end())
1500       break;
1501     // Check for overflow
1502     if (end - scan < 2)
1503       break;
1504   }
1505 
1506   // Check for overflow
1507   KMP_ASSERT(scan <= end);
1508   return buf;
1509 }
1510 #undef KMP_ADVANCE_SCAN
1511 
1512 // Print the affinity mask to the string buffer object in a pretty format
1513 // The format is a comma separated list of non-negative integers or integer
1514 // ranges: e.g., 1,2,3-5,7,9-15
1515 // The format can also be the string "{<empty>}" if no bits are set in mask
__kmp_affinity_str_buf_mask(kmp_str_buf_t * buf,kmp_affin_mask_t * mask)1516 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
1517                                            kmp_affin_mask_t *mask) {
1518   int start = 0, finish = 0, previous = 0;
1519   bool first_range;
1520   KMP_ASSERT(buf);
1521   KMP_ASSERT(mask);
1522 
1523   __kmp_str_buf_clear(buf);
1524 
1525   // Check for empty set.
1526   if (mask->begin() == mask->end()) {
1527     __kmp_str_buf_print(buf, "%s", "{<empty>}");
1528     return buf;
1529   }
1530 
1531   first_range = true;
1532   start = mask->begin();
1533   while (1) {
1534     // Find next range
1535     // [start, previous] is inclusive range of contiguous bits in mask
1536     for (finish = mask->next(start), previous = start;
1537          finish == previous + 1 && finish != mask->end();
1538          finish = mask->next(finish)) {
1539       previous = finish;
1540     }
1541 
1542     // The first range does not need a comma printed before it, but the rest
1543     // of the ranges do need a comma beforehand
1544     if (!first_range) {
1545       __kmp_str_buf_print(buf, "%s", ",");
1546     } else {
1547       first_range = false;
1548     }
1549     // Range with three or more contiguous bits in the affinity mask
1550     if (previous - start > 1) {
1551       __kmp_str_buf_print(buf, "%u-%u", start, previous);
1552     } else {
1553       // Range with one or two contiguous bits in the affinity mask
1554       __kmp_str_buf_print(buf, "%u", start);
1555       if (previous - start > 0) {
1556         __kmp_str_buf_print(buf, ",%u", previous);
1557       }
1558     }
1559     // Start over with new start point
1560     start = finish;
1561     if (start == mask->end())
1562       break;
1563   }
1564   return buf;
1565 }
1566 
1567 // Return (possibly empty) affinity mask representing the offline CPUs
1568 // Caller must free the mask
__kmp_affinity_get_offline_cpus()1569 kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
1570   kmp_affin_mask_t *offline;
1571   KMP_CPU_ALLOC(offline);
1572   KMP_CPU_ZERO(offline);
1573 #if KMP_OS_LINUX
1574   int n, begin_cpu, end_cpu;
1575   kmp_safe_raii_file_t offline_file;
1576   auto skip_ws = [](FILE *f) {
1577     int c;
1578     do {
1579       c = fgetc(f);
1580     } while (isspace(c));
1581     if (c != EOF)
1582       ungetc(c, f);
1583   };
1584   // File contains CSV of integer ranges representing the offline CPUs
1585   // e.g., 1,2,4-7,9,11-15
1586   int status = offline_file.try_open("/sys/devices/system/cpu/offline", "r");
1587   if (status != 0)
1588     return offline;
1589   while (!feof(offline_file)) {
1590     skip_ws(offline_file);
1591     n = fscanf(offline_file, "%d", &begin_cpu);
1592     if (n != 1)
1593       break;
1594     skip_ws(offline_file);
1595     int c = fgetc(offline_file);
1596     if (c == EOF || c == ',') {
1597       // Just single CPU
1598       end_cpu = begin_cpu;
1599     } else if (c == '-') {
1600       // Range of CPUs
1601       skip_ws(offline_file);
1602       n = fscanf(offline_file, "%d", &end_cpu);
1603       if (n != 1)
1604         break;
1605       skip_ws(offline_file);
1606       c = fgetc(offline_file); // skip ','
1607     } else {
1608       // Syntax problem
1609       break;
1610     }
1611     // Ensure a valid range of CPUs
1612     if (begin_cpu < 0 || begin_cpu >= __kmp_xproc || end_cpu < 0 ||
1613         end_cpu >= __kmp_xproc || begin_cpu > end_cpu) {
1614       continue;
1615     }
1616     // Insert [begin_cpu, end_cpu] into offline mask
1617     for (int cpu = begin_cpu; cpu <= end_cpu; ++cpu) {
1618       KMP_CPU_SET(cpu, offline);
1619     }
1620   }
1621 #endif
1622   return offline;
1623 }
1624 
1625 // Return the number of available procs
__kmp_affinity_entire_machine_mask(kmp_affin_mask_t * mask)1626 int __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
1627   int avail_proc = 0;
1628   KMP_CPU_ZERO(mask);
1629 
1630 #if KMP_GROUP_AFFINITY
1631 
1632   if (__kmp_num_proc_groups > 1) {
1633     int group;
1634     KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
1635     for (group = 0; group < __kmp_num_proc_groups; group++) {
1636       int i;
1637       int num = __kmp_GetActiveProcessorCount(group);
1638       for (i = 0; i < num; i++) {
1639         KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
1640         avail_proc++;
1641       }
1642     }
1643   } else
1644 
1645 #endif /* KMP_GROUP_AFFINITY */
1646 
1647   {
1648     int proc;
1649     kmp_affin_mask_t *offline_cpus = __kmp_affinity_get_offline_cpus();
1650     for (proc = 0; proc < __kmp_xproc; proc++) {
1651       // Skip offline CPUs
1652       if (KMP_CPU_ISSET(proc, offline_cpus))
1653         continue;
1654       KMP_CPU_SET(proc, mask);
1655       avail_proc++;
1656     }
1657     KMP_CPU_FREE(offline_cpus);
1658   }
1659 
1660   return avail_proc;
1661 }
1662 
1663 // All of the __kmp_affinity_create_*_map() routines should allocate the
1664 // internal topology object and set the layer ids for it.  Each routine
1665 // returns a boolean on whether it was successful at doing so.
1666 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
1667 // Original mask is a subset of full mask in multiple processor groups topology
1668 kmp_affin_mask_t *__kmp_affin_origMask = NULL;
1669 
1670 #if KMP_USE_HWLOC
__kmp_hwloc_is_cache_type(hwloc_obj_t obj)1671 static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) {
1672 #if HWLOC_API_VERSION >= 0x00020000
1673   return hwloc_obj_type_is_cache(obj->type);
1674 #else
1675   return obj->type == HWLOC_OBJ_CACHE;
1676 #endif
1677 }
1678 
1679 // Returns KMP_HW_* type derived from HWLOC_* type
__kmp_hwloc_type_2_topology_type(hwloc_obj_t obj)1680 static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) {
1681 
1682   if (__kmp_hwloc_is_cache_type(obj)) {
1683     if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION)
1684       return KMP_HW_UNKNOWN;
1685     switch (obj->attr->cache.depth) {
1686     case 1:
1687       return KMP_HW_L1;
1688     case 2:
1689 #if KMP_MIC_SUPPORTED
1690       if (__kmp_mic_type == mic3) {
1691         return KMP_HW_TILE;
1692       }
1693 #endif
1694       return KMP_HW_L2;
1695     case 3:
1696       return KMP_HW_L3;
1697     }
1698     return KMP_HW_UNKNOWN;
1699   }
1700 
1701   switch (obj->type) {
1702   case HWLOC_OBJ_PACKAGE:
1703     return KMP_HW_SOCKET;
1704   case HWLOC_OBJ_NUMANODE:
1705     return KMP_HW_NUMA;
1706   case HWLOC_OBJ_CORE:
1707     return KMP_HW_CORE;
1708   case HWLOC_OBJ_PU:
1709     return KMP_HW_THREAD;
1710   case HWLOC_OBJ_GROUP:
1711 #if HWLOC_API_VERSION >= 0x00020000
1712     if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE)
1713       return KMP_HW_DIE;
1714     else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE)
1715       return KMP_HW_TILE;
1716     else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE)
1717       return KMP_HW_MODULE;
1718     else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP)
1719       return KMP_HW_PROC_GROUP;
1720 #endif
1721     return KMP_HW_UNKNOWN;
1722 #if HWLOC_API_VERSION >= 0x00020100
1723   case HWLOC_OBJ_DIE:
1724     return KMP_HW_DIE;
1725 #endif
1726   }
1727   return KMP_HW_UNKNOWN;
1728 }
1729 
1730 // Returns the number of objects of type 'type' below 'obj' within the topology
1731 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
1732 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
1733 // object.
__kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,hwloc_obj_type_t type)1734 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
1735                                            hwloc_obj_type_t type) {
1736   int retval = 0;
1737   hwloc_obj_t first;
1738   for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
1739                                            obj->logical_index, type, 0);
1740        first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology,
1741                                                        obj->type, first) == obj;
1742        first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
1743                                           first)) {
1744     ++retval;
1745   }
1746   return retval;
1747 }
1748 
1749 // This gets the sub_id for a lower object under a higher object in the
1750 // topology tree
__kmp_hwloc_get_sub_id(hwloc_topology_t t,hwloc_obj_t higher,hwloc_obj_t lower)1751 static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher,
1752                                   hwloc_obj_t lower) {
1753   hwloc_obj_t obj;
1754   hwloc_obj_type_t ltype = lower->type;
1755   int lindex = lower->logical_index - 1;
1756   int sub_id = 0;
1757   // Get the previous lower object
1758   obj = hwloc_get_obj_by_type(t, ltype, lindex);
1759   while (obj && lindex >= 0 &&
1760          hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) {
1761     if (obj->userdata) {
1762       sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata));
1763       break;
1764     }
1765     sub_id++;
1766     lindex--;
1767     obj = hwloc_get_obj_by_type(t, ltype, lindex);
1768   }
1769   // store sub_id + 1 so that 0 is differed from NULL
1770   lower->userdata = RCAST(void *, sub_id + 1);
1771   return sub_id;
1772 }
1773 
__kmp_affinity_create_hwloc_map(kmp_i18n_id_t * const msg_id)1774 static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
1775   kmp_hw_t type;
1776   int hw_thread_index, sub_id;
1777   int depth;
1778   hwloc_obj_t pu, obj, root, prev;
1779   kmp_hw_t types[KMP_HW_LAST];
1780   hwloc_obj_type_t hwloc_types[KMP_HW_LAST];
1781 
1782   hwloc_topology_t tp = __kmp_hwloc_topology;
1783   *msg_id = kmp_i18n_null;
1784   if (__kmp_affinity.flags.verbose) {
1785     KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
1786   }
1787 
1788   if (!KMP_AFFINITY_CAPABLE()) {
1789     // Hack to try and infer the machine topology using only the data
1790     // available from hwloc on the current thread, and __kmp_xproc.
1791     KMP_ASSERT(__kmp_affinity.type == affinity_none);
1792     // hwloc only guarantees existance of PU object, so check PACKAGE and CORE
1793     hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0);
1794     if (o != NULL)
1795       nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE);
1796     else
1797       nCoresPerPkg = 1; // no PACKAGE found
1798     o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0);
1799     if (o != NULL)
1800       __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU);
1801     else
1802       __kmp_nThreadsPerCore = 1; // no CORE found
1803     if (__kmp_nThreadsPerCore == 0)
1804       __kmp_nThreadsPerCore = 1;
1805     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1806     if (nCoresPerPkg == 0)
1807       nCoresPerPkg = 1; // to prevent possible division by 0
1808     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1809     return true;
1810   }
1811 
1812 #if HWLOC_API_VERSION >= 0x00020400
1813   // Handle multiple types of cores if they exist on the system
1814   int nr_cpu_kinds = hwloc_cpukinds_get_nr(tp, 0);
1815 
1816   typedef struct kmp_hwloc_cpukinds_info_t {
1817     int efficiency;
1818     kmp_hw_core_type_t core_type;
1819     hwloc_bitmap_t mask;
1820   } kmp_hwloc_cpukinds_info_t;
1821   kmp_hwloc_cpukinds_info_t *cpukinds = nullptr;
1822 
1823   if (nr_cpu_kinds > 0) {
1824     unsigned nr_infos;
1825     struct hwloc_info_s *infos;
1826     cpukinds = (kmp_hwloc_cpukinds_info_t *)__kmp_allocate(
1827         sizeof(kmp_hwloc_cpukinds_info_t) * nr_cpu_kinds);
1828     for (unsigned idx = 0; idx < (unsigned)nr_cpu_kinds; ++idx) {
1829       cpukinds[idx].efficiency = -1;
1830       cpukinds[idx].core_type = KMP_HW_CORE_TYPE_UNKNOWN;
1831       cpukinds[idx].mask = hwloc_bitmap_alloc();
1832       if (hwloc_cpukinds_get_info(tp, idx, cpukinds[idx].mask,
1833                                   &cpukinds[idx].efficiency, &nr_infos, &infos,
1834                                   0) == 0) {
1835         for (unsigned i = 0; i < nr_infos; ++i) {
1836           if (__kmp_str_match("CoreType", 8, infos[i].name)) {
1837 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1838             if (__kmp_str_match("IntelAtom", 9, infos[i].value)) {
1839               cpukinds[idx].core_type = KMP_HW_CORE_TYPE_ATOM;
1840               break;
1841             } else if (__kmp_str_match("IntelCore", 9, infos[i].value)) {
1842               cpukinds[idx].core_type = KMP_HW_CORE_TYPE_CORE;
1843               break;
1844             }
1845 #endif
1846           }
1847         }
1848       }
1849     }
1850   }
1851 #endif
1852 
1853   root = hwloc_get_root_obj(tp);
1854 
1855   // Figure out the depth and types in the topology
1856   depth = 0;
1857   obj = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
1858   while (obj && obj != root) {
1859 #if HWLOC_API_VERSION >= 0x00020000
1860     if (obj->memory_arity) {
1861       hwloc_obj_t memory;
1862       for (memory = obj->memory_first_child; memory;
1863            memory = hwloc_get_next_child(tp, obj, memory)) {
1864         if (memory->type == HWLOC_OBJ_NUMANODE)
1865           break;
1866       }
1867       if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
1868         types[depth] = KMP_HW_NUMA;
1869         hwloc_types[depth] = memory->type;
1870         depth++;
1871       }
1872     }
1873 #endif
1874     type = __kmp_hwloc_type_2_topology_type(obj);
1875     if (type != KMP_HW_UNKNOWN) {
1876       types[depth] = type;
1877       hwloc_types[depth] = obj->type;
1878       depth++;
1879     }
1880     obj = obj->parent;
1881   }
1882   KMP_ASSERT(depth > 0);
1883 
1884   // Get the order for the types correct
1885   for (int i = 0, j = depth - 1; i < j; ++i, --j) {
1886     hwloc_obj_type_t hwloc_temp = hwloc_types[i];
1887     kmp_hw_t temp = types[i];
1888     types[i] = types[j];
1889     types[j] = temp;
1890     hwloc_types[i] = hwloc_types[j];
1891     hwloc_types[j] = hwloc_temp;
1892   }
1893 
1894   // Allocate the data structure to be returned.
1895   __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
1896 
1897   hw_thread_index = 0;
1898   pu = NULL;
1899   while ((pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu))) {
1900     int index = depth - 1;
1901     bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask);
1902     kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
1903     if (included) {
1904       hw_thread.clear();
1905       hw_thread.ids[index] = pu->logical_index;
1906       hw_thread.os_id = pu->os_index;
1907       // If multiple core types, then set that attribute for the hardware thread
1908 #if HWLOC_API_VERSION >= 0x00020400
1909       if (cpukinds) {
1910         int cpukind_index = -1;
1911         for (int i = 0; i < nr_cpu_kinds; ++i) {
1912           if (hwloc_bitmap_isset(cpukinds[i].mask, hw_thread.os_id)) {
1913             cpukind_index = i;
1914             break;
1915           }
1916         }
1917         if (cpukind_index >= 0) {
1918           hw_thread.attrs.set_core_type(cpukinds[cpukind_index].core_type);
1919           hw_thread.attrs.set_core_eff(cpukinds[cpukind_index].efficiency);
1920         }
1921       }
1922 #endif
1923       index--;
1924     }
1925     obj = pu;
1926     prev = obj;
1927     while (obj != root && obj != NULL) {
1928       obj = obj->parent;
1929 #if HWLOC_API_VERSION >= 0x00020000
1930       // NUMA Nodes are handled differently since they are not within the
1931       // parent/child structure anymore.  They are separate children
1932       // of obj (memory_first_child points to first memory child)
1933       if (obj->memory_arity) {
1934         hwloc_obj_t memory;
1935         for (memory = obj->memory_first_child; memory;
1936              memory = hwloc_get_next_child(tp, obj, memory)) {
1937           if (memory->type == HWLOC_OBJ_NUMANODE)
1938             break;
1939         }
1940         if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
1941           sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev);
1942           if (included) {
1943             hw_thread.ids[index] = memory->logical_index;
1944             hw_thread.ids[index + 1] = sub_id;
1945             index--;
1946           }
1947           prev = memory;
1948         }
1949         prev = obj;
1950       }
1951 #endif
1952       type = __kmp_hwloc_type_2_topology_type(obj);
1953       if (type != KMP_HW_UNKNOWN) {
1954         sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev);
1955         if (included) {
1956           hw_thread.ids[index] = obj->logical_index;
1957           hw_thread.ids[index + 1] = sub_id;
1958           index--;
1959         }
1960         prev = obj;
1961       }
1962     }
1963     if (included)
1964       hw_thread_index++;
1965   }
1966 
1967 #if HWLOC_API_VERSION >= 0x00020400
1968   // Free the core types information
1969   if (cpukinds) {
1970     for (int idx = 0; idx < nr_cpu_kinds; ++idx)
1971       hwloc_bitmap_free(cpukinds[idx].mask);
1972     __kmp_free(cpukinds);
1973   }
1974 #endif
1975   __kmp_topology->sort_ids();
1976   return true;
1977 }
1978 #endif // KMP_USE_HWLOC
1979 
1980 // If we don't know how to retrieve the machine's processor topology, or
1981 // encounter an error in doing so, this routine is called to form a "flat"
1982 // mapping of os thread id's <-> processor id's.
__kmp_affinity_create_flat_map(kmp_i18n_id_t * const msg_id)1983 static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) {
1984   *msg_id = kmp_i18n_null;
1985   int depth = 3;
1986   kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1987 
1988   if (__kmp_affinity.flags.verbose) {
1989     KMP_INFORM(UsingFlatOS, "KMP_AFFINITY");
1990   }
1991 
1992   // Even if __kmp_affinity.type == affinity_none, this routine might still
1993   // be called to set __kmp_ncores, as well as
1994   // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1995   if (!KMP_AFFINITY_CAPABLE()) {
1996     KMP_ASSERT(__kmp_affinity.type == affinity_none);
1997     __kmp_ncores = nPackages = __kmp_xproc;
1998     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1999     return true;
2000   }
2001 
2002   // When affinity is off, this routine will still be called to set
2003   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
2004   // Make sure all these vars are set correctly, and return now if affinity is
2005   // not enabled.
2006   __kmp_ncores = nPackages = __kmp_avail_proc;
2007   __kmp_nThreadsPerCore = nCoresPerPkg = 1;
2008 
2009   // Construct the data structure to be returned.
2010   __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
2011   int avail_ct = 0;
2012   int i;
2013   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
2014     // Skip this proc if it is not included in the machine model.
2015     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
2016       continue;
2017     }
2018     kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
2019     hw_thread.clear();
2020     hw_thread.os_id = i;
2021     hw_thread.ids[0] = i;
2022     hw_thread.ids[1] = 0;
2023     hw_thread.ids[2] = 0;
2024     avail_ct++;
2025   }
2026   if (__kmp_affinity.flags.verbose) {
2027     KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
2028   }
2029   return true;
2030 }
2031 
2032 #if KMP_GROUP_AFFINITY
2033 // If multiple Windows* OS processor groups exist, we can create a 2-level
2034 // topology map with the groups at level 0 and the individual procs at level 1.
2035 // This facilitates letting the threads float among all procs in a group,
2036 // if granularity=group (the default when there are multiple groups).
__kmp_affinity_create_proc_group_map(kmp_i18n_id_t * const msg_id)2037 static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
2038   *msg_id = kmp_i18n_null;
2039   int depth = 3;
2040   kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD};
2041   const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR);
2042 
2043   if (__kmp_affinity.flags.verbose) {
2044     KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
2045   }
2046 
2047   // If we aren't affinity capable, then use flat topology
2048   if (!KMP_AFFINITY_CAPABLE()) {
2049     KMP_ASSERT(__kmp_affinity.type == affinity_none);
2050     nPackages = __kmp_num_proc_groups;
2051     __kmp_nThreadsPerCore = 1;
2052     __kmp_ncores = __kmp_xproc;
2053     nCoresPerPkg = nPackages / __kmp_ncores;
2054     return true;
2055   }
2056 
2057   // Construct the data structure to be returned.
2058   __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
2059   int avail_ct = 0;
2060   int i;
2061   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
2062     // Skip this proc if it is not included in the machine model.
2063     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
2064       continue;
2065     }
2066     kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++);
2067     hw_thread.clear();
2068     hw_thread.os_id = i;
2069     hw_thread.ids[0] = i / BITS_PER_GROUP;
2070     hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP;
2071   }
2072   return true;
2073 }
2074 #endif /* KMP_GROUP_AFFINITY */
2075 
2076 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
2077 
2078 template <kmp_uint32 LSB, kmp_uint32 MSB>
__kmp_extract_bits(kmp_uint32 v)2079 static inline unsigned __kmp_extract_bits(kmp_uint32 v) {
2080   const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB;
2081   const kmp_uint32 SHIFT_RIGHT = LSB;
2082   kmp_uint32 retval = v;
2083   retval <<= SHIFT_LEFT;
2084   retval >>= (SHIFT_LEFT + SHIFT_RIGHT);
2085   return retval;
2086 }
2087 
__kmp_cpuid_mask_width(int count)2088 static int __kmp_cpuid_mask_width(int count) {
2089   int r = 0;
2090 
2091   while ((1 << r) < count)
2092     ++r;
2093   return r;
2094 }
2095 
2096 class apicThreadInfo {
2097 public:
2098   unsigned osId; // param to __kmp_affinity_bind_thread
2099   unsigned apicId; // from cpuid after binding
2100   unsigned maxCoresPerPkg; //      ""
2101   unsigned maxThreadsPerPkg; //      ""
2102   unsigned pkgId; // inferred from above values
2103   unsigned coreId; //      ""
2104   unsigned threadId; //      ""
2105 };
2106 
__kmp_affinity_cmp_apicThreadInfo_phys_id(const void * a,const void * b)2107 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
2108                                                      const void *b) {
2109   const apicThreadInfo *aa = (const apicThreadInfo *)a;
2110   const apicThreadInfo *bb = (const apicThreadInfo *)b;
2111   if (aa->pkgId < bb->pkgId)
2112     return -1;
2113   if (aa->pkgId > bb->pkgId)
2114     return 1;
2115   if (aa->coreId < bb->coreId)
2116     return -1;
2117   if (aa->coreId > bb->coreId)
2118     return 1;
2119   if (aa->threadId < bb->threadId)
2120     return -1;
2121   if (aa->threadId > bb->threadId)
2122     return 1;
2123   return 0;
2124 }
2125 
2126 class kmp_cache_info_t {
2127 public:
2128   struct info_t {
2129     unsigned level, mask;
2130   };
kmp_cache_info_t()2131   kmp_cache_info_t() : depth(0) { get_leaf4_levels(); }
get_depth() const2132   size_t get_depth() const { return depth; }
operator [](size_t index)2133   info_t &operator[](size_t index) { return table[index]; }
operator [](size_t index) const2134   const info_t &operator[](size_t index) const { return table[index]; }
2135 
get_topology_type(unsigned level)2136   static kmp_hw_t get_topology_type(unsigned level) {
2137     KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL);
2138     switch (level) {
2139     case 1:
2140       return KMP_HW_L1;
2141     case 2:
2142       return KMP_HW_L2;
2143     case 3:
2144       return KMP_HW_L3;
2145     }
2146     return KMP_HW_UNKNOWN;
2147   }
2148 
2149 private:
2150   static const int MAX_CACHE_LEVEL = 3;
2151 
2152   size_t depth;
2153   info_t table[MAX_CACHE_LEVEL];
2154 
get_leaf4_levels()2155   void get_leaf4_levels() {
2156     unsigned level = 0;
2157     while (depth < MAX_CACHE_LEVEL) {
2158       unsigned cache_type, max_threads_sharing;
2159       unsigned cache_level, cache_mask_width;
2160       kmp_cpuid buf2;
2161       __kmp_x86_cpuid(4, level, &buf2);
2162       cache_type = __kmp_extract_bits<0, 4>(buf2.eax);
2163       if (!cache_type)
2164         break;
2165       // Skip instruction caches
2166       if (cache_type == 2) {
2167         level++;
2168         continue;
2169       }
2170       max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1;
2171       cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing);
2172       cache_level = __kmp_extract_bits<5, 7>(buf2.eax);
2173       table[depth].level = cache_level;
2174       table[depth].mask = ((-1) << cache_mask_width);
2175       depth++;
2176       level++;
2177     }
2178   }
2179 };
2180 
2181 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
2182 // an algorithm which cycles through the available os threads, setting
2183 // the current thread's affinity mask to that thread, and then retrieves
2184 // the Apic Id for each thread context using the cpuid instruction.
__kmp_affinity_create_apicid_map(kmp_i18n_id_t * const msg_id)2185 static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
2186   kmp_cpuid buf;
2187   *msg_id = kmp_i18n_null;
2188 
2189   if (__kmp_affinity.flags.verbose) {
2190     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
2191   }
2192 
2193   // Check if cpuid leaf 4 is supported.
2194   __kmp_x86_cpuid(0, 0, &buf);
2195   if (buf.eax < 4) {
2196     *msg_id = kmp_i18n_str_NoLeaf4Support;
2197     return false;
2198   }
2199 
2200   // The algorithm used starts by setting the affinity to each available thread
2201   // and retrieving info from the cpuid instruction, so if we are not capable of
2202   // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
2203   // need to do something else - use the defaults that we calculated from
2204   // issuing cpuid without binding to each proc.
2205   if (!KMP_AFFINITY_CAPABLE()) {
2206     // Hack to try and infer the machine topology using only the data
2207     // available from cpuid on the current thread, and __kmp_xproc.
2208     KMP_ASSERT(__kmp_affinity.type == affinity_none);
2209 
2210     // Get an upper bound on the number of threads per package using cpuid(1).
2211     // On some OS/chps combinations where HT is supported by the chip but is
2212     // disabled, this value will be 2 on a single core chip. Usually, it will be
2213     // 2 if HT is enabled and 1 if HT is disabled.
2214     __kmp_x86_cpuid(1, 0, &buf);
2215     int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
2216     if (maxThreadsPerPkg == 0) {
2217       maxThreadsPerPkg = 1;
2218     }
2219 
2220     // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded
2221     // value.
2222     //
2223     // The author of cpu_count.cpp treated this only an upper bound on the
2224     // number of cores, but I haven't seen any cases where it was greater than
2225     // the actual number of cores, so we will treat it as exact in this block of
2226     // code.
2227     //
2228     // First, we need to check if cpuid(4) is supported on this chip. To see if
2229     // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or
2230     // greater.
2231     __kmp_x86_cpuid(0, 0, &buf);
2232     if (buf.eax >= 4) {
2233       __kmp_x86_cpuid(4, 0, &buf);
2234       nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
2235     } else {
2236       nCoresPerPkg = 1;
2237     }
2238 
2239     // There is no way to reliably tell if HT is enabled without issuing the
2240     // cpuid instruction from every thread, can correlating the cpuid info, so
2241     // if the machine is not affinity capable, we assume that HT is off. We have
2242     // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine
2243     // does not support HT.
2244     //
2245     // - Older OSes are usually found on machines with older chips, which do not
2246     //   support HT.
2247     // - The performance penalty for mistakenly identifying a machine as HT when
2248     //   it isn't (which results in blocktime being incorrectly set to 0) is
2249     //   greater than the penalty when for mistakenly identifying a machine as
2250     //   being 1 thread/core when it is really HT enabled (which results in
2251     //   blocktime being incorrectly set to a positive value).
2252     __kmp_ncores = __kmp_xproc;
2253     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
2254     __kmp_nThreadsPerCore = 1;
2255     return true;
2256   }
2257 
2258   // From here on, we can assume that it is safe to call
2259   // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
2260   // __kmp_affinity.type = affinity_none.
2261 
2262   // Save the affinity mask for the current thread.
2263   kmp_affinity_raii_t previous_affinity;
2264 
2265   // Run through each of the available contexts, binding the current thread
2266   // to it, and obtaining the pertinent information using the cpuid instr.
2267   //
2268   // The relevant information is:
2269   // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
2270   //     has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
2271   // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
2272   //     of this field determines the width of the core# + thread# fields in the
2273   //     Apic Id. It is also an upper bound on the number of threads per
2274   //     package, but it has been verified that situations happen were it is not
2275   //     exact. In particular, on certain OS/chip combinations where Intel(R)
2276   //     Hyper-Threading Technology is supported by the chip but has been
2277   //     disabled, the value of this field will be 2 (for a single core chip).
2278   //     On other OS/chip combinations supporting Intel(R) Hyper-Threading
2279   //     Technology, the value of this field will be 1 when Intel(R)
2280   //     Hyper-Threading Technology is disabled and 2 when it is enabled.
2281   // - Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4). The value
2282   //     of this field (+1) determines the width of the core# field in the Apic
2283   //     Id. The comments in "cpucount.cpp" say that this value is an upper
2284   //     bound, but the IA-32 architecture manual says that it is exactly the
2285   //     number of cores per package, and I haven't seen any case where it
2286   //     wasn't.
2287   //
2288   // From this information, deduce the package Id, core Id, and thread Id,
2289   // and set the corresponding fields in the apicThreadInfo struct.
2290   unsigned i;
2291   apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
2292       __kmp_avail_proc * sizeof(apicThreadInfo));
2293   unsigned nApics = 0;
2294   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
2295     // Skip this proc if it is not included in the machine model.
2296     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
2297       continue;
2298     }
2299     KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
2300 
2301     __kmp_affinity_dispatch->bind_thread(i);
2302     threadInfo[nApics].osId = i;
2303 
2304     // The apic id and max threads per pkg come from cpuid(1).
2305     __kmp_x86_cpuid(1, 0, &buf);
2306     if (((buf.edx >> 9) & 1) == 0) {
2307       __kmp_free(threadInfo);
2308       *msg_id = kmp_i18n_str_ApicNotPresent;
2309       return false;
2310     }
2311     threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
2312     threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
2313     if (threadInfo[nApics].maxThreadsPerPkg == 0) {
2314       threadInfo[nApics].maxThreadsPerPkg = 1;
2315     }
2316 
2317     // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded
2318     // value.
2319     //
2320     // First, we need to check if cpuid(4) is supported on this chip. To see if
2321     // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n
2322     // or greater.
2323     __kmp_x86_cpuid(0, 0, &buf);
2324     if (buf.eax >= 4) {
2325       __kmp_x86_cpuid(4, 0, &buf);
2326       threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
2327     } else {
2328       threadInfo[nApics].maxCoresPerPkg = 1;
2329     }
2330 
2331     // Infer the pkgId / coreId / threadId using only the info obtained locally.
2332     int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg);
2333     threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
2334 
2335     int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg);
2336     int widthT = widthCT - widthC;
2337     if (widthT < 0) {
2338       // I've never seen this one happen, but I suppose it could, if the cpuid
2339       // instruction on a chip was really screwed up. Make sure to restore the
2340       // affinity mask before the tail call.
2341       __kmp_free(threadInfo);
2342       *msg_id = kmp_i18n_str_InvalidCpuidInfo;
2343       return false;
2344     }
2345 
2346     int maskC = (1 << widthC) - 1;
2347     threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC;
2348 
2349     int maskT = (1 << widthT) - 1;
2350     threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT;
2351 
2352     nApics++;
2353   }
2354 
2355   // We've collected all the info we need.
2356   // Restore the old affinity mask for this thread.
2357   previous_affinity.restore();
2358 
2359   // Sort the threadInfo table by physical Id.
2360   qsort(threadInfo, nApics, sizeof(*threadInfo),
2361         __kmp_affinity_cmp_apicThreadInfo_phys_id);
2362 
2363   // The table is now sorted by pkgId / coreId / threadId, but we really don't
2364   // know the radix of any of the fields. pkgId's may be sparsely assigned among
2365   // the chips on a system. Although coreId's are usually assigned
2366   // [0 .. coresPerPkg-1] and threadId's are usually assigned
2367   // [0..threadsPerCore-1], we don't want to make any such assumptions.
2368   //
2369   // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
2370   // total # packages) are at this point - we want to determine that now. We
2371   // only have an upper bound on the first two figures.
2372   //
2373   // We also perform a consistency check at this point: the values returned by
2374   // the cpuid instruction for any thread bound to a given package had better
2375   // return the same info for maxThreadsPerPkg and maxCoresPerPkg.
2376   nPackages = 1;
2377   nCoresPerPkg = 1;
2378   __kmp_nThreadsPerCore = 1;
2379   unsigned nCores = 1;
2380 
2381   unsigned pkgCt = 1; // to determine radii
2382   unsigned lastPkgId = threadInfo[0].pkgId;
2383   unsigned coreCt = 1;
2384   unsigned lastCoreId = threadInfo[0].coreId;
2385   unsigned threadCt = 1;
2386   unsigned lastThreadId = threadInfo[0].threadId;
2387 
2388   // intra-pkg consist checks
2389   unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
2390   unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
2391 
2392   for (i = 1; i < nApics; i++) {
2393     if (threadInfo[i].pkgId != lastPkgId) {
2394       nCores++;
2395       pkgCt++;
2396       lastPkgId = threadInfo[i].pkgId;
2397       if ((int)coreCt > nCoresPerPkg)
2398         nCoresPerPkg = coreCt;
2399       coreCt = 1;
2400       lastCoreId = threadInfo[i].coreId;
2401       if ((int)threadCt > __kmp_nThreadsPerCore)
2402         __kmp_nThreadsPerCore = threadCt;
2403       threadCt = 1;
2404       lastThreadId = threadInfo[i].threadId;
2405 
2406       // This is a different package, so go on to the next iteration without
2407       // doing any consistency checks. Reset the consistency check vars, though.
2408       prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
2409       prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
2410       continue;
2411     }
2412 
2413     if (threadInfo[i].coreId != lastCoreId) {
2414       nCores++;
2415       coreCt++;
2416       lastCoreId = threadInfo[i].coreId;
2417       if ((int)threadCt > __kmp_nThreadsPerCore)
2418         __kmp_nThreadsPerCore = threadCt;
2419       threadCt = 1;
2420       lastThreadId = threadInfo[i].threadId;
2421     } else if (threadInfo[i].threadId != lastThreadId) {
2422       threadCt++;
2423       lastThreadId = threadInfo[i].threadId;
2424     } else {
2425       __kmp_free(threadInfo);
2426       *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
2427       return false;
2428     }
2429 
2430     // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
2431     // fields agree between all the threads bounds to a given package.
2432     if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) ||
2433         (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
2434       __kmp_free(threadInfo);
2435       *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
2436       return false;
2437     }
2438   }
2439   // When affinity is off, this routine will still be called to set
2440   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
2441   // Make sure all these vars are set correctly
2442   nPackages = pkgCt;
2443   if ((int)coreCt > nCoresPerPkg)
2444     nCoresPerPkg = coreCt;
2445   if ((int)threadCt > __kmp_nThreadsPerCore)
2446     __kmp_nThreadsPerCore = threadCt;
2447   __kmp_ncores = nCores;
2448   KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc);
2449 
2450   // Now that we've determined the number of packages, the number of cores per
2451   // package, and the number of threads per core, we can construct the data
2452   // structure that is to be returned.
2453   int idx = 0;
2454   int pkgLevel = 0;
2455   int coreLevel = 1;
2456   int threadLevel = 2;
2457   //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
2458   int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
2459   kmp_hw_t types[3];
2460   if (pkgLevel >= 0)
2461     types[idx++] = KMP_HW_SOCKET;
2462   if (coreLevel >= 0)
2463     types[idx++] = KMP_HW_CORE;
2464   if (threadLevel >= 0)
2465     types[idx++] = KMP_HW_THREAD;
2466 
2467   KMP_ASSERT(depth > 0);
2468   __kmp_topology = kmp_topology_t::allocate(nApics, depth, types);
2469 
2470   for (i = 0; i < nApics; ++i) {
2471     idx = 0;
2472     unsigned os = threadInfo[i].osId;
2473     kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
2474     hw_thread.clear();
2475 
2476     if (pkgLevel >= 0) {
2477       hw_thread.ids[idx++] = threadInfo[i].pkgId;
2478     }
2479     if (coreLevel >= 0) {
2480       hw_thread.ids[idx++] = threadInfo[i].coreId;
2481     }
2482     if (threadLevel >= 0) {
2483       hw_thread.ids[idx++] = threadInfo[i].threadId;
2484     }
2485     hw_thread.os_id = os;
2486   }
2487 
2488   __kmp_free(threadInfo);
2489   __kmp_topology->sort_ids();
2490   if (!__kmp_topology->check_ids()) {
2491     kmp_topology_t::deallocate(__kmp_topology);
2492     __kmp_topology = nullptr;
2493     *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
2494     return false;
2495   }
2496   return true;
2497 }
2498 
2499 // Hybrid cpu detection using CPUID.1A
2500 // Thread should be pinned to processor already
__kmp_get_hybrid_info(kmp_hw_core_type_t * type,int * efficiency,unsigned * native_model_id)2501 static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, int *efficiency,
2502                                   unsigned *native_model_id) {
2503   kmp_cpuid buf;
2504   __kmp_x86_cpuid(0x1a, 0, &buf);
2505   *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax);
2506   switch (*type) {
2507   case KMP_HW_CORE_TYPE_ATOM:
2508     *efficiency = 0;
2509     break;
2510   case KMP_HW_CORE_TYPE_CORE:
2511     *efficiency = 1;
2512     break;
2513   default:
2514     *efficiency = 0;
2515   }
2516   *native_model_id = __kmp_extract_bits<0, 23>(buf.eax);
2517 }
2518 
2519 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
2520 // architectures support a newer interface for specifying the x2APIC Ids,
2521 // based on CPUID.B or CPUID.1F
2522 /*
2523  * CPUID.B or 1F, Input ECX (sub leaf # aka level number)
2524     Bits            Bits            Bits           Bits
2525     31-16           15-8            7-4            4-0
2526 ---+-----------+--------------+-------------+-----------------+
2527 EAX| reserved  |   reserved   |   reserved  |  Bits to Shift  |
2528 ---+-----------|--------------+-------------+-----------------|
2529 EBX| reserved  | Num logical processors at level (16 bits)    |
2530 ---+-----------|--------------+-------------------------------|
2531 ECX| reserved  |   Level Type |      Level Number (8 bits)    |
2532 ---+-----------+--------------+-------------------------------|
2533 EDX|                    X2APIC ID (32 bits)                   |
2534 ---+----------------------------------------------------------+
2535 */
2536 
2537 enum {
2538   INTEL_LEVEL_TYPE_INVALID = 0, // Package level
2539   INTEL_LEVEL_TYPE_SMT = 1,
2540   INTEL_LEVEL_TYPE_CORE = 2,
2541   INTEL_LEVEL_TYPE_MODULE = 3,
2542   INTEL_LEVEL_TYPE_TILE = 4,
2543   INTEL_LEVEL_TYPE_DIE = 5,
2544   INTEL_LEVEL_TYPE_LAST = 6,
2545 };
2546 
2547 struct cpuid_level_info_t {
2548   unsigned level_type, mask, mask_width, nitems, cache_mask;
2549 };
2550 
__kmp_intel_type_2_topology_type(int intel_type)2551 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
2552   switch (intel_type) {
2553   case INTEL_LEVEL_TYPE_INVALID:
2554     return KMP_HW_SOCKET;
2555   case INTEL_LEVEL_TYPE_SMT:
2556     return KMP_HW_THREAD;
2557   case INTEL_LEVEL_TYPE_CORE:
2558     return KMP_HW_CORE;
2559   case INTEL_LEVEL_TYPE_TILE:
2560     return KMP_HW_TILE;
2561   case INTEL_LEVEL_TYPE_MODULE:
2562     return KMP_HW_MODULE;
2563   case INTEL_LEVEL_TYPE_DIE:
2564     return KMP_HW_DIE;
2565   }
2566   return KMP_HW_UNKNOWN;
2567 }
2568 
2569 // This function takes the topology leaf, a levels array to store the levels
2570 // detected and a bitmap of the known levels.
2571 // Returns the number of levels in the topology
2572 static unsigned
__kmp_x2apicid_get_levels(int leaf,cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],kmp_uint64 known_levels)2573 __kmp_x2apicid_get_levels(int leaf,
2574                           cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],
2575                           kmp_uint64 known_levels) {
2576   unsigned level, levels_index;
2577   unsigned level_type, mask_width, nitems;
2578   kmp_cpuid buf;
2579 
2580   // New algorithm has known topology layers act as highest unknown topology
2581   // layers when unknown topology layers exist.
2582   // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z>
2583   // are unknown topology layers, Then SMT will take the characteristics of
2584   // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>).
2585   // This eliminates unknown portions of the topology while still keeping the
2586   // correct structure.
2587   level = levels_index = 0;
2588   do {
2589     __kmp_x86_cpuid(leaf, level, &buf);
2590     level_type = __kmp_extract_bits<8, 15>(buf.ecx);
2591     mask_width = __kmp_extract_bits<0, 4>(buf.eax);
2592     nitems = __kmp_extract_bits<0, 15>(buf.ebx);
2593     if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0)
2594       return 0;
2595 
2596     if (known_levels & (1ull << level_type)) {
2597       // Add a new level to the topology
2598       KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST);
2599       levels[levels_index].level_type = level_type;
2600       levels[levels_index].mask_width = mask_width;
2601       levels[levels_index].nitems = nitems;
2602       levels_index++;
2603     } else {
2604       // If it is an unknown level, then logically move the previous layer up
2605       if (levels_index > 0) {
2606         levels[levels_index - 1].mask_width = mask_width;
2607         levels[levels_index - 1].nitems = nitems;
2608       }
2609     }
2610     level++;
2611   } while (level_type != INTEL_LEVEL_TYPE_INVALID);
2612 
2613   // Ensure the INTEL_LEVEL_TYPE_INVALID (Socket) layer isn't first
2614   if (levels_index == 0 || levels[0].level_type == INTEL_LEVEL_TYPE_INVALID)
2615     return 0;
2616 
2617   // Set the masks to & with apicid
2618   for (unsigned i = 0; i < levels_index; ++i) {
2619     if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) {
2620       levels[i].mask = ~((-1) << levels[i].mask_width);
2621       levels[i].cache_mask = (-1) << levels[i].mask_width;
2622       for (unsigned j = 0; j < i; ++j)
2623         levels[i].mask ^= levels[j].mask;
2624     } else {
2625       KMP_DEBUG_ASSERT(i > 0);
2626       levels[i].mask = (-1) << levels[i - 1].mask_width;
2627       levels[i].cache_mask = 0;
2628     }
2629   }
2630   return levels_index;
2631 }
2632 
__kmp_affinity_create_x2apicid_map(kmp_i18n_id_t * const msg_id)2633 static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
2634 
2635   cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
2636   kmp_hw_t types[INTEL_LEVEL_TYPE_LAST];
2637   unsigned levels_index;
2638   kmp_cpuid buf;
2639   kmp_uint64 known_levels;
2640   int topology_leaf, highest_leaf, apic_id;
2641   int num_leaves;
2642   static int leaves[] = {0, 0};
2643 
2644   kmp_i18n_id_t leaf_message_id;
2645 
2646   KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST);
2647 
2648   *msg_id = kmp_i18n_null;
2649   if (__kmp_affinity.flags.verbose) {
2650     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
2651   }
2652 
2653   // Figure out the known topology levels
2654   known_levels = 0ull;
2655   for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) {
2656     if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) {
2657       known_levels |= (1ull << i);
2658     }
2659   }
2660 
2661   // Get the highest cpuid leaf supported
2662   __kmp_x86_cpuid(0, 0, &buf);
2663   highest_leaf = buf.eax;
2664 
2665   // If a specific topology method was requested, only allow that specific leaf
2666   // otherwise, try both leaves 31 and 11 in that order
2667   num_leaves = 0;
2668   if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
2669     num_leaves = 1;
2670     leaves[0] = 11;
2671     leaf_message_id = kmp_i18n_str_NoLeaf11Support;
2672   } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
2673     num_leaves = 1;
2674     leaves[0] = 31;
2675     leaf_message_id = kmp_i18n_str_NoLeaf31Support;
2676   } else {
2677     num_leaves = 2;
2678     leaves[0] = 31;
2679     leaves[1] = 11;
2680     leaf_message_id = kmp_i18n_str_NoLeaf11Support;
2681   }
2682 
2683   // Check to see if cpuid leaf 31 or 11 is supported.
2684   __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2685   topology_leaf = -1;
2686   for (int i = 0; i < num_leaves; ++i) {
2687     int leaf = leaves[i];
2688     if (highest_leaf < leaf)
2689       continue;
2690     __kmp_x86_cpuid(leaf, 0, &buf);
2691     if (buf.ebx == 0)
2692       continue;
2693     topology_leaf = leaf;
2694     levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels);
2695     if (levels_index == 0)
2696       continue;
2697     break;
2698   }
2699   if (topology_leaf == -1 || levels_index == 0) {
2700     *msg_id = leaf_message_id;
2701     return false;
2702   }
2703   KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
2704 
2705   // The algorithm used starts by setting the affinity to each available thread
2706   // and retrieving info from the cpuid instruction, so if we are not capable of
2707   // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then
2708   // we need to do something else - use the defaults that we calculated from
2709   // issuing cpuid without binding to each proc.
2710   if (!KMP_AFFINITY_CAPABLE()) {
2711     // Hack to try and infer the machine topology using only the data
2712     // available from cpuid on the current thread, and __kmp_xproc.
2713     KMP_ASSERT(__kmp_affinity.type == affinity_none);
2714     for (unsigned i = 0; i < levels_index; ++i) {
2715       if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
2716         __kmp_nThreadsPerCore = levels[i].nitems;
2717       } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
2718         nCoresPerPkg = levels[i].nitems;
2719       }
2720     }
2721     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
2722     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
2723     return true;
2724   }
2725 
2726   // Allocate the data structure to be returned.
2727   int depth = levels_index;
2728   for (int i = depth - 1, j = 0; i >= 0; --i, ++j)
2729     types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type);
2730   __kmp_topology =
2731       kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types);
2732 
2733   // Insert equivalent cache types if they exist
2734   kmp_cache_info_t cache_info;
2735   for (size_t i = 0; i < cache_info.get_depth(); ++i) {
2736     const kmp_cache_info_t::info_t &info = cache_info[i];
2737     unsigned cache_mask = info.mask;
2738     unsigned cache_level = info.level;
2739     for (unsigned j = 0; j < levels_index; ++j) {
2740       unsigned hw_cache_mask = levels[j].cache_mask;
2741       kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level);
2742       if (hw_cache_mask == cache_mask && j < levels_index - 1) {
2743         kmp_hw_t type =
2744             __kmp_intel_type_2_topology_type(levels[j + 1].level_type);
2745         __kmp_topology->set_equivalent_type(cache_type, type);
2746       }
2747     }
2748   }
2749 
2750   // From here on, we can assume that it is safe to call
2751   // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
2752   // __kmp_affinity.type = affinity_none.
2753 
2754   // Save the affinity mask for the current thread.
2755   kmp_affinity_raii_t previous_affinity;
2756 
2757   // Run through each of the available contexts, binding the current thread
2758   // to it, and obtaining the pertinent information using the cpuid instr.
2759   unsigned int proc;
2760   int hw_thread_index = 0;
2761   KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
2762     cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST];
2763     unsigned my_levels_index;
2764 
2765     // Skip this proc if it is not included in the machine model.
2766     if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
2767       continue;
2768     }
2769     KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc);
2770 
2771     __kmp_affinity_dispatch->bind_thread(proc);
2772 
2773     // New algorithm
2774     __kmp_x86_cpuid(topology_leaf, 0, &buf);
2775     apic_id = buf.edx;
2776     kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
2777     my_levels_index =
2778         __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels);
2779     if (my_levels_index == 0 || my_levels_index != levels_index) {
2780       *msg_id = kmp_i18n_str_InvalidCpuidInfo;
2781       return false;
2782     }
2783     hw_thread.clear();
2784     hw_thread.os_id = proc;
2785     // Put in topology information
2786     for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) {
2787       hw_thread.ids[idx] = apic_id & my_levels[j].mask;
2788       if (j > 0) {
2789         hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
2790       }
2791     }
2792     // Hybrid information
2793     if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) {
2794       kmp_hw_core_type_t type;
2795       unsigned native_model_id;
2796       int efficiency;
2797       __kmp_get_hybrid_info(&type, &efficiency, &native_model_id);
2798       hw_thread.attrs.set_core_type(type);
2799       hw_thread.attrs.set_core_eff(efficiency);
2800     }
2801     hw_thread_index++;
2802   }
2803   KMP_ASSERT(hw_thread_index > 0);
2804   __kmp_topology->sort_ids();
2805   if (!__kmp_topology->check_ids()) {
2806     kmp_topology_t::deallocate(__kmp_topology);
2807     __kmp_topology = nullptr;
2808     *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
2809     return false;
2810   }
2811   return true;
2812 }
2813 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
2814 
2815 #define osIdIndex 0
2816 #define threadIdIndex 1
2817 #define coreIdIndex 2
2818 #define pkgIdIndex 3
2819 #define nodeIdIndex 4
2820 
2821 typedef unsigned *ProcCpuInfo;
2822 static unsigned maxIndex = pkgIdIndex;
2823 
__kmp_affinity_cmp_ProcCpuInfo_phys_id(const void * a,const void * b)2824 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
2825                                                   const void *b) {
2826   unsigned i;
2827   const unsigned *aa = *(unsigned *const *)a;
2828   const unsigned *bb = *(unsigned *const *)b;
2829   for (i = maxIndex;; i--) {
2830     if (aa[i] < bb[i])
2831       return -1;
2832     if (aa[i] > bb[i])
2833       return 1;
2834     if (i == osIdIndex)
2835       break;
2836   }
2837   return 0;
2838 }
2839 
2840 #if KMP_USE_HIER_SCHED
2841 // Set the array sizes for the hierarchy layers
__kmp_dispatch_set_hierarchy_values()2842 static void __kmp_dispatch_set_hierarchy_values() {
2843   // Set the maximum number of L1's to number of cores
2844   // Set the maximum number of L2's to either number of cores / 2 for
2845   // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
2846   // Or the number of cores for Intel(R) Xeon(R) processors
2847   // Set the maximum number of NUMA nodes and L3's to number of packages
2848   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
2849       nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
2850   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
2851 #if KMP_ARCH_X86_64 &&                                                         \
2852     (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||    \
2853      KMP_OS_WINDOWS) &&                                                        \
2854     KMP_MIC_SUPPORTED
2855   if (__kmp_mic_type >= mic3)
2856     __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
2857   else
2858 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2859     __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores;
2860   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages;
2861   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages;
2862   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1;
2863   // Set the number of threads per unit
2864   // Number of hardware threads per L1/L2/L3/NUMA/LOOP
2865   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
2866   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
2867       __kmp_nThreadsPerCore;
2868 #if KMP_ARCH_X86_64 &&                                                         \
2869     (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||    \
2870      KMP_OS_WINDOWS) &&                                                        \
2871     KMP_MIC_SUPPORTED
2872   if (__kmp_mic_type >= mic3)
2873     __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2874         2 * __kmp_nThreadsPerCore;
2875   else
2876 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2877     __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2878         __kmp_nThreadsPerCore;
2879   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] =
2880       nCoresPerPkg * __kmp_nThreadsPerCore;
2881   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] =
2882       nCoresPerPkg * __kmp_nThreadsPerCore;
2883   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] =
2884       nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
2885 }
2886 
2887 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
2888 // i.e., this thread's L1 or this thread's L2, etc.
__kmp_dispatch_get_index(int tid,kmp_hier_layer_e type)2889 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) {
2890   int index = type + 1;
2891   int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
2892   KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST);
2893   if (type == kmp_hier_layer_e::LAYER_THREAD)
2894     return tid;
2895   else if (type == kmp_hier_layer_e::LAYER_LOOP)
2896     return 0;
2897   KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0);
2898   if (tid >= num_hw_threads)
2899     tid = tid % num_hw_threads;
2900   return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index];
2901 }
2902 
2903 // Return the number of t1's per t2
__kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1,kmp_hier_layer_e t2)2904 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
2905   int i1 = t1 + 1;
2906   int i2 = t2 + 1;
2907   KMP_DEBUG_ASSERT(i1 <= i2);
2908   KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST);
2909   KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST);
2910   KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0);
2911   // (nthreads/t2) / (nthreads/t1) = t1 / t2
2912   return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1];
2913 }
2914 #endif // KMP_USE_HIER_SCHED
2915 
__kmp_cpuinfo_get_filename()2916 static inline const char *__kmp_cpuinfo_get_filename() {
2917   const char *filename;
2918   if (__kmp_cpuinfo_file != nullptr)
2919     filename = __kmp_cpuinfo_file;
2920   else
2921     filename = "/proc/cpuinfo";
2922   return filename;
2923 }
2924 
__kmp_cpuinfo_get_envvar()2925 static inline const char *__kmp_cpuinfo_get_envvar() {
2926   const char *envvar = nullptr;
2927   if (__kmp_cpuinfo_file != nullptr)
2928     envvar = "KMP_CPUINFO_FILE";
2929   return envvar;
2930 }
2931 
2932 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
2933 // affinity map. On AIX, the map is obtained through system SRAD (Scheduler
2934 // Resource Allocation Domain).
__kmp_affinity_create_cpuinfo_map(int * line,kmp_i18n_id_t * const msg_id)2935 static bool __kmp_affinity_create_cpuinfo_map(int *line,
2936                                               kmp_i18n_id_t *const msg_id) {
2937   *msg_id = kmp_i18n_null;
2938 
2939 #if KMP_OS_AIX
2940   unsigned num_records = __kmp_xproc;
2941 #else
2942   const char *filename = __kmp_cpuinfo_get_filename();
2943   const char *envvar = __kmp_cpuinfo_get_envvar();
2944 
2945   if (__kmp_affinity.flags.verbose) {
2946     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
2947   }
2948 
2949   kmp_safe_raii_file_t f(filename, "r", envvar);
2950 
2951   // Scan of the file, and count the number of "processor" (osId) fields,
2952   // and find the highest value of <n> for a node_<n> field.
2953   char buf[256];
2954   unsigned num_records = 0;
2955   while (!feof(f)) {
2956     buf[sizeof(buf) - 1] = 1;
2957     if (!fgets(buf, sizeof(buf), f)) {
2958       // Read errors presumably because of EOF
2959       break;
2960     }
2961 
2962     char s1[] = "processor";
2963     if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2964       num_records++;
2965       continue;
2966     }
2967 
2968     // FIXME - this will match "node_<n> <garbage>"
2969     unsigned level;
2970     if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
2971       // validate the input fisrt:
2972       if (level > (unsigned)__kmp_xproc) { // level is too big
2973         level = __kmp_xproc;
2974       }
2975       if (nodeIdIndex + level >= maxIndex) {
2976         maxIndex = nodeIdIndex + level;
2977       }
2978       continue;
2979     }
2980   }
2981 
2982   // Check for empty file / no valid processor records, or too many. The number
2983   // of records can't exceed the number of valid bits in the affinity mask.
2984   if (num_records == 0) {
2985     *msg_id = kmp_i18n_str_NoProcRecords;
2986     return false;
2987   }
2988   if (num_records > (unsigned)__kmp_xproc) {
2989     *msg_id = kmp_i18n_str_TooManyProcRecords;
2990     return false;
2991   }
2992 
2993   // Set the file pointer back to the beginning, so that we can scan the file
2994   // again, this time performing a full parse of the data. Allocate a vector of
2995   // ProcCpuInfo object, where we will place the data. Adding an extra element
2996   // at the end allows us to remove a lot of extra checks for termination
2997   // conditions.
2998   if (fseek(f, 0, SEEK_SET) != 0) {
2999     *msg_id = kmp_i18n_str_CantRewindCpuinfo;
3000     return false;
3001   }
3002 #endif // KMP_OS_AIX
3003 
3004   // Allocate the array of records to store the proc info in.  The dummy
3005   // element at the end makes the logic in filling them out easier to code.
3006   unsigned **threadInfo =
3007       (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *));
3008   unsigned i;
3009   for (i = 0; i <= num_records; i++) {
3010     threadInfo[i] =
3011         (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3012   }
3013 
3014 #define CLEANUP_THREAD_INFO                                                    \
3015   for (i = 0; i <= num_records; i++) {                                         \
3016     __kmp_free(threadInfo[i]);                                                 \
3017   }                                                                            \
3018   __kmp_free(threadInfo);
3019 
3020   // A value of UINT_MAX means that we didn't find the field
3021   unsigned __index;
3022 
3023 #define INIT_PROC_INFO(p)                                                      \
3024   for (__index = 0; __index <= maxIndex; __index++) {                          \
3025     (p)[__index] = UINT_MAX;                                                   \
3026   }
3027 
3028   for (i = 0; i <= num_records; i++) {
3029     INIT_PROC_INFO(threadInfo[i]);
3030   }
3031 
3032 #if KMP_OS_AIX
3033   int smt_threads;
3034   lpar_info_format1_t cpuinfo;
3035   unsigned num_avail = __kmp_xproc;
3036 
3037   if (__kmp_affinity.flags.verbose)
3038     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "system info for topology");
3039 
3040   // Get the number of SMT threads per core.
3041   smt_threads = syssmt(GET_NUMBER_SMT_SETS, 0, 0, NULL);
3042 
3043   // Allocate a resource set containing available system resourses.
3044   rsethandle_t sys_rset = rs_alloc(RS_SYSTEM);
3045   if (sys_rset == NULL) {
3046     CLEANUP_THREAD_INFO;
3047     *msg_id = kmp_i18n_str_UnknownTopology;
3048     return false;
3049   }
3050   // Allocate a resource set for the SRAD info.
3051   rsethandle_t srad = rs_alloc(RS_EMPTY);
3052   if (srad == NULL) {
3053     rs_free(sys_rset);
3054     CLEANUP_THREAD_INFO;
3055     *msg_id = kmp_i18n_str_UnknownTopology;
3056     return false;
3057   }
3058 
3059   // Get the SRAD system detail level.
3060   int sradsdl = rs_getinfo(NULL, R_SRADSDL, 0);
3061   if (sradsdl < 0) {
3062     rs_free(sys_rset);
3063     rs_free(srad);
3064     CLEANUP_THREAD_INFO;
3065     *msg_id = kmp_i18n_str_UnknownTopology;
3066     return false;
3067   }
3068   // Get the number of RADs at that SRAD SDL.
3069   int num_rads = rs_numrads(sys_rset, sradsdl, 0);
3070   if (num_rads < 0) {
3071     rs_free(sys_rset);
3072     rs_free(srad);
3073     CLEANUP_THREAD_INFO;
3074     *msg_id = kmp_i18n_str_UnknownTopology;
3075     return false;
3076   }
3077 
3078   // Get the maximum number of procs that may be contained in a resource set.
3079   int max_procs = rs_getinfo(NULL, R_MAXPROCS, 0);
3080   if (max_procs < 0) {
3081     rs_free(sys_rset);
3082     rs_free(srad);
3083     CLEANUP_THREAD_INFO;
3084     *msg_id = kmp_i18n_str_UnknownTopology;
3085     return false;
3086   }
3087 
3088   int cur_rad = 0;
3089   int num_set = 0;
3090   for (int srad_idx = 0; cur_rad < num_rads && srad_idx < VMI_MAXRADS;
3091        ++srad_idx) {
3092     // Check if the SRAD is available in the RSET.
3093     if (rs_getrad(sys_rset, srad, sradsdl, srad_idx, 0) < 0)
3094       continue;
3095 
3096     for (int cpu = 0; cpu < max_procs; cpu++) {
3097       // Set the info for the cpu if it is in the SRAD.
3098       if (rs_op(RS_TESTRESOURCE, srad, NULL, R_PROCS, cpu)) {
3099         threadInfo[cpu][osIdIndex] = cpu;
3100         threadInfo[cpu][pkgIdIndex] = cur_rad;
3101         threadInfo[cpu][coreIdIndex] = cpu / smt_threads;
3102         ++num_set;
3103         if (num_set >= num_avail) {
3104           // Done if all available CPUs have been set.
3105           break;
3106         }
3107       }
3108     }
3109     ++cur_rad;
3110   }
3111   rs_free(sys_rset);
3112   rs_free(srad);
3113 
3114   // The topology is already sorted.
3115 
3116 #else // !KMP_OS_AIX
3117   unsigned num_avail = 0;
3118   *line = 0;
3119 #if KMP_ARCH_S390X
3120   bool reading_s390x_sys_info = true;
3121 #endif
3122   while (!feof(f)) {
3123     // Create an inner scoping level, so that all the goto targets at the end of
3124     // the loop appear in an outer scoping level. This avoids warnings about
3125     // jumping past an initialization to a target in the same block.
3126     {
3127       buf[sizeof(buf) - 1] = 1;
3128       bool long_line = false;
3129       if (!fgets(buf, sizeof(buf), f)) {
3130         // Read errors presumably because of EOF
3131         // If there is valid data in threadInfo[num_avail], then fake
3132         // a blank line in ensure that the last address gets parsed.
3133         bool valid = false;
3134         for (i = 0; i <= maxIndex; i++) {
3135           if (threadInfo[num_avail][i] != UINT_MAX) {
3136             valid = true;
3137           }
3138         }
3139         if (!valid) {
3140           break;
3141         }
3142         buf[0] = 0;
3143       } else if (!buf[sizeof(buf) - 1]) {
3144         // The line is longer than the buffer.  Set a flag and don't
3145         // emit an error if we were going to ignore the line, anyway.
3146         long_line = true;
3147 
3148 #define CHECK_LINE                                                             \
3149   if (long_line) {                                                             \
3150     CLEANUP_THREAD_INFO;                                                       \
3151     *msg_id = kmp_i18n_str_LongLineCpuinfo;                                    \
3152     return false;                                                              \
3153   }
3154       }
3155       (*line)++;
3156 
3157 #if KMP_ARCH_LOONGARCH64
3158       // The parsing logic of /proc/cpuinfo in this function highly depends on
3159       // the blank lines between each processor info block. But on LoongArch a
3160       // blank line exists before the first processor info block (i.e. after the
3161       // "system type" line). This blank line was added because the "system
3162       // type" line is unrelated to any of the CPUs. We must skip this line so
3163       // that the original logic works on LoongArch.
3164       if (*buf == '\n' && *line == 2)
3165         continue;
3166 #endif
3167 #if KMP_ARCH_S390X
3168       // s390x /proc/cpuinfo starts with a variable number of lines containing
3169       // the overall system information. Skip them.
3170       if (reading_s390x_sys_info) {
3171         if (*buf == '\n')
3172           reading_s390x_sys_info = false;
3173         continue;
3174       }
3175 #endif
3176 
3177 #if KMP_ARCH_S390X
3178       char s1[] = "cpu number";
3179 #else
3180       char s1[] = "processor";
3181 #endif
3182       if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
3183         CHECK_LINE;
3184         char *p = strchr(buf + sizeof(s1) - 1, ':');
3185         unsigned val;
3186         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3187           goto no_val;
3188         if (threadInfo[num_avail][osIdIndex] != UINT_MAX)
3189 #if KMP_ARCH_AARCH64
3190           // Handle the old AArch64 /proc/cpuinfo layout differently,
3191           // it contains all of the 'processor' entries listed in a
3192           // single 'Processor' section, therefore the normal looking
3193           // for duplicates in that section will always fail.
3194           num_avail++;
3195 #else
3196           goto dup_field;
3197 #endif
3198         threadInfo[num_avail][osIdIndex] = val;
3199 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
3200         char path[256];
3201         KMP_SNPRINTF(
3202             path, sizeof(path),
3203             "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
3204             threadInfo[num_avail][osIdIndex]);
3205         __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
3206 
3207 #if KMP_ARCH_S390X
3208         // Disambiguate physical_package_id.
3209         unsigned book_id;
3210         KMP_SNPRINTF(path, sizeof(path),
3211                      "/sys/devices/system/cpu/cpu%u/topology/book_id",
3212                      threadInfo[num_avail][osIdIndex]);
3213         __kmp_read_from_file(path, "%u", &book_id);
3214         threadInfo[num_avail][pkgIdIndex] |= (book_id << 8);
3215 
3216         unsigned drawer_id;
3217         KMP_SNPRINTF(path, sizeof(path),
3218                      "/sys/devices/system/cpu/cpu%u/topology/drawer_id",
3219                      threadInfo[num_avail][osIdIndex]);
3220         __kmp_read_from_file(path, "%u", &drawer_id);
3221         threadInfo[num_avail][pkgIdIndex] |= (drawer_id << 16);
3222 #endif
3223 
3224         KMP_SNPRINTF(path, sizeof(path),
3225                      "/sys/devices/system/cpu/cpu%u/topology/core_id",
3226                      threadInfo[num_avail][osIdIndex]);
3227         __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
3228         continue;
3229 #else
3230       }
3231       char s2[] = "physical id";
3232       if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
3233         CHECK_LINE;
3234         char *p = strchr(buf + sizeof(s2) - 1, ':');
3235         unsigned val;
3236         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3237           goto no_val;
3238         if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX)
3239           goto dup_field;
3240         threadInfo[num_avail][pkgIdIndex] = val;
3241         continue;
3242       }
3243       char s3[] = "core id";
3244       if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
3245         CHECK_LINE;
3246         char *p = strchr(buf + sizeof(s3) - 1, ':');
3247         unsigned val;
3248         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3249           goto no_val;
3250         if (threadInfo[num_avail][coreIdIndex] != UINT_MAX)
3251           goto dup_field;
3252         threadInfo[num_avail][coreIdIndex] = val;
3253         continue;
3254 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
3255       }
3256       char s4[] = "thread id";
3257       if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
3258         CHECK_LINE;
3259         char *p = strchr(buf + sizeof(s4) - 1, ':');
3260         unsigned val;
3261         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3262           goto no_val;
3263         if (threadInfo[num_avail][threadIdIndex] != UINT_MAX)
3264           goto dup_field;
3265         threadInfo[num_avail][threadIdIndex] = val;
3266         continue;
3267       }
3268       unsigned level;
3269       if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
3270         CHECK_LINE;
3271         char *p = strchr(buf + sizeof(s4) - 1, ':');
3272         unsigned val;
3273         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3274           goto no_val;
3275         // validate the input before using level:
3276         if (level > (unsigned)__kmp_xproc) { // level is too big
3277           level = __kmp_xproc;
3278         }
3279         if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
3280           goto dup_field;
3281         threadInfo[num_avail][nodeIdIndex + level] = val;
3282         continue;
3283       }
3284 
3285       // We didn't recognize the leading token on the line. There are lots of
3286       // leading tokens that we don't recognize - if the line isn't empty, go on
3287       // to the next line.
3288       if ((*buf != 0) && (*buf != '\n')) {
3289         // If the line is longer than the buffer, read characters
3290         // until we find a newline.
3291         if (long_line) {
3292           int ch;
3293           while (((ch = fgetc(f)) != EOF) && (ch != '\n'))
3294             ;
3295         }
3296         continue;
3297       }
3298 
3299       // A newline has signalled the end of the processor record.
3300       // Check that there aren't too many procs specified.
3301       if ((int)num_avail == __kmp_xproc) {
3302         CLEANUP_THREAD_INFO;
3303         *msg_id = kmp_i18n_str_TooManyEntries;
3304         return false;
3305       }
3306 
3307       // Check for missing fields.  The osId field must be there, and we
3308       // currently require that the physical id field is specified, also.
3309       if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
3310         CLEANUP_THREAD_INFO;
3311         *msg_id = kmp_i18n_str_MissingProcField;
3312         return false;
3313       }
3314       if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
3315         CLEANUP_THREAD_INFO;
3316         *msg_id = kmp_i18n_str_MissingPhysicalIDField;
3317         return false;
3318       }
3319 
3320       // Skip this proc if it is not included in the machine model.
3321       if (KMP_AFFINITY_CAPABLE() &&
3322           !KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
3323                          __kmp_affin_fullMask)) {
3324         INIT_PROC_INFO(threadInfo[num_avail]);
3325         continue;
3326       }
3327 
3328       // We have a successful parse of this proc's info.
3329       // Increment the counter, and prepare for the next proc.
3330       num_avail++;
3331       KMP_ASSERT(num_avail <= num_records);
3332       INIT_PROC_INFO(threadInfo[num_avail]);
3333     }
3334     continue;
3335 
3336   no_val:
3337     CLEANUP_THREAD_INFO;
3338     *msg_id = kmp_i18n_str_MissingValCpuinfo;
3339     return false;
3340 
3341   dup_field:
3342     CLEANUP_THREAD_INFO;
3343     *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
3344     return false;
3345   }
3346   *line = 0;
3347 
3348 #if KMP_MIC && REDUCE_TEAM_SIZE
3349   unsigned teamSize = 0;
3350 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3351 
3352   // check for num_records == __kmp_xproc ???
3353 
3354   // If it is configured to omit the package level when there is only a single
3355   // package, the logic at the end of this routine won't work if there is only a
3356   // single thread
3357   KMP_ASSERT(num_avail > 0);
3358   KMP_ASSERT(num_avail <= num_records);
3359 
3360   // Sort the threadInfo table by physical Id.
3361   qsort(threadInfo, num_avail, sizeof(*threadInfo),
3362         __kmp_affinity_cmp_ProcCpuInfo_phys_id);
3363 
3364 #endif // KMP_OS_AIX
3365 
3366   // The table is now sorted by pkgId / coreId / threadId, but we really don't
3367   // know the radix of any of the fields. pkgId's may be sparsely assigned among
3368   // the chips on a system. Although coreId's are usually assigned
3369   // [0 .. coresPerPkg-1] and threadId's are usually assigned
3370   // [0..threadsPerCore-1], we don't want to make any such assumptions.
3371   //
3372   // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
3373   // total # packages) are at this point - we want to determine that now. We
3374   // only have an upper bound on the first two figures.
3375   unsigned *counts =
3376       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3377   unsigned *maxCt =
3378       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3379   unsigned *totals =
3380       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3381   unsigned *lastId =
3382       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3383 
3384   bool assign_thread_ids = false;
3385   unsigned threadIdCt;
3386   unsigned index;
3387 
3388 restart_radix_check:
3389   threadIdCt = 0;
3390 
3391   // Initialize the counter arrays with data from threadInfo[0].
3392   if (assign_thread_ids) {
3393     if (threadInfo[0][threadIdIndex] == UINT_MAX) {
3394       threadInfo[0][threadIdIndex] = threadIdCt++;
3395     } else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
3396       threadIdCt = threadInfo[0][threadIdIndex] + 1;
3397     }
3398   }
3399   for (index = 0; index <= maxIndex; index++) {
3400     counts[index] = 1;
3401     maxCt[index] = 1;
3402     totals[index] = 1;
3403     lastId[index] = threadInfo[0][index];
3404     ;
3405   }
3406 
3407   // Run through the rest of the OS procs.
3408   for (i = 1; i < num_avail; i++) {
3409     // Find the most significant index whose id differs from the id for the
3410     // previous OS proc.
3411     for (index = maxIndex; index >= threadIdIndex; index--) {
3412       if (assign_thread_ids && (index == threadIdIndex)) {
3413         // Auto-assign the thread id field if it wasn't specified.
3414         if (threadInfo[i][threadIdIndex] == UINT_MAX) {
3415           threadInfo[i][threadIdIndex] = threadIdCt++;
3416         }
3417         // Apparently the thread id field was specified for some entries and not
3418         // others. Start the thread id counter off at the next higher thread id.
3419         else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
3420           threadIdCt = threadInfo[i][threadIdIndex] + 1;
3421         }
3422       }
3423       if (threadInfo[i][index] != lastId[index]) {
3424         // Run through all indices which are less significant, and reset the
3425         // counts to 1. At all levels up to and including index, we need to
3426         // increment the totals and record the last id.
3427         unsigned index2;
3428         for (index2 = threadIdIndex; index2 < index; index2++) {
3429           totals[index2]++;
3430           if (counts[index2] > maxCt[index2]) {
3431             maxCt[index2] = counts[index2];
3432           }
3433           counts[index2] = 1;
3434           lastId[index2] = threadInfo[i][index2];
3435         }
3436         counts[index]++;
3437         totals[index]++;
3438         lastId[index] = threadInfo[i][index];
3439 
3440         if (assign_thread_ids && (index > threadIdIndex)) {
3441 
3442 #if KMP_MIC && REDUCE_TEAM_SIZE
3443           // The default team size is the total #threads in the machine
3444           // minus 1 thread for every core that has 3 or more threads.
3445           teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
3446 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3447 
3448           // Restart the thread counter, as we are on a new core.
3449           threadIdCt = 0;
3450 
3451           // Auto-assign the thread id field if it wasn't specified.
3452           if (threadInfo[i][threadIdIndex] == UINT_MAX) {
3453             threadInfo[i][threadIdIndex] = threadIdCt++;
3454           }
3455 
3456           // Apparently the thread id field was specified for some entries and
3457           // not others. Start the thread id counter off at the next higher
3458           // thread id.
3459           else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
3460             threadIdCt = threadInfo[i][threadIdIndex] + 1;
3461           }
3462         }
3463         break;
3464       }
3465     }
3466     if (index < threadIdIndex) {
3467       // If thread ids were specified, it is an error if they are not unique.
3468       // Also, check that we waven't already restarted the loop (to be safe -
3469       // shouldn't need to).
3470       if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) {
3471         __kmp_free(lastId);
3472         __kmp_free(totals);
3473         __kmp_free(maxCt);
3474         __kmp_free(counts);
3475         CLEANUP_THREAD_INFO;
3476         *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
3477         return false;
3478       }
3479 
3480       // If the thread ids were not specified and we see entries that
3481       // are duplicates, start the loop over and assign the thread ids manually.
3482       assign_thread_ids = true;
3483       goto restart_radix_check;
3484     }
3485   }
3486 
3487 #if KMP_MIC && REDUCE_TEAM_SIZE
3488   // The default team size is the total #threads in the machine
3489   // minus 1 thread for every core that has 3 or more threads.
3490   teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
3491 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3492 
3493   for (index = threadIdIndex; index <= maxIndex; index++) {
3494     if (counts[index] > maxCt[index]) {
3495       maxCt[index] = counts[index];
3496     }
3497   }
3498 
3499   __kmp_nThreadsPerCore = maxCt[threadIdIndex];
3500   nCoresPerPkg = maxCt[coreIdIndex];
3501   nPackages = totals[pkgIdIndex];
3502 
3503   // When affinity is off, this routine will still be called to set
3504   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
3505   // Make sure all these vars are set correctly, and return now if affinity is
3506   // not enabled.
3507   __kmp_ncores = totals[coreIdIndex];
3508   if (!KMP_AFFINITY_CAPABLE()) {
3509     KMP_ASSERT(__kmp_affinity.type == affinity_none);
3510     return true;
3511   }
3512 
3513 #if KMP_MIC && REDUCE_TEAM_SIZE
3514   // Set the default team size.
3515   if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
3516     __kmp_dflt_team_nth = teamSize;
3517     KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting "
3518                   "__kmp_dflt_team_nth = %d\n",
3519                   __kmp_dflt_team_nth));
3520   }
3521 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3522 
3523   KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc);
3524 
3525   // Count the number of levels which have more nodes at that level than at the
3526   // parent's level (with there being an implicit root node of the top level).
3527   // This is equivalent to saying that there is at least one node at this level
3528   // which has a sibling. These levels are in the map, and the package level is
3529   // always in the map.
3530   bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
3531   for (index = threadIdIndex; index < maxIndex; index++) {
3532     KMP_ASSERT(totals[index] >= totals[index + 1]);
3533     inMap[index] = (totals[index] > totals[index + 1]);
3534   }
3535   inMap[maxIndex] = (totals[maxIndex] > 1);
3536   inMap[pkgIdIndex] = true;
3537   inMap[coreIdIndex] = true;
3538   inMap[threadIdIndex] = true;
3539 
3540   int depth = 0;
3541   int idx = 0;
3542   kmp_hw_t types[KMP_HW_LAST];
3543   int pkgLevel = -1;
3544   int coreLevel = -1;
3545   int threadLevel = -1;
3546   for (index = threadIdIndex; index <= maxIndex; index++) {
3547     if (inMap[index]) {
3548       depth++;
3549     }
3550   }
3551   if (inMap[pkgIdIndex]) {
3552     pkgLevel = idx;
3553     types[idx++] = KMP_HW_SOCKET;
3554   }
3555   if (inMap[coreIdIndex]) {
3556     coreLevel = idx;
3557     types[idx++] = KMP_HW_CORE;
3558   }
3559   if (inMap[threadIdIndex]) {
3560     threadLevel = idx;
3561     types[idx++] = KMP_HW_THREAD;
3562   }
3563   KMP_ASSERT(depth > 0);
3564 
3565   // Construct the data structure that is to be returned.
3566   __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types);
3567 
3568   for (i = 0; i < num_avail; ++i) {
3569     unsigned os = threadInfo[i][osIdIndex];
3570     int src_index;
3571     kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
3572     hw_thread.clear();
3573     hw_thread.os_id = os;
3574 
3575     idx = 0;
3576     for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
3577       if (!inMap[src_index]) {
3578         continue;
3579       }
3580       if (src_index == pkgIdIndex) {
3581         hw_thread.ids[pkgLevel] = threadInfo[i][src_index];
3582       } else if (src_index == coreIdIndex) {
3583         hw_thread.ids[coreLevel] = threadInfo[i][src_index];
3584       } else if (src_index == threadIdIndex) {
3585         hw_thread.ids[threadLevel] = threadInfo[i][src_index];
3586       }
3587     }
3588   }
3589 
3590   __kmp_free(inMap);
3591   __kmp_free(lastId);
3592   __kmp_free(totals);
3593   __kmp_free(maxCt);
3594   __kmp_free(counts);
3595   CLEANUP_THREAD_INFO;
3596   __kmp_topology->sort_ids();
3597   if (!__kmp_topology->check_ids()) {
3598     kmp_topology_t::deallocate(__kmp_topology);
3599     __kmp_topology = nullptr;
3600     *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
3601     return false;
3602   }
3603   return true;
3604 }
3605 
3606 // Create and return a table of affinity masks, indexed by OS thread ID.
3607 // This routine handles OR'ing together all the affinity masks of threads
3608 // that are sufficiently close, if granularity > fine.
3609 template <typename FindNextFunctionType>
__kmp_create_os_id_masks(unsigned * numUnique,kmp_affinity_t & affinity,FindNextFunctionType find_next)3610 static void __kmp_create_os_id_masks(unsigned *numUnique,
3611                                      kmp_affinity_t &affinity,
3612                                      FindNextFunctionType find_next) {
3613   // First form a table of affinity masks in order of OS thread id.
3614   int maxOsId;
3615   int i;
3616   int numAddrs = __kmp_topology->get_num_hw_threads();
3617   int depth = __kmp_topology->get_depth();
3618   const char *env_var = __kmp_get_affinity_env_var(affinity);
3619   KMP_ASSERT(numAddrs);
3620   KMP_ASSERT(depth);
3621 
3622   i = find_next(-1);
3623   // If could not find HW thread location with attributes, then return and
3624   // fallback to increment find_next and disregard core attributes.
3625   if (i >= numAddrs)
3626     return;
3627 
3628   maxOsId = 0;
3629   for (i = numAddrs - 1;; --i) {
3630     int osId = __kmp_topology->at(i).os_id;
3631     if (osId > maxOsId) {
3632       maxOsId = osId;
3633     }
3634     if (i == 0)
3635       break;
3636   }
3637   affinity.num_os_id_masks = maxOsId + 1;
3638   KMP_CPU_ALLOC_ARRAY(affinity.os_id_masks, affinity.num_os_id_masks);
3639   KMP_ASSERT(affinity.gran_levels >= 0);
3640   if (affinity.flags.verbose && (affinity.gran_levels > 0)) {
3641     KMP_INFORM(ThreadsMigrate, env_var, affinity.gran_levels);
3642   }
3643   if (affinity.gran_levels >= (int)depth) {
3644     KMP_AFF_WARNING(affinity, AffThreadsMayMigrate);
3645   }
3646 
3647   // Run through the table, forming the masks for all threads on each core.
3648   // Threads on the same core will have identical kmp_hw_thread_t objects, not
3649   // considering the last level, which must be the thread id. All threads on a
3650   // core will appear consecutively.
3651   int unique = 0;
3652   int j = 0; // index of 1st thread on core
3653   int leader = 0;
3654   kmp_affin_mask_t *sum;
3655   KMP_CPU_ALLOC_ON_STACK(sum);
3656   KMP_CPU_ZERO(sum);
3657 
3658   i = j = leader = find_next(-1);
3659   KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
3660   kmp_full_mask_modifier_t full_mask;
3661   for (i = find_next(i); i < numAddrs; i = find_next(i)) {
3662     // If this thread is sufficiently close to the leader (within the
3663     // granularity setting), then set the bit for this os thread in the
3664     // affinity mask for this group, and go on to the next thread.
3665     if (__kmp_topology->is_close(leader, i, affinity)) {
3666       KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
3667       continue;
3668     }
3669 
3670     // For every thread in this group, copy the mask to the thread's entry in
3671     // the OS Id mask table. Mark the first address as a leader.
3672     for (; j < i; j = find_next(j)) {
3673       int osId = __kmp_topology->at(j).os_id;
3674       KMP_DEBUG_ASSERT(osId <= maxOsId);
3675       kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
3676       KMP_CPU_COPY(mask, sum);
3677       __kmp_topology->at(j).leader = (j == leader);
3678     }
3679     unique++;
3680 
3681     // Start a new mask.
3682     leader = i;
3683     full_mask.include(sum);
3684     KMP_CPU_ZERO(sum);
3685     KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
3686   }
3687 
3688   // For every thread in last group, copy the mask to the thread's
3689   // entry in the OS Id mask table.
3690   for (; j < i; j = find_next(j)) {
3691     int osId = __kmp_topology->at(j).os_id;
3692     KMP_DEBUG_ASSERT(osId <= maxOsId);
3693     kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
3694     KMP_CPU_COPY(mask, sum);
3695     __kmp_topology->at(j).leader = (j == leader);
3696   }
3697   full_mask.include(sum);
3698   unique++;
3699   KMP_CPU_FREE_FROM_STACK(sum);
3700 
3701   // See if the OS Id mask table further restricts or changes the full mask
3702   if (full_mask.restrict_to_mask() && affinity.flags.verbose) {
3703     __kmp_topology->print(env_var);
3704   }
3705 
3706   *numUnique = unique;
3707 }
3708 
3709 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
3710 // as file-static than to try and pass them through the calling sequence of
3711 // the recursive-descent OMP_PLACES parser.
3712 static kmp_affin_mask_t *newMasks;
3713 static int numNewMasks;
3714 static int nextNewMask;
3715 
3716 #define ADD_MASK(_mask)                                                        \
3717   {                                                                            \
3718     if (nextNewMask >= numNewMasks) {                                          \
3719       int i;                                                                   \
3720       numNewMasks *= 2;                                                        \
3721       kmp_affin_mask_t *temp;                                                  \
3722       KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks);                         \
3723       for (i = 0; i < numNewMasks / 2; i++) {                                  \
3724         kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);                    \
3725         kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i);                       \
3726         KMP_CPU_COPY(dest, src);                                               \
3727       }                                                                        \
3728       KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2);                  \
3729       newMasks = temp;                                                         \
3730     }                                                                          \
3731     KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));               \
3732     nextNewMask++;                                                             \
3733   }
3734 
3735 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId)                             \
3736   {                                                                            \
3737     if (((_osId) > _maxOsId) ||                                                \
3738         (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) {     \
3739       KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, _osId);                \
3740     } else {                                                                   \
3741       ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));                            \
3742     }                                                                          \
3743   }
3744 
3745 // Re-parse the proclist (for the explicit affinity type), and form the list
3746 // of affinity newMasks indexed by gtid.
__kmp_affinity_process_proclist(kmp_affinity_t & affinity)3747 static void __kmp_affinity_process_proclist(kmp_affinity_t &affinity) {
3748   int i;
3749   kmp_affin_mask_t **out_masks = &affinity.masks;
3750   unsigned *out_numMasks = &affinity.num_masks;
3751   const char *proclist = affinity.proclist;
3752   kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
3753   int maxOsId = affinity.num_os_id_masks - 1;
3754   const char *scan = proclist;
3755   const char *next = proclist;
3756 
3757   // We use malloc() for the temporary mask vector, so that we can use
3758   // realloc() to extend it.
3759   numNewMasks = 2;
3760   KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
3761   nextNewMask = 0;
3762   kmp_affin_mask_t *sumMask;
3763   KMP_CPU_ALLOC(sumMask);
3764   int setSize = 0;
3765 
3766   for (;;) {
3767     int start, end, stride;
3768 
3769     SKIP_WS(scan);
3770     next = scan;
3771     if (*next == '\0') {
3772       break;
3773     }
3774 
3775     if (*next == '{') {
3776       int num;
3777       setSize = 0;
3778       next++; // skip '{'
3779       SKIP_WS(next);
3780       scan = next;
3781 
3782       // Read the first integer in the set.
3783       KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist");
3784       SKIP_DIGITS(next);
3785       num = __kmp_str_to_int(scan, *next);
3786       KMP_ASSERT2(num >= 0, "bad explicit proc list");
3787 
3788       // Copy the mask for that osId to the sum (union) mask.
3789       if ((num > maxOsId) ||
3790           (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3791         KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
3792         KMP_CPU_ZERO(sumMask);
3793       } else {
3794         KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
3795         setSize = 1;
3796       }
3797 
3798       for (;;) {
3799         // Check for end of set.
3800         SKIP_WS(next);
3801         if (*next == '}') {
3802           next++; // skip '}'
3803           break;
3804         }
3805 
3806         // Skip optional comma.
3807         if (*next == ',') {
3808           next++;
3809         }
3810         SKIP_WS(next);
3811 
3812         // Read the next integer in the set.
3813         scan = next;
3814         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3815 
3816         SKIP_DIGITS(next);
3817         num = __kmp_str_to_int(scan, *next);
3818         KMP_ASSERT2(num >= 0, "bad explicit proc list");
3819 
3820         // Add the mask for that osId to the sum mask.
3821         if ((num > maxOsId) ||
3822             (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3823           KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
3824         } else {
3825           KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
3826           setSize++;
3827         }
3828       }
3829       if (setSize > 0) {
3830         ADD_MASK(sumMask);
3831       }
3832 
3833       SKIP_WS(next);
3834       if (*next == ',') {
3835         next++;
3836       }
3837       scan = next;
3838       continue;
3839     }
3840 
3841     // Read the first integer.
3842     KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3843     SKIP_DIGITS(next);
3844     start = __kmp_str_to_int(scan, *next);
3845     KMP_ASSERT2(start >= 0, "bad explicit proc list");
3846     SKIP_WS(next);
3847 
3848     // If this isn't a range, then add a mask to the list and go on.
3849     if (*next != '-') {
3850       ADD_MASK_OSID(start, osId2Mask, maxOsId);
3851 
3852       // Skip optional comma.
3853       if (*next == ',') {
3854         next++;
3855       }
3856       scan = next;
3857       continue;
3858     }
3859 
3860     // This is a range.  Skip over the '-' and read in the 2nd int.
3861     next++; // skip '-'
3862     SKIP_WS(next);
3863     scan = next;
3864     KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3865     SKIP_DIGITS(next);
3866     end = __kmp_str_to_int(scan, *next);
3867     KMP_ASSERT2(end >= 0, "bad explicit proc list");
3868 
3869     // Check for a stride parameter
3870     stride = 1;
3871     SKIP_WS(next);
3872     if (*next == ':') {
3873       // A stride is specified.  Skip over the ':" and read the 3rd int.
3874       int sign = +1;
3875       next++; // skip ':'
3876       SKIP_WS(next);
3877       scan = next;
3878       if (*next == '-') {
3879         sign = -1;
3880         next++;
3881         SKIP_WS(next);
3882         scan = next;
3883       }
3884       KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3885       SKIP_DIGITS(next);
3886       stride = __kmp_str_to_int(scan, *next);
3887       KMP_ASSERT2(stride >= 0, "bad explicit proc list");
3888       stride *= sign;
3889     }
3890 
3891     // Do some range checks.
3892     KMP_ASSERT2(stride != 0, "bad explicit proc list");
3893     if (stride > 0) {
3894       KMP_ASSERT2(start <= end, "bad explicit proc list");
3895     } else {
3896       KMP_ASSERT2(start >= end, "bad explicit proc list");
3897     }
3898     KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
3899 
3900     // Add the mask for each OS proc # to the list.
3901     if (stride > 0) {
3902       do {
3903         ADD_MASK_OSID(start, osId2Mask, maxOsId);
3904         start += stride;
3905       } while (start <= end);
3906     } else {
3907       do {
3908         ADD_MASK_OSID(start, osId2Mask, maxOsId);
3909         start += stride;
3910       } while (start >= end);
3911     }
3912 
3913     // Skip optional comma.
3914     SKIP_WS(next);
3915     if (*next == ',') {
3916       next++;
3917     }
3918     scan = next;
3919   }
3920 
3921   *out_numMasks = nextNewMask;
3922   if (nextNewMask == 0) {
3923     *out_masks = NULL;
3924     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3925     return;
3926   }
3927   KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3928   for (i = 0; i < nextNewMask; i++) {
3929     kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
3930     kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
3931     KMP_CPU_COPY(dest, src);
3932   }
3933   KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3934   KMP_CPU_FREE(sumMask);
3935 }
3936 
3937 /*-----------------------------------------------------------------------------
3938 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3939 places.  Again, Here is the grammar:
3940 
3941 place_list := place
3942 place_list := place , place_list
3943 place := num
3944 place := place : num
3945 place := place : num : signed
3946 place := { subplacelist }
3947 place := ! place                  // (lowest priority)
3948 subplace_list := subplace
3949 subplace_list := subplace , subplace_list
3950 subplace := num
3951 subplace := num : num
3952 subplace := num : num : signed
3953 signed := num
3954 signed := + signed
3955 signed := - signed
3956 -----------------------------------------------------------------------------*/
__kmp_process_subplace_list(const char ** scan,kmp_affinity_t & affinity,int maxOsId,kmp_affin_mask_t * tempMask,int * setSize)3957 static void __kmp_process_subplace_list(const char **scan,
3958                                         kmp_affinity_t &affinity, int maxOsId,
3959                                         kmp_affin_mask_t *tempMask,
3960                                         int *setSize) {
3961   const char *next;
3962   kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
3963 
3964   for (;;) {
3965     int start, count, stride, i;
3966 
3967     // Read in the starting proc id
3968     SKIP_WS(*scan);
3969     KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3970     next = *scan;
3971     SKIP_DIGITS(next);
3972     start = __kmp_str_to_int(*scan, *next);
3973     KMP_ASSERT(start >= 0);
3974     *scan = next;
3975 
3976     // valid follow sets are ',' ':' and '}'
3977     SKIP_WS(*scan);
3978     if (**scan == '}' || **scan == ',') {
3979       if ((start > maxOsId) ||
3980           (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3981         KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
3982       } else {
3983         KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3984         (*setSize)++;
3985       }
3986       if (**scan == '}') {
3987         break;
3988       }
3989       (*scan)++; // skip ','
3990       continue;
3991     }
3992     KMP_ASSERT2(**scan == ':', "bad explicit places list");
3993     (*scan)++; // skip ':'
3994 
3995     // Read count parameter
3996     SKIP_WS(*scan);
3997     KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3998     next = *scan;
3999     SKIP_DIGITS(next);
4000     count = __kmp_str_to_int(*scan, *next);
4001     KMP_ASSERT(count >= 0);
4002     *scan = next;
4003 
4004     // valid follow sets are ',' ':' and '}'
4005     SKIP_WS(*scan);
4006     if (**scan == '}' || **scan == ',') {
4007       for (i = 0; i < count; i++) {
4008         if ((start > maxOsId) ||
4009             (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
4010           KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
4011           break; // don't proliferate warnings for large count
4012         } else {
4013           KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
4014           start++;
4015           (*setSize)++;
4016         }
4017       }
4018       if (**scan == '}') {
4019         break;
4020       }
4021       (*scan)++; // skip ','
4022       continue;
4023     }
4024     KMP_ASSERT2(**scan == ':', "bad explicit places list");
4025     (*scan)++; // skip ':'
4026 
4027     // Read stride parameter
4028     int sign = +1;
4029     for (;;) {
4030       SKIP_WS(*scan);
4031       if (**scan == '+') {
4032         (*scan)++; // skip '+'
4033         continue;
4034       }
4035       if (**scan == '-') {
4036         sign *= -1;
4037         (*scan)++; // skip '-'
4038         continue;
4039       }
4040       break;
4041     }
4042     SKIP_WS(*scan);
4043     KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
4044     next = *scan;
4045     SKIP_DIGITS(next);
4046     stride = __kmp_str_to_int(*scan, *next);
4047     KMP_ASSERT(stride >= 0);
4048     *scan = next;
4049     stride *= sign;
4050 
4051     // valid follow sets are ',' and '}'
4052     SKIP_WS(*scan);
4053     if (**scan == '}' || **scan == ',') {
4054       for (i = 0; i < count; i++) {
4055         if ((start > maxOsId) ||
4056             (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
4057           KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
4058           break; // don't proliferate warnings for large count
4059         } else {
4060           KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
4061           start += stride;
4062           (*setSize)++;
4063         }
4064       }
4065       if (**scan == '}') {
4066         break;
4067       }
4068       (*scan)++; // skip ','
4069       continue;
4070     }
4071 
4072     KMP_ASSERT2(0, "bad explicit places list");
4073   }
4074 }
4075 
__kmp_process_place(const char ** scan,kmp_affinity_t & affinity,int maxOsId,kmp_affin_mask_t * tempMask,int * setSize)4076 static void __kmp_process_place(const char **scan, kmp_affinity_t &affinity,
4077                                 int maxOsId, kmp_affin_mask_t *tempMask,
4078                                 int *setSize) {
4079   const char *next;
4080   kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
4081 
4082   // valid follow sets are '{' '!' and num
4083   SKIP_WS(*scan);
4084   if (**scan == '{') {
4085     (*scan)++; // skip '{'
4086     __kmp_process_subplace_list(scan, affinity, maxOsId, tempMask, setSize);
4087     KMP_ASSERT2(**scan == '}', "bad explicit places list");
4088     (*scan)++; // skip '}'
4089   } else if (**scan == '!') {
4090     (*scan)++; // skip '!'
4091     __kmp_process_place(scan, affinity, maxOsId, tempMask, setSize);
4092     KMP_CPU_COMPLEMENT(maxOsId, tempMask);
4093   } else if ((**scan >= '0') && (**scan <= '9')) {
4094     next = *scan;
4095     SKIP_DIGITS(next);
4096     int num = __kmp_str_to_int(*scan, *next);
4097     KMP_ASSERT(num >= 0);
4098     if ((num > maxOsId) ||
4099         (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
4100       KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
4101     } else {
4102       KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
4103       (*setSize)++;
4104     }
4105     *scan = next; // skip num
4106   } else {
4107     KMP_ASSERT2(0, "bad explicit places list");
4108   }
4109 }
4110 
4111 // static void
__kmp_affinity_process_placelist(kmp_affinity_t & affinity)4112 void __kmp_affinity_process_placelist(kmp_affinity_t &affinity) {
4113   int i, j, count, stride, sign;
4114   kmp_affin_mask_t **out_masks = &affinity.masks;
4115   unsigned *out_numMasks = &affinity.num_masks;
4116   const char *placelist = affinity.proclist;
4117   kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
4118   int maxOsId = affinity.num_os_id_masks - 1;
4119   const char *scan = placelist;
4120   const char *next = placelist;
4121 
4122   numNewMasks = 2;
4123   KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
4124   nextNewMask = 0;
4125 
4126   // tempMask is modified based on the previous or initial
4127   //   place to form the current place
4128   // previousMask contains the previous place
4129   kmp_affin_mask_t *tempMask;
4130   kmp_affin_mask_t *previousMask;
4131   KMP_CPU_ALLOC(tempMask);
4132   KMP_CPU_ZERO(tempMask);
4133   KMP_CPU_ALLOC(previousMask);
4134   KMP_CPU_ZERO(previousMask);
4135   int setSize = 0;
4136 
4137   for (;;) {
4138     __kmp_process_place(&scan, affinity, maxOsId, tempMask, &setSize);
4139 
4140     // valid follow sets are ',' ':' and EOL
4141     SKIP_WS(scan);
4142     if (*scan == '\0' || *scan == ',') {
4143       if (setSize > 0) {
4144         ADD_MASK(tempMask);
4145       }
4146       KMP_CPU_ZERO(tempMask);
4147       setSize = 0;
4148       if (*scan == '\0') {
4149         break;
4150       }
4151       scan++; // skip ','
4152       continue;
4153     }
4154 
4155     KMP_ASSERT2(*scan == ':', "bad explicit places list");
4156     scan++; // skip ':'
4157 
4158     // Read count parameter
4159     SKIP_WS(scan);
4160     KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
4161     next = scan;
4162     SKIP_DIGITS(next);
4163     count = __kmp_str_to_int(scan, *next);
4164     KMP_ASSERT(count >= 0);
4165     scan = next;
4166 
4167     // valid follow sets are ',' ':' and EOL
4168     SKIP_WS(scan);
4169     if (*scan == '\0' || *scan == ',') {
4170       stride = +1;
4171     } else {
4172       KMP_ASSERT2(*scan == ':', "bad explicit places list");
4173       scan++; // skip ':'
4174 
4175       // Read stride parameter
4176       sign = +1;
4177       for (;;) {
4178         SKIP_WS(scan);
4179         if (*scan == '+') {
4180           scan++; // skip '+'
4181           continue;
4182         }
4183         if (*scan == '-') {
4184           sign *= -1;
4185           scan++; // skip '-'
4186           continue;
4187         }
4188         break;
4189       }
4190       SKIP_WS(scan);
4191       KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
4192       next = scan;
4193       SKIP_DIGITS(next);
4194       stride = __kmp_str_to_int(scan, *next);
4195       KMP_DEBUG_ASSERT(stride >= 0);
4196       scan = next;
4197       stride *= sign;
4198     }
4199 
4200     // Add places determined by initial_place : count : stride
4201     for (i = 0; i < count; i++) {
4202       if (setSize == 0) {
4203         break;
4204       }
4205       // Add the current place, then build the next place (tempMask) from that
4206       KMP_CPU_COPY(previousMask, tempMask);
4207       ADD_MASK(previousMask);
4208       KMP_CPU_ZERO(tempMask);
4209       setSize = 0;
4210       KMP_CPU_SET_ITERATE(j, previousMask) {
4211         if (!KMP_CPU_ISSET(j, previousMask)) {
4212           continue;
4213         }
4214         if ((j + stride > maxOsId) || (j + stride < 0) ||
4215             (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
4216             (!KMP_CPU_ISSET(j + stride,
4217                             KMP_CPU_INDEX(osId2Mask, j + stride)))) {
4218           if (i < count - 1) {
4219             KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, j + stride);
4220           }
4221           continue;
4222         }
4223         KMP_CPU_SET(j + stride, tempMask);
4224         setSize++;
4225       }
4226     }
4227     KMP_CPU_ZERO(tempMask);
4228     setSize = 0;
4229 
4230     // valid follow sets are ',' and EOL
4231     SKIP_WS(scan);
4232     if (*scan == '\0') {
4233       break;
4234     }
4235     if (*scan == ',') {
4236       scan++; // skip ','
4237       continue;
4238     }
4239 
4240     KMP_ASSERT2(0, "bad explicit places list");
4241   }
4242 
4243   *out_numMasks = nextNewMask;
4244   if (nextNewMask == 0) {
4245     *out_masks = NULL;
4246     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
4247     return;
4248   }
4249   KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
4250   KMP_CPU_FREE(tempMask);
4251   KMP_CPU_FREE(previousMask);
4252   for (i = 0; i < nextNewMask; i++) {
4253     kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
4254     kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
4255     KMP_CPU_COPY(dest, src);
4256   }
4257   KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
4258 }
4259 
4260 #undef ADD_MASK
4261 #undef ADD_MASK_OSID
4262 
4263 // This function figures out the deepest level at which there is at least one
4264 // cluster/core with more than one processing unit bound to it.
__kmp_affinity_find_core_level(int nprocs,int bottom_level)4265 static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) {
4266   int core_level = 0;
4267 
4268   for (int i = 0; i < nprocs; i++) {
4269     const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
4270     for (int j = bottom_level; j > 0; j--) {
4271       if (hw_thread.ids[j] > 0) {
4272         if (core_level < (j - 1)) {
4273           core_level = j - 1;
4274         }
4275       }
4276     }
4277   }
4278   return core_level;
4279 }
4280 
4281 // This function counts number of clusters/cores at given level.
__kmp_affinity_compute_ncores(int nprocs,int bottom_level,int core_level)4282 static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level,
4283                                          int core_level) {
4284   return __kmp_topology->get_count(core_level);
4285 }
4286 // This function finds to which cluster/core given processing unit is bound.
__kmp_affinity_find_core(int proc,int bottom_level,int core_level)4287 static int __kmp_affinity_find_core(int proc, int bottom_level,
4288                                     int core_level) {
4289   int core = 0;
4290   KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads());
4291   for (int i = 0; i <= proc; ++i) {
4292     if (i + 1 <= proc) {
4293       for (int j = 0; j <= core_level; ++j) {
4294         if (__kmp_topology->at(i + 1).sub_ids[j] !=
4295             __kmp_topology->at(i).sub_ids[j]) {
4296           core++;
4297           break;
4298         }
4299       }
4300     }
4301   }
4302   return core;
4303 }
4304 
4305 // This function finds maximal number of processing units bound to a
4306 // cluster/core at given level.
__kmp_affinity_max_proc_per_core(int nprocs,int bottom_level,int core_level)4307 static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level,
4308                                             int core_level) {
4309   if (core_level >= bottom_level)
4310     return 1;
4311   int thread_level = __kmp_topology->get_level(KMP_HW_THREAD);
4312   return __kmp_topology->calculate_ratio(thread_level, core_level);
4313 }
4314 
4315 static int *procarr = NULL;
4316 static int __kmp_aff_depth = 0;
4317 static int *__kmp_osid_to_hwthread_map = NULL;
4318 
__kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t * mask,kmp_affinity_ids_t & ids,kmp_affinity_attrs_t & attrs)4319 static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
4320                                                   kmp_affinity_ids_t &ids,
4321                                                   kmp_affinity_attrs_t &attrs) {
4322   if (!KMP_AFFINITY_CAPABLE())
4323     return;
4324 
4325   // Initiailze ids and attrs thread data
4326   for (int i = 0; i < KMP_HW_LAST; ++i)
4327     ids.ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
4328   attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
4329 
4330   // Iterate through each os id within the mask and determine
4331   // the topology id and attribute information
4332   int cpu;
4333   int depth = __kmp_topology->get_depth();
4334   KMP_CPU_SET_ITERATE(cpu, mask) {
4335     int osid_idx = __kmp_osid_to_hwthread_map[cpu];
4336     ids.os_id = cpu;
4337     const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx);
4338     for (int level = 0; level < depth; ++level) {
4339       kmp_hw_t type = __kmp_topology->get_type(level);
4340       int id = hw_thread.sub_ids[level];
4341       if (ids.ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids.ids[type] == id) {
4342         ids.ids[type] = id;
4343       } else {
4344         // This mask spans across multiple topology units, set it as such
4345         // and mark every level below as such as well.
4346         ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
4347         for (; level < depth; ++level) {
4348           kmp_hw_t type = __kmp_topology->get_type(level);
4349           ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
4350         }
4351       }
4352     }
4353     if (!attrs.valid) {
4354       attrs.core_type = hw_thread.attrs.get_core_type();
4355       attrs.core_eff = hw_thread.attrs.get_core_eff();
4356       attrs.valid = 1;
4357     } else {
4358       // This mask spans across multiple attributes, set it as such
4359       if (attrs.core_type != hw_thread.attrs.get_core_type())
4360         attrs.core_type = KMP_HW_CORE_TYPE_UNKNOWN;
4361       if (attrs.core_eff != hw_thread.attrs.get_core_eff())
4362         attrs.core_eff = kmp_hw_attr_t::UNKNOWN_CORE_EFF;
4363     }
4364   }
4365 }
4366 
__kmp_affinity_get_thread_topology_info(kmp_info_t * th)4367 static void __kmp_affinity_get_thread_topology_info(kmp_info_t *th) {
4368   if (!KMP_AFFINITY_CAPABLE())
4369     return;
4370   const kmp_affin_mask_t *mask = th->th.th_affin_mask;
4371   kmp_affinity_ids_t &ids = th->th.th_topology_ids;
4372   kmp_affinity_attrs_t &attrs = th->th.th_topology_attrs;
4373   __kmp_affinity_get_mask_topology_info(mask, ids, attrs);
4374 }
4375 
4376 // Assign the topology information to each place in the place list
4377 // A thread can then grab not only its affinity mask, but the topology
4378 // information associated with that mask. e.g., Which socket is a thread on
__kmp_affinity_get_topology_info(kmp_affinity_t & affinity)4379 static void __kmp_affinity_get_topology_info(kmp_affinity_t &affinity) {
4380   if (!KMP_AFFINITY_CAPABLE())
4381     return;
4382   if (affinity.type != affinity_none) {
4383     KMP_ASSERT(affinity.num_os_id_masks);
4384     KMP_ASSERT(affinity.os_id_masks);
4385   }
4386   KMP_ASSERT(affinity.num_masks);
4387   KMP_ASSERT(affinity.masks);
4388   KMP_ASSERT(__kmp_affin_fullMask);
4389 
4390   int max_cpu = __kmp_affin_fullMask->get_max_cpu();
4391   int num_hw_threads = __kmp_topology->get_num_hw_threads();
4392 
4393   // Allocate thread topology information
4394   if (!affinity.ids) {
4395     affinity.ids = (kmp_affinity_ids_t *)__kmp_allocate(
4396         sizeof(kmp_affinity_ids_t) * affinity.num_masks);
4397   }
4398   if (!affinity.attrs) {
4399     affinity.attrs = (kmp_affinity_attrs_t *)__kmp_allocate(
4400         sizeof(kmp_affinity_attrs_t) * affinity.num_masks);
4401   }
4402   if (!__kmp_osid_to_hwthread_map) {
4403     // Want the +1 because max_cpu should be valid index into map
4404     __kmp_osid_to_hwthread_map =
4405         (int *)__kmp_allocate(sizeof(int) * (max_cpu + 1));
4406   }
4407 
4408   // Create the OS proc to hardware thread map
4409   for (int hw_thread = 0; hw_thread < num_hw_threads; ++hw_thread) {
4410     int os_id = __kmp_topology->at(hw_thread).os_id;
4411     if (KMP_CPU_ISSET(os_id, __kmp_affin_fullMask))
4412       __kmp_osid_to_hwthread_map[os_id] = hw_thread;
4413   }
4414 
4415   for (unsigned i = 0; i < affinity.num_masks; ++i) {
4416     kmp_affinity_ids_t &ids = affinity.ids[i];
4417     kmp_affinity_attrs_t &attrs = affinity.attrs[i];
4418     kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.masks, i);
4419     __kmp_affinity_get_mask_topology_info(mask, ids, attrs);
4420   }
4421 }
4422 
4423 // Called when __kmp_topology is ready
__kmp_aux_affinity_initialize_other_data(kmp_affinity_t & affinity)4424 static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t &affinity) {
4425   // Initialize other data structures which depend on the topology
4426   if (__kmp_topology && __kmp_topology->get_num_hw_threads()) {
4427     machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
4428     __kmp_affinity_get_topology_info(affinity);
4429 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
4430     __kmp_first_osid_with_ecore = __kmp_get_first_osid_with_ecore();
4431 #endif
4432   }
4433 }
4434 
4435 // Create a one element mask array (set of places) which only contains the
4436 // initial process's affinity mask
__kmp_create_affinity_none_places(kmp_affinity_t & affinity)4437 static void __kmp_create_affinity_none_places(kmp_affinity_t &affinity) {
4438   KMP_ASSERT(__kmp_affin_fullMask != NULL);
4439   KMP_ASSERT(affinity.type == affinity_none);
4440   KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
4441   affinity.num_masks = 1;
4442   KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
4443   kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, 0);
4444   KMP_CPU_COPY(dest, __kmp_affin_fullMask);
4445   __kmp_aux_affinity_initialize_other_data(affinity);
4446 }
4447 
__kmp_aux_affinity_initialize_masks(kmp_affinity_t & affinity)4448 static void __kmp_aux_affinity_initialize_masks(kmp_affinity_t &affinity) {
4449   // Create the "full" mask - this defines all of the processors that we
4450   // consider to be in the machine model. If respect is set, then it is the
4451   // initialization thread's affinity mask. Otherwise, it is all processors that
4452   // we know about on the machine.
4453   int verbose = affinity.flags.verbose;
4454   const char *env_var = affinity.env_var;
4455 
4456   // Already initialized
4457   if (__kmp_affin_fullMask && __kmp_affin_origMask)
4458     return;
4459 
4460   if (__kmp_affin_fullMask == NULL) {
4461     KMP_CPU_ALLOC(__kmp_affin_fullMask);
4462   }
4463   if (__kmp_affin_origMask == NULL) {
4464     KMP_CPU_ALLOC(__kmp_affin_origMask);
4465   }
4466   if (KMP_AFFINITY_CAPABLE()) {
4467     __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
4468     // Make a copy before possible expanding to the entire machine mask
4469     __kmp_affin_origMask->copy(__kmp_affin_fullMask);
4470     if (affinity.flags.respect) {
4471       // Count the number of available processors.
4472       unsigned i;
4473       __kmp_avail_proc = 0;
4474       KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
4475         if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
4476           continue;
4477         }
4478         __kmp_avail_proc++;
4479       }
4480       if (__kmp_avail_proc > __kmp_xproc) {
4481         KMP_AFF_WARNING(affinity, ErrorInitializeAffinity);
4482         affinity.type = affinity_none;
4483         KMP_AFFINITY_DISABLE();
4484         return;
4485       }
4486 
4487       if (verbose) {
4488         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4489         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4490                                   __kmp_affin_fullMask);
4491         KMP_INFORM(InitOSProcSetRespect, env_var, buf);
4492       }
4493     } else {
4494       if (verbose) {
4495         char buf[KMP_AFFIN_MASK_PRINT_LEN];
4496         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4497                                   __kmp_affin_fullMask);
4498         KMP_INFORM(InitOSProcSetNotRespect, env_var, buf);
4499       }
4500       __kmp_avail_proc =
4501           __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
4502 #if KMP_OS_WINDOWS
4503       if (__kmp_num_proc_groups <= 1) {
4504         // Copy expanded full mask if topology has single processor group
4505         __kmp_affin_origMask->copy(__kmp_affin_fullMask);
4506       }
4507       // Set the process affinity mask since threads' affinity
4508       // masks must be subset of process mask in Windows* OS
4509       __kmp_affin_fullMask->set_process_affinity(true);
4510 #endif
4511     }
4512   }
4513 }
4514 
__kmp_aux_affinity_initialize_topology(kmp_affinity_t & affinity)4515 static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) {
4516   bool success = false;
4517   const char *env_var = affinity.env_var;
4518   kmp_i18n_id_t msg_id = kmp_i18n_null;
4519   int verbose = affinity.flags.verbose;
4520 
4521   // For backward compatibility, setting KMP_CPUINFO_FILE =>
4522   // KMP_TOPOLOGY_METHOD=cpuinfo
4523   if ((__kmp_cpuinfo_file != NULL) &&
4524       (__kmp_affinity_top_method == affinity_top_method_all)) {
4525     __kmp_affinity_top_method = affinity_top_method_cpuinfo;
4526   }
4527 
4528   if (__kmp_affinity_top_method == affinity_top_method_all) {
4529 // In the default code path, errors are not fatal - we just try using
4530 // another method. We only emit a warning message if affinity is on, or the
4531 // verbose flag is set, an the nowarnings flag was not set.
4532 #if KMP_USE_HWLOC
4533     if (!success &&
4534         __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
4535       if (!__kmp_hwloc_error) {
4536         success = __kmp_affinity_create_hwloc_map(&msg_id);
4537         if (!success && verbose) {
4538           KMP_INFORM(AffIgnoringHwloc, env_var);
4539         }
4540       } else if (verbose) {
4541         KMP_INFORM(AffIgnoringHwloc, env_var);
4542       }
4543     }
4544 #endif
4545 
4546 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4547     if (!success) {
4548       success = __kmp_affinity_create_x2apicid_map(&msg_id);
4549       if (!success && verbose && msg_id != kmp_i18n_null) {
4550         KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4551       }
4552     }
4553     if (!success) {
4554       success = __kmp_affinity_create_apicid_map(&msg_id);
4555       if (!success && verbose && msg_id != kmp_i18n_null) {
4556         KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4557       }
4558     }
4559 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4560 
4561 #if KMP_OS_LINUX || KMP_OS_AIX
4562     if (!success) {
4563       int line = 0;
4564       success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
4565       if (!success && verbose && msg_id != kmp_i18n_null) {
4566         KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4567       }
4568     }
4569 #endif /* KMP_OS_LINUX */
4570 
4571 #if KMP_GROUP_AFFINITY
4572     if (!success && (__kmp_num_proc_groups > 1)) {
4573       success = __kmp_affinity_create_proc_group_map(&msg_id);
4574       if (!success && verbose && msg_id != kmp_i18n_null) {
4575         KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4576       }
4577     }
4578 #endif /* KMP_GROUP_AFFINITY */
4579 
4580     if (!success) {
4581       success = __kmp_affinity_create_flat_map(&msg_id);
4582       if (!success && verbose && msg_id != kmp_i18n_null) {
4583         KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4584       }
4585       KMP_ASSERT(success);
4586     }
4587   }
4588 
4589 // If the user has specified that a paricular topology discovery method is to be
4590 // used, then we abort if that method fails. The exception is group affinity,
4591 // which might have been implicitly set.
4592 #if KMP_USE_HWLOC
4593   else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
4594     KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
4595     success = __kmp_affinity_create_hwloc_map(&msg_id);
4596     if (!success) {
4597       KMP_ASSERT(msg_id != kmp_i18n_null);
4598       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4599     }
4600   }
4601 #endif // KMP_USE_HWLOC
4602 
4603 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4604   else if (__kmp_affinity_top_method == affinity_top_method_x2apicid ||
4605            __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
4606     success = __kmp_affinity_create_x2apicid_map(&msg_id);
4607     if (!success) {
4608       KMP_ASSERT(msg_id != kmp_i18n_null);
4609       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4610     }
4611   } else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
4612     success = __kmp_affinity_create_apicid_map(&msg_id);
4613     if (!success) {
4614       KMP_ASSERT(msg_id != kmp_i18n_null);
4615       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4616     }
4617   }
4618 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4619 
4620   else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
4621     int line = 0;
4622     success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
4623     if (!success) {
4624       KMP_ASSERT(msg_id != kmp_i18n_null);
4625       const char *filename = __kmp_cpuinfo_get_filename();
4626       if (line > 0) {
4627         KMP_FATAL(FileLineMsgExiting, filename, line,
4628                   __kmp_i18n_catgets(msg_id));
4629       } else {
4630         KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
4631       }
4632     }
4633   }
4634 
4635 #if KMP_GROUP_AFFINITY
4636   else if (__kmp_affinity_top_method == affinity_top_method_group) {
4637     success = __kmp_affinity_create_proc_group_map(&msg_id);
4638     KMP_ASSERT(success);
4639     if (!success) {
4640       KMP_ASSERT(msg_id != kmp_i18n_null);
4641       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4642     }
4643   }
4644 #endif /* KMP_GROUP_AFFINITY */
4645 
4646   else if (__kmp_affinity_top_method == affinity_top_method_flat) {
4647     success = __kmp_affinity_create_flat_map(&msg_id);
4648     // should not fail
4649     KMP_ASSERT(success);
4650   }
4651 
4652   // Early exit if topology could not be created
4653   if (!__kmp_topology) {
4654     if (KMP_AFFINITY_CAPABLE()) {
4655       KMP_AFF_WARNING(affinity, ErrorInitializeAffinity);
4656     }
4657     if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 &&
4658         __kmp_ncores > 0) {
4659       __kmp_topology = kmp_topology_t::allocate(0, 0, NULL);
4660       __kmp_topology->canonicalize(nPackages, nCoresPerPkg,
4661                                    __kmp_nThreadsPerCore, __kmp_ncores);
4662       if (verbose) {
4663         __kmp_topology->print(env_var);
4664       }
4665     }
4666     return false;
4667   }
4668 
4669   // Canonicalize, print (if requested), apply KMP_HW_SUBSET
4670   __kmp_topology->canonicalize();
4671   if (verbose)
4672     __kmp_topology->print(env_var);
4673   bool filtered = __kmp_topology->filter_hw_subset();
4674   if (filtered && verbose)
4675     __kmp_topology->print("KMP_HW_SUBSET");
4676   return success;
4677 }
4678 
__kmp_aux_affinity_initialize(kmp_affinity_t & affinity)4679 static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) {
4680   bool is_regular_affinity = (&affinity == &__kmp_affinity);
4681   bool is_hidden_helper_affinity = (&affinity == &__kmp_hh_affinity);
4682   const char *env_var = __kmp_get_affinity_env_var(affinity);
4683 
4684   if (affinity.flags.initialized) {
4685     KMP_ASSERT(__kmp_affin_fullMask != NULL);
4686     return;
4687   }
4688 
4689   if (is_regular_affinity && (!__kmp_affin_fullMask || !__kmp_affin_origMask))
4690     __kmp_aux_affinity_initialize_masks(affinity);
4691 
4692   if (is_regular_affinity && !__kmp_topology) {
4693     bool success = __kmp_aux_affinity_initialize_topology(affinity);
4694     if (success) {
4695       KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
4696     } else {
4697       affinity.type = affinity_none;
4698       KMP_AFFINITY_DISABLE();
4699     }
4700   }
4701 
4702   // If KMP_AFFINITY=none, then only create the single "none" place
4703   // which is the process's initial affinity mask or the number of
4704   // hardware threads depending on respect,norespect
4705   if (affinity.type == affinity_none) {
4706     __kmp_create_affinity_none_places(affinity);
4707 #if KMP_USE_HIER_SCHED
4708     __kmp_dispatch_set_hierarchy_values();
4709 #endif
4710     affinity.flags.initialized = TRUE;
4711     return;
4712   }
4713 
4714   __kmp_topology->set_granularity(affinity);
4715   int depth = __kmp_topology->get_depth();
4716 
4717   // Create the table of masks, indexed by thread Id.
4718   unsigned numUnique;
4719   int numAddrs = __kmp_topology->get_num_hw_threads();
4720   // If OMP_PLACES=cores:<attribute> specified, then attempt
4721   // to make OS Id mask table using those attributes
4722   if (affinity.core_attr_gran.valid) {
4723     __kmp_create_os_id_masks(&numUnique, affinity, [&](int idx) {
4724       KMP_ASSERT(idx >= -1);
4725       for (int i = idx + 1; i < numAddrs; ++i)
4726         if (__kmp_topology->at(i).attrs.contains(affinity.core_attr_gran))
4727           return i;
4728       return numAddrs;
4729     });
4730     if (!affinity.os_id_masks) {
4731       const char *core_attribute;
4732       if (affinity.core_attr_gran.core_eff != kmp_hw_attr_t::UNKNOWN_CORE_EFF)
4733         core_attribute = "core_efficiency";
4734       else
4735         core_attribute = "core_type";
4736       KMP_AFF_WARNING(affinity, AffIgnoringNotAvailable, env_var,
4737                       core_attribute,
4738                       __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true))
4739     }
4740   }
4741   // If core attributes did not work, or none were specified,
4742   // then make OS Id mask table using typical incremental way.
4743   if (!affinity.os_id_masks) {
4744     __kmp_create_os_id_masks(&numUnique, affinity, [](int idx) {
4745       KMP_ASSERT(idx >= -1);
4746       return idx + 1;
4747     });
4748   }
4749   if (affinity.gran_levels == 0) {
4750     KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
4751   }
4752 
4753   switch (affinity.type) {
4754 
4755   case affinity_explicit:
4756     KMP_DEBUG_ASSERT(affinity.proclist != NULL);
4757     if (is_hidden_helper_affinity ||
4758         __kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) {
4759       __kmp_affinity_process_proclist(affinity);
4760     } else {
4761       __kmp_affinity_process_placelist(affinity);
4762     }
4763     if (affinity.num_masks == 0) {
4764       KMP_AFF_WARNING(affinity, AffNoValidProcID);
4765       affinity.type = affinity_none;
4766       __kmp_create_affinity_none_places(affinity);
4767       affinity.flags.initialized = TRUE;
4768       return;
4769     }
4770     break;
4771 
4772   // The other affinity types rely on sorting the hardware threads according to
4773   // some permutation of the machine topology tree. Set affinity.compact
4774   // and affinity.offset appropriately, then jump to a common code
4775   // fragment to do the sort and create the array of affinity masks.
4776   case affinity_logical:
4777     affinity.compact = 0;
4778     if (affinity.offset) {
4779       affinity.offset =
4780           __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc;
4781     }
4782     goto sortTopology;
4783 
4784   case affinity_physical:
4785     if (__kmp_nThreadsPerCore > 1) {
4786       affinity.compact = 1;
4787       if (affinity.compact >= depth) {
4788         affinity.compact = 0;
4789       }
4790     } else {
4791       affinity.compact = 0;
4792     }
4793     if (affinity.offset) {
4794       affinity.offset =
4795           __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc;
4796     }
4797     goto sortTopology;
4798 
4799   case affinity_scatter:
4800     if (affinity.compact >= depth) {
4801       affinity.compact = 0;
4802     } else {
4803       affinity.compact = depth - 1 - affinity.compact;
4804     }
4805     goto sortTopology;
4806 
4807   case affinity_compact:
4808     if (affinity.compact >= depth) {
4809       affinity.compact = depth - 1;
4810     }
4811     goto sortTopology;
4812 
4813   case affinity_balanced:
4814     if (depth <= 1 || is_hidden_helper_affinity) {
4815       KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var);
4816       affinity.type = affinity_none;
4817       __kmp_create_affinity_none_places(affinity);
4818       affinity.flags.initialized = TRUE;
4819       return;
4820     } else if (!__kmp_topology->is_uniform()) {
4821       // Save the depth for further usage
4822       __kmp_aff_depth = depth;
4823 
4824       int core_level =
4825           __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1);
4826       int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1,
4827                                                  core_level);
4828       int maxprocpercore = __kmp_affinity_max_proc_per_core(
4829           __kmp_avail_proc, depth - 1, core_level);
4830 
4831       int nproc = ncores * maxprocpercore;
4832       if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
4833         KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var);
4834         affinity.type = affinity_none;
4835         __kmp_create_affinity_none_places(affinity);
4836         affinity.flags.initialized = TRUE;
4837         return;
4838       }
4839 
4840       procarr = (int *)__kmp_allocate(sizeof(int) * nproc);
4841       for (int i = 0; i < nproc; i++) {
4842         procarr[i] = -1;
4843       }
4844 
4845       int lastcore = -1;
4846       int inlastcore = 0;
4847       for (int i = 0; i < __kmp_avail_proc; i++) {
4848         int proc = __kmp_topology->at(i).os_id;
4849         int core = __kmp_affinity_find_core(i, depth - 1, core_level);
4850 
4851         if (core == lastcore) {
4852           inlastcore++;
4853         } else {
4854           inlastcore = 0;
4855         }
4856         lastcore = core;
4857 
4858         procarr[core * maxprocpercore + inlastcore] = proc;
4859       }
4860     }
4861     if (affinity.compact >= depth) {
4862       affinity.compact = depth - 1;
4863     }
4864 
4865   sortTopology:
4866     // Allocate the gtid->affinity mask table.
4867     if (affinity.flags.dups) {
4868       affinity.num_masks = __kmp_avail_proc;
4869     } else {
4870       affinity.num_masks = numUnique;
4871     }
4872 
4873     if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
4874         (__kmp_affinity_num_places > 0) &&
4875         ((unsigned)__kmp_affinity_num_places < affinity.num_masks) &&
4876         !is_hidden_helper_affinity) {
4877       affinity.num_masks = __kmp_affinity_num_places;
4878     }
4879 
4880     KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
4881 
4882     // Sort the topology table according to the current setting of
4883     // affinity.compact, then fill out affinity.masks.
4884     __kmp_topology->sort_compact(affinity);
4885     {
4886       int i;
4887       unsigned j;
4888       int num_hw_threads = __kmp_topology->get_num_hw_threads();
4889       kmp_full_mask_modifier_t full_mask;
4890       for (i = 0, j = 0; i < num_hw_threads; i++) {
4891         if ((!affinity.flags.dups) && (!__kmp_topology->at(i).leader)) {
4892           continue;
4893         }
4894         int osId = __kmp_topology->at(i).os_id;
4895 
4896         kmp_affin_mask_t *src = KMP_CPU_INDEX(affinity.os_id_masks, osId);
4897         kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, j);
4898         KMP_ASSERT(KMP_CPU_ISSET(osId, src));
4899         KMP_CPU_COPY(dest, src);
4900         full_mask.include(src);
4901         if (++j >= affinity.num_masks) {
4902           break;
4903         }
4904       }
4905       KMP_DEBUG_ASSERT(j == affinity.num_masks);
4906       // See if the places list further restricts or changes the full mask
4907       if (full_mask.restrict_to_mask() && affinity.flags.verbose) {
4908         __kmp_topology->print(env_var);
4909       }
4910     }
4911     // Sort the topology back using ids
4912     __kmp_topology->sort_ids();
4913     break;
4914 
4915   default:
4916     KMP_ASSERT2(0, "Unexpected affinity setting");
4917   }
4918   __kmp_aux_affinity_initialize_other_data(affinity);
4919   affinity.flags.initialized = TRUE;
4920 }
4921 
__kmp_affinity_initialize(kmp_affinity_t & affinity)4922 void __kmp_affinity_initialize(kmp_affinity_t &affinity) {
4923   // Much of the code above was written assuming that if a machine was not
4924   // affinity capable, then affinity type == affinity_none.
4925   // We now explicitly represent this as affinity type == affinity_disabled.
4926   // There are too many checks for affinity type == affinity_none in this code.
4927   // Instead of trying to change them all, check if
4928   // affinity type == affinity_disabled, and if so, slam it with affinity_none,
4929   // call the real initialization routine, then restore affinity type to
4930   // affinity_disabled.
4931   int disabled = (affinity.type == affinity_disabled);
4932   if (!KMP_AFFINITY_CAPABLE())
4933     KMP_ASSERT(disabled);
4934   if (disabled)
4935     affinity.type = affinity_none;
4936   __kmp_aux_affinity_initialize(affinity);
4937   if (disabled)
4938     affinity.type = affinity_disabled;
4939 }
4940 
__kmp_affinity_uninitialize(void)4941 void __kmp_affinity_uninitialize(void) {
4942   for (kmp_affinity_t *affinity : __kmp_affinities) {
4943     if (affinity->masks != NULL)
4944       KMP_CPU_FREE_ARRAY(affinity->masks, affinity->num_masks);
4945     if (affinity->os_id_masks != NULL)
4946       KMP_CPU_FREE_ARRAY(affinity->os_id_masks, affinity->num_os_id_masks);
4947     if (affinity->proclist != NULL)
4948       __kmp_free(affinity->proclist);
4949     if (affinity->ids != NULL)
4950       __kmp_free(affinity->ids);
4951     if (affinity->attrs != NULL)
4952       __kmp_free(affinity->attrs);
4953     *affinity = KMP_AFFINITY_INIT(affinity->env_var);
4954   }
4955   if (__kmp_affin_origMask != NULL) {
4956     if (KMP_AFFINITY_CAPABLE()) {
4957 #if KMP_OS_AIX
4958       // Uninitialize by unbinding the thread.
4959       bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY);
4960 #else
4961       __kmp_set_system_affinity(__kmp_affin_origMask, FALSE);
4962 #endif
4963     }
4964     KMP_CPU_FREE(__kmp_affin_origMask);
4965     __kmp_affin_origMask = NULL;
4966   }
4967   __kmp_affinity_num_places = 0;
4968   if (procarr != NULL) {
4969     __kmp_free(procarr);
4970     procarr = NULL;
4971   }
4972   if (__kmp_osid_to_hwthread_map) {
4973     __kmp_free(__kmp_osid_to_hwthread_map);
4974     __kmp_osid_to_hwthread_map = NULL;
4975   }
4976 #if KMP_USE_HWLOC
4977   if (__kmp_hwloc_topology != NULL) {
4978     hwloc_topology_destroy(__kmp_hwloc_topology);
4979     __kmp_hwloc_topology = NULL;
4980   }
4981 #endif
4982   if (__kmp_hw_subset) {
4983     kmp_hw_subset_t::deallocate(__kmp_hw_subset);
4984     __kmp_hw_subset = nullptr;
4985   }
4986   if (__kmp_topology) {
4987     kmp_topology_t::deallocate(__kmp_topology);
4988     __kmp_topology = nullptr;
4989   }
4990   KMPAffinity::destroy_api();
4991 }
4992 
__kmp_select_mask_by_gtid(int gtid,const kmp_affinity_t * affinity,int * place,kmp_affin_mask_t ** mask)4993 static void __kmp_select_mask_by_gtid(int gtid, const kmp_affinity_t *affinity,
4994                                       int *place, kmp_affin_mask_t **mask) {
4995   int mask_idx;
4996   bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
4997   if (is_hidden_helper)
4998     // The first gtid is the regular primary thread, the second gtid is the main
4999     // thread of hidden team which does not participate in task execution.
5000     mask_idx = gtid - 2;
5001   else
5002     mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
5003   KMP_DEBUG_ASSERT(affinity->num_masks > 0);
5004   *place = (mask_idx + affinity->offset) % affinity->num_masks;
5005   *mask = KMP_CPU_INDEX(affinity->masks, *place);
5006 }
5007 
5008 // This function initializes the per-thread data concerning affinity including
5009 // the mask and topology information
__kmp_affinity_set_init_mask(int gtid,int isa_root)5010 void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
5011 
5012   kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
5013 
5014   // Set the thread topology information to default of unknown
5015   for (int id = 0; id < KMP_HW_LAST; ++id)
5016     th->th.th_topology_ids.ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
5017   th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
5018 
5019   if (!KMP_AFFINITY_CAPABLE()) {
5020     return;
5021   }
5022 
5023   if (th->th.th_affin_mask == NULL) {
5024     KMP_CPU_ALLOC(th->th.th_affin_mask);
5025   } else {
5026     KMP_CPU_ZERO(th->th.th_affin_mask);
5027   }
5028 
5029   // Copy the thread mask to the kmp_info_t structure. If
5030   // __kmp_affinity.type == affinity_none, copy the "full" mask, i.e.
5031   // one that has all of the OS proc ids set, or if
5032   // __kmp_affinity.flags.respect is set, then the full mask is the
5033   // same as the mask of the initialization thread.
5034   kmp_affin_mask_t *mask;
5035   int i;
5036   const kmp_affinity_t *affinity;
5037   bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
5038 
5039   if (is_hidden_helper)
5040     affinity = &__kmp_hh_affinity;
5041   else
5042     affinity = &__kmp_affinity;
5043 
5044   if (KMP_AFFINITY_NON_PROC_BIND || is_hidden_helper) {
5045     if ((affinity->type == affinity_none) ||
5046         (affinity->type == affinity_balanced) ||
5047         KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
5048 #if KMP_GROUP_AFFINITY
5049       if (__kmp_num_proc_groups > 1) {
5050         return;
5051       }
5052 #endif
5053       KMP_ASSERT(__kmp_affin_fullMask != NULL);
5054       i = 0;
5055       mask = __kmp_affin_fullMask;
5056     } else {
5057       __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask);
5058     }
5059   } else {
5060     if (!isa_root || __kmp_nested_proc_bind.bind_types[0] == proc_bind_false) {
5061 #if KMP_GROUP_AFFINITY
5062       if (__kmp_num_proc_groups > 1) {
5063         return;
5064       }
5065 #endif
5066       KMP_ASSERT(__kmp_affin_fullMask != NULL);
5067       i = KMP_PLACE_ALL;
5068       mask = __kmp_affin_fullMask;
5069     } else {
5070       __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask);
5071     }
5072   }
5073 
5074   th->th.th_current_place = i;
5075   if (isa_root && !is_hidden_helper) {
5076     th->th.th_new_place = i;
5077     th->th.th_first_place = 0;
5078     th->th.th_last_place = affinity->num_masks - 1;
5079   } else if (KMP_AFFINITY_NON_PROC_BIND) {
5080     // When using a Non-OMP_PROC_BIND affinity method,
5081     // set all threads' place-partition-var to the entire place list
5082     th->th.th_first_place = 0;
5083     th->th.th_last_place = affinity->num_masks - 1;
5084   }
5085   // Copy topology information associated with the place
5086   if (i >= 0) {
5087     th->th.th_topology_ids = __kmp_affinity.ids[i];
5088     th->th.th_topology_attrs = __kmp_affinity.attrs[i];
5089   }
5090 
5091   if (i == KMP_PLACE_ALL) {
5092     KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to all places\n",
5093                    gtid));
5094   } else {
5095     KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to place %d\n",
5096                    gtid, i));
5097   }
5098 
5099   KMP_CPU_COPY(th->th.th_affin_mask, mask);
5100 }
5101 
__kmp_affinity_bind_init_mask(int gtid)5102 void __kmp_affinity_bind_init_mask(int gtid) {
5103   if (!KMP_AFFINITY_CAPABLE()) {
5104     return;
5105   }
5106   kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
5107   const kmp_affinity_t *affinity;
5108   const char *env_var;
5109   bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
5110 
5111   if (is_hidden_helper)
5112     affinity = &__kmp_hh_affinity;
5113   else
5114     affinity = &__kmp_affinity;
5115   env_var = __kmp_get_affinity_env_var(*affinity, /*for_binding=*/true);
5116   /* to avoid duplicate printing (will be correctly printed on barrier) */
5117   if (affinity->flags.verbose && (affinity->type == affinity_none ||
5118                                   (th->th.th_current_place != KMP_PLACE_ALL &&
5119                                    affinity->type != affinity_balanced)) &&
5120       !KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
5121     char buf[KMP_AFFIN_MASK_PRINT_LEN];
5122     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5123                               th->th.th_affin_mask);
5124     KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
5125                gtid, buf);
5126   }
5127 
5128 #if KMP_OS_WINDOWS
5129   // On Windows* OS, the process affinity mask might have changed. If the user
5130   // didn't request affinity and this call fails, just continue silently.
5131   // See CQ171393.
5132   if (affinity->type == affinity_none) {
5133     __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
5134   } else
5135 #endif
5136 #ifndef KMP_OS_AIX
5137     // Do not set the full mask as the init mask on AIX.
5138     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
5139 #endif
5140 }
5141 
__kmp_affinity_bind_place(int gtid)5142 void __kmp_affinity_bind_place(int gtid) {
5143   // Hidden helper threads should not be affected by OMP_PLACES/OMP_PROC_BIND
5144   if (!KMP_AFFINITY_CAPABLE() || KMP_HIDDEN_HELPER_THREAD(gtid)) {
5145     return;
5146   }
5147 
5148   kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
5149 
5150   KA_TRACE(100, ("__kmp_affinity_bind_place: binding T#%d to place %d (current "
5151                  "place = %d)\n",
5152                  gtid, th->th.th_new_place, th->th.th_current_place));
5153 
5154   // Check that the new place is within this thread's partition.
5155   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
5156   KMP_ASSERT(th->th.th_new_place >= 0);
5157   KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity.num_masks);
5158   if (th->th.th_first_place <= th->th.th_last_place) {
5159     KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
5160                (th->th.th_new_place <= th->th.th_last_place));
5161   } else {
5162     KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) ||
5163                (th->th.th_new_place >= th->th.th_last_place));
5164   }
5165 
5166   // Copy the thread mask to the kmp_info_t structure,
5167   // and set this thread's affinity.
5168   kmp_affin_mask_t *mask =
5169       KMP_CPU_INDEX(__kmp_affinity.masks, th->th.th_new_place);
5170   KMP_CPU_COPY(th->th.th_affin_mask, mask);
5171   th->th.th_current_place = th->th.th_new_place;
5172 
5173   if (__kmp_affinity.flags.verbose) {
5174     char buf[KMP_AFFIN_MASK_PRINT_LEN];
5175     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5176                               th->th.th_affin_mask);
5177     KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
5178                __kmp_gettid(), gtid, buf);
5179   }
5180   __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
5181 }
5182 
__kmp_aux_set_affinity(void ** mask)5183 int __kmp_aux_set_affinity(void **mask) {
5184   int gtid;
5185   kmp_info_t *th;
5186   int retval;
5187 
5188   if (!KMP_AFFINITY_CAPABLE()) {
5189     return -1;
5190   }
5191 
5192   gtid = __kmp_entry_gtid();
5193   KA_TRACE(
5194       1000, (""); {
5195         char buf[KMP_AFFIN_MASK_PRINT_LEN];
5196         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5197                                   (kmp_affin_mask_t *)(*mask));
5198         __kmp_debug_printf(
5199             "kmp_set_affinity: setting affinity mask for thread %d = %s\n",
5200             gtid, buf);
5201       });
5202 
5203   if (__kmp_env_consistency_check) {
5204     if ((mask == NULL) || (*mask == NULL)) {
5205       KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
5206     } else {
5207       unsigned proc;
5208       int num_procs = 0;
5209 
5210       KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) {
5211         if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5212           KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
5213         }
5214         if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
5215           continue;
5216         }
5217         num_procs++;
5218       }
5219       if (num_procs == 0) {
5220         KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
5221       }
5222 
5223 #if KMP_GROUP_AFFINITY
5224       if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
5225         KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
5226       }
5227 #endif /* KMP_GROUP_AFFINITY */
5228     }
5229   }
5230 
5231   th = __kmp_threads[gtid];
5232   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
5233   retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
5234   if (retval == 0) {
5235     KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
5236   }
5237 
5238   th->th.th_current_place = KMP_PLACE_UNDEFINED;
5239   th->th.th_new_place = KMP_PLACE_UNDEFINED;
5240   th->th.th_first_place = 0;
5241   th->th.th_last_place = __kmp_affinity.num_masks - 1;
5242 
5243   // Turn off 4.0 affinity for the current tread at this parallel level.
5244   th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
5245 
5246   return retval;
5247 }
5248 
__kmp_aux_get_affinity(void ** mask)5249 int __kmp_aux_get_affinity(void **mask) {
5250   int gtid;
5251   int retval;
5252 #if KMP_OS_WINDOWS || KMP_OS_AIX || KMP_DEBUG
5253   kmp_info_t *th;
5254 #endif
5255   if (!KMP_AFFINITY_CAPABLE()) {
5256     return -1;
5257   }
5258 
5259   gtid = __kmp_entry_gtid();
5260 #if KMP_OS_WINDOWS || KMP_OS_AIX || KMP_DEBUG
5261   th = __kmp_threads[gtid];
5262 #else
5263   (void)gtid; // unused variable
5264 #endif
5265   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
5266 
5267   KA_TRACE(
5268       1000, (""); {
5269         char buf[KMP_AFFIN_MASK_PRINT_LEN];
5270         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5271                                   th->th.th_affin_mask);
5272         __kmp_printf(
5273             "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid,
5274             buf);
5275       });
5276 
5277   if (__kmp_env_consistency_check) {
5278     if ((mask == NULL) || (*mask == NULL)) {
5279       KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
5280     }
5281   }
5282 
5283 #if !KMP_OS_WINDOWS && !KMP_OS_AIX
5284 
5285   retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
5286   KA_TRACE(
5287       1000, (""); {
5288         char buf[KMP_AFFIN_MASK_PRINT_LEN];
5289         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5290                                   (kmp_affin_mask_t *)(*mask));
5291         __kmp_printf(
5292             "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid,
5293             buf);
5294       });
5295   return retval;
5296 
5297 #else
5298   (void)retval;
5299 
5300   KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
5301   return 0;
5302 
5303 #endif /* !KMP_OS_WINDOWS && !KMP_OS_AIX */
5304 }
5305 
__kmp_aux_get_affinity_max_proc()5306 int __kmp_aux_get_affinity_max_proc() {
5307   if (!KMP_AFFINITY_CAPABLE()) {
5308     return 0;
5309   }
5310 #if KMP_GROUP_AFFINITY
5311   if (__kmp_num_proc_groups > 1) {
5312     return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT);
5313   }
5314 #endif
5315   return __kmp_xproc;
5316 }
5317 
__kmp_aux_set_affinity_mask_proc(int proc,void ** mask)5318 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) {
5319   if (!KMP_AFFINITY_CAPABLE()) {
5320     return -1;
5321   }
5322 
5323   KA_TRACE(
5324       1000, (""); {
5325         int gtid = __kmp_entry_gtid();
5326         char buf[KMP_AFFIN_MASK_PRINT_LEN];
5327         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5328                                   (kmp_affin_mask_t *)(*mask));
5329         __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
5330                            "affinity mask for thread %d = %s\n",
5331                            proc, gtid, buf);
5332       });
5333 
5334   if (__kmp_env_consistency_check) {
5335     if ((mask == NULL) || (*mask == NULL)) {
5336       KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
5337     }
5338   }
5339 
5340   if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5341     return -1;
5342   }
5343   if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5344     return -2;
5345   }
5346 
5347   KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
5348   return 0;
5349 }
5350 
__kmp_aux_unset_affinity_mask_proc(int proc,void ** mask)5351 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) {
5352   if (!KMP_AFFINITY_CAPABLE()) {
5353     return -1;
5354   }
5355 
5356   KA_TRACE(
5357       1000, (""); {
5358         int gtid = __kmp_entry_gtid();
5359         char buf[KMP_AFFIN_MASK_PRINT_LEN];
5360         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5361                                   (kmp_affin_mask_t *)(*mask));
5362         __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
5363                            "affinity mask for thread %d = %s\n",
5364                            proc, gtid, buf);
5365       });
5366 
5367   if (__kmp_env_consistency_check) {
5368     if ((mask == NULL) || (*mask == NULL)) {
5369       KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
5370     }
5371   }
5372 
5373   if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5374     return -1;
5375   }
5376   if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5377     return -2;
5378   }
5379 
5380   KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
5381   return 0;
5382 }
5383 
__kmp_aux_get_affinity_mask_proc(int proc,void ** mask)5384 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
5385   if (!KMP_AFFINITY_CAPABLE()) {
5386     return -1;
5387   }
5388 
5389   KA_TRACE(
5390       1000, (""); {
5391         int gtid = __kmp_entry_gtid();
5392         char buf[KMP_AFFIN_MASK_PRINT_LEN];
5393         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5394                                   (kmp_affin_mask_t *)(*mask));
5395         __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
5396                            "affinity mask for thread %d = %s\n",
5397                            proc, gtid, buf);
5398       });
5399 
5400   if (__kmp_env_consistency_check) {
5401     if ((mask == NULL) || (*mask == NULL)) {
5402       KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
5403     }
5404   }
5405 
5406   if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5407     return -1;
5408   }
5409   if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5410     return 0;
5411   }
5412 
5413   return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
5414 }
5415 
5416 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
5417 // Returns first os proc id with ATOM core
__kmp_get_first_osid_with_ecore(void)5418 int __kmp_get_first_osid_with_ecore(void) {
5419   int low = 0;
5420   int high = __kmp_topology->get_num_hw_threads() - 1;
5421   int mid = 0;
5422   while (high - low > 1) {
5423     mid = (high + low) / 2;
5424     if (__kmp_topology->at(mid).attrs.get_core_type() ==
5425         KMP_HW_CORE_TYPE_CORE) {
5426       low = mid + 1;
5427     } else {
5428       high = mid;
5429     }
5430   }
5431   if (__kmp_topology->at(mid).attrs.get_core_type() == KMP_HW_CORE_TYPE_ATOM) {
5432     return mid;
5433   }
5434   return -1;
5435 }
5436 #endif
5437 
5438 // Dynamic affinity settings - Affinity balanced
__kmp_balanced_affinity(kmp_info_t * th,int nthreads)5439 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
5440   KMP_DEBUG_ASSERT(th);
5441   bool fine_gran = true;
5442   int tid = th->th.th_info.ds.ds_tid;
5443   const char *env_var = "KMP_AFFINITY";
5444 
5445   // Do not perform balanced affinity for the hidden helper threads
5446   if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th)))
5447     return;
5448 
5449   switch (__kmp_affinity.gran) {
5450   case KMP_HW_THREAD:
5451     break;
5452   case KMP_HW_CORE:
5453     if (__kmp_nThreadsPerCore > 1) {
5454       fine_gran = false;
5455     }
5456     break;
5457   case KMP_HW_SOCKET:
5458     if (nCoresPerPkg > 1) {
5459       fine_gran = false;
5460     }
5461     break;
5462   default:
5463     fine_gran = false;
5464   }
5465 
5466   if (__kmp_topology->is_uniform()) {
5467     int coreID;
5468     int threadID;
5469     // Number of hyper threads per core in HT machine
5470     int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
5471     // Number of cores
5472     int ncores = __kmp_ncores;
5473     if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) {
5474       __kmp_nth_per_core = __kmp_avail_proc / nPackages;
5475       ncores = nPackages;
5476     }
5477     // How many threads will be bound to each core
5478     int chunk = nthreads / ncores;
5479     // How many cores will have an additional thread bound to it - "big cores"
5480     int big_cores = nthreads % ncores;
5481     // Number of threads on the big cores
5482     int big_nth = (chunk + 1) * big_cores;
5483     if (tid < big_nth) {
5484       coreID = tid / (chunk + 1);
5485       threadID = (tid % (chunk + 1)) % __kmp_nth_per_core;
5486     } else { // tid >= big_nth
5487       coreID = (tid - big_cores) / chunk;
5488       threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
5489     }
5490     KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
5491                       "Illegal set affinity operation when not capable");
5492 
5493     kmp_affin_mask_t *mask = th->th.th_affin_mask;
5494     KMP_CPU_ZERO(mask);
5495 
5496     if (fine_gran) {
5497       int osID =
5498           __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id;
5499       KMP_CPU_SET(osID, mask);
5500     } else {
5501       for (int i = 0; i < __kmp_nth_per_core; i++) {
5502         int osID;
5503         osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id;
5504         KMP_CPU_SET(osID, mask);
5505       }
5506     }
5507     if (__kmp_affinity.flags.verbose) {
5508       char buf[KMP_AFFIN_MASK_PRINT_LEN];
5509       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
5510       KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
5511                  tid, buf);
5512     }
5513     __kmp_affinity_get_thread_topology_info(th);
5514     __kmp_set_system_affinity(mask, TRUE);
5515   } else { // Non-uniform topology
5516 
5517     kmp_affin_mask_t *mask = th->th.th_affin_mask;
5518     KMP_CPU_ZERO(mask);
5519 
5520     int core_level =
5521         __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1);
5522     int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc,
5523                                                __kmp_aff_depth - 1, core_level);
5524     int nth_per_core = __kmp_affinity_max_proc_per_core(
5525         __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
5526 
5527     // For performance gain consider the special case nthreads ==
5528     // __kmp_avail_proc
5529     if (nthreads == __kmp_avail_proc) {
5530       if (fine_gran) {
5531         int osID = __kmp_topology->at(tid).os_id;
5532         KMP_CPU_SET(osID, mask);
5533       } else {
5534         int core =
5535             __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level);
5536         for (int i = 0; i < __kmp_avail_proc; i++) {
5537           int osID = __kmp_topology->at(i).os_id;
5538           if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) ==
5539               core) {
5540             KMP_CPU_SET(osID, mask);
5541           }
5542         }
5543       }
5544     } else if (nthreads <= ncores) {
5545 
5546       int core = 0;
5547       for (int i = 0; i < ncores; i++) {
5548         // Check if this core from procarr[] is in the mask
5549         int in_mask = 0;
5550         for (int j = 0; j < nth_per_core; j++) {
5551           if (procarr[i * nth_per_core + j] != -1) {
5552             in_mask = 1;
5553             break;
5554           }
5555         }
5556         if (in_mask) {
5557           if (tid == core) {
5558             for (int j = 0; j < nth_per_core; j++) {
5559               int osID = procarr[i * nth_per_core + j];
5560               if (osID != -1) {
5561                 KMP_CPU_SET(osID, mask);
5562                 // For fine granularity it is enough to set the first available
5563                 // osID for this core
5564                 if (fine_gran) {
5565                   break;
5566                 }
5567               }
5568             }
5569             break;
5570           } else {
5571             core++;
5572           }
5573         }
5574       }
5575     } else { // nthreads > ncores
5576       // Array to save the number of processors at each core
5577       int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores);
5578       // Array to save the number of cores with "x" available processors;
5579       int *ncores_with_x_procs =
5580           (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
5581       // Array to save the number of cores with # procs from x to nth_per_core
5582       int *ncores_with_x_to_max_procs =
5583           (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
5584 
5585       for (int i = 0; i <= nth_per_core; i++) {
5586         ncores_with_x_procs[i] = 0;
5587         ncores_with_x_to_max_procs[i] = 0;
5588       }
5589 
5590       for (int i = 0; i < ncores; i++) {
5591         int cnt = 0;
5592         for (int j = 0; j < nth_per_core; j++) {
5593           if (procarr[i * nth_per_core + j] != -1) {
5594             cnt++;
5595           }
5596         }
5597         nproc_at_core[i] = cnt;
5598         ncores_with_x_procs[cnt]++;
5599       }
5600 
5601       for (int i = 0; i <= nth_per_core; i++) {
5602         for (int j = i; j <= nth_per_core; j++) {
5603           ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j];
5604         }
5605       }
5606 
5607       // Max number of processors
5608       int nproc = nth_per_core * ncores;
5609       // An array to keep number of threads per each context
5610       int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc);
5611       for (int i = 0; i < nproc; i++) {
5612         newarr[i] = 0;
5613       }
5614 
5615       int nth = nthreads;
5616       int flag = 0;
5617       while (nth > 0) {
5618         for (int j = 1; j <= nth_per_core; j++) {
5619           int cnt = ncores_with_x_to_max_procs[j];
5620           for (int i = 0; i < ncores; i++) {
5621             // Skip the core with 0 processors
5622             if (nproc_at_core[i] == 0) {
5623               continue;
5624             }
5625             for (int k = 0; k < nth_per_core; k++) {
5626               if (procarr[i * nth_per_core + k] != -1) {
5627                 if (newarr[i * nth_per_core + k] == 0) {
5628                   newarr[i * nth_per_core + k] = 1;
5629                   cnt--;
5630                   nth--;
5631                   break;
5632                 } else {
5633                   if (flag != 0) {
5634                     newarr[i * nth_per_core + k]++;
5635                     cnt--;
5636                     nth--;
5637                     break;
5638                   }
5639                 }
5640               }
5641             }
5642             if (cnt == 0 || nth == 0) {
5643               break;
5644             }
5645           }
5646           if (nth == 0) {
5647             break;
5648           }
5649         }
5650         flag = 1;
5651       }
5652       int sum = 0;
5653       for (int i = 0; i < nproc; i++) {
5654         sum += newarr[i];
5655         if (sum > tid) {
5656           if (fine_gran) {
5657             int osID = procarr[i];
5658             KMP_CPU_SET(osID, mask);
5659           } else {
5660             int coreID = i / nth_per_core;
5661             for (int ii = 0; ii < nth_per_core; ii++) {
5662               int osID = procarr[coreID * nth_per_core + ii];
5663               if (osID != -1) {
5664                 KMP_CPU_SET(osID, mask);
5665               }
5666             }
5667           }
5668           break;
5669         }
5670       }
5671       __kmp_free(newarr);
5672     }
5673 
5674     if (__kmp_affinity.flags.verbose) {
5675       char buf[KMP_AFFIN_MASK_PRINT_LEN];
5676       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
5677       KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
5678                  tid, buf);
5679     }
5680     __kmp_affinity_get_thread_topology_info(th);
5681     __kmp_set_system_affinity(mask, TRUE);
5682   }
5683 }
5684 
5685 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||     \
5686     KMP_OS_AIX
5687 // We don't need this entry for Windows because
5688 // there is GetProcessAffinityMask() api
5689 //
5690 // The intended usage is indicated by these steps:
5691 // 1) The user gets the current affinity mask
5692 // 2) Then sets the affinity by calling this function
5693 // 3) Error check the return value
5694 // 4) Use non-OpenMP parallelization
5695 // 5) Reset the affinity to what was stored in step 1)
5696 #ifdef __cplusplus
5697 extern "C"
5698 #endif
5699     int
kmp_set_thread_affinity_mask_initial()5700     kmp_set_thread_affinity_mask_initial()
5701 // the function returns 0 on success,
5702 //   -1 if we cannot bind thread
5703 //   >0 (errno) if an error happened during binding
5704 {
5705   int gtid = __kmp_get_gtid();
5706   if (gtid < 0) {
5707     // Do not touch non-omp threads
5708     KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5709                   "non-omp thread, returning\n"));
5710     return -1;
5711   }
5712   if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
5713     KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5714                   "affinity not initialized, returning\n"));
5715     return -1;
5716   }
5717   KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5718                 "set full mask for thread %d\n",
5719                 gtid));
5720   KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
5721 #if KMP_OS_AIX
5722   return bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY);
5723 #else
5724   return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
5725 #endif
5726 }
5727 #endif
5728 
5729 #endif // KMP_AFFINITY_SUPPORTED
5730