xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_affinity.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1  /*
2   * kmp_affinity.cpp -- affinity management
3   */
4  
5  //===----------------------------------------------------------------------===//
6  //
7  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8  // See https://llvm.org/LICENSE.txt for license information.
9  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10  //
11  //===----------------------------------------------------------------------===//
12  
13  #include "kmp.h"
14  #include "kmp_affinity.h"
15  #include "kmp_i18n.h"
16  #include "kmp_io.h"
17  #include "kmp_str.h"
18  #include "kmp_wrapper_getpid.h"
19  #if KMP_USE_HIER_SCHED
20  #include "kmp_dispatch_hier.h"
21  #endif
22  #if KMP_USE_HWLOC
23  // Copied from hwloc
24  #define HWLOC_GROUP_KIND_INTEL_MODULE 102
25  #define HWLOC_GROUP_KIND_INTEL_TILE 103
26  #define HWLOC_GROUP_KIND_INTEL_DIE 104
27  #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220
28  #endif
29  #include <ctype.h>
30  
31  // The machine topology
32  kmp_topology_t *__kmp_topology = nullptr;
33  // KMP_HW_SUBSET environment variable
34  kmp_hw_subset_t *__kmp_hw_subset = nullptr;
35  
36  // Store the real or imagined machine hierarchy here
37  static hierarchy_info machine_hierarchy;
38  
__kmp_cleanup_hierarchy()39  void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
40  
41  #if KMP_AFFINITY_SUPPORTED
42  // Helper class to see if place lists further restrict the fullMask
43  class kmp_full_mask_modifier_t {
44    kmp_affin_mask_t *mask;
45  
46  public:
kmp_full_mask_modifier_t()47    kmp_full_mask_modifier_t() {
48      KMP_CPU_ALLOC(mask);
49      KMP_CPU_ZERO(mask);
50    }
~kmp_full_mask_modifier_t()51    ~kmp_full_mask_modifier_t() {
52      KMP_CPU_FREE(mask);
53      mask = nullptr;
54    }
include(const kmp_affin_mask_t * other)55    void include(const kmp_affin_mask_t *other) { KMP_CPU_UNION(mask, other); }
56    // If the new full mask is different from the current full mask,
57    // then switch them. Returns true if full mask was affected, false otherwise.
restrict_to_mask()58    bool restrict_to_mask() {
59      // See if the new mask further restricts or changes the full mask
60      if (KMP_CPU_EQUAL(__kmp_affin_fullMask, mask) || KMP_CPU_ISEMPTY(mask))
61        return false;
62      return __kmp_topology->restrict_to_mask(mask);
63    }
64  };
65  
66  static inline const char *
__kmp_get_affinity_env_var(const kmp_affinity_t & affinity,bool for_binding=false)67  __kmp_get_affinity_env_var(const kmp_affinity_t &affinity,
68                             bool for_binding = false) {
69    if (affinity.flags.omp_places) {
70      if (for_binding)
71        return "OMP_PROC_BIND";
72      return "OMP_PLACES";
73    }
74    return affinity.env_var;
75  }
76  #endif // KMP_AFFINITY_SUPPORTED
77  
__kmp_get_hierarchy(kmp_uint32 nproc,kmp_bstate_t * thr_bar)78  void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
79    kmp_uint32 depth;
80    // The test below is true if affinity is available, but set to "none". Need to
81    // init on first use of hierarchical barrier.
82    if (TCR_1(machine_hierarchy.uninitialized))
83      machine_hierarchy.init(nproc);
84  
85    // Adjust the hierarchy in case num threads exceeds original
86    if (nproc > machine_hierarchy.base_num_threads)
87      machine_hierarchy.resize(nproc);
88  
89    depth = machine_hierarchy.depth;
90    KMP_DEBUG_ASSERT(depth > 0);
91  
92    thr_bar->depth = depth;
93    __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1,
94                       &(thr_bar->base_leaf_kids));
95    thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
96  }
97  
98  static int nCoresPerPkg, nPackages;
99  static int __kmp_nThreadsPerCore;
100  #ifndef KMP_DFLT_NTH_CORES
101  static int __kmp_ncores;
102  #endif
103  
__kmp_hw_get_catalog_string(kmp_hw_t type,bool plural)104  const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
105    switch (type) {
106    case KMP_HW_SOCKET:
107      return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket));
108    case KMP_HW_DIE:
109      return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die));
110    case KMP_HW_MODULE:
111      return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module));
112    case KMP_HW_TILE:
113      return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile));
114    case KMP_HW_NUMA:
115      return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain));
116    case KMP_HW_L3:
117      return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache));
118    case KMP_HW_L2:
119      return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache));
120    case KMP_HW_L1:
121      return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache));
122    case KMP_HW_LLC:
123      return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache));
124    case KMP_HW_CORE:
125      return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core));
126    case KMP_HW_THREAD:
127      return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread));
128    case KMP_HW_PROC_GROUP:
129      return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup));
130    case KMP_HW_UNKNOWN:
131    case KMP_HW_LAST:
132      return KMP_I18N_STR(Unknown);
133    }
134    KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
135    KMP_BUILTIN_UNREACHABLE;
136  }
137  
__kmp_hw_get_keyword(kmp_hw_t type,bool plural)138  const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
139    switch (type) {
140    case KMP_HW_SOCKET:
141      return ((plural) ? "sockets" : "socket");
142    case KMP_HW_DIE:
143      return ((plural) ? "dice" : "die");
144    case KMP_HW_MODULE:
145      return ((plural) ? "modules" : "module");
146    case KMP_HW_TILE:
147      return ((plural) ? "tiles" : "tile");
148    case KMP_HW_NUMA:
149      return ((plural) ? "numa_domains" : "numa_domain");
150    case KMP_HW_L3:
151      return ((plural) ? "l3_caches" : "l3_cache");
152    case KMP_HW_L2:
153      return ((plural) ? "l2_caches" : "l2_cache");
154    case KMP_HW_L1:
155      return ((plural) ? "l1_caches" : "l1_cache");
156    case KMP_HW_LLC:
157      return ((plural) ? "ll_caches" : "ll_cache");
158    case KMP_HW_CORE:
159      return ((plural) ? "cores" : "core");
160    case KMP_HW_THREAD:
161      return ((plural) ? "threads" : "thread");
162    case KMP_HW_PROC_GROUP:
163      return ((plural) ? "proc_groups" : "proc_group");
164    case KMP_HW_UNKNOWN:
165    case KMP_HW_LAST:
166      return ((plural) ? "unknowns" : "unknown");
167    }
168    KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
169    KMP_BUILTIN_UNREACHABLE;
170  }
171  
__kmp_hw_get_core_type_string(kmp_hw_core_type_t type)172  const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
173    switch (type) {
174    case KMP_HW_CORE_TYPE_UNKNOWN:
175    case KMP_HW_MAX_NUM_CORE_TYPES:
176      return "unknown";
177  #if KMP_ARCH_X86 || KMP_ARCH_X86_64
178    case KMP_HW_CORE_TYPE_ATOM:
179      return "Intel Atom(R) processor";
180    case KMP_HW_CORE_TYPE_CORE:
181      return "Intel(R) Core(TM) processor";
182  #endif
183    }
184    KMP_ASSERT2(false, "Unhandled kmp_hw_core_type_t enumeration");
185    KMP_BUILTIN_UNREACHABLE;
186  }
187  
188  #if KMP_AFFINITY_SUPPORTED
189  // If affinity is supported, check the affinity
190  // verbose and warning flags before printing warning
191  #define KMP_AFF_WARNING(s, ...)                                                \
192    if (s.flags.verbose || (s.flags.warnings && (s.type != affinity_none))) {    \
193      KMP_WARNING(__VA_ARGS__);                                                  \
194    }
195  #else
196  #define KMP_AFF_WARNING(s, ...) KMP_WARNING(__VA_ARGS__)
197  #endif
198  
199  ////////////////////////////////////////////////////////////////////////////////
200  // kmp_hw_thread_t methods
compare_ids(const void * a,const void * b)201  int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
202    const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a;
203    const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b;
204    int depth = __kmp_topology->get_depth();
205    for (int level = 0; level < depth; ++level) {
206      if (ahwthread->ids[level] < bhwthread->ids[level])
207        return -1;
208      else if (ahwthread->ids[level] > bhwthread->ids[level])
209        return 1;
210    }
211    if (ahwthread->os_id < bhwthread->os_id)
212      return -1;
213    else if (ahwthread->os_id > bhwthread->os_id)
214      return 1;
215    return 0;
216  }
217  
218  #if KMP_AFFINITY_SUPPORTED
compare_compact(const void * a,const void * b)219  int kmp_hw_thread_t::compare_compact(const void *a, const void *b) {
220    int i;
221    const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a;
222    const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b;
223    int depth = __kmp_topology->get_depth();
224    int compact = __kmp_topology->compact;
225    KMP_DEBUG_ASSERT(compact >= 0);
226    KMP_DEBUG_ASSERT(compact <= depth);
227    for (i = 0; i < compact; i++) {
228      int j = depth - i - 1;
229      if (aa->sub_ids[j] < bb->sub_ids[j])
230        return -1;
231      if (aa->sub_ids[j] > bb->sub_ids[j])
232        return 1;
233    }
234    for (; i < depth; i++) {
235      int j = i - compact;
236      if (aa->sub_ids[j] < bb->sub_ids[j])
237        return -1;
238      if (aa->sub_ids[j] > bb->sub_ids[j])
239        return 1;
240    }
241    return 0;
242  }
243  #endif
244  
print() const245  void kmp_hw_thread_t::print() const {
246    int depth = __kmp_topology->get_depth();
247    printf("%4d ", os_id);
248    for (int i = 0; i < depth; ++i) {
249      printf("%4d ", ids[i]);
250    }
251    if (attrs) {
252      if (attrs.is_core_type_valid())
253        printf(" (%s)", __kmp_hw_get_core_type_string(attrs.get_core_type()));
254      if (attrs.is_core_eff_valid())
255        printf(" (eff=%d)", attrs.get_core_eff());
256    }
257    if (leader)
258      printf(" (leader)");
259    printf("\n");
260  }
261  
262  ////////////////////////////////////////////////////////////////////////////////
263  // kmp_topology_t methods
264  
265  // Add a layer to the topology based on the ids. Assume the topology
266  // is perfectly nested (i.e., so no object has more than one parent)
_insert_layer(kmp_hw_t type,const int * ids)267  void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) {
268    // Figure out where the layer should go by comparing the ids of the current
269    // layers with the new ids
270    int target_layer;
271    int previous_id = kmp_hw_thread_t::UNKNOWN_ID;
272    int previous_new_id = kmp_hw_thread_t::UNKNOWN_ID;
273  
274    // Start from the highest layer and work down to find target layer
275    // If new layer is equal to another layer then put the new layer above
276    for (target_layer = 0; target_layer < depth; ++target_layer) {
277      bool layers_equal = true;
278      bool strictly_above_target_layer = false;
279      for (int i = 0; i < num_hw_threads; ++i) {
280        int id = hw_threads[i].ids[target_layer];
281        int new_id = ids[i];
282        if (id != previous_id && new_id == previous_new_id) {
283          // Found the layer we are strictly above
284          strictly_above_target_layer = true;
285          layers_equal = false;
286          break;
287        } else if (id == previous_id && new_id != previous_new_id) {
288          // Found a layer we are below. Move to next layer and check.
289          layers_equal = false;
290          break;
291        }
292        previous_id = id;
293        previous_new_id = new_id;
294      }
295      if (strictly_above_target_layer || layers_equal)
296        break;
297    }
298  
299    // Found the layer we are above. Now move everything to accommodate the new
300    // layer. And put the new ids and type into the topology.
301    for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
302      types[j] = types[i];
303    types[target_layer] = type;
304    for (int k = 0; k < num_hw_threads; ++k) {
305      for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
306        hw_threads[k].ids[j] = hw_threads[k].ids[i];
307      hw_threads[k].ids[target_layer] = ids[k];
308    }
309    equivalent[type] = type;
310    depth++;
311  }
312  
313  #if KMP_GROUP_AFFINITY
314  // Insert the Windows Processor Group structure into the topology
_insert_windows_proc_groups()315  void kmp_topology_t::_insert_windows_proc_groups() {
316    // Do not insert the processor group structure for a single group
317    if (__kmp_num_proc_groups == 1)
318      return;
319    kmp_affin_mask_t *mask;
320    int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads);
321    KMP_CPU_ALLOC(mask);
322    for (int i = 0; i < num_hw_threads; ++i) {
323      KMP_CPU_ZERO(mask);
324      KMP_CPU_SET(hw_threads[i].os_id, mask);
325      ids[i] = __kmp_get_proc_group(mask);
326    }
327    KMP_CPU_FREE(mask);
328    _insert_layer(KMP_HW_PROC_GROUP, ids);
329    __kmp_free(ids);
330  
331    // sort topology after adding proc groups
332    __kmp_topology->sort_ids();
333  }
334  #endif
335  
336  // Remove layers that don't add information to the topology.
337  // This is done by having the layer take on the id = UNKNOWN_ID (-1)
_remove_radix1_layers()338  void kmp_topology_t::_remove_radix1_layers() {
339    int preference[KMP_HW_LAST];
340    int top_index1, top_index2;
341    // Set up preference associative array
342    preference[KMP_HW_SOCKET] = 110;
343    preference[KMP_HW_PROC_GROUP] = 100;
344    preference[KMP_HW_CORE] = 95;
345    preference[KMP_HW_THREAD] = 90;
346    preference[KMP_HW_NUMA] = 85;
347    preference[KMP_HW_DIE] = 80;
348    preference[KMP_HW_TILE] = 75;
349    preference[KMP_HW_MODULE] = 73;
350    preference[KMP_HW_L3] = 70;
351    preference[KMP_HW_L2] = 65;
352    preference[KMP_HW_L1] = 60;
353    preference[KMP_HW_LLC] = 5;
354    top_index1 = 0;
355    top_index2 = 1;
356    while (top_index1 < depth - 1 && top_index2 < depth) {
357      kmp_hw_t type1 = types[top_index1];
358      kmp_hw_t type2 = types[top_index2];
359      KMP_ASSERT_VALID_HW_TYPE(type1);
360      KMP_ASSERT_VALID_HW_TYPE(type2);
361      // Do not allow the three main topology levels (sockets, cores, threads) to
362      // be compacted down
363      if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE ||
364           type1 == KMP_HW_SOCKET) &&
365          (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE ||
366           type2 == KMP_HW_SOCKET)) {
367        top_index1 = top_index2++;
368        continue;
369      }
370      bool radix1 = true;
371      bool all_same = true;
372      int id1 = hw_threads[0].ids[top_index1];
373      int id2 = hw_threads[0].ids[top_index2];
374      int pref1 = preference[type1];
375      int pref2 = preference[type2];
376      for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) {
377        if (hw_threads[hwidx].ids[top_index1] == id1 &&
378            hw_threads[hwidx].ids[top_index2] != id2) {
379          radix1 = false;
380          break;
381        }
382        if (hw_threads[hwidx].ids[top_index2] != id2)
383          all_same = false;
384        id1 = hw_threads[hwidx].ids[top_index1];
385        id2 = hw_threads[hwidx].ids[top_index2];
386      }
387      if (radix1) {
388        // Select the layer to remove based on preference
389        kmp_hw_t remove_type, keep_type;
390        int remove_layer, remove_layer_ids;
391        if (pref1 > pref2) {
392          remove_type = type2;
393          remove_layer = remove_layer_ids = top_index2;
394          keep_type = type1;
395        } else {
396          remove_type = type1;
397          remove_layer = remove_layer_ids = top_index1;
398          keep_type = type2;
399        }
400        // If all the indexes for the second (deeper) layer are the same.
401        // e.g., all are zero, then make sure to keep the first layer's ids
402        if (all_same)
403          remove_layer_ids = top_index2;
404        // Remove radix one type by setting the equivalence, removing the id from
405        // the hw threads and removing the layer from types and depth
406        set_equivalent_type(remove_type, keep_type);
407        for (int idx = 0; idx < num_hw_threads; ++idx) {
408          kmp_hw_thread_t &hw_thread = hw_threads[idx];
409          for (int d = remove_layer_ids; d < depth - 1; ++d)
410            hw_thread.ids[d] = hw_thread.ids[d + 1];
411        }
412        for (int idx = remove_layer; idx < depth - 1; ++idx)
413          types[idx] = types[idx + 1];
414        depth--;
415      } else {
416        top_index1 = top_index2++;
417      }
418    }
419    KMP_ASSERT(depth > 0);
420  }
421  
_set_last_level_cache()422  void kmp_topology_t::_set_last_level_cache() {
423    if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN)
424      set_equivalent_type(KMP_HW_LLC, KMP_HW_L3);
425    else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
426      set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
427  #if KMP_MIC_SUPPORTED
428    else if (__kmp_mic_type == mic3) {
429      if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
430        set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
431      else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN)
432        set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE);
433      // L2/Tile wasn't detected so just say L1
434      else
435        set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
436    }
437  #endif
438    else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN)
439      set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
440    // Fallback is to set last level cache to socket or core
441    if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) {
442      if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN)
443        set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET);
444      else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN)
445        set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE);
446    }
447    KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN);
448  }
449  
450  // Gather the count of each topology layer and the ratio
_gather_enumeration_information()451  void kmp_topology_t::_gather_enumeration_information() {
452    int previous_id[KMP_HW_LAST];
453    int max[KMP_HW_LAST];
454  
455    for (int i = 0; i < depth; ++i) {
456      previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
457      max[i] = 0;
458      count[i] = 0;
459      ratio[i] = 0;
460    }
461    int core_level = get_level(KMP_HW_CORE);
462    for (int i = 0; i < num_hw_threads; ++i) {
463      kmp_hw_thread_t &hw_thread = hw_threads[i];
464      for (int layer = 0; layer < depth; ++layer) {
465        int id = hw_thread.ids[layer];
466        if (id != previous_id[layer]) {
467          // Add an additional increment to each count
468          for (int l = layer; l < depth; ++l)
469            count[l]++;
470          // Keep track of topology layer ratio statistics
471          max[layer]++;
472          for (int l = layer + 1; l < depth; ++l) {
473            if (max[l] > ratio[l])
474              ratio[l] = max[l];
475            max[l] = 1;
476          }
477          // Figure out the number of different core types
478          // and efficiencies for hybrid CPUs
479          if (__kmp_is_hybrid_cpu() && core_level >= 0 && layer <= core_level) {
480            if (hw_thread.attrs.is_core_eff_valid() &&
481                hw_thread.attrs.core_eff >= num_core_efficiencies) {
482              // Because efficiencies can range from 0 to max efficiency - 1,
483              // the number of efficiencies is max efficiency + 1
484              num_core_efficiencies = hw_thread.attrs.core_eff + 1;
485            }
486            if (hw_thread.attrs.is_core_type_valid()) {
487              bool found = false;
488              for (int j = 0; j < num_core_types; ++j) {
489                if (hw_thread.attrs.get_core_type() == core_types[j]) {
490                  found = true;
491                  break;
492                }
493              }
494              if (!found) {
495                KMP_ASSERT(num_core_types < KMP_HW_MAX_NUM_CORE_TYPES);
496                core_types[num_core_types++] = hw_thread.attrs.get_core_type();
497              }
498            }
499          }
500          break;
501        }
502      }
503      for (int layer = 0; layer < depth; ++layer) {
504        previous_id[layer] = hw_thread.ids[layer];
505      }
506    }
507    for (int layer = 0; layer < depth; ++layer) {
508      if (max[layer] > ratio[layer])
509        ratio[layer] = max[layer];
510    }
511  }
512  
_get_ncores_with_attr(const kmp_hw_attr_t & attr,int above_level,bool find_all) const513  int kmp_topology_t::_get_ncores_with_attr(const kmp_hw_attr_t &attr,
514                                            int above_level,
515                                            bool find_all) const {
516    int current, current_max;
517    int previous_id[KMP_HW_LAST];
518    for (int i = 0; i < depth; ++i)
519      previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
520    int core_level = get_level(KMP_HW_CORE);
521    if (find_all)
522      above_level = -1;
523    KMP_ASSERT(above_level < core_level);
524    current_max = 0;
525    current = 0;
526    for (int i = 0; i < num_hw_threads; ++i) {
527      kmp_hw_thread_t &hw_thread = hw_threads[i];
528      if (!find_all && hw_thread.ids[above_level] != previous_id[above_level]) {
529        if (current > current_max)
530          current_max = current;
531        current = hw_thread.attrs.contains(attr);
532      } else {
533        for (int level = above_level + 1; level <= core_level; ++level) {
534          if (hw_thread.ids[level] != previous_id[level]) {
535            if (hw_thread.attrs.contains(attr))
536              current++;
537            break;
538          }
539        }
540      }
541      for (int level = 0; level < depth; ++level)
542        previous_id[level] = hw_thread.ids[level];
543    }
544    if (current > current_max)
545      current_max = current;
546    return current_max;
547  }
548  
549  // Find out if the topology is uniform
_discover_uniformity()550  void kmp_topology_t::_discover_uniformity() {
551    int num = 1;
552    for (int level = 0; level < depth; ++level)
553      num *= ratio[level];
554    flags.uniform = (num == count[depth - 1]);
555  }
556  
557  // Set all the sub_ids for each hardware thread
_set_sub_ids()558  void kmp_topology_t::_set_sub_ids() {
559    int previous_id[KMP_HW_LAST];
560    int sub_id[KMP_HW_LAST];
561  
562    for (int i = 0; i < depth; ++i) {
563      previous_id[i] = -1;
564      sub_id[i] = -1;
565    }
566    for (int i = 0; i < num_hw_threads; ++i) {
567      kmp_hw_thread_t &hw_thread = hw_threads[i];
568      // Setup the sub_id
569      for (int j = 0; j < depth; ++j) {
570        if (hw_thread.ids[j] != previous_id[j]) {
571          sub_id[j]++;
572          for (int k = j + 1; k < depth; ++k) {
573            sub_id[k] = 0;
574          }
575          break;
576        }
577      }
578      // Set previous_id
579      for (int j = 0; j < depth; ++j) {
580        previous_id[j] = hw_thread.ids[j];
581      }
582      // Set the sub_ids field
583      for (int j = 0; j < depth; ++j) {
584        hw_thread.sub_ids[j] = sub_id[j];
585      }
586    }
587  }
588  
_set_globals()589  void kmp_topology_t::_set_globals() {
590    // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores
591    int core_level, thread_level, package_level;
592    package_level = get_level(KMP_HW_SOCKET);
593  #if KMP_GROUP_AFFINITY
594    if (package_level == -1)
595      package_level = get_level(KMP_HW_PROC_GROUP);
596  #endif
597    core_level = get_level(KMP_HW_CORE);
598    thread_level = get_level(KMP_HW_THREAD);
599  
600    KMP_ASSERT(core_level != -1);
601    KMP_ASSERT(thread_level != -1);
602  
603    __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level);
604    if (package_level != -1) {
605      nCoresPerPkg = calculate_ratio(core_level, package_level);
606      nPackages = get_count(package_level);
607    } else {
608      // assume one socket
609      nCoresPerPkg = get_count(core_level);
610      nPackages = 1;
611    }
612  #ifndef KMP_DFLT_NTH_CORES
613    __kmp_ncores = get_count(core_level);
614  #endif
615  }
616  
allocate(int nproc,int ndepth,const kmp_hw_t * types)617  kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
618                                           const kmp_hw_t *types) {
619    kmp_topology_t *retval;
620    // Allocate all data in one large allocation
621    size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc +
622                  sizeof(int) * (size_t)KMP_HW_LAST * 3;
623    char *bytes = (char *)__kmp_allocate(size);
624    retval = (kmp_topology_t *)bytes;
625    if (nproc > 0) {
626      retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t));
627    } else {
628      retval->hw_threads = nullptr;
629    }
630    retval->num_hw_threads = nproc;
631    retval->depth = ndepth;
632    int *arr =
633        (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc);
634    retval->types = (kmp_hw_t *)arr;
635    retval->ratio = arr + (size_t)KMP_HW_LAST;
636    retval->count = arr + 2 * (size_t)KMP_HW_LAST;
637    retval->num_core_efficiencies = 0;
638    retval->num_core_types = 0;
639    retval->compact = 0;
640    for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
641      retval->core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN;
642    KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
643    for (int i = 0; i < ndepth; ++i) {
644      retval->types[i] = types[i];
645      retval->equivalent[types[i]] = types[i];
646    }
647    return retval;
648  }
649  
deallocate(kmp_topology_t * topology)650  void kmp_topology_t::deallocate(kmp_topology_t *topology) {
651    if (topology)
652      __kmp_free(topology);
653  }
654  
check_ids() const655  bool kmp_topology_t::check_ids() const {
656    // Assume ids have been sorted
657    if (num_hw_threads == 0)
658      return true;
659    for (int i = 1; i < num_hw_threads; ++i) {
660      kmp_hw_thread_t &current_thread = hw_threads[i];
661      kmp_hw_thread_t &previous_thread = hw_threads[i - 1];
662      bool unique = false;
663      for (int j = 0; j < depth; ++j) {
664        if (previous_thread.ids[j] != current_thread.ids[j]) {
665          unique = true;
666          break;
667        }
668      }
669      if (unique)
670        continue;
671      return false;
672    }
673    return true;
674  }
675  
dump() const676  void kmp_topology_t::dump() const {
677    printf("***********************\n");
678    printf("*** __kmp_topology: ***\n");
679    printf("***********************\n");
680    printf("* depth: %d\n", depth);
681  
682    printf("* types: ");
683    for (int i = 0; i < depth; ++i)
684      printf("%15s ", __kmp_hw_get_keyword(types[i]));
685    printf("\n");
686  
687    printf("* ratio: ");
688    for (int i = 0; i < depth; ++i) {
689      printf("%15d ", ratio[i]);
690    }
691    printf("\n");
692  
693    printf("* count: ");
694    for (int i = 0; i < depth; ++i) {
695      printf("%15d ", count[i]);
696    }
697    printf("\n");
698  
699    printf("* num_core_eff: %d\n", num_core_efficiencies);
700    printf("* num_core_types: %d\n", num_core_types);
701    printf("* core_types: ");
702    for (int i = 0; i < num_core_types; ++i)
703      printf("%3d ", core_types[i]);
704    printf("\n");
705  
706    printf("* equivalent map:\n");
707    KMP_FOREACH_HW_TYPE(i) {
708      const char *key = __kmp_hw_get_keyword(i);
709      const char *value = __kmp_hw_get_keyword(equivalent[i]);
710      printf("%-15s -> %-15s\n", key, value);
711    }
712  
713    printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No"));
714  
715    printf("* num_hw_threads: %d\n", num_hw_threads);
716    printf("* hw_threads:\n");
717    for (int i = 0; i < num_hw_threads; ++i) {
718      hw_threads[i].print();
719    }
720    printf("***********************\n");
721  }
722  
print(const char * env_var) const723  void kmp_topology_t::print(const char *env_var) const {
724    kmp_str_buf_t buf;
725    int print_types_depth;
726    __kmp_str_buf_init(&buf);
727    kmp_hw_t print_types[KMP_HW_LAST + 2];
728  
729    // Num Available Threads
730    if (num_hw_threads) {
731      KMP_INFORM(AvailableOSProc, env_var, num_hw_threads);
732    } else {
733      KMP_INFORM(AvailableOSProc, env_var, __kmp_xproc);
734    }
735  
736    // Uniform or not
737    if (is_uniform()) {
738      KMP_INFORM(Uniform, env_var);
739    } else {
740      KMP_INFORM(NonUniform, env_var);
741    }
742  
743    // Equivalent types
744    KMP_FOREACH_HW_TYPE(type) {
745      kmp_hw_t eq_type = equivalent[type];
746      if (eq_type != KMP_HW_UNKNOWN && eq_type != type) {
747        KMP_INFORM(AffEqualTopologyTypes, env_var,
748                   __kmp_hw_get_catalog_string(type),
749                   __kmp_hw_get_catalog_string(eq_type));
750      }
751    }
752  
753    // Quick topology
754    KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST);
755    // Create a print types array that always guarantees printing
756    // the core and thread level
757    print_types_depth = 0;
758    for (int level = 0; level < depth; ++level)
759      print_types[print_types_depth++] = types[level];
760    if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) {
761      // Force in the core level for quick topology
762      if (print_types[print_types_depth - 1] == KMP_HW_THREAD) {
763        // Force core before thread e.g., 1 socket X 2 threads/socket
764        // becomes 1 socket X 1 core/socket X 2 threads/socket
765        print_types[print_types_depth - 1] = KMP_HW_CORE;
766        print_types[print_types_depth++] = KMP_HW_THREAD;
767      } else {
768        print_types[print_types_depth++] = KMP_HW_CORE;
769      }
770    }
771    // Always put threads at very end of quick topology
772    if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD)
773      print_types[print_types_depth++] = KMP_HW_THREAD;
774  
775    __kmp_str_buf_clear(&buf);
776    kmp_hw_t numerator_type;
777    kmp_hw_t denominator_type = KMP_HW_UNKNOWN;
778    int core_level = get_level(KMP_HW_CORE);
779    int ncores = get_count(core_level);
780  
781    for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) {
782      int c;
783      bool plural;
784      numerator_type = print_types[plevel];
785      KMP_ASSERT_VALID_HW_TYPE(numerator_type);
786      if (equivalent[numerator_type] != numerator_type)
787        c = 1;
788      else
789        c = get_ratio(level++);
790      plural = (c > 1);
791      if (plevel == 0) {
792        __kmp_str_buf_print(&buf, "%d %s", c,
793                            __kmp_hw_get_catalog_string(numerator_type, plural));
794      } else {
795        __kmp_str_buf_print(&buf, " x %d %s/%s", c,
796                            __kmp_hw_get_catalog_string(numerator_type, plural),
797                            __kmp_hw_get_catalog_string(denominator_type));
798      }
799      denominator_type = numerator_type;
800    }
801    KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores);
802  
803    // Hybrid topology information
804    if (__kmp_is_hybrid_cpu()) {
805      for (int i = 0; i < num_core_types; ++i) {
806        kmp_hw_core_type_t core_type = core_types[i];
807        kmp_hw_attr_t attr;
808        attr.clear();
809        attr.set_core_type(core_type);
810        int ncores = get_ncores_with_attr(attr);
811        if (ncores > 0) {
812          KMP_INFORM(TopologyHybrid, env_var, ncores,
813                     __kmp_hw_get_core_type_string(core_type));
814          KMP_ASSERT(num_core_efficiencies <= KMP_HW_MAX_NUM_CORE_EFFS)
815          for (int eff = 0; eff < num_core_efficiencies; ++eff) {
816            attr.set_core_eff(eff);
817            int ncores_with_eff = get_ncores_with_attr(attr);
818            if (ncores_with_eff > 0) {
819              KMP_INFORM(TopologyHybridCoreEff, env_var, ncores_with_eff, eff);
820            }
821          }
822        }
823      }
824    }
825  
826    if (num_hw_threads <= 0) {
827      __kmp_str_buf_free(&buf);
828      return;
829    }
830  
831    // Full OS proc to hardware thread map
832    KMP_INFORM(OSProcToPhysicalThreadMap, env_var);
833    for (int i = 0; i < num_hw_threads; i++) {
834      __kmp_str_buf_clear(&buf);
835      for (int level = 0; level < depth; ++level) {
836        kmp_hw_t type = types[level];
837        __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
838        __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
839      }
840      if (__kmp_is_hybrid_cpu())
841        __kmp_str_buf_print(
842            &buf, "(%s)",
843            __kmp_hw_get_core_type_string(hw_threads[i].attrs.get_core_type()));
844      KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str);
845    }
846  
847    __kmp_str_buf_free(&buf);
848  }
849  
850  #if KMP_AFFINITY_SUPPORTED
set_granularity(kmp_affinity_t & affinity) const851  void kmp_topology_t::set_granularity(kmp_affinity_t &affinity) const {
852    const char *env_var = __kmp_get_affinity_env_var(affinity);
853    // If requested hybrid CPU attributes for granularity (either OMP_PLACES or
854    // KMP_AFFINITY), but none exist, then reset granularity and have below method
855    // select a granularity and warn user.
856    if (!__kmp_is_hybrid_cpu()) {
857      if (affinity.core_attr_gran.valid) {
858        // OMP_PLACES with cores:<attribute> but non-hybrid arch, use cores
859        // instead
860        KMP_AFF_WARNING(
861            affinity, AffIgnoringNonHybrid, env_var,
862            __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true));
863        affinity.gran = KMP_HW_CORE;
864        affinity.gran_levels = -1;
865        affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN;
866        affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0;
867      } else if (affinity.flags.core_types_gran ||
868                 affinity.flags.core_effs_gran) {
869        // OMP_PLACES=core_types|core_effs but non-hybrid, use cores instead
870        if (affinity.flags.omp_places) {
871          KMP_AFF_WARNING(
872              affinity, AffIgnoringNonHybrid, env_var,
873              __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true));
874        } else {
875          // KMP_AFFINITY=granularity=core_type|core_eff,...
876          KMP_AFF_WARNING(affinity, AffGranularityBad, env_var,
877                          "Intel(R) Hybrid Technology core attribute",
878                          __kmp_hw_get_catalog_string(KMP_HW_CORE));
879        }
880        affinity.gran = KMP_HW_CORE;
881        affinity.gran_levels = -1;
882        affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN;
883        affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0;
884      }
885    }
886    // Set the number of affinity granularity levels
887    if (affinity.gran_levels < 0) {
888      kmp_hw_t gran_type = get_equivalent_type(affinity.gran);
889      // Check if user's granularity request is valid
890      if (gran_type == KMP_HW_UNKNOWN) {
891        // First try core, then thread, then package
892        kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET};
893        for (auto g : gran_types) {
894          if (get_equivalent_type(g) != KMP_HW_UNKNOWN) {
895            gran_type = g;
896            break;
897          }
898        }
899        KMP_ASSERT(gran_type != KMP_HW_UNKNOWN);
900        // Warn user what granularity setting will be used instead
901        KMP_AFF_WARNING(affinity, AffGranularityBad, env_var,
902                        __kmp_hw_get_catalog_string(affinity.gran),
903                        __kmp_hw_get_catalog_string(gran_type));
904        affinity.gran = gran_type;
905      }
906  #if KMP_GROUP_AFFINITY
907      // If more than one processor group exists, and the level of
908      // granularity specified by the user is too coarse, then the
909      // granularity must be adjusted "down" to processor group affinity
910      // because threads can only exist within one processor group.
911      // For example, if a user sets granularity=socket and there are two
912      // processor groups that cover a socket, then the runtime must
913      // restrict the granularity down to the processor group level.
914      if (__kmp_num_proc_groups > 1) {
915        int gran_depth = get_level(gran_type);
916        int proc_group_depth = get_level(KMP_HW_PROC_GROUP);
917        if (gran_depth >= 0 && proc_group_depth >= 0 &&
918            gran_depth < proc_group_depth) {
919          KMP_AFF_WARNING(affinity, AffGranTooCoarseProcGroup, env_var,
920                          __kmp_hw_get_catalog_string(affinity.gran));
921          affinity.gran = gran_type = KMP_HW_PROC_GROUP;
922        }
923      }
924  #endif
925      affinity.gran_levels = 0;
926      for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
927        affinity.gran_levels++;
928    }
929  }
930  #endif
931  
canonicalize()932  void kmp_topology_t::canonicalize() {
933  #if KMP_GROUP_AFFINITY
934    _insert_windows_proc_groups();
935  #endif
936    _remove_radix1_layers();
937    _gather_enumeration_information();
938    _discover_uniformity();
939    _set_sub_ids();
940    _set_globals();
941    _set_last_level_cache();
942  
943  #if KMP_MIC_SUPPORTED
944    // Manually Add L2 = Tile equivalence
945    if (__kmp_mic_type == mic3) {
946      if (get_level(KMP_HW_L2) != -1)
947        set_equivalent_type(KMP_HW_TILE, KMP_HW_L2);
948      else if (get_level(KMP_HW_TILE) != -1)
949        set_equivalent_type(KMP_HW_L2, KMP_HW_TILE);
950    }
951  #endif
952  
953    // Perform post canonicalization checking
954    KMP_ASSERT(depth > 0);
955    for (int level = 0; level < depth; ++level) {
956      // All counts, ratios, and types must be valid
957      KMP_ASSERT(count[level] > 0 && ratio[level] > 0);
958      KMP_ASSERT_VALID_HW_TYPE(types[level]);
959      // Detected types must point to themselves
960      KMP_ASSERT(equivalent[types[level]] == types[level]);
961    }
962  }
963  
964  // Canonicalize an explicit packages X cores/pkg X threads/core topology
canonicalize(int npackages,int ncores_per_pkg,int nthreads_per_core,int ncores)965  void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
966                                    int nthreads_per_core, int ncores) {
967    int ndepth = 3;
968    depth = ndepth;
969    KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; }
970    for (int level = 0; level < depth; ++level) {
971      count[level] = 0;
972      ratio[level] = 0;
973    }
974    count[0] = npackages;
975    count[1] = ncores;
976    count[2] = __kmp_xproc;
977    ratio[0] = npackages;
978    ratio[1] = ncores_per_pkg;
979    ratio[2] = nthreads_per_core;
980    equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET;
981    equivalent[KMP_HW_CORE] = KMP_HW_CORE;
982    equivalent[KMP_HW_THREAD] = KMP_HW_THREAD;
983    types[0] = KMP_HW_SOCKET;
984    types[1] = KMP_HW_CORE;
985    types[2] = KMP_HW_THREAD;
986    //__kmp_avail_proc = __kmp_xproc;
987    _discover_uniformity();
988  }
989  
990  #if KMP_AFFINITY_SUPPORTED
991  static kmp_str_buf_t *
__kmp_hw_get_catalog_core_string(const kmp_hw_attr_t & attr,kmp_str_buf_t * buf,bool plural)992  __kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
993                                   bool plural) {
994    __kmp_str_buf_init(buf);
995    if (attr.is_core_type_valid())
996      __kmp_str_buf_print(buf, "%s %s",
997                          __kmp_hw_get_core_type_string(attr.get_core_type()),
998                          __kmp_hw_get_catalog_string(KMP_HW_CORE, plural));
999    else
1000      __kmp_str_buf_print(buf, "%s eff=%d",
1001                          __kmp_hw_get_catalog_string(KMP_HW_CORE, plural),
1002                          attr.get_core_eff());
1003    return buf;
1004  }
1005  
restrict_to_mask(const kmp_affin_mask_t * mask)1006  bool kmp_topology_t::restrict_to_mask(const kmp_affin_mask_t *mask) {
1007    // Apply the filter
1008    bool affected;
1009    int new_index = 0;
1010    for (int i = 0; i < num_hw_threads; ++i) {
1011      int os_id = hw_threads[i].os_id;
1012      if (KMP_CPU_ISSET(os_id, mask)) {
1013        if (i != new_index)
1014          hw_threads[new_index] = hw_threads[i];
1015        new_index++;
1016      } else {
1017        KMP_CPU_CLR(os_id, __kmp_affin_fullMask);
1018        __kmp_avail_proc--;
1019      }
1020    }
1021  
1022    KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
1023    affected = (num_hw_threads != new_index);
1024    num_hw_threads = new_index;
1025  
1026    // Post hardware subset canonicalization
1027    if (affected) {
1028      _gather_enumeration_information();
1029      _discover_uniformity();
1030      _set_globals();
1031      _set_last_level_cache();
1032  #if KMP_OS_WINDOWS
1033      // Copy filtered full mask if topology has single processor group
1034      if (__kmp_num_proc_groups <= 1)
1035  #endif
1036        __kmp_affin_origMask->copy(__kmp_affin_fullMask);
1037    }
1038    return affected;
1039  }
1040  
1041  // Apply the KMP_HW_SUBSET envirable to the topology
1042  // Returns true if KMP_HW_SUBSET filtered any processors
1043  // otherwise, returns false
filter_hw_subset()1044  bool kmp_topology_t::filter_hw_subset() {
1045    // If KMP_HW_SUBSET wasn't requested, then do nothing.
1046    if (!__kmp_hw_subset)
1047      return false;
1048  
1049    // First, sort the KMP_HW_SUBSET items by the machine topology
1050    __kmp_hw_subset->sort();
1051  
1052    __kmp_hw_subset->canonicalize(__kmp_topology);
1053  
1054    // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
1055    bool using_core_types = false;
1056    bool using_core_effs = false;
1057    bool is_absolute = __kmp_hw_subset->is_absolute();
1058    int hw_subset_depth = __kmp_hw_subset->get_depth();
1059    kmp_hw_t specified[KMP_HW_LAST];
1060    int *topology_levels = (int *)KMP_ALLOCA(sizeof(int) * hw_subset_depth);
1061    KMP_ASSERT(hw_subset_depth > 0);
1062    KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; }
1063    int core_level = get_level(KMP_HW_CORE);
1064    for (int i = 0; i < hw_subset_depth; ++i) {
1065      int max_count;
1066      const kmp_hw_subset_t::item_t &item = __kmp_hw_subset->at(i);
1067      int num = item.num[0];
1068      int offset = item.offset[0];
1069      kmp_hw_t type = item.type;
1070      kmp_hw_t equivalent_type = equivalent[type];
1071      int level = get_level(type);
1072      topology_levels[i] = level;
1073  
1074      // Check to see if current layer is in detected machine topology
1075      if (equivalent_type != KMP_HW_UNKNOWN) {
1076        __kmp_hw_subset->at(i).type = equivalent_type;
1077      } else {
1078        KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetNotExistGeneric,
1079                        __kmp_hw_get_catalog_string(type));
1080        return false;
1081      }
1082  
1083      // Check to see if current layer has already been
1084      // specified either directly or through an equivalent type
1085      if (specified[equivalent_type] != KMP_HW_UNKNOWN) {
1086        KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetEqvLayers,
1087                        __kmp_hw_get_catalog_string(type),
1088                        __kmp_hw_get_catalog_string(specified[equivalent_type]));
1089        return false;
1090      }
1091      specified[equivalent_type] = type;
1092  
1093      // Check to see if each layer's num & offset parameters are valid
1094      max_count = get_ratio(level);
1095      if (!is_absolute) {
1096        if (max_count < 0 ||
1097            (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
1098          bool plural = (num > 1);
1099          KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric,
1100                          __kmp_hw_get_catalog_string(type, plural));
1101          return false;
1102        }
1103      }
1104  
1105      // Check to see if core attributes are consistent
1106      if (core_level == level) {
1107        // Determine which core attributes are specified
1108        for (int j = 0; j < item.num_attrs; ++j) {
1109          if (item.attr[j].is_core_type_valid())
1110            using_core_types = true;
1111          if (item.attr[j].is_core_eff_valid())
1112            using_core_effs = true;
1113        }
1114  
1115        // Check if using a single core attribute on non-hybrid arch.
1116        // Do not ignore all of KMP_HW_SUBSET, just ignore the attribute.
1117        //
1118        // Check if using multiple core attributes on non-hyrbid arch.
1119        // Ignore all of KMP_HW_SUBSET if this is the case.
1120        if ((using_core_effs || using_core_types) && !__kmp_is_hybrid_cpu()) {
1121          if (item.num_attrs == 1) {
1122            if (using_core_effs) {
1123              KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr,
1124                              "efficiency");
1125            } else {
1126              KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr,
1127                              "core_type");
1128            }
1129            using_core_effs = false;
1130            using_core_types = false;
1131          } else {
1132            KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrsNonHybrid);
1133            return false;
1134          }
1135        }
1136  
1137        // Check if using both core types and core efficiencies together
1138        if (using_core_types && using_core_effs) {
1139          KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat, "core_type",
1140                          "efficiency");
1141          return false;
1142        }
1143  
1144        // Check that core efficiency values are valid
1145        if (using_core_effs) {
1146          for (int j = 0; j < item.num_attrs; ++j) {
1147            if (item.attr[j].is_core_eff_valid()) {
1148              int core_eff = item.attr[j].get_core_eff();
1149              if (core_eff < 0 || core_eff >= num_core_efficiencies) {
1150                kmp_str_buf_t buf;
1151                __kmp_str_buf_init(&buf);
1152                __kmp_str_buf_print(&buf, "%d", item.attr[j].get_core_eff());
1153                __kmp_msg(kmp_ms_warning,
1154                          KMP_MSG(AffHWSubsetAttrInvalid, "efficiency", buf.str),
1155                          KMP_HNT(ValidValuesRange, 0, num_core_efficiencies - 1),
1156                          __kmp_msg_null);
1157                __kmp_str_buf_free(&buf);
1158                return false;
1159              }
1160            }
1161          }
1162        }
1163  
1164        // Check that the number of requested cores with attributes is valid
1165        if ((using_core_types || using_core_effs) && !is_absolute) {
1166          for (int j = 0; j < item.num_attrs; ++j) {
1167            int num = item.num[j];
1168            int offset = item.offset[j];
1169            int level_above = core_level - 1;
1170            if (level_above >= 0) {
1171              max_count = get_ncores_with_attr_per(item.attr[j], level_above);
1172              if (max_count <= 0 ||
1173                  (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
1174                kmp_str_buf_t buf;
1175                __kmp_hw_get_catalog_core_string(item.attr[j], &buf, num > 0);
1176                KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric, buf.str);
1177                __kmp_str_buf_free(&buf);
1178                return false;
1179              }
1180            }
1181          }
1182        }
1183  
1184        if ((using_core_types || using_core_effs) && item.num_attrs > 1) {
1185          for (int j = 0; j < item.num_attrs; ++j) {
1186            // Ambiguous use of specific core attribute + generic core
1187            // e.g., 4c & 3c:intel_core or 4c & 3c:eff1
1188            if (!item.attr[j]) {
1189              kmp_hw_attr_t other_attr;
1190              for (int k = 0; k < item.num_attrs; ++k) {
1191                if (item.attr[k] != item.attr[j]) {
1192                  other_attr = item.attr[k];
1193                  break;
1194                }
1195              }
1196              kmp_str_buf_t buf;
1197              __kmp_hw_get_catalog_core_string(other_attr, &buf, item.num[j] > 0);
1198              KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat,
1199                              __kmp_hw_get_catalog_string(KMP_HW_CORE), buf.str);
1200              __kmp_str_buf_free(&buf);
1201              return false;
1202            }
1203            // Allow specifying a specific core type or core eff exactly once
1204            for (int k = 0; k < j; ++k) {
1205              if (!item.attr[j] || !item.attr[k])
1206                continue;
1207              if (item.attr[k] == item.attr[j]) {
1208                kmp_str_buf_t buf;
1209                __kmp_hw_get_catalog_core_string(item.attr[j], &buf,
1210                                                 item.num[j] > 0);
1211                KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrRepeat, buf.str);
1212                __kmp_str_buf_free(&buf);
1213                return false;
1214              }
1215            }
1216          }
1217        }
1218      }
1219    }
1220  
1221    // For keeping track of sub_ids for an absolute KMP_HW_SUBSET
1222    // or core attributes (core type or efficiency)
1223    int prev_sub_ids[KMP_HW_LAST];
1224    int abs_sub_ids[KMP_HW_LAST];
1225    int core_eff_sub_ids[KMP_HW_MAX_NUM_CORE_EFFS];
1226    int core_type_sub_ids[KMP_HW_MAX_NUM_CORE_TYPES];
1227    for (size_t i = 0; i < KMP_HW_LAST; ++i) {
1228      abs_sub_ids[i] = -1;
1229      prev_sub_ids[i] = -1;
1230    }
1231    for (size_t i = 0; i < KMP_HW_MAX_NUM_CORE_EFFS; ++i)
1232      core_eff_sub_ids[i] = -1;
1233    for (size_t i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
1234      core_type_sub_ids[i] = -1;
1235  
1236    // Determine which hardware threads should be filtered.
1237  
1238    // Helpful to determine if a topology layer is targeted by an absolute subset
1239    auto is_targeted = [&](int level) {
1240      if (is_absolute) {
1241        for (int i = 0; i < hw_subset_depth; ++i)
1242          if (topology_levels[i] == level)
1243            return true;
1244        return false;
1245      }
1246      // If not absolute KMP_HW_SUBSET, then every layer is seen as targeted
1247      return true;
1248    };
1249  
1250    // Helpful to index into core type sub Ids array
1251    auto get_core_type_index = [](const kmp_hw_thread_t &t) {
1252      switch (t.attrs.get_core_type()) {
1253      case KMP_HW_CORE_TYPE_UNKNOWN:
1254      case KMP_HW_MAX_NUM_CORE_TYPES:
1255        return 0;
1256  #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1257      case KMP_HW_CORE_TYPE_ATOM:
1258        return 1;
1259      case KMP_HW_CORE_TYPE_CORE:
1260        return 2;
1261  #endif
1262      }
1263      KMP_ASSERT2(false, "Unhandled kmp_hw_thread_t enumeration");
1264      KMP_BUILTIN_UNREACHABLE;
1265    };
1266  
1267    // Helpful to index into core efficiencies sub Ids array
1268    auto get_core_eff_index = [](const kmp_hw_thread_t &t) {
1269      return t.attrs.get_core_eff();
1270    };
1271  
1272    int num_filtered = 0;
1273    kmp_affin_mask_t *filtered_mask;
1274    KMP_CPU_ALLOC(filtered_mask);
1275    KMP_CPU_COPY(filtered_mask, __kmp_affin_fullMask);
1276    for (int i = 0; i < num_hw_threads; ++i) {
1277      kmp_hw_thread_t &hw_thread = hw_threads[i];
1278  
1279      // Figure out the absolute sub ids and core eff/type sub ids
1280      if (is_absolute || using_core_effs || using_core_types) {
1281        for (int level = 0; level < get_depth(); ++level) {
1282          if (hw_thread.sub_ids[level] != prev_sub_ids[level]) {
1283            bool found_targeted = false;
1284            for (int j = level; j < get_depth(); ++j) {
1285              bool targeted = is_targeted(j);
1286              if (!found_targeted && targeted) {
1287                found_targeted = true;
1288                abs_sub_ids[j]++;
1289                if (j == core_level && using_core_effs)
1290                  core_eff_sub_ids[get_core_eff_index(hw_thread)]++;
1291                if (j == core_level && using_core_types)
1292                  core_type_sub_ids[get_core_type_index(hw_thread)]++;
1293              } else if (targeted) {
1294                abs_sub_ids[j] = 0;
1295                if (j == core_level && using_core_effs)
1296                  core_eff_sub_ids[get_core_eff_index(hw_thread)] = 0;
1297                if (j == core_level && using_core_types)
1298                  core_type_sub_ids[get_core_type_index(hw_thread)] = 0;
1299              }
1300            }
1301            break;
1302          }
1303        }
1304        for (int level = 0; level < get_depth(); ++level)
1305          prev_sub_ids[level] = hw_thread.sub_ids[level];
1306      }
1307  
1308      // Check to see if this hardware thread should be filtered
1309      bool should_be_filtered = false;
1310      for (int hw_subset_index = 0; hw_subset_index < hw_subset_depth;
1311           ++hw_subset_index) {
1312        const auto &hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
1313        int level = topology_levels[hw_subset_index];
1314        if (level == -1)
1315          continue;
1316        if ((using_core_effs || using_core_types) && level == core_level) {
1317          // Look for the core attribute in KMP_HW_SUBSET which corresponds
1318          // to this hardware thread's core attribute. Use this num,offset plus
1319          // the running sub_id for the particular core attribute of this hardware
1320          // thread to determine if the hardware thread should be filtered or not.
1321          int attr_idx;
1322          kmp_hw_core_type_t core_type = hw_thread.attrs.get_core_type();
1323          int core_eff = hw_thread.attrs.get_core_eff();
1324          for (attr_idx = 0; attr_idx < hw_subset_item.num_attrs; ++attr_idx) {
1325            if (using_core_types &&
1326                hw_subset_item.attr[attr_idx].get_core_type() == core_type)
1327              break;
1328            if (using_core_effs &&
1329                hw_subset_item.attr[attr_idx].get_core_eff() == core_eff)
1330              break;
1331          }
1332          // This core attribute isn't in the KMP_HW_SUBSET so always filter it.
1333          if (attr_idx == hw_subset_item.num_attrs) {
1334            should_be_filtered = true;
1335            break;
1336          }
1337          int sub_id;
1338          int num = hw_subset_item.num[attr_idx];
1339          int offset = hw_subset_item.offset[attr_idx];
1340          if (using_core_types)
1341            sub_id = core_type_sub_ids[get_core_type_index(hw_thread)];
1342          else
1343            sub_id = core_eff_sub_ids[get_core_eff_index(hw_thread)];
1344          if (sub_id < offset ||
1345              (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
1346            should_be_filtered = true;
1347            break;
1348          }
1349        } else {
1350          int sub_id;
1351          int num = hw_subset_item.num[0];
1352          int offset = hw_subset_item.offset[0];
1353          if (is_absolute)
1354            sub_id = abs_sub_ids[level];
1355          else
1356            sub_id = hw_thread.sub_ids[level];
1357          if (sub_id < offset ||
1358              (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
1359            should_be_filtered = true;
1360            break;
1361          }
1362        }
1363      }
1364      // Collect filtering information
1365      if (should_be_filtered) {
1366        KMP_CPU_CLR(hw_thread.os_id, filtered_mask);
1367        num_filtered++;
1368      }
1369    }
1370  
1371    // One last check that we shouldn't allow filtering entire machine
1372    if (num_filtered == num_hw_threads) {
1373      KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAllFiltered);
1374      return false;
1375    }
1376  
1377    // Apply the filter
1378    restrict_to_mask(filtered_mask);
1379    return true;
1380  }
1381  
is_close(int hwt1,int hwt2,const kmp_affinity_t & stgs) const1382  bool kmp_topology_t::is_close(int hwt1, int hwt2,
1383                                const kmp_affinity_t &stgs) const {
1384    int hw_level = stgs.gran_levels;
1385    if (hw_level >= depth)
1386      return true;
1387    bool retval = true;
1388    const kmp_hw_thread_t &t1 = hw_threads[hwt1];
1389    const kmp_hw_thread_t &t2 = hw_threads[hwt2];
1390    if (stgs.flags.core_types_gran)
1391      return t1.attrs.get_core_type() == t2.attrs.get_core_type();
1392    if (stgs.flags.core_effs_gran)
1393      return t1.attrs.get_core_eff() == t2.attrs.get_core_eff();
1394    for (int i = 0; i < (depth - hw_level); ++i) {
1395      if (t1.ids[i] != t2.ids[i])
1396        return false;
1397    }
1398    return retval;
1399  }
1400  
1401  ////////////////////////////////////////////////////////////////////////////////
1402  
1403  bool KMPAffinity::picked_api = false;
1404  
operator new(size_t n)1405  void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
operator new[](size_t n)1406  void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); }
operator delete(void * p)1407  void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); }
operator delete[](void * p)1408  void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); }
operator new(size_t n)1409  void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); }
operator delete(void * p)1410  void KMPAffinity::operator delete(void *p) { __kmp_free(p); }
1411  
pick_api()1412  void KMPAffinity::pick_api() {
1413    KMPAffinity *affinity_dispatch;
1414    if (picked_api)
1415      return;
1416  #if KMP_USE_HWLOC
1417    // Only use Hwloc if affinity isn't explicitly disabled and
1418    // user requests Hwloc topology method
1419    if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
1420        __kmp_affinity.type != affinity_disabled) {
1421      affinity_dispatch = new KMPHwlocAffinity();
1422    } else
1423  #endif
1424    {
1425      affinity_dispatch = new KMPNativeAffinity();
1426    }
1427    __kmp_affinity_dispatch = affinity_dispatch;
1428    picked_api = true;
1429  }
1430  
destroy_api()1431  void KMPAffinity::destroy_api() {
1432    if (__kmp_affinity_dispatch != NULL) {
1433      delete __kmp_affinity_dispatch;
1434      __kmp_affinity_dispatch = NULL;
1435      picked_api = false;
1436    }
1437  }
1438  
1439  #define KMP_ADVANCE_SCAN(scan)                                                 \
1440    while (*scan != '\0') {                                                      \
1441      scan++;                                                                    \
1442    }
1443  
1444  // Print the affinity mask to the character array in a pretty format.
1445  // The format is a comma separated list of non-negative integers or integer
1446  // ranges: e.g., 1,2,3-5,7,9-15
1447  // The format can also be the string "{<empty>}" if no bits are set in mask
__kmp_affinity_print_mask(char * buf,int buf_len,kmp_affin_mask_t * mask)1448  char *__kmp_affinity_print_mask(char *buf, int buf_len,
1449                                  kmp_affin_mask_t *mask) {
1450    int start = 0, finish = 0, previous = 0;
1451    bool first_range;
1452    KMP_ASSERT(buf);
1453    KMP_ASSERT(buf_len >= 40);
1454    KMP_ASSERT(mask);
1455    char *scan = buf;
1456    char *end = buf + buf_len - 1;
1457  
1458    // Check for empty set.
1459    if (mask->begin() == mask->end()) {
1460      KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}");
1461      KMP_ADVANCE_SCAN(scan);
1462      KMP_ASSERT(scan <= end);
1463      return buf;
1464    }
1465  
1466    first_range = true;
1467    start = mask->begin();
1468    while (1) {
1469      // Find next range
1470      // [start, previous] is inclusive range of contiguous bits in mask
1471      for (finish = mask->next(start), previous = start;
1472           finish == previous + 1 && finish != mask->end();
1473           finish = mask->next(finish)) {
1474        previous = finish;
1475      }
1476  
1477      // The first range does not need a comma printed before it, but the rest
1478      // of the ranges do need a comma beforehand
1479      if (!first_range) {
1480        KMP_SNPRINTF(scan, end - scan + 1, "%s", ",");
1481        KMP_ADVANCE_SCAN(scan);
1482      } else {
1483        first_range = false;
1484      }
1485      // Range with three or more contiguous bits in the affinity mask
1486      if (previous - start > 1) {
1487        KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous);
1488      } else {
1489        // Range with one or two contiguous bits in the affinity mask
1490        KMP_SNPRINTF(scan, end - scan + 1, "%u", start);
1491        KMP_ADVANCE_SCAN(scan);
1492        if (previous - start > 0) {
1493          KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous);
1494        }
1495      }
1496      KMP_ADVANCE_SCAN(scan);
1497      // Start over with new start point
1498      start = finish;
1499      if (start == mask->end())
1500        break;
1501      // Check for overflow
1502      if (end - scan < 2)
1503        break;
1504    }
1505  
1506    // Check for overflow
1507    KMP_ASSERT(scan <= end);
1508    return buf;
1509  }
1510  #undef KMP_ADVANCE_SCAN
1511  
1512  // Print the affinity mask to the string buffer object in a pretty format
1513  // The format is a comma separated list of non-negative integers or integer
1514  // ranges: e.g., 1,2,3-5,7,9-15
1515  // The format can also be the string "{<empty>}" if no bits are set in mask
__kmp_affinity_str_buf_mask(kmp_str_buf_t * buf,kmp_affin_mask_t * mask)1516  kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
1517                                             kmp_affin_mask_t *mask) {
1518    int start = 0, finish = 0, previous = 0;
1519    bool first_range;
1520    KMP_ASSERT(buf);
1521    KMP_ASSERT(mask);
1522  
1523    __kmp_str_buf_clear(buf);
1524  
1525    // Check for empty set.
1526    if (mask->begin() == mask->end()) {
1527      __kmp_str_buf_print(buf, "%s", "{<empty>}");
1528      return buf;
1529    }
1530  
1531    first_range = true;
1532    start = mask->begin();
1533    while (1) {
1534      // Find next range
1535      // [start, previous] is inclusive range of contiguous bits in mask
1536      for (finish = mask->next(start), previous = start;
1537           finish == previous + 1 && finish != mask->end();
1538           finish = mask->next(finish)) {
1539        previous = finish;
1540      }
1541  
1542      // The first range does not need a comma printed before it, but the rest
1543      // of the ranges do need a comma beforehand
1544      if (!first_range) {
1545        __kmp_str_buf_print(buf, "%s", ",");
1546      } else {
1547        first_range = false;
1548      }
1549      // Range with three or more contiguous bits in the affinity mask
1550      if (previous - start > 1) {
1551        __kmp_str_buf_print(buf, "%u-%u", start, previous);
1552      } else {
1553        // Range with one or two contiguous bits in the affinity mask
1554        __kmp_str_buf_print(buf, "%u", start);
1555        if (previous - start > 0) {
1556          __kmp_str_buf_print(buf, ",%u", previous);
1557        }
1558      }
1559      // Start over with new start point
1560      start = finish;
1561      if (start == mask->end())
1562        break;
1563    }
1564    return buf;
1565  }
1566  
1567  // Return (possibly empty) affinity mask representing the offline CPUs
1568  // Caller must free the mask
__kmp_affinity_get_offline_cpus()1569  kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
1570    kmp_affin_mask_t *offline;
1571    KMP_CPU_ALLOC(offline);
1572    KMP_CPU_ZERO(offline);
1573  #if KMP_OS_LINUX
1574    int n, begin_cpu, end_cpu;
1575    kmp_safe_raii_file_t offline_file;
1576    auto skip_ws = [](FILE *f) {
1577      int c;
1578      do {
1579        c = fgetc(f);
1580      } while (isspace(c));
1581      if (c != EOF)
1582        ungetc(c, f);
1583    };
1584    // File contains CSV of integer ranges representing the offline CPUs
1585    // e.g., 1,2,4-7,9,11-15
1586    int status = offline_file.try_open("/sys/devices/system/cpu/offline", "r");
1587    if (status != 0)
1588      return offline;
1589    while (!feof(offline_file)) {
1590      skip_ws(offline_file);
1591      n = fscanf(offline_file, "%d", &begin_cpu);
1592      if (n != 1)
1593        break;
1594      skip_ws(offline_file);
1595      int c = fgetc(offline_file);
1596      if (c == EOF || c == ',') {
1597        // Just single CPU
1598        end_cpu = begin_cpu;
1599      } else if (c == '-') {
1600        // Range of CPUs
1601        skip_ws(offline_file);
1602        n = fscanf(offline_file, "%d", &end_cpu);
1603        if (n != 1)
1604          break;
1605        skip_ws(offline_file);
1606        c = fgetc(offline_file); // skip ','
1607      } else {
1608        // Syntax problem
1609        break;
1610      }
1611      // Ensure a valid range of CPUs
1612      if (begin_cpu < 0 || begin_cpu >= __kmp_xproc || end_cpu < 0 ||
1613          end_cpu >= __kmp_xproc || begin_cpu > end_cpu) {
1614        continue;
1615      }
1616      // Insert [begin_cpu, end_cpu] into offline mask
1617      for (int cpu = begin_cpu; cpu <= end_cpu; ++cpu) {
1618        KMP_CPU_SET(cpu, offline);
1619      }
1620    }
1621  #endif
1622    return offline;
1623  }
1624  
1625  // Return the number of available procs
__kmp_affinity_entire_machine_mask(kmp_affin_mask_t * mask)1626  int __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
1627    int avail_proc = 0;
1628    KMP_CPU_ZERO(mask);
1629  
1630  #if KMP_GROUP_AFFINITY
1631  
1632    if (__kmp_num_proc_groups > 1) {
1633      int group;
1634      KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
1635      for (group = 0; group < __kmp_num_proc_groups; group++) {
1636        int i;
1637        int num = __kmp_GetActiveProcessorCount(group);
1638        for (i = 0; i < num; i++) {
1639          KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
1640          avail_proc++;
1641        }
1642      }
1643    } else
1644  
1645  #endif /* KMP_GROUP_AFFINITY */
1646  
1647    {
1648      int proc;
1649      kmp_affin_mask_t *offline_cpus = __kmp_affinity_get_offline_cpus();
1650      for (proc = 0; proc < __kmp_xproc; proc++) {
1651        // Skip offline CPUs
1652        if (KMP_CPU_ISSET(proc, offline_cpus))
1653          continue;
1654        KMP_CPU_SET(proc, mask);
1655        avail_proc++;
1656      }
1657      KMP_CPU_FREE(offline_cpus);
1658    }
1659  
1660    return avail_proc;
1661  }
1662  
1663  // All of the __kmp_affinity_create_*_map() routines should allocate the
1664  // internal topology object and set the layer ids for it.  Each routine
1665  // returns a boolean on whether it was successful at doing so.
1666  kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
1667  // Original mask is a subset of full mask in multiple processor groups topology
1668  kmp_affin_mask_t *__kmp_affin_origMask = NULL;
1669  
1670  #if KMP_USE_HWLOC
__kmp_hwloc_is_cache_type(hwloc_obj_t obj)1671  static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) {
1672  #if HWLOC_API_VERSION >= 0x00020000
1673    return hwloc_obj_type_is_cache(obj->type);
1674  #else
1675    return obj->type == HWLOC_OBJ_CACHE;
1676  #endif
1677  }
1678  
1679  // Returns KMP_HW_* type derived from HWLOC_* type
__kmp_hwloc_type_2_topology_type(hwloc_obj_t obj)1680  static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) {
1681  
1682    if (__kmp_hwloc_is_cache_type(obj)) {
1683      if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION)
1684        return KMP_HW_UNKNOWN;
1685      switch (obj->attr->cache.depth) {
1686      case 1:
1687        return KMP_HW_L1;
1688      case 2:
1689  #if KMP_MIC_SUPPORTED
1690        if (__kmp_mic_type == mic3) {
1691          return KMP_HW_TILE;
1692        }
1693  #endif
1694        return KMP_HW_L2;
1695      case 3:
1696        return KMP_HW_L3;
1697      }
1698      return KMP_HW_UNKNOWN;
1699    }
1700  
1701    switch (obj->type) {
1702    case HWLOC_OBJ_PACKAGE:
1703      return KMP_HW_SOCKET;
1704    case HWLOC_OBJ_NUMANODE:
1705      return KMP_HW_NUMA;
1706    case HWLOC_OBJ_CORE:
1707      return KMP_HW_CORE;
1708    case HWLOC_OBJ_PU:
1709      return KMP_HW_THREAD;
1710    case HWLOC_OBJ_GROUP:
1711  #if HWLOC_API_VERSION >= 0x00020000
1712      if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE)
1713        return KMP_HW_DIE;
1714      else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE)
1715        return KMP_HW_TILE;
1716      else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE)
1717        return KMP_HW_MODULE;
1718      else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP)
1719        return KMP_HW_PROC_GROUP;
1720  #endif
1721      return KMP_HW_UNKNOWN;
1722  #if HWLOC_API_VERSION >= 0x00020100
1723    case HWLOC_OBJ_DIE:
1724      return KMP_HW_DIE;
1725  #endif
1726    }
1727    return KMP_HW_UNKNOWN;
1728  }
1729  
1730  // Returns the number of objects of type 'type' below 'obj' within the topology
1731  // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
1732  // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
1733  // object.
__kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,hwloc_obj_type_t type)1734  static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
1735                                             hwloc_obj_type_t type) {
1736    int retval = 0;
1737    hwloc_obj_t first;
1738    for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
1739                                             obj->logical_index, type, 0);
1740         first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology,
1741                                                         obj->type, first) == obj;
1742         first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
1743                                            first)) {
1744      ++retval;
1745    }
1746    return retval;
1747  }
1748  
1749  // This gets the sub_id for a lower object under a higher object in the
1750  // topology tree
__kmp_hwloc_get_sub_id(hwloc_topology_t t,hwloc_obj_t higher,hwloc_obj_t lower)1751  static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher,
1752                                    hwloc_obj_t lower) {
1753    hwloc_obj_t obj;
1754    hwloc_obj_type_t ltype = lower->type;
1755    int lindex = lower->logical_index - 1;
1756    int sub_id = 0;
1757    // Get the previous lower object
1758    obj = hwloc_get_obj_by_type(t, ltype, lindex);
1759    while (obj && lindex >= 0 &&
1760           hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) {
1761      if (obj->userdata) {
1762        sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata));
1763        break;
1764      }
1765      sub_id++;
1766      lindex--;
1767      obj = hwloc_get_obj_by_type(t, ltype, lindex);
1768    }
1769    // store sub_id + 1 so that 0 is differed from NULL
1770    lower->userdata = RCAST(void *, sub_id + 1);
1771    return sub_id;
1772  }
1773  
__kmp_affinity_create_hwloc_map(kmp_i18n_id_t * const msg_id)1774  static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
1775    kmp_hw_t type;
1776    int hw_thread_index, sub_id;
1777    int depth;
1778    hwloc_obj_t pu, obj, root, prev;
1779    kmp_hw_t types[KMP_HW_LAST];
1780    hwloc_obj_type_t hwloc_types[KMP_HW_LAST];
1781  
1782    hwloc_topology_t tp = __kmp_hwloc_topology;
1783    *msg_id = kmp_i18n_null;
1784    if (__kmp_affinity.flags.verbose) {
1785      KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
1786    }
1787  
1788    if (!KMP_AFFINITY_CAPABLE()) {
1789      // Hack to try and infer the machine topology using only the data
1790      // available from hwloc on the current thread, and __kmp_xproc.
1791      KMP_ASSERT(__kmp_affinity.type == affinity_none);
1792      // hwloc only guarantees existance of PU object, so check PACKAGE and CORE
1793      hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0);
1794      if (o != NULL)
1795        nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE);
1796      else
1797        nCoresPerPkg = 1; // no PACKAGE found
1798      o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0);
1799      if (o != NULL)
1800        __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU);
1801      else
1802        __kmp_nThreadsPerCore = 1; // no CORE found
1803      if (__kmp_nThreadsPerCore == 0)
1804        __kmp_nThreadsPerCore = 1;
1805      __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1806      if (nCoresPerPkg == 0)
1807        nCoresPerPkg = 1; // to prevent possible division by 0
1808      nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1809      return true;
1810    }
1811  
1812  #if HWLOC_API_VERSION >= 0x00020400
1813    // Handle multiple types of cores if they exist on the system
1814    int nr_cpu_kinds = hwloc_cpukinds_get_nr(tp, 0);
1815  
1816    typedef struct kmp_hwloc_cpukinds_info_t {
1817      int efficiency;
1818      kmp_hw_core_type_t core_type;
1819      hwloc_bitmap_t mask;
1820    } kmp_hwloc_cpukinds_info_t;
1821    kmp_hwloc_cpukinds_info_t *cpukinds = nullptr;
1822  
1823    if (nr_cpu_kinds > 0) {
1824      unsigned nr_infos;
1825      struct hwloc_info_s *infos;
1826      cpukinds = (kmp_hwloc_cpukinds_info_t *)__kmp_allocate(
1827          sizeof(kmp_hwloc_cpukinds_info_t) * nr_cpu_kinds);
1828      for (unsigned idx = 0; idx < (unsigned)nr_cpu_kinds; ++idx) {
1829        cpukinds[idx].efficiency = -1;
1830        cpukinds[idx].core_type = KMP_HW_CORE_TYPE_UNKNOWN;
1831        cpukinds[idx].mask = hwloc_bitmap_alloc();
1832        if (hwloc_cpukinds_get_info(tp, idx, cpukinds[idx].mask,
1833                                    &cpukinds[idx].efficiency, &nr_infos, &infos,
1834                                    0) == 0) {
1835          for (unsigned i = 0; i < nr_infos; ++i) {
1836            if (__kmp_str_match("CoreType", 8, infos[i].name)) {
1837  #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1838              if (__kmp_str_match("IntelAtom", 9, infos[i].value)) {
1839                cpukinds[idx].core_type = KMP_HW_CORE_TYPE_ATOM;
1840                break;
1841              } else if (__kmp_str_match("IntelCore", 9, infos[i].value)) {
1842                cpukinds[idx].core_type = KMP_HW_CORE_TYPE_CORE;
1843                break;
1844              }
1845  #endif
1846            }
1847          }
1848        }
1849      }
1850    }
1851  #endif
1852  
1853    root = hwloc_get_root_obj(tp);
1854  
1855    // Figure out the depth and types in the topology
1856    depth = 0;
1857    obj = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
1858    while (obj && obj != root) {
1859  #if HWLOC_API_VERSION >= 0x00020000
1860      if (obj->memory_arity) {
1861        hwloc_obj_t memory;
1862        for (memory = obj->memory_first_child; memory;
1863             memory = hwloc_get_next_child(tp, obj, memory)) {
1864          if (memory->type == HWLOC_OBJ_NUMANODE)
1865            break;
1866        }
1867        if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
1868          types[depth] = KMP_HW_NUMA;
1869          hwloc_types[depth] = memory->type;
1870          depth++;
1871        }
1872      }
1873  #endif
1874      type = __kmp_hwloc_type_2_topology_type(obj);
1875      if (type != KMP_HW_UNKNOWN) {
1876        types[depth] = type;
1877        hwloc_types[depth] = obj->type;
1878        depth++;
1879      }
1880      obj = obj->parent;
1881    }
1882    KMP_ASSERT(depth > 0);
1883  
1884    // Get the order for the types correct
1885    for (int i = 0, j = depth - 1; i < j; ++i, --j) {
1886      hwloc_obj_type_t hwloc_temp = hwloc_types[i];
1887      kmp_hw_t temp = types[i];
1888      types[i] = types[j];
1889      types[j] = temp;
1890      hwloc_types[i] = hwloc_types[j];
1891      hwloc_types[j] = hwloc_temp;
1892    }
1893  
1894    // Allocate the data structure to be returned.
1895    __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
1896  
1897    hw_thread_index = 0;
1898    pu = NULL;
1899    while ((pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu))) {
1900      int index = depth - 1;
1901      bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask);
1902      kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
1903      if (included) {
1904        hw_thread.clear();
1905        hw_thread.ids[index] = pu->logical_index;
1906        hw_thread.os_id = pu->os_index;
1907        // If multiple core types, then set that attribute for the hardware thread
1908  #if HWLOC_API_VERSION >= 0x00020400
1909        if (cpukinds) {
1910          int cpukind_index = -1;
1911          for (int i = 0; i < nr_cpu_kinds; ++i) {
1912            if (hwloc_bitmap_isset(cpukinds[i].mask, hw_thread.os_id)) {
1913              cpukind_index = i;
1914              break;
1915            }
1916          }
1917          if (cpukind_index >= 0) {
1918            hw_thread.attrs.set_core_type(cpukinds[cpukind_index].core_type);
1919            hw_thread.attrs.set_core_eff(cpukinds[cpukind_index].efficiency);
1920          }
1921        }
1922  #endif
1923        index--;
1924      }
1925      obj = pu;
1926      prev = obj;
1927      while (obj != root && obj != NULL) {
1928        obj = obj->parent;
1929  #if HWLOC_API_VERSION >= 0x00020000
1930        // NUMA Nodes are handled differently since they are not within the
1931        // parent/child structure anymore.  They are separate children
1932        // of obj (memory_first_child points to first memory child)
1933        if (obj->memory_arity) {
1934          hwloc_obj_t memory;
1935          for (memory = obj->memory_first_child; memory;
1936               memory = hwloc_get_next_child(tp, obj, memory)) {
1937            if (memory->type == HWLOC_OBJ_NUMANODE)
1938              break;
1939          }
1940          if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
1941            sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev);
1942            if (included) {
1943              hw_thread.ids[index] = memory->logical_index;
1944              hw_thread.ids[index + 1] = sub_id;
1945              index--;
1946            }
1947            prev = memory;
1948          }
1949          prev = obj;
1950        }
1951  #endif
1952        type = __kmp_hwloc_type_2_topology_type(obj);
1953        if (type != KMP_HW_UNKNOWN) {
1954          sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev);
1955          if (included) {
1956            hw_thread.ids[index] = obj->logical_index;
1957            hw_thread.ids[index + 1] = sub_id;
1958            index--;
1959          }
1960          prev = obj;
1961        }
1962      }
1963      if (included)
1964        hw_thread_index++;
1965    }
1966  
1967  #if HWLOC_API_VERSION >= 0x00020400
1968    // Free the core types information
1969    if (cpukinds) {
1970      for (int idx = 0; idx < nr_cpu_kinds; ++idx)
1971        hwloc_bitmap_free(cpukinds[idx].mask);
1972      __kmp_free(cpukinds);
1973    }
1974  #endif
1975    __kmp_topology->sort_ids();
1976    return true;
1977  }
1978  #endif // KMP_USE_HWLOC
1979  
1980  // If we don't know how to retrieve the machine's processor topology, or
1981  // encounter an error in doing so, this routine is called to form a "flat"
1982  // mapping of os thread id's <-> processor id's.
__kmp_affinity_create_flat_map(kmp_i18n_id_t * const msg_id)1983  static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) {
1984    *msg_id = kmp_i18n_null;
1985    int depth = 3;
1986    kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1987  
1988    if (__kmp_affinity.flags.verbose) {
1989      KMP_INFORM(UsingFlatOS, "KMP_AFFINITY");
1990    }
1991  
1992    // Even if __kmp_affinity.type == affinity_none, this routine might still
1993    // be called to set __kmp_ncores, as well as
1994    // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1995    if (!KMP_AFFINITY_CAPABLE()) {
1996      KMP_ASSERT(__kmp_affinity.type == affinity_none);
1997      __kmp_ncores = nPackages = __kmp_xproc;
1998      __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1999      return true;
2000    }
2001  
2002    // When affinity is off, this routine will still be called to set
2003    // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
2004    // Make sure all these vars are set correctly, and return now if affinity is
2005    // not enabled.
2006    __kmp_ncores = nPackages = __kmp_avail_proc;
2007    __kmp_nThreadsPerCore = nCoresPerPkg = 1;
2008  
2009    // Construct the data structure to be returned.
2010    __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
2011    int avail_ct = 0;
2012    int i;
2013    KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
2014      // Skip this proc if it is not included in the machine model.
2015      if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
2016        continue;
2017      }
2018      kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
2019      hw_thread.clear();
2020      hw_thread.os_id = i;
2021      hw_thread.ids[0] = i;
2022      hw_thread.ids[1] = 0;
2023      hw_thread.ids[2] = 0;
2024      avail_ct++;
2025    }
2026    if (__kmp_affinity.flags.verbose) {
2027      KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
2028    }
2029    return true;
2030  }
2031  
2032  #if KMP_GROUP_AFFINITY
2033  // If multiple Windows* OS processor groups exist, we can create a 2-level
2034  // topology map with the groups at level 0 and the individual procs at level 1.
2035  // This facilitates letting the threads float among all procs in a group,
2036  // if granularity=group (the default when there are multiple groups).
__kmp_affinity_create_proc_group_map(kmp_i18n_id_t * const msg_id)2037  static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
2038    *msg_id = kmp_i18n_null;
2039    int depth = 3;
2040    kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD};
2041    const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR);
2042  
2043    if (__kmp_affinity.flags.verbose) {
2044      KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
2045    }
2046  
2047    // If we aren't affinity capable, then use flat topology
2048    if (!KMP_AFFINITY_CAPABLE()) {
2049      KMP_ASSERT(__kmp_affinity.type == affinity_none);
2050      nPackages = __kmp_num_proc_groups;
2051      __kmp_nThreadsPerCore = 1;
2052      __kmp_ncores = __kmp_xproc;
2053      nCoresPerPkg = nPackages / __kmp_ncores;
2054      return true;
2055    }
2056  
2057    // Construct the data structure to be returned.
2058    __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
2059    int avail_ct = 0;
2060    int i;
2061    KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
2062      // Skip this proc if it is not included in the machine model.
2063      if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
2064        continue;
2065      }
2066      kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++);
2067      hw_thread.clear();
2068      hw_thread.os_id = i;
2069      hw_thread.ids[0] = i / BITS_PER_GROUP;
2070      hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP;
2071    }
2072    return true;
2073  }
2074  #endif /* KMP_GROUP_AFFINITY */
2075  
2076  #if KMP_ARCH_X86 || KMP_ARCH_X86_64
2077  
2078  template <kmp_uint32 LSB, kmp_uint32 MSB>
__kmp_extract_bits(kmp_uint32 v)2079  static inline unsigned __kmp_extract_bits(kmp_uint32 v) {
2080    const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB;
2081    const kmp_uint32 SHIFT_RIGHT = LSB;
2082    kmp_uint32 retval = v;
2083    retval <<= SHIFT_LEFT;
2084    retval >>= (SHIFT_LEFT + SHIFT_RIGHT);
2085    return retval;
2086  }
2087  
__kmp_cpuid_mask_width(int count)2088  static int __kmp_cpuid_mask_width(int count) {
2089    int r = 0;
2090  
2091    while ((1 << r) < count)
2092      ++r;
2093    return r;
2094  }
2095  
2096  class apicThreadInfo {
2097  public:
2098    unsigned osId; // param to __kmp_affinity_bind_thread
2099    unsigned apicId; // from cpuid after binding
2100    unsigned maxCoresPerPkg; //      ""
2101    unsigned maxThreadsPerPkg; //      ""
2102    unsigned pkgId; // inferred from above values
2103    unsigned coreId; //      ""
2104    unsigned threadId; //      ""
2105  };
2106  
__kmp_affinity_cmp_apicThreadInfo_phys_id(const void * a,const void * b)2107  static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
2108                                                       const void *b) {
2109    const apicThreadInfo *aa = (const apicThreadInfo *)a;
2110    const apicThreadInfo *bb = (const apicThreadInfo *)b;
2111    if (aa->pkgId < bb->pkgId)
2112      return -1;
2113    if (aa->pkgId > bb->pkgId)
2114      return 1;
2115    if (aa->coreId < bb->coreId)
2116      return -1;
2117    if (aa->coreId > bb->coreId)
2118      return 1;
2119    if (aa->threadId < bb->threadId)
2120      return -1;
2121    if (aa->threadId > bb->threadId)
2122      return 1;
2123    return 0;
2124  }
2125  
2126  class kmp_cache_info_t {
2127  public:
2128    struct info_t {
2129      unsigned level, mask;
2130    };
kmp_cache_info_t()2131    kmp_cache_info_t() : depth(0) { get_leaf4_levels(); }
get_depth() const2132    size_t get_depth() const { return depth; }
operator [](size_t index)2133    info_t &operator[](size_t index) { return table[index]; }
operator [](size_t index) const2134    const info_t &operator[](size_t index) const { return table[index]; }
2135  
get_topology_type(unsigned level)2136    static kmp_hw_t get_topology_type(unsigned level) {
2137      KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL);
2138      switch (level) {
2139      case 1:
2140        return KMP_HW_L1;
2141      case 2:
2142        return KMP_HW_L2;
2143      case 3:
2144        return KMP_HW_L3;
2145      }
2146      return KMP_HW_UNKNOWN;
2147    }
2148  
2149  private:
2150    static const int MAX_CACHE_LEVEL = 3;
2151  
2152    size_t depth;
2153    info_t table[MAX_CACHE_LEVEL];
2154  
get_leaf4_levels()2155    void get_leaf4_levels() {
2156      unsigned level = 0;
2157      while (depth < MAX_CACHE_LEVEL) {
2158        unsigned cache_type, max_threads_sharing;
2159        unsigned cache_level, cache_mask_width;
2160        kmp_cpuid buf2;
2161        __kmp_x86_cpuid(4, level, &buf2);
2162        cache_type = __kmp_extract_bits<0, 4>(buf2.eax);
2163        if (!cache_type)
2164          break;
2165        // Skip instruction caches
2166        if (cache_type == 2) {
2167          level++;
2168          continue;
2169        }
2170        max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1;
2171        cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing);
2172        cache_level = __kmp_extract_bits<5, 7>(buf2.eax);
2173        table[depth].level = cache_level;
2174        table[depth].mask = ((-1) << cache_mask_width);
2175        depth++;
2176        level++;
2177      }
2178    }
2179  };
2180  
2181  // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
2182  // an algorithm which cycles through the available os threads, setting
2183  // the current thread's affinity mask to that thread, and then retrieves
2184  // the Apic Id for each thread context using the cpuid instruction.
__kmp_affinity_create_apicid_map(kmp_i18n_id_t * const msg_id)2185  static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
2186    kmp_cpuid buf;
2187    *msg_id = kmp_i18n_null;
2188  
2189    if (__kmp_affinity.flags.verbose) {
2190      KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
2191    }
2192  
2193    // Check if cpuid leaf 4 is supported.
2194    __kmp_x86_cpuid(0, 0, &buf);
2195    if (buf.eax < 4) {
2196      *msg_id = kmp_i18n_str_NoLeaf4Support;
2197      return false;
2198    }
2199  
2200    // The algorithm used starts by setting the affinity to each available thread
2201    // and retrieving info from the cpuid instruction, so if we are not capable of
2202    // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
2203    // need to do something else - use the defaults that we calculated from
2204    // issuing cpuid without binding to each proc.
2205    if (!KMP_AFFINITY_CAPABLE()) {
2206      // Hack to try and infer the machine topology using only the data
2207      // available from cpuid on the current thread, and __kmp_xproc.
2208      KMP_ASSERT(__kmp_affinity.type == affinity_none);
2209  
2210      // Get an upper bound on the number of threads per package using cpuid(1).
2211      // On some OS/chps combinations where HT is supported by the chip but is
2212      // disabled, this value will be 2 on a single core chip. Usually, it will be
2213      // 2 if HT is enabled and 1 if HT is disabled.
2214      __kmp_x86_cpuid(1, 0, &buf);
2215      int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
2216      if (maxThreadsPerPkg == 0) {
2217        maxThreadsPerPkg = 1;
2218      }
2219  
2220      // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded
2221      // value.
2222      //
2223      // The author of cpu_count.cpp treated this only an upper bound on the
2224      // number of cores, but I haven't seen any cases where it was greater than
2225      // the actual number of cores, so we will treat it as exact in this block of
2226      // code.
2227      //
2228      // First, we need to check if cpuid(4) is supported on this chip. To see if
2229      // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or
2230      // greater.
2231      __kmp_x86_cpuid(0, 0, &buf);
2232      if (buf.eax >= 4) {
2233        __kmp_x86_cpuid(4, 0, &buf);
2234        nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
2235      } else {
2236        nCoresPerPkg = 1;
2237      }
2238  
2239      // There is no way to reliably tell if HT is enabled without issuing the
2240      // cpuid instruction from every thread, can correlating the cpuid info, so
2241      // if the machine is not affinity capable, we assume that HT is off. We have
2242      // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine
2243      // does not support HT.
2244      //
2245      // - Older OSes are usually found on machines with older chips, which do not
2246      //   support HT.
2247      // - The performance penalty for mistakenly identifying a machine as HT when
2248      //   it isn't (which results in blocktime being incorrectly set to 0) is
2249      //   greater than the penalty when for mistakenly identifying a machine as
2250      //   being 1 thread/core when it is really HT enabled (which results in
2251      //   blocktime being incorrectly set to a positive value).
2252      __kmp_ncores = __kmp_xproc;
2253      nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
2254      __kmp_nThreadsPerCore = 1;
2255      return true;
2256    }
2257  
2258    // From here on, we can assume that it is safe to call
2259    // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
2260    // __kmp_affinity.type = affinity_none.
2261  
2262    // Save the affinity mask for the current thread.
2263    kmp_affinity_raii_t previous_affinity;
2264  
2265    // Run through each of the available contexts, binding the current thread
2266    // to it, and obtaining the pertinent information using the cpuid instr.
2267    //
2268    // The relevant information is:
2269    // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
2270    //     has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
2271    // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
2272    //     of this field determines the width of the core# + thread# fields in the
2273    //     Apic Id. It is also an upper bound on the number of threads per
2274    //     package, but it has been verified that situations happen were it is not
2275    //     exact. In particular, on certain OS/chip combinations where Intel(R)
2276    //     Hyper-Threading Technology is supported by the chip but has been
2277    //     disabled, the value of this field will be 2 (for a single core chip).
2278    //     On other OS/chip combinations supporting Intel(R) Hyper-Threading
2279    //     Technology, the value of this field will be 1 when Intel(R)
2280    //     Hyper-Threading Technology is disabled and 2 when it is enabled.
2281    // - Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4). The value
2282    //     of this field (+1) determines the width of the core# field in the Apic
2283    //     Id. The comments in "cpucount.cpp" say that this value is an upper
2284    //     bound, but the IA-32 architecture manual says that it is exactly the
2285    //     number of cores per package, and I haven't seen any case where it
2286    //     wasn't.
2287    //
2288    // From this information, deduce the package Id, core Id, and thread Id,
2289    // and set the corresponding fields in the apicThreadInfo struct.
2290    unsigned i;
2291    apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
2292        __kmp_avail_proc * sizeof(apicThreadInfo));
2293    unsigned nApics = 0;
2294    KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
2295      // Skip this proc if it is not included in the machine model.
2296      if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
2297        continue;
2298      }
2299      KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
2300  
2301      __kmp_affinity_dispatch->bind_thread(i);
2302      threadInfo[nApics].osId = i;
2303  
2304      // The apic id and max threads per pkg come from cpuid(1).
2305      __kmp_x86_cpuid(1, 0, &buf);
2306      if (((buf.edx >> 9) & 1) == 0) {
2307        __kmp_free(threadInfo);
2308        *msg_id = kmp_i18n_str_ApicNotPresent;
2309        return false;
2310      }
2311      threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
2312      threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
2313      if (threadInfo[nApics].maxThreadsPerPkg == 0) {
2314        threadInfo[nApics].maxThreadsPerPkg = 1;
2315      }
2316  
2317      // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded
2318      // value.
2319      //
2320      // First, we need to check if cpuid(4) is supported on this chip. To see if
2321      // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n
2322      // or greater.
2323      __kmp_x86_cpuid(0, 0, &buf);
2324      if (buf.eax >= 4) {
2325        __kmp_x86_cpuid(4, 0, &buf);
2326        threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
2327      } else {
2328        threadInfo[nApics].maxCoresPerPkg = 1;
2329      }
2330  
2331      // Infer the pkgId / coreId / threadId using only the info obtained locally.
2332      int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg);
2333      threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
2334  
2335      int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg);
2336      int widthT = widthCT - widthC;
2337      if (widthT < 0) {
2338        // I've never seen this one happen, but I suppose it could, if the cpuid
2339        // instruction on a chip was really screwed up. Make sure to restore the
2340        // affinity mask before the tail call.
2341        __kmp_free(threadInfo);
2342        *msg_id = kmp_i18n_str_InvalidCpuidInfo;
2343        return false;
2344      }
2345  
2346      int maskC = (1 << widthC) - 1;
2347      threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC;
2348  
2349      int maskT = (1 << widthT) - 1;
2350      threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT;
2351  
2352      nApics++;
2353    }
2354  
2355    // We've collected all the info we need.
2356    // Restore the old affinity mask for this thread.
2357    previous_affinity.restore();
2358  
2359    // Sort the threadInfo table by physical Id.
2360    qsort(threadInfo, nApics, sizeof(*threadInfo),
2361          __kmp_affinity_cmp_apicThreadInfo_phys_id);
2362  
2363    // The table is now sorted by pkgId / coreId / threadId, but we really don't
2364    // know the radix of any of the fields. pkgId's may be sparsely assigned among
2365    // the chips on a system. Although coreId's are usually assigned
2366    // [0 .. coresPerPkg-1] and threadId's are usually assigned
2367    // [0..threadsPerCore-1], we don't want to make any such assumptions.
2368    //
2369    // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
2370    // total # packages) are at this point - we want to determine that now. We
2371    // only have an upper bound on the first two figures.
2372    //
2373    // We also perform a consistency check at this point: the values returned by
2374    // the cpuid instruction for any thread bound to a given package had better
2375    // return the same info for maxThreadsPerPkg and maxCoresPerPkg.
2376    nPackages = 1;
2377    nCoresPerPkg = 1;
2378    __kmp_nThreadsPerCore = 1;
2379    unsigned nCores = 1;
2380  
2381    unsigned pkgCt = 1; // to determine radii
2382    unsigned lastPkgId = threadInfo[0].pkgId;
2383    unsigned coreCt = 1;
2384    unsigned lastCoreId = threadInfo[0].coreId;
2385    unsigned threadCt = 1;
2386    unsigned lastThreadId = threadInfo[0].threadId;
2387  
2388    // intra-pkg consist checks
2389    unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
2390    unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
2391  
2392    for (i = 1; i < nApics; i++) {
2393      if (threadInfo[i].pkgId != lastPkgId) {
2394        nCores++;
2395        pkgCt++;
2396        lastPkgId = threadInfo[i].pkgId;
2397        if ((int)coreCt > nCoresPerPkg)
2398          nCoresPerPkg = coreCt;
2399        coreCt = 1;
2400        lastCoreId = threadInfo[i].coreId;
2401        if ((int)threadCt > __kmp_nThreadsPerCore)
2402          __kmp_nThreadsPerCore = threadCt;
2403        threadCt = 1;
2404        lastThreadId = threadInfo[i].threadId;
2405  
2406        // This is a different package, so go on to the next iteration without
2407        // doing any consistency checks. Reset the consistency check vars, though.
2408        prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
2409        prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
2410        continue;
2411      }
2412  
2413      if (threadInfo[i].coreId != lastCoreId) {
2414        nCores++;
2415        coreCt++;
2416        lastCoreId = threadInfo[i].coreId;
2417        if ((int)threadCt > __kmp_nThreadsPerCore)
2418          __kmp_nThreadsPerCore = threadCt;
2419        threadCt = 1;
2420        lastThreadId = threadInfo[i].threadId;
2421      } else if (threadInfo[i].threadId != lastThreadId) {
2422        threadCt++;
2423        lastThreadId = threadInfo[i].threadId;
2424      } else {
2425        __kmp_free(threadInfo);
2426        *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
2427        return false;
2428      }
2429  
2430      // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
2431      // fields agree between all the threads bounds to a given package.
2432      if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) ||
2433          (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
2434        __kmp_free(threadInfo);
2435        *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
2436        return false;
2437      }
2438    }
2439    // When affinity is off, this routine will still be called to set
2440    // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
2441    // Make sure all these vars are set correctly
2442    nPackages = pkgCt;
2443    if ((int)coreCt > nCoresPerPkg)
2444      nCoresPerPkg = coreCt;
2445    if ((int)threadCt > __kmp_nThreadsPerCore)
2446      __kmp_nThreadsPerCore = threadCt;
2447    __kmp_ncores = nCores;
2448    KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc);
2449  
2450    // Now that we've determined the number of packages, the number of cores per
2451    // package, and the number of threads per core, we can construct the data
2452    // structure that is to be returned.
2453    int idx = 0;
2454    int pkgLevel = 0;
2455    int coreLevel = 1;
2456    int threadLevel = 2;
2457    //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
2458    int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
2459    kmp_hw_t types[3];
2460    if (pkgLevel >= 0)
2461      types[idx++] = KMP_HW_SOCKET;
2462    if (coreLevel >= 0)
2463      types[idx++] = KMP_HW_CORE;
2464    if (threadLevel >= 0)
2465      types[idx++] = KMP_HW_THREAD;
2466  
2467    KMP_ASSERT(depth > 0);
2468    __kmp_topology = kmp_topology_t::allocate(nApics, depth, types);
2469  
2470    for (i = 0; i < nApics; ++i) {
2471      idx = 0;
2472      unsigned os = threadInfo[i].osId;
2473      kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
2474      hw_thread.clear();
2475  
2476      if (pkgLevel >= 0) {
2477        hw_thread.ids[idx++] = threadInfo[i].pkgId;
2478      }
2479      if (coreLevel >= 0) {
2480        hw_thread.ids[idx++] = threadInfo[i].coreId;
2481      }
2482      if (threadLevel >= 0) {
2483        hw_thread.ids[idx++] = threadInfo[i].threadId;
2484      }
2485      hw_thread.os_id = os;
2486    }
2487  
2488    __kmp_free(threadInfo);
2489    __kmp_topology->sort_ids();
2490    if (!__kmp_topology->check_ids()) {
2491      kmp_topology_t::deallocate(__kmp_topology);
2492      __kmp_topology = nullptr;
2493      *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
2494      return false;
2495    }
2496    return true;
2497  }
2498  
2499  // Hybrid cpu detection using CPUID.1A
2500  // Thread should be pinned to processor already
__kmp_get_hybrid_info(kmp_hw_core_type_t * type,int * efficiency,unsigned * native_model_id)2501  static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, int *efficiency,
2502                                    unsigned *native_model_id) {
2503    kmp_cpuid buf;
2504    __kmp_x86_cpuid(0x1a, 0, &buf);
2505    *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax);
2506    switch (*type) {
2507    case KMP_HW_CORE_TYPE_ATOM:
2508      *efficiency = 0;
2509      break;
2510    case KMP_HW_CORE_TYPE_CORE:
2511      *efficiency = 1;
2512      break;
2513    default:
2514      *efficiency = 0;
2515    }
2516    *native_model_id = __kmp_extract_bits<0, 23>(buf.eax);
2517  }
2518  
2519  // Intel(R) microarchitecture code name Nehalem, Dunnington and later
2520  // architectures support a newer interface for specifying the x2APIC Ids,
2521  // based on CPUID.B or CPUID.1F
2522  /*
2523   * CPUID.B or 1F, Input ECX (sub leaf # aka level number)
2524      Bits            Bits            Bits           Bits
2525      31-16           15-8            7-4            4-0
2526  ---+-----------+--------------+-------------+-----------------+
2527  EAX| reserved  |   reserved   |   reserved  |  Bits to Shift  |
2528  ---+-----------|--------------+-------------+-----------------|
2529  EBX| reserved  | Num logical processors at level (16 bits)    |
2530  ---+-----------|--------------+-------------------------------|
2531  ECX| reserved  |   Level Type |      Level Number (8 bits)    |
2532  ---+-----------+--------------+-------------------------------|
2533  EDX|                    X2APIC ID (32 bits)                   |
2534  ---+----------------------------------------------------------+
2535  */
2536  
2537  enum {
2538    INTEL_LEVEL_TYPE_INVALID = 0, // Package level
2539    INTEL_LEVEL_TYPE_SMT = 1,
2540    INTEL_LEVEL_TYPE_CORE = 2,
2541    INTEL_LEVEL_TYPE_MODULE = 3,
2542    INTEL_LEVEL_TYPE_TILE = 4,
2543    INTEL_LEVEL_TYPE_DIE = 5,
2544    INTEL_LEVEL_TYPE_LAST = 6,
2545  };
2546  
2547  struct cpuid_level_info_t {
2548    unsigned level_type, mask, mask_width, nitems, cache_mask;
2549  };
2550  
__kmp_intel_type_2_topology_type(int intel_type)2551  static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
2552    switch (intel_type) {
2553    case INTEL_LEVEL_TYPE_INVALID:
2554      return KMP_HW_SOCKET;
2555    case INTEL_LEVEL_TYPE_SMT:
2556      return KMP_HW_THREAD;
2557    case INTEL_LEVEL_TYPE_CORE:
2558      return KMP_HW_CORE;
2559    case INTEL_LEVEL_TYPE_TILE:
2560      return KMP_HW_TILE;
2561    case INTEL_LEVEL_TYPE_MODULE:
2562      return KMP_HW_MODULE;
2563    case INTEL_LEVEL_TYPE_DIE:
2564      return KMP_HW_DIE;
2565    }
2566    return KMP_HW_UNKNOWN;
2567  }
2568  
2569  // This function takes the topology leaf, a levels array to store the levels
2570  // detected and a bitmap of the known levels.
2571  // Returns the number of levels in the topology
2572  static unsigned
__kmp_x2apicid_get_levels(int leaf,cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],kmp_uint64 known_levels)2573  __kmp_x2apicid_get_levels(int leaf,
2574                            cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],
2575                            kmp_uint64 known_levels) {
2576    unsigned level, levels_index;
2577    unsigned level_type, mask_width, nitems;
2578    kmp_cpuid buf;
2579  
2580    // New algorithm has known topology layers act as highest unknown topology
2581    // layers when unknown topology layers exist.
2582    // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z>
2583    // are unknown topology layers, Then SMT will take the characteristics of
2584    // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>).
2585    // This eliminates unknown portions of the topology while still keeping the
2586    // correct structure.
2587    level = levels_index = 0;
2588    do {
2589      __kmp_x86_cpuid(leaf, level, &buf);
2590      level_type = __kmp_extract_bits<8, 15>(buf.ecx);
2591      mask_width = __kmp_extract_bits<0, 4>(buf.eax);
2592      nitems = __kmp_extract_bits<0, 15>(buf.ebx);
2593      if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0)
2594        return 0;
2595  
2596      if (known_levels & (1ull << level_type)) {
2597        // Add a new level to the topology
2598        KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST);
2599        levels[levels_index].level_type = level_type;
2600        levels[levels_index].mask_width = mask_width;
2601        levels[levels_index].nitems = nitems;
2602        levels_index++;
2603      } else {
2604        // If it is an unknown level, then logically move the previous layer up
2605        if (levels_index > 0) {
2606          levels[levels_index - 1].mask_width = mask_width;
2607          levels[levels_index - 1].nitems = nitems;
2608        }
2609      }
2610      level++;
2611    } while (level_type != INTEL_LEVEL_TYPE_INVALID);
2612  
2613    // Ensure the INTEL_LEVEL_TYPE_INVALID (Socket) layer isn't first
2614    if (levels_index == 0 || levels[0].level_type == INTEL_LEVEL_TYPE_INVALID)
2615      return 0;
2616  
2617    // Set the masks to & with apicid
2618    for (unsigned i = 0; i < levels_index; ++i) {
2619      if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) {
2620        levels[i].mask = ~((-1) << levels[i].mask_width);
2621        levels[i].cache_mask = (-1) << levels[i].mask_width;
2622        for (unsigned j = 0; j < i; ++j)
2623          levels[i].mask ^= levels[j].mask;
2624      } else {
2625        KMP_DEBUG_ASSERT(i > 0);
2626        levels[i].mask = (-1) << levels[i - 1].mask_width;
2627        levels[i].cache_mask = 0;
2628      }
2629    }
2630    return levels_index;
2631  }
2632  
__kmp_affinity_create_x2apicid_map(kmp_i18n_id_t * const msg_id)2633  static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
2634  
2635    cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
2636    kmp_hw_t types[INTEL_LEVEL_TYPE_LAST];
2637    unsigned levels_index;
2638    kmp_cpuid buf;
2639    kmp_uint64 known_levels;
2640    int topology_leaf, highest_leaf, apic_id;
2641    int num_leaves;
2642    static int leaves[] = {0, 0};
2643  
2644    kmp_i18n_id_t leaf_message_id;
2645  
2646    KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST);
2647  
2648    *msg_id = kmp_i18n_null;
2649    if (__kmp_affinity.flags.verbose) {
2650      KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
2651    }
2652  
2653    // Figure out the known topology levels
2654    known_levels = 0ull;
2655    for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) {
2656      if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) {
2657        known_levels |= (1ull << i);
2658      }
2659    }
2660  
2661    // Get the highest cpuid leaf supported
2662    __kmp_x86_cpuid(0, 0, &buf);
2663    highest_leaf = buf.eax;
2664  
2665    // If a specific topology method was requested, only allow that specific leaf
2666    // otherwise, try both leaves 31 and 11 in that order
2667    num_leaves = 0;
2668    if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
2669      num_leaves = 1;
2670      leaves[0] = 11;
2671      leaf_message_id = kmp_i18n_str_NoLeaf11Support;
2672    } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
2673      num_leaves = 1;
2674      leaves[0] = 31;
2675      leaf_message_id = kmp_i18n_str_NoLeaf31Support;
2676    } else {
2677      num_leaves = 2;
2678      leaves[0] = 31;
2679      leaves[1] = 11;
2680      leaf_message_id = kmp_i18n_str_NoLeaf11Support;
2681    }
2682  
2683    // Check to see if cpuid leaf 31 or 11 is supported.
2684    __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2685    topology_leaf = -1;
2686    for (int i = 0; i < num_leaves; ++i) {
2687      int leaf = leaves[i];
2688      if (highest_leaf < leaf)
2689        continue;
2690      __kmp_x86_cpuid(leaf, 0, &buf);
2691      if (buf.ebx == 0)
2692        continue;
2693      topology_leaf = leaf;
2694      levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels);
2695      if (levels_index == 0)
2696        continue;
2697      break;
2698    }
2699    if (topology_leaf == -1 || levels_index == 0) {
2700      *msg_id = leaf_message_id;
2701      return false;
2702    }
2703    KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
2704  
2705    // The algorithm used starts by setting the affinity to each available thread
2706    // and retrieving info from the cpuid instruction, so if we are not capable of
2707    // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then
2708    // we need to do something else - use the defaults that we calculated from
2709    // issuing cpuid without binding to each proc.
2710    if (!KMP_AFFINITY_CAPABLE()) {
2711      // Hack to try and infer the machine topology using only the data
2712      // available from cpuid on the current thread, and __kmp_xproc.
2713      KMP_ASSERT(__kmp_affinity.type == affinity_none);
2714      for (unsigned i = 0; i < levels_index; ++i) {
2715        if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
2716          __kmp_nThreadsPerCore = levels[i].nitems;
2717        } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
2718          nCoresPerPkg = levels[i].nitems;
2719        }
2720      }
2721      __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
2722      nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
2723      return true;
2724    }
2725  
2726    // Allocate the data structure to be returned.
2727    int depth = levels_index;
2728    for (int i = depth - 1, j = 0; i >= 0; --i, ++j)
2729      types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type);
2730    __kmp_topology =
2731        kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types);
2732  
2733    // Insert equivalent cache types if they exist
2734    kmp_cache_info_t cache_info;
2735    for (size_t i = 0; i < cache_info.get_depth(); ++i) {
2736      const kmp_cache_info_t::info_t &info = cache_info[i];
2737      unsigned cache_mask = info.mask;
2738      unsigned cache_level = info.level;
2739      for (unsigned j = 0; j < levels_index; ++j) {
2740        unsigned hw_cache_mask = levels[j].cache_mask;
2741        kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level);
2742        if (hw_cache_mask == cache_mask && j < levels_index - 1) {
2743          kmp_hw_t type =
2744              __kmp_intel_type_2_topology_type(levels[j + 1].level_type);
2745          __kmp_topology->set_equivalent_type(cache_type, type);
2746        }
2747      }
2748    }
2749  
2750    // From here on, we can assume that it is safe to call
2751    // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
2752    // __kmp_affinity.type = affinity_none.
2753  
2754    // Save the affinity mask for the current thread.
2755    kmp_affinity_raii_t previous_affinity;
2756  
2757    // Run through each of the available contexts, binding the current thread
2758    // to it, and obtaining the pertinent information using the cpuid instr.
2759    unsigned int proc;
2760    int hw_thread_index = 0;
2761    KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
2762      cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST];
2763      unsigned my_levels_index;
2764  
2765      // Skip this proc if it is not included in the machine model.
2766      if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
2767        continue;
2768      }
2769      KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc);
2770  
2771      __kmp_affinity_dispatch->bind_thread(proc);
2772  
2773      // New algorithm
2774      __kmp_x86_cpuid(topology_leaf, 0, &buf);
2775      apic_id = buf.edx;
2776      kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
2777      my_levels_index =
2778          __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels);
2779      if (my_levels_index == 0 || my_levels_index != levels_index) {
2780        *msg_id = kmp_i18n_str_InvalidCpuidInfo;
2781        return false;
2782      }
2783      hw_thread.clear();
2784      hw_thread.os_id = proc;
2785      // Put in topology information
2786      for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) {
2787        hw_thread.ids[idx] = apic_id & my_levels[j].mask;
2788        if (j > 0) {
2789          hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
2790        }
2791      }
2792      // Hybrid information
2793      if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) {
2794        kmp_hw_core_type_t type;
2795        unsigned native_model_id;
2796        int efficiency;
2797        __kmp_get_hybrid_info(&type, &efficiency, &native_model_id);
2798        hw_thread.attrs.set_core_type(type);
2799        hw_thread.attrs.set_core_eff(efficiency);
2800      }
2801      hw_thread_index++;
2802    }
2803    KMP_ASSERT(hw_thread_index > 0);
2804    __kmp_topology->sort_ids();
2805    if (!__kmp_topology->check_ids()) {
2806      kmp_topology_t::deallocate(__kmp_topology);
2807      __kmp_topology = nullptr;
2808      *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
2809      return false;
2810    }
2811    return true;
2812  }
2813  #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
2814  
2815  #define osIdIndex 0
2816  #define threadIdIndex 1
2817  #define coreIdIndex 2
2818  #define pkgIdIndex 3
2819  #define nodeIdIndex 4
2820  
2821  typedef unsigned *ProcCpuInfo;
2822  static unsigned maxIndex = pkgIdIndex;
2823  
__kmp_affinity_cmp_ProcCpuInfo_phys_id(const void * a,const void * b)2824  static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
2825                                                    const void *b) {
2826    unsigned i;
2827    const unsigned *aa = *(unsigned *const *)a;
2828    const unsigned *bb = *(unsigned *const *)b;
2829    for (i = maxIndex;; i--) {
2830      if (aa[i] < bb[i])
2831        return -1;
2832      if (aa[i] > bb[i])
2833        return 1;
2834      if (i == osIdIndex)
2835        break;
2836    }
2837    return 0;
2838  }
2839  
2840  #if KMP_USE_HIER_SCHED
2841  // Set the array sizes for the hierarchy layers
__kmp_dispatch_set_hierarchy_values()2842  static void __kmp_dispatch_set_hierarchy_values() {
2843    // Set the maximum number of L1's to number of cores
2844    // Set the maximum number of L2's to either number of cores / 2 for
2845    // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
2846    // Or the number of cores for Intel(R) Xeon(R) processors
2847    // Set the maximum number of NUMA nodes and L3's to number of packages
2848    __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
2849        nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
2850    __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
2851  #if KMP_ARCH_X86_64 &&                                                         \
2852      (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||    \
2853       KMP_OS_WINDOWS) &&                                                        \
2854      KMP_MIC_SUPPORTED
2855    if (__kmp_mic_type >= mic3)
2856      __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
2857    else
2858  #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2859      __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores;
2860    __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages;
2861    __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages;
2862    __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1;
2863    // Set the number of threads per unit
2864    // Number of hardware threads per L1/L2/L3/NUMA/LOOP
2865    __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
2866    __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
2867        __kmp_nThreadsPerCore;
2868  #if KMP_ARCH_X86_64 &&                                                         \
2869      (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||    \
2870       KMP_OS_WINDOWS) &&                                                        \
2871      KMP_MIC_SUPPORTED
2872    if (__kmp_mic_type >= mic3)
2873      __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2874          2 * __kmp_nThreadsPerCore;
2875    else
2876  #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2877      __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2878          __kmp_nThreadsPerCore;
2879    __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] =
2880        nCoresPerPkg * __kmp_nThreadsPerCore;
2881    __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] =
2882        nCoresPerPkg * __kmp_nThreadsPerCore;
2883    __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] =
2884        nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
2885  }
2886  
2887  // Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
2888  // i.e., this thread's L1 or this thread's L2, etc.
__kmp_dispatch_get_index(int tid,kmp_hier_layer_e type)2889  int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) {
2890    int index = type + 1;
2891    int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
2892    KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST);
2893    if (type == kmp_hier_layer_e::LAYER_THREAD)
2894      return tid;
2895    else if (type == kmp_hier_layer_e::LAYER_LOOP)
2896      return 0;
2897    KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0);
2898    if (tid >= num_hw_threads)
2899      tid = tid % num_hw_threads;
2900    return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index];
2901  }
2902  
2903  // Return the number of t1's per t2
__kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1,kmp_hier_layer_e t2)2904  int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
2905    int i1 = t1 + 1;
2906    int i2 = t2 + 1;
2907    KMP_DEBUG_ASSERT(i1 <= i2);
2908    KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST);
2909    KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST);
2910    KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0);
2911    // (nthreads/t2) / (nthreads/t1) = t1 / t2
2912    return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1];
2913  }
2914  #endif // KMP_USE_HIER_SCHED
2915  
__kmp_cpuinfo_get_filename()2916  static inline const char *__kmp_cpuinfo_get_filename() {
2917    const char *filename;
2918    if (__kmp_cpuinfo_file != nullptr)
2919      filename = __kmp_cpuinfo_file;
2920    else
2921      filename = "/proc/cpuinfo";
2922    return filename;
2923  }
2924  
__kmp_cpuinfo_get_envvar()2925  static inline const char *__kmp_cpuinfo_get_envvar() {
2926    const char *envvar = nullptr;
2927    if (__kmp_cpuinfo_file != nullptr)
2928      envvar = "KMP_CPUINFO_FILE";
2929    return envvar;
2930  }
2931  
2932  // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
2933  // affinity map. On AIX, the map is obtained through system SRAD (Scheduler
2934  // Resource Allocation Domain).
__kmp_affinity_create_cpuinfo_map(int * line,kmp_i18n_id_t * const msg_id)2935  static bool __kmp_affinity_create_cpuinfo_map(int *line,
2936                                                kmp_i18n_id_t *const msg_id) {
2937    *msg_id = kmp_i18n_null;
2938  
2939  #if KMP_OS_AIX
2940    unsigned num_records = __kmp_xproc;
2941  #else
2942    const char *filename = __kmp_cpuinfo_get_filename();
2943    const char *envvar = __kmp_cpuinfo_get_envvar();
2944  
2945    if (__kmp_affinity.flags.verbose) {
2946      KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
2947    }
2948  
2949    kmp_safe_raii_file_t f(filename, "r", envvar);
2950  
2951    // Scan of the file, and count the number of "processor" (osId) fields,
2952    // and find the highest value of <n> for a node_<n> field.
2953    char buf[256];
2954    unsigned num_records = 0;
2955    while (!feof(f)) {
2956      buf[sizeof(buf) - 1] = 1;
2957      if (!fgets(buf, sizeof(buf), f)) {
2958        // Read errors presumably because of EOF
2959        break;
2960      }
2961  
2962      char s1[] = "processor";
2963      if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2964        num_records++;
2965        continue;
2966      }
2967  
2968      // FIXME - this will match "node_<n> <garbage>"
2969      unsigned level;
2970      if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
2971        // validate the input fisrt:
2972        if (level > (unsigned)__kmp_xproc) { // level is too big
2973          level = __kmp_xproc;
2974        }
2975        if (nodeIdIndex + level >= maxIndex) {
2976          maxIndex = nodeIdIndex + level;
2977        }
2978        continue;
2979      }
2980    }
2981  
2982    // Check for empty file / no valid processor records, or too many. The number
2983    // of records can't exceed the number of valid bits in the affinity mask.
2984    if (num_records == 0) {
2985      *msg_id = kmp_i18n_str_NoProcRecords;
2986      return false;
2987    }
2988    if (num_records > (unsigned)__kmp_xproc) {
2989      *msg_id = kmp_i18n_str_TooManyProcRecords;
2990      return false;
2991    }
2992  
2993    // Set the file pointer back to the beginning, so that we can scan the file
2994    // again, this time performing a full parse of the data. Allocate a vector of
2995    // ProcCpuInfo object, where we will place the data. Adding an extra element
2996    // at the end allows us to remove a lot of extra checks for termination
2997    // conditions.
2998    if (fseek(f, 0, SEEK_SET) != 0) {
2999      *msg_id = kmp_i18n_str_CantRewindCpuinfo;
3000      return false;
3001    }
3002  #endif // KMP_OS_AIX
3003  
3004    // Allocate the array of records to store the proc info in.  The dummy
3005    // element at the end makes the logic in filling them out easier to code.
3006    unsigned **threadInfo =
3007        (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *));
3008    unsigned i;
3009    for (i = 0; i <= num_records; i++) {
3010      threadInfo[i] =
3011          (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3012    }
3013  
3014  #define CLEANUP_THREAD_INFO                                                    \
3015    for (i = 0; i <= num_records; i++) {                                         \
3016      __kmp_free(threadInfo[i]);                                                 \
3017    }                                                                            \
3018    __kmp_free(threadInfo);
3019  
3020    // A value of UINT_MAX means that we didn't find the field
3021    unsigned __index;
3022  
3023  #define INIT_PROC_INFO(p)                                                      \
3024    for (__index = 0; __index <= maxIndex; __index++) {                          \
3025      (p)[__index] = UINT_MAX;                                                   \
3026    }
3027  
3028    for (i = 0; i <= num_records; i++) {
3029      INIT_PROC_INFO(threadInfo[i]);
3030    }
3031  
3032  #if KMP_OS_AIX
3033    int smt_threads;
3034    lpar_info_format1_t cpuinfo;
3035    unsigned num_avail = __kmp_xproc;
3036  
3037    if (__kmp_affinity.flags.verbose)
3038      KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "system info for topology");
3039  
3040    // Get the number of SMT threads per core.
3041    smt_threads = syssmt(GET_NUMBER_SMT_SETS, 0, 0, NULL);
3042  
3043    // Allocate a resource set containing available system resourses.
3044    rsethandle_t sys_rset = rs_alloc(RS_SYSTEM);
3045    if (sys_rset == NULL) {
3046      CLEANUP_THREAD_INFO;
3047      *msg_id = kmp_i18n_str_UnknownTopology;
3048      return false;
3049    }
3050    // Allocate a resource set for the SRAD info.
3051    rsethandle_t srad = rs_alloc(RS_EMPTY);
3052    if (srad == NULL) {
3053      rs_free(sys_rset);
3054      CLEANUP_THREAD_INFO;
3055      *msg_id = kmp_i18n_str_UnknownTopology;
3056      return false;
3057    }
3058  
3059    // Get the SRAD system detail level.
3060    int sradsdl = rs_getinfo(NULL, R_SRADSDL, 0);
3061    if (sradsdl < 0) {
3062      rs_free(sys_rset);
3063      rs_free(srad);
3064      CLEANUP_THREAD_INFO;
3065      *msg_id = kmp_i18n_str_UnknownTopology;
3066      return false;
3067    }
3068    // Get the number of RADs at that SRAD SDL.
3069    int num_rads = rs_numrads(sys_rset, sradsdl, 0);
3070    if (num_rads < 0) {
3071      rs_free(sys_rset);
3072      rs_free(srad);
3073      CLEANUP_THREAD_INFO;
3074      *msg_id = kmp_i18n_str_UnknownTopology;
3075      return false;
3076    }
3077  
3078    // Get the maximum number of procs that may be contained in a resource set.
3079    int max_procs = rs_getinfo(NULL, R_MAXPROCS, 0);
3080    if (max_procs < 0) {
3081      rs_free(sys_rset);
3082      rs_free(srad);
3083      CLEANUP_THREAD_INFO;
3084      *msg_id = kmp_i18n_str_UnknownTopology;
3085      return false;
3086    }
3087  
3088    int cur_rad = 0;
3089    int num_set = 0;
3090    for (int srad_idx = 0; cur_rad < num_rads && srad_idx < VMI_MAXRADS;
3091         ++srad_idx) {
3092      // Check if the SRAD is available in the RSET.
3093      if (rs_getrad(sys_rset, srad, sradsdl, srad_idx, 0) < 0)
3094        continue;
3095  
3096      for (int cpu = 0; cpu < max_procs; cpu++) {
3097        // Set the info for the cpu if it is in the SRAD.
3098        if (rs_op(RS_TESTRESOURCE, srad, NULL, R_PROCS, cpu)) {
3099          threadInfo[cpu][osIdIndex] = cpu;
3100          threadInfo[cpu][pkgIdIndex] = cur_rad;
3101          threadInfo[cpu][coreIdIndex] = cpu / smt_threads;
3102          ++num_set;
3103          if (num_set >= num_avail) {
3104            // Done if all available CPUs have been set.
3105            break;
3106          }
3107        }
3108      }
3109      ++cur_rad;
3110    }
3111    rs_free(sys_rset);
3112    rs_free(srad);
3113  
3114    // The topology is already sorted.
3115  
3116  #else // !KMP_OS_AIX
3117    unsigned num_avail = 0;
3118    *line = 0;
3119  #if KMP_ARCH_S390X
3120    bool reading_s390x_sys_info = true;
3121  #endif
3122    while (!feof(f)) {
3123      // Create an inner scoping level, so that all the goto targets at the end of
3124      // the loop appear in an outer scoping level. This avoids warnings about
3125      // jumping past an initialization to a target in the same block.
3126      {
3127        buf[sizeof(buf) - 1] = 1;
3128        bool long_line = false;
3129        if (!fgets(buf, sizeof(buf), f)) {
3130          // Read errors presumably because of EOF
3131          // If there is valid data in threadInfo[num_avail], then fake
3132          // a blank line in ensure that the last address gets parsed.
3133          bool valid = false;
3134          for (i = 0; i <= maxIndex; i++) {
3135            if (threadInfo[num_avail][i] != UINT_MAX) {
3136              valid = true;
3137            }
3138          }
3139          if (!valid) {
3140            break;
3141          }
3142          buf[0] = 0;
3143        } else if (!buf[sizeof(buf) - 1]) {
3144          // The line is longer than the buffer.  Set a flag and don't
3145          // emit an error if we were going to ignore the line, anyway.
3146          long_line = true;
3147  
3148  #define CHECK_LINE                                                             \
3149    if (long_line) {                                                             \
3150      CLEANUP_THREAD_INFO;                                                       \
3151      *msg_id = kmp_i18n_str_LongLineCpuinfo;                                    \
3152      return false;                                                              \
3153    }
3154        }
3155        (*line)++;
3156  
3157  #if KMP_ARCH_LOONGARCH64
3158        // The parsing logic of /proc/cpuinfo in this function highly depends on
3159        // the blank lines between each processor info block. But on LoongArch a
3160        // blank line exists before the first processor info block (i.e. after the
3161        // "system type" line). This blank line was added because the "system
3162        // type" line is unrelated to any of the CPUs. We must skip this line so
3163        // that the original logic works on LoongArch.
3164        if (*buf == '\n' && *line == 2)
3165          continue;
3166  #endif
3167  #if KMP_ARCH_S390X
3168        // s390x /proc/cpuinfo starts with a variable number of lines containing
3169        // the overall system information. Skip them.
3170        if (reading_s390x_sys_info) {
3171          if (*buf == '\n')
3172            reading_s390x_sys_info = false;
3173          continue;
3174        }
3175  #endif
3176  
3177  #if KMP_ARCH_S390X
3178        char s1[] = "cpu number";
3179  #else
3180        char s1[] = "processor";
3181  #endif
3182        if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
3183          CHECK_LINE;
3184          char *p = strchr(buf + sizeof(s1) - 1, ':');
3185          unsigned val;
3186          if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3187            goto no_val;
3188          if (threadInfo[num_avail][osIdIndex] != UINT_MAX)
3189  #if KMP_ARCH_AARCH64
3190            // Handle the old AArch64 /proc/cpuinfo layout differently,
3191            // it contains all of the 'processor' entries listed in a
3192            // single 'Processor' section, therefore the normal looking
3193            // for duplicates in that section will always fail.
3194            num_avail++;
3195  #else
3196            goto dup_field;
3197  #endif
3198          threadInfo[num_avail][osIdIndex] = val;
3199  #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
3200          char path[256];
3201          KMP_SNPRINTF(
3202              path, sizeof(path),
3203              "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
3204              threadInfo[num_avail][osIdIndex]);
3205          __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
3206  
3207  #if KMP_ARCH_S390X
3208          // Disambiguate physical_package_id.
3209          unsigned book_id;
3210          KMP_SNPRINTF(path, sizeof(path),
3211                       "/sys/devices/system/cpu/cpu%u/topology/book_id",
3212                       threadInfo[num_avail][osIdIndex]);
3213          __kmp_read_from_file(path, "%u", &book_id);
3214          threadInfo[num_avail][pkgIdIndex] |= (book_id << 8);
3215  
3216          unsigned drawer_id;
3217          KMP_SNPRINTF(path, sizeof(path),
3218                       "/sys/devices/system/cpu/cpu%u/topology/drawer_id",
3219                       threadInfo[num_avail][osIdIndex]);
3220          __kmp_read_from_file(path, "%u", &drawer_id);
3221          threadInfo[num_avail][pkgIdIndex] |= (drawer_id << 16);
3222  #endif
3223  
3224          KMP_SNPRINTF(path, sizeof(path),
3225                       "/sys/devices/system/cpu/cpu%u/topology/core_id",
3226                       threadInfo[num_avail][osIdIndex]);
3227          __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
3228          continue;
3229  #else
3230        }
3231        char s2[] = "physical id";
3232        if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
3233          CHECK_LINE;
3234          char *p = strchr(buf + sizeof(s2) - 1, ':');
3235          unsigned val;
3236          if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3237            goto no_val;
3238          if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX)
3239            goto dup_field;
3240          threadInfo[num_avail][pkgIdIndex] = val;
3241          continue;
3242        }
3243        char s3[] = "core id";
3244        if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
3245          CHECK_LINE;
3246          char *p = strchr(buf + sizeof(s3) - 1, ':');
3247          unsigned val;
3248          if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3249            goto no_val;
3250          if (threadInfo[num_avail][coreIdIndex] != UINT_MAX)
3251            goto dup_field;
3252          threadInfo[num_avail][coreIdIndex] = val;
3253          continue;
3254  #endif // KMP_OS_LINUX && USE_SYSFS_INFO
3255        }
3256        char s4[] = "thread id";
3257        if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
3258          CHECK_LINE;
3259          char *p = strchr(buf + sizeof(s4) - 1, ':');
3260          unsigned val;
3261          if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3262            goto no_val;
3263          if (threadInfo[num_avail][threadIdIndex] != UINT_MAX)
3264            goto dup_field;
3265          threadInfo[num_avail][threadIdIndex] = val;
3266          continue;
3267        }
3268        unsigned level;
3269        if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
3270          CHECK_LINE;
3271          char *p = strchr(buf + sizeof(s4) - 1, ':');
3272          unsigned val;
3273          if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3274            goto no_val;
3275          // validate the input before using level:
3276          if (level > (unsigned)__kmp_xproc) { // level is too big
3277            level = __kmp_xproc;
3278          }
3279          if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
3280            goto dup_field;
3281          threadInfo[num_avail][nodeIdIndex + level] = val;
3282          continue;
3283        }
3284  
3285        // We didn't recognize the leading token on the line. There are lots of
3286        // leading tokens that we don't recognize - if the line isn't empty, go on
3287        // to the next line.
3288        if ((*buf != 0) && (*buf != '\n')) {
3289          // If the line is longer than the buffer, read characters
3290          // until we find a newline.
3291          if (long_line) {
3292            int ch;
3293            while (((ch = fgetc(f)) != EOF) && (ch != '\n'))
3294              ;
3295          }
3296          continue;
3297        }
3298  
3299        // A newline has signalled the end of the processor record.
3300        // Check that there aren't too many procs specified.
3301        if ((int)num_avail == __kmp_xproc) {
3302          CLEANUP_THREAD_INFO;
3303          *msg_id = kmp_i18n_str_TooManyEntries;
3304          return false;
3305        }
3306  
3307        // Check for missing fields.  The osId field must be there, and we
3308        // currently require that the physical id field is specified, also.
3309        if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
3310          CLEANUP_THREAD_INFO;
3311          *msg_id = kmp_i18n_str_MissingProcField;
3312          return false;
3313        }
3314        if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
3315          CLEANUP_THREAD_INFO;
3316          *msg_id = kmp_i18n_str_MissingPhysicalIDField;
3317          return false;
3318        }
3319  
3320        // Skip this proc if it is not included in the machine model.
3321        if (KMP_AFFINITY_CAPABLE() &&
3322            !KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
3323                           __kmp_affin_fullMask)) {
3324          INIT_PROC_INFO(threadInfo[num_avail]);
3325          continue;
3326        }
3327  
3328        // We have a successful parse of this proc's info.
3329        // Increment the counter, and prepare for the next proc.
3330        num_avail++;
3331        KMP_ASSERT(num_avail <= num_records);
3332        INIT_PROC_INFO(threadInfo[num_avail]);
3333      }
3334      continue;
3335  
3336    no_val:
3337      CLEANUP_THREAD_INFO;
3338      *msg_id = kmp_i18n_str_MissingValCpuinfo;
3339      return false;
3340  
3341    dup_field:
3342      CLEANUP_THREAD_INFO;
3343      *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
3344      return false;
3345    }
3346    *line = 0;
3347  
3348  #if KMP_MIC && REDUCE_TEAM_SIZE
3349    unsigned teamSize = 0;
3350  #endif // KMP_MIC && REDUCE_TEAM_SIZE
3351  
3352    // check for num_records == __kmp_xproc ???
3353  
3354    // If it is configured to omit the package level when there is only a single
3355    // package, the logic at the end of this routine won't work if there is only a
3356    // single thread
3357    KMP_ASSERT(num_avail > 0);
3358    KMP_ASSERT(num_avail <= num_records);
3359  
3360    // Sort the threadInfo table by physical Id.
3361    qsort(threadInfo, num_avail, sizeof(*threadInfo),
3362          __kmp_affinity_cmp_ProcCpuInfo_phys_id);
3363  
3364  #endif // KMP_OS_AIX
3365  
3366    // The table is now sorted by pkgId / coreId / threadId, but we really don't
3367    // know the radix of any of the fields. pkgId's may be sparsely assigned among
3368    // the chips on a system. Although coreId's are usually assigned
3369    // [0 .. coresPerPkg-1] and threadId's are usually assigned
3370    // [0..threadsPerCore-1], we don't want to make any such assumptions.
3371    //
3372    // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
3373    // total # packages) are at this point - we want to determine that now. We
3374    // only have an upper bound on the first two figures.
3375    unsigned *counts =
3376        (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3377    unsigned *maxCt =
3378        (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3379    unsigned *totals =
3380        (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3381    unsigned *lastId =
3382        (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3383  
3384    bool assign_thread_ids = false;
3385    unsigned threadIdCt;
3386    unsigned index;
3387  
3388  restart_radix_check:
3389    threadIdCt = 0;
3390  
3391    // Initialize the counter arrays with data from threadInfo[0].
3392    if (assign_thread_ids) {
3393      if (threadInfo[0][threadIdIndex] == UINT_MAX) {
3394        threadInfo[0][threadIdIndex] = threadIdCt++;
3395      } else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
3396        threadIdCt = threadInfo[0][threadIdIndex] + 1;
3397      }
3398    }
3399    for (index = 0; index <= maxIndex; index++) {
3400      counts[index] = 1;
3401      maxCt[index] = 1;
3402      totals[index] = 1;
3403      lastId[index] = threadInfo[0][index];
3404      ;
3405    }
3406  
3407    // Run through the rest of the OS procs.
3408    for (i = 1; i < num_avail; i++) {
3409      // Find the most significant index whose id differs from the id for the
3410      // previous OS proc.
3411      for (index = maxIndex; index >= threadIdIndex; index--) {
3412        if (assign_thread_ids && (index == threadIdIndex)) {
3413          // Auto-assign the thread id field if it wasn't specified.
3414          if (threadInfo[i][threadIdIndex] == UINT_MAX) {
3415            threadInfo[i][threadIdIndex] = threadIdCt++;
3416          }
3417          // Apparently the thread id field was specified for some entries and not
3418          // others. Start the thread id counter off at the next higher thread id.
3419          else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
3420            threadIdCt = threadInfo[i][threadIdIndex] + 1;
3421          }
3422        }
3423        if (threadInfo[i][index] != lastId[index]) {
3424          // Run through all indices which are less significant, and reset the
3425          // counts to 1. At all levels up to and including index, we need to
3426          // increment the totals and record the last id.
3427          unsigned index2;
3428          for (index2 = threadIdIndex; index2 < index; index2++) {
3429            totals[index2]++;
3430            if (counts[index2] > maxCt[index2]) {
3431              maxCt[index2] = counts[index2];
3432            }
3433            counts[index2] = 1;
3434            lastId[index2] = threadInfo[i][index2];
3435          }
3436          counts[index]++;
3437          totals[index]++;
3438          lastId[index] = threadInfo[i][index];
3439  
3440          if (assign_thread_ids && (index > threadIdIndex)) {
3441  
3442  #if KMP_MIC && REDUCE_TEAM_SIZE
3443            // The default team size is the total #threads in the machine
3444            // minus 1 thread for every core that has 3 or more threads.
3445            teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
3446  #endif // KMP_MIC && REDUCE_TEAM_SIZE
3447  
3448            // Restart the thread counter, as we are on a new core.
3449            threadIdCt = 0;
3450  
3451            // Auto-assign the thread id field if it wasn't specified.
3452            if (threadInfo[i][threadIdIndex] == UINT_MAX) {
3453              threadInfo[i][threadIdIndex] = threadIdCt++;
3454            }
3455  
3456            // Apparently the thread id field was specified for some entries and
3457            // not others. Start the thread id counter off at the next higher
3458            // thread id.
3459            else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
3460              threadIdCt = threadInfo[i][threadIdIndex] + 1;
3461            }
3462          }
3463          break;
3464        }
3465      }
3466      if (index < threadIdIndex) {
3467        // If thread ids were specified, it is an error if they are not unique.
3468        // Also, check that we waven't already restarted the loop (to be safe -
3469        // shouldn't need to).
3470        if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) {
3471          __kmp_free(lastId);
3472          __kmp_free(totals);
3473          __kmp_free(maxCt);
3474          __kmp_free(counts);
3475          CLEANUP_THREAD_INFO;
3476          *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
3477          return false;
3478        }
3479  
3480        // If the thread ids were not specified and we see entries that
3481        // are duplicates, start the loop over and assign the thread ids manually.
3482        assign_thread_ids = true;
3483        goto restart_radix_check;
3484      }
3485    }
3486  
3487  #if KMP_MIC && REDUCE_TEAM_SIZE
3488    // The default team size is the total #threads in the machine
3489    // minus 1 thread for every core that has 3 or more threads.
3490    teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
3491  #endif // KMP_MIC && REDUCE_TEAM_SIZE
3492  
3493    for (index = threadIdIndex; index <= maxIndex; index++) {
3494      if (counts[index] > maxCt[index]) {
3495        maxCt[index] = counts[index];
3496      }
3497    }
3498  
3499    __kmp_nThreadsPerCore = maxCt[threadIdIndex];
3500    nCoresPerPkg = maxCt[coreIdIndex];
3501    nPackages = totals[pkgIdIndex];
3502  
3503    // When affinity is off, this routine will still be called to set
3504    // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
3505    // Make sure all these vars are set correctly, and return now if affinity is
3506    // not enabled.
3507    __kmp_ncores = totals[coreIdIndex];
3508    if (!KMP_AFFINITY_CAPABLE()) {
3509      KMP_ASSERT(__kmp_affinity.type == affinity_none);
3510      return true;
3511    }
3512  
3513  #if KMP_MIC && REDUCE_TEAM_SIZE
3514    // Set the default team size.
3515    if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
3516      __kmp_dflt_team_nth = teamSize;
3517      KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting "
3518                    "__kmp_dflt_team_nth = %d\n",
3519                    __kmp_dflt_team_nth));
3520    }
3521  #endif // KMP_MIC && REDUCE_TEAM_SIZE
3522  
3523    KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc);
3524  
3525    // Count the number of levels which have more nodes at that level than at the
3526    // parent's level (with there being an implicit root node of the top level).
3527    // This is equivalent to saying that there is at least one node at this level
3528    // which has a sibling. These levels are in the map, and the package level is
3529    // always in the map.
3530    bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
3531    for (index = threadIdIndex; index < maxIndex; index++) {
3532      KMP_ASSERT(totals[index] >= totals[index + 1]);
3533      inMap[index] = (totals[index] > totals[index + 1]);
3534    }
3535    inMap[maxIndex] = (totals[maxIndex] > 1);
3536    inMap[pkgIdIndex] = true;
3537    inMap[coreIdIndex] = true;
3538    inMap[threadIdIndex] = true;
3539  
3540    int depth = 0;
3541    int idx = 0;
3542    kmp_hw_t types[KMP_HW_LAST];
3543    int pkgLevel = -1;
3544    int coreLevel = -1;
3545    int threadLevel = -1;
3546    for (index = threadIdIndex; index <= maxIndex; index++) {
3547      if (inMap[index]) {
3548        depth++;
3549      }
3550    }
3551    if (inMap[pkgIdIndex]) {
3552      pkgLevel = idx;
3553      types[idx++] = KMP_HW_SOCKET;
3554    }
3555    if (inMap[coreIdIndex]) {
3556      coreLevel = idx;
3557      types[idx++] = KMP_HW_CORE;
3558    }
3559    if (inMap[threadIdIndex]) {
3560      threadLevel = idx;
3561      types[idx++] = KMP_HW_THREAD;
3562    }
3563    KMP_ASSERT(depth > 0);
3564  
3565    // Construct the data structure that is to be returned.
3566    __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types);
3567  
3568    for (i = 0; i < num_avail; ++i) {
3569      unsigned os = threadInfo[i][osIdIndex];
3570      int src_index;
3571      kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
3572      hw_thread.clear();
3573      hw_thread.os_id = os;
3574  
3575      idx = 0;
3576      for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
3577        if (!inMap[src_index]) {
3578          continue;
3579        }
3580        if (src_index == pkgIdIndex) {
3581          hw_thread.ids[pkgLevel] = threadInfo[i][src_index];
3582        } else if (src_index == coreIdIndex) {
3583          hw_thread.ids[coreLevel] = threadInfo[i][src_index];
3584        } else if (src_index == threadIdIndex) {
3585          hw_thread.ids[threadLevel] = threadInfo[i][src_index];
3586        }
3587      }
3588    }
3589  
3590    __kmp_free(inMap);
3591    __kmp_free(lastId);
3592    __kmp_free(totals);
3593    __kmp_free(maxCt);
3594    __kmp_free(counts);
3595    CLEANUP_THREAD_INFO;
3596    __kmp_topology->sort_ids();
3597    if (!__kmp_topology->check_ids()) {
3598      kmp_topology_t::deallocate(__kmp_topology);
3599      __kmp_topology = nullptr;
3600      *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
3601      return false;
3602    }
3603    return true;
3604  }
3605  
3606  // Create and return a table of affinity masks, indexed by OS thread ID.
3607  // This routine handles OR'ing together all the affinity masks of threads
3608  // that are sufficiently close, if granularity > fine.
3609  template <typename FindNextFunctionType>
__kmp_create_os_id_masks(unsigned * numUnique,kmp_affinity_t & affinity,FindNextFunctionType find_next)3610  static void __kmp_create_os_id_masks(unsigned *numUnique,
3611                                       kmp_affinity_t &affinity,
3612                                       FindNextFunctionType find_next) {
3613    // First form a table of affinity masks in order of OS thread id.
3614    int maxOsId;
3615    int i;
3616    int numAddrs = __kmp_topology->get_num_hw_threads();
3617    int depth = __kmp_topology->get_depth();
3618    const char *env_var = __kmp_get_affinity_env_var(affinity);
3619    KMP_ASSERT(numAddrs);
3620    KMP_ASSERT(depth);
3621  
3622    i = find_next(-1);
3623    // If could not find HW thread location with attributes, then return and
3624    // fallback to increment find_next and disregard core attributes.
3625    if (i >= numAddrs)
3626      return;
3627  
3628    maxOsId = 0;
3629    for (i = numAddrs - 1;; --i) {
3630      int osId = __kmp_topology->at(i).os_id;
3631      if (osId > maxOsId) {
3632        maxOsId = osId;
3633      }
3634      if (i == 0)
3635        break;
3636    }
3637    affinity.num_os_id_masks = maxOsId + 1;
3638    KMP_CPU_ALLOC_ARRAY(affinity.os_id_masks, affinity.num_os_id_masks);
3639    KMP_ASSERT(affinity.gran_levels >= 0);
3640    if (affinity.flags.verbose && (affinity.gran_levels > 0)) {
3641      KMP_INFORM(ThreadsMigrate, env_var, affinity.gran_levels);
3642    }
3643    if (affinity.gran_levels >= (int)depth) {
3644      KMP_AFF_WARNING(affinity, AffThreadsMayMigrate);
3645    }
3646  
3647    // Run through the table, forming the masks for all threads on each core.
3648    // Threads on the same core will have identical kmp_hw_thread_t objects, not
3649    // considering the last level, which must be the thread id. All threads on a
3650    // core will appear consecutively.
3651    int unique = 0;
3652    int j = 0; // index of 1st thread on core
3653    int leader = 0;
3654    kmp_affin_mask_t *sum;
3655    KMP_CPU_ALLOC_ON_STACK(sum);
3656    KMP_CPU_ZERO(sum);
3657  
3658    i = j = leader = find_next(-1);
3659    KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
3660    kmp_full_mask_modifier_t full_mask;
3661    for (i = find_next(i); i < numAddrs; i = find_next(i)) {
3662      // If this thread is sufficiently close to the leader (within the
3663      // granularity setting), then set the bit for this os thread in the
3664      // affinity mask for this group, and go on to the next thread.
3665      if (__kmp_topology->is_close(leader, i, affinity)) {
3666        KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
3667        continue;
3668      }
3669  
3670      // For every thread in this group, copy the mask to the thread's entry in
3671      // the OS Id mask table. Mark the first address as a leader.
3672      for (; j < i; j = find_next(j)) {
3673        int osId = __kmp_topology->at(j).os_id;
3674        KMP_DEBUG_ASSERT(osId <= maxOsId);
3675        kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
3676        KMP_CPU_COPY(mask, sum);
3677        __kmp_topology->at(j).leader = (j == leader);
3678      }
3679      unique++;
3680  
3681      // Start a new mask.
3682      leader = i;
3683      full_mask.include(sum);
3684      KMP_CPU_ZERO(sum);
3685      KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
3686    }
3687  
3688    // For every thread in last group, copy the mask to the thread's
3689    // entry in the OS Id mask table.
3690    for (; j < i; j = find_next(j)) {
3691      int osId = __kmp_topology->at(j).os_id;
3692      KMP_DEBUG_ASSERT(osId <= maxOsId);
3693      kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
3694      KMP_CPU_COPY(mask, sum);
3695      __kmp_topology->at(j).leader = (j == leader);
3696    }
3697    full_mask.include(sum);
3698    unique++;
3699    KMP_CPU_FREE_FROM_STACK(sum);
3700  
3701    // See if the OS Id mask table further restricts or changes the full mask
3702    if (full_mask.restrict_to_mask() && affinity.flags.verbose) {
3703      __kmp_topology->print(env_var);
3704    }
3705  
3706    *numUnique = unique;
3707  }
3708  
3709  // Stuff for the affinity proclist parsers.  It's easier to declare these vars
3710  // as file-static than to try and pass them through the calling sequence of
3711  // the recursive-descent OMP_PLACES parser.
3712  static kmp_affin_mask_t *newMasks;
3713  static int numNewMasks;
3714  static int nextNewMask;
3715  
3716  #define ADD_MASK(_mask)                                                        \
3717    {                                                                            \
3718      if (nextNewMask >= numNewMasks) {                                          \
3719        int i;                                                                   \
3720        numNewMasks *= 2;                                                        \
3721        kmp_affin_mask_t *temp;                                                  \
3722        KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks);                         \
3723        for (i = 0; i < numNewMasks / 2; i++) {                                  \
3724          kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);                    \
3725          kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i);                       \
3726          KMP_CPU_COPY(dest, src);                                               \
3727        }                                                                        \
3728        KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2);                  \
3729        newMasks = temp;                                                         \
3730      }                                                                          \
3731      KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));               \
3732      nextNewMask++;                                                             \
3733    }
3734  
3735  #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId)                             \
3736    {                                                                            \
3737      if (((_osId) > _maxOsId) ||                                                \
3738          (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) {     \
3739        KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, _osId);                \
3740      } else {                                                                   \
3741        ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));                            \
3742      }                                                                          \
3743    }
3744  
3745  // Re-parse the proclist (for the explicit affinity type), and form the list
3746  // of affinity newMasks indexed by gtid.
__kmp_affinity_process_proclist(kmp_affinity_t & affinity)3747  static void __kmp_affinity_process_proclist(kmp_affinity_t &affinity) {
3748    int i;
3749    kmp_affin_mask_t **out_masks = &affinity.masks;
3750    unsigned *out_numMasks = &affinity.num_masks;
3751    const char *proclist = affinity.proclist;
3752    kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
3753    int maxOsId = affinity.num_os_id_masks - 1;
3754    const char *scan = proclist;
3755    const char *next = proclist;
3756  
3757    // We use malloc() for the temporary mask vector, so that we can use
3758    // realloc() to extend it.
3759    numNewMasks = 2;
3760    KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
3761    nextNewMask = 0;
3762    kmp_affin_mask_t *sumMask;
3763    KMP_CPU_ALLOC(sumMask);
3764    int setSize = 0;
3765  
3766    for (;;) {
3767      int start, end, stride;
3768  
3769      SKIP_WS(scan);
3770      next = scan;
3771      if (*next == '\0') {
3772        break;
3773      }
3774  
3775      if (*next == '{') {
3776        int num;
3777        setSize = 0;
3778        next++; // skip '{'
3779        SKIP_WS(next);
3780        scan = next;
3781  
3782        // Read the first integer in the set.
3783        KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist");
3784        SKIP_DIGITS(next);
3785        num = __kmp_str_to_int(scan, *next);
3786        KMP_ASSERT2(num >= 0, "bad explicit proc list");
3787  
3788        // Copy the mask for that osId to the sum (union) mask.
3789        if ((num > maxOsId) ||
3790            (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3791          KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
3792          KMP_CPU_ZERO(sumMask);
3793        } else {
3794          KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
3795          setSize = 1;
3796        }
3797  
3798        for (;;) {
3799          // Check for end of set.
3800          SKIP_WS(next);
3801          if (*next == '}') {
3802            next++; // skip '}'
3803            break;
3804          }
3805  
3806          // Skip optional comma.
3807          if (*next == ',') {
3808            next++;
3809          }
3810          SKIP_WS(next);
3811  
3812          // Read the next integer in the set.
3813          scan = next;
3814          KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3815  
3816          SKIP_DIGITS(next);
3817          num = __kmp_str_to_int(scan, *next);
3818          KMP_ASSERT2(num >= 0, "bad explicit proc list");
3819  
3820          // Add the mask for that osId to the sum mask.
3821          if ((num > maxOsId) ||
3822              (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3823            KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
3824          } else {
3825            KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
3826            setSize++;
3827          }
3828        }
3829        if (setSize > 0) {
3830          ADD_MASK(sumMask);
3831        }
3832  
3833        SKIP_WS(next);
3834        if (*next == ',') {
3835          next++;
3836        }
3837        scan = next;
3838        continue;
3839      }
3840  
3841      // Read the first integer.
3842      KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3843      SKIP_DIGITS(next);
3844      start = __kmp_str_to_int(scan, *next);
3845      KMP_ASSERT2(start >= 0, "bad explicit proc list");
3846      SKIP_WS(next);
3847  
3848      // If this isn't a range, then add a mask to the list and go on.
3849      if (*next != '-') {
3850        ADD_MASK_OSID(start, osId2Mask, maxOsId);
3851  
3852        // Skip optional comma.
3853        if (*next == ',') {
3854          next++;
3855        }
3856        scan = next;
3857        continue;
3858      }
3859  
3860      // This is a range.  Skip over the '-' and read in the 2nd int.
3861      next++; // skip '-'
3862      SKIP_WS(next);
3863      scan = next;
3864      KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3865      SKIP_DIGITS(next);
3866      end = __kmp_str_to_int(scan, *next);
3867      KMP_ASSERT2(end >= 0, "bad explicit proc list");
3868  
3869      // Check for a stride parameter
3870      stride = 1;
3871      SKIP_WS(next);
3872      if (*next == ':') {
3873        // A stride is specified.  Skip over the ':" and read the 3rd int.
3874        int sign = +1;
3875        next++; // skip ':'
3876        SKIP_WS(next);
3877        scan = next;
3878        if (*next == '-') {
3879          sign = -1;
3880          next++;
3881          SKIP_WS(next);
3882          scan = next;
3883        }
3884        KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3885        SKIP_DIGITS(next);
3886        stride = __kmp_str_to_int(scan, *next);
3887        KMP_ASSERT2(stride >= 0, "bad explicit proc list");
3888        stride *= sign;
3889      }
3890  
3891      // Do some range checks.
3892      KMP_ASSERT2(stride != 0, "bad explicit proc list");
3893      if (stride > 0) {
3894        KMP_ASSERT2(start <= end, "bad explicit proc list");
3895      } else {
3896        KMP_ASSERT2(start >= end, "bad explicit proc list");
3897      }
3898      KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
3899  
3900      // Add the mask for each OS proc # to the list.
3901      if (stride > 0) {
3902        do {
3903          ADD_MASK_OSID(start, osId2Mask, maxOsId);
3904          start += stride;
3905        } while (start <= end);
3906      } else {
3907        do {
3908          ADD_MASK_OSID(start, osId2Mask, maxOsId);
3909          start += stride;
3910        } while (start >= end);
3911      }
3912  
3913      // Skip optional comma.
3914      SKIP_WS(next);
3915      if (*next == ',') {
3916        next++;
3917      }
3918      scan = next;
3919    }
3920  
3921    *out_numMasks = nextNewMask;
3922    if (nextNewMask == 0) {
3923      *out_masks = NULL;
3924      KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3925      return;
3926    }
3927    KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3928    for (i = 0; i < nextNewMask; i++) {
3929      kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
3930      kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
3931      KMP_CPU_COPY(dest, src);
3932    }
3933    KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3934    KMP_CPU_FREE(sumMask);
3935  }
3936  
3937  /*-----------------------------------------------------------------------------
3938  Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3939  places.  Again, Here is the grammar:
3940  
3941  place_list := place
3942  place_list := place , place_list
3943  place := num
3944  place := place : num
3945  place := place : num : signed
3946  place := { subplacelist }
3947  place := ! place                  // (lowest priority)
3948  subplace_list := subplace
3949  subplace_list := subplace , subplace_list
3950  subplace := num
3951  subplace := num : num
3952  subplace := num : num : signed
3953  signed := num
3954  signed := + signed
3955  signed := - signed
3956  -----------------------------------------------------------------------------*/
__kmp_process_subplace_list(const char ** scan,kmp_affinity_t & affinity,int maxOsId,kmp_affin_mask_t * tempMask,int * setSize)3957  static void __kmp_process_subplace_list(const char **scan,
3958                                          kmp_affinity_t &affinity, int maxOsId,
3959                                          kmp_affin_mask_t *tempMask,
3960                                          int *setSize) {
3961    const char *next;
3962    kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
3963  
3964    for (;;) {
3965      int start, count, stride, i;
3966  
3967      // Read in the starting proc id
3968      SKIP_WS(*scan);
3969      KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3970      next = *scan;
3971      SKIP_DIGITS(next);
3972      start = __kmp_str_to_int(*scan, *next);
3973      KMP_ASSERT(start >= 0);
3974      *scan = next;
3975  
3976      // valid follow sets are ',' ':' and '}'
3977      SKIP_WS(*scan);
3978      if (**scan == '}' || **scan == ',') {
3979        if ((start > maxOsId) ||
3980            (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3981          KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
3982        } else {
3983          KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3984          (*setSize)++;
3985        }
3986        if (**scan == '}') {
3987          break;
3988        }
3989        (*scan)++; // skip ','
3990        continue;
3991      }
3992      KMP_ASSERT2(**scan == ':', "bad explicit places list");
3993      (*scan)++; // skip ':'
3994  
3995      // Read count parameter
3996      SKIP_WS(*scan);
3997      KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3998      next = *scan;
3999      SKIP_DIGITS(next);
4000      count = __kmp_str_to_int(*scan, *next);
4001      KMP_ASSERT(count >= 0);
4002      *scan = next;
4003  
4004      // valid follow sets are ',' ':' and '}'
4005      SKIP_WS(*scan);
4006      if (**scan == '}' || **scan == ',') {
4007        for (i = 0; i < count; i++) {
4008          if ((start > maxOsId) ||
4009              (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
4010            KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
4011            break; // don't proliferate warnings for large count
4012          } else {
4013            KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
4014            start++;
4015            (*setSize)++;
4016          }
4017        }
4018        if (**scan == '}') {
4019          break;
4020        }
4021        (*scan)++; // skip ','
4022        continue;
4023      }
4024      KMP_ASSERT2(**scan == ':', "bad explicit places list");
4025      (*scan)++; // skip ':'
4026  
4027      // Read stride parameter
4028      int sign = +1;
4029      for (;;) {
4030        SKIP_WS(*scan);
4031        if (**scan == '+') {
4032          (*scan)++; // skip '+'
4033          continue;
4034        }
4035        if (**scan == '-') {
4036          sign *= -1;
4037          (*scan)++; // skip '-'
4038          continue;
4039        }
4040        break;
4041      }
4042      SKIP_WS(*scan);
4043      KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
4044      next = *scan;
4045      SKIP_DIGITS(next);
4046      stride = __kmp_str_to_int(*scan, *next);
4047      KMP_ASSERT(stride >= 0);
4048      *scan = next;
4049      stride *= sign;
4050  
4051      // valid follow sets are ',' and '}'
4052      SKIP_WS(*scan);
4053      if (**scan == '}' || **scan == ',') {
4054        for (i = 0; i < count; i++) {
4055          if ((start > maxOsId) ||
4056              (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
4057            KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
4058            break; // don't proliferate warnings for large count
4059          } else {
4060            KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
4061            start += stride;
4062            (*setSize)++;
4063          }
4064        }
4065        if (**scan == '}') {
4066          break;
4067        }
4068        (*scan)++; // skip ','
4069        continue;
4070      }
4071  
4072      KMP_ASSERT2(0, "bad explicit places list");
4073    }
4074  }
4075  
__kmp_process_place(const char ** scan,kmp_affinity_t & affinity,int maxOsId,kmp_affin_mask_t * tempMask,int * setSize)4076  static void __kmp_process_place(const char **scan, kmp_affinity_t &affinity,
4077                                  int maxOsId, kmp_affin_mask_t *tempMask,
4078                                  int *setSize) {
4079    const char *next;
4080    kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
4081  
4082    // valid follow sets are '{' '!' and num
4083    SKIP_WS(*scan);
4084    if (**scan == '{') {
4085      (*scan)++; // skip '{'
4086      __kmp_process_subplace_list(scan, affinity, maxOsId, tempMask, setSize);
4087      KMP_ASSERT2(**scan == '}', "bad explicit places list");
4088      (*scan)++; // skip '}'
4089    } else if (**scan == '!') {
4090      (*scan)++; // skip '!'
4091      __kmp_process_place(scan, affinity, maxOsId, tempMask, setSize);
4092      KMP_CPU_COMPLEMENT(maxOsId, tempMask);
4093    } else if ((**scan >= '0') && (**scan <= '9')) {
4094      next = *scan;
4095      SKIP_DIGITS(next);
4096      int num = __kmp_str_to_int(*scan, *next);
4097      KMP_ASSERT(num >= 0);
4098      if ((num > maxOsId) ||
4099          (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
4100        KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
4101      } else {
4102        KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
4103        (*setSize)++;
4104      }
4105      *scan = next; // skip num
4106    } else {
4107      KMP_ASSERT2(0, "bad explicit places list");
4108    }
4109  }
4110  
4111  // static void
__kmp_affinity_process_placelist(kmp_affinity_t & affinity)4112  void __kmp_affinity_process_placelist(kmp_affinity_t &affinity) {
4113    int i, j, count, stride, sign;
4114    kmp_affin_mask_t **out_masks = &affinity.masks;
4115    unsigned *out_numMasks = &affinity.num_masks;
4116    const char *placelist = affinity.proclist;
4117    kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
4118    int maxOsId = affinity.num_os_id_masks - 1;
4119    const char *scan = placelist;
4120    const char *next = placelist;
4121  
4122    numNewMasks = 2;
4123    KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
4124    nextNewMask = 0;
4125  
4126    // tempMask is modified based on the previous or initial
4127    //   place to form the current place
4128    // previousMask contains the previous place
4129    kmp_affin_mask_t *tempMask;
4130    kmp_affin_mask_t *previousMask;
4131    KMP_CPU_ALLOC(tempMask);
4132    KMP_CPU_ZERO(tempMask);
4133    KMP_CPU_ALLOC(previousMask);
4134    KMP_CPU_ZERO(previousMask);
4135    int setSize = 0;
4136  
4137    for (;;) {
4138      __kmp_process_place(&scan, affinity, maxOsId, tempMask, &setSize);
4139  
4140      // valid follow sets are ',' ':' and EOL
4141      SKIP_WS(scan);
4142      if (*scan == '\0' || *scan == ',') {
4143        if (setSize > 0) {
4144          ADD_MASK(tempMask);
4145        }
4146        KMP_CPU_ZERO(tempMask);
4147        setSize = 0;
4148        if (*scan == '\0') {
4149          break;
4150        }
4151        scan++; // skip ','
4152        continue;
4153      }
4154  
4155      KMP_ASSERT2(*scan == ':', "bad explicit places list");
4156      scan++; // skip ':'
4157  
4158      // Read count parameter
4159      SKIP_WS(scan);
4160      KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
4161      next = scan;
4162      SKIP_DIGITS(next);
4163      count = __kmp_str_to_int(scan, *next);
4164      KMP_ASSERT(count >= 0);
4165      scan = next;
4166  
4167      // valid follow sets are ',' ':' and EOL
4168      SKIP_WS(scan);
4169      if (*scan == '\0' || *scan == ',') {
4170        stride = +1;
4171      } else {
4172        KMP_ASSERT2(*scan == ':', "bad explicit places list");
4173        scan++; // skip ':'
4174  
4175        // Read stride parameter
4176        sign = +1;
4177        for (;;) {
4178          SKIP_WS(scan);
4179          if (*scan == '+') {
4180            scan++; // skip '+'
4181            continue;
4182          }
4183          if (*scan == '-') {
4184            sign *= -1;
4185            scan++; // skip '-'
4186            continue;
4187          }
4188          break;
4189        }
4190        SKIP_WS(scan);
4191        KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
4192        next = scan;
4193        SKIP_DIGITS(next);
4194        stride = __kmp_str_to_int(scan, *next);
4195        KMP_DEBUG_ASSERT(stride >= 0);
4196        scan = next;
4197        stride *= sign;
4198      }
4199  
4200      // Add places determined by initial_place : count : stride
4201      for (i = 0; i < count; i++) {
4202        if (setSize == 0) {
4203          break;
4204        }
4205        // Add the current place, then build the next place (tempMask) from that
4206        KMP_CPU_COPY(previousMask, tempMask);
4207        ADD_MASK(previousMask);
4208        KMP_CPU_ZERO(tempMask);
4209        setSize = 0;
4210        KMP_CPU_SET_ITERATE(j, previousMask) {
4211          if (!KMP_CPU_ISSET(j, previousMask)) {
4212            continue;
4213          }
4214          if ((j + stride > maxOsId) || (j + stride < 0) ||
4215              (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
4216              (!KMP_CPU_ISSET(j + stride,
4217                              KMP_CPU_INDEX(osId2Mask, j + stride)))) {
4218            if (i < count - 1) {
4219              KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, j + stride);
4220            }
4221            continue;
4222          }
4223          KMP_CPU_SET(j + stride, tempMask);
4224          setSize++;
4225        }
4226      }
4227      KMP_CPU_ZERO(tempMask);
4228      setSize = 0;
4229  
4230      // valid follow sets are ',' and EOL
4231      SKIP_WS(scan);
4232      if (*scan == '\0') {
4233        break;
4234      }
4235      if (*scan == ',') {
4236        scan++; // skip ','
4237        continue;
4238      }
4239  
4240      KMP_ASSERT2(0, "bad explicit places list");
4241    }
4242  
4243    *out_numMasks = nextNewMask;
4244    if (nextNewMask == 0) {
4245      *out_masks = NULL;
4246      KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
4247      return;
4248    }
4249    KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
4250    KMP_CPU_FREE(tempMask);
4251    KMP_CPU_FREE(previousMask);
4252    for (i = 0; i < nextNewMask; i++) {
4253      kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
4254      kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
4255      KMP_CPU_COPY(dest, src);
4256    }
4257    KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
4258  }
4259  
4260  #undef ADD_MASK
4261  #undef ADD_MASK_OSID
4262  
4263  // This function figures out the deepest level at which there is at least one
4264  // cluster/core with more than one processing unit bound to it.
__kmp_affinity_find_core_level(int nprocs,int bottom_level)4265  static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) {
4266    int core_level = 0;
4267  
4268    for (int i = 0; i < nprocs; i++) {
4269      const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
4270      for (int j = bottom_level; j > 0; j--) {
4271        if (hw_thread.ids[j] > 0) {
4272          if (core_level < (j - 1)) {
4273            core_level = j - 1;
4274          }
4275        }
4276      }
4277    }
4278    return core_level;
4279  }
4280  
4281  // This function counts number of clusters/cores at given level.
__kmp_affinity_compute_ncores(int nprocs,int bottom_level,int core_level)4282  static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level,
4283                                           int core_level) {
4284    return __kmp_topology->get_count(core_level);
4285  }
4286  // This function finds to which cluster/core given processing unit is bound.
__kmp_affinity_find_core(int proc,int bottom_level,int core_level)4287  static int __kmp_affinity_find_core(int proc, int bottom_level,
4288                                      int core_level) {
4289    int core = 0;
4290    KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads());
4291    for (int i = 0; i <= proc; ++i) {
4292      if (i + 1 <= proc) {
4293        for (int j = 0; j <= core_level; ++j) {
4294          if (__kmp_topology->at(i + 1).sub_ids[j] !=
4295              __kmp_topology->at(i).sub_ids[j]) {
4296            core++;
4297            break;
4298          }
4299        }
4300      }
4301    }
4302    return core;
4303  }
4304  
4305  // This function finds maximal number of processing units bound to a
4306  // cluster/core at given level.
__kmp_affinity_max_proc_per_core(int nprocs,int bottom_level,int core_level)4307  static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level,
4308                                              int core_level) {
4309    if (core_level >= bottom_level)
4310      return 1;
4311    int thread_level = __kmp_topology->get_level(KMP_HW_THREAD);
4312    return __kmp_topology->calculate_ratio(thread_level, core_level);
4313  }
4314  
4315  static int *procarr = NULL;
4316  static int __kmp_aff_depth = 0;
4317  static int *__kmp_osid_to_hwthread_map = NULL;
4318  
__kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t * mask,kmp_affinity_ids_t & ids,kmp_affinity_attrs_t & attrs)4319  static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
4320                                                    kmp_affinity_ids_t &ids,
4321                                                    kmp_affinity_attrs_t &attrs) {
4322    if (!KMP_AFFINITY_CAPABLE())
4323      return;
4324  
4325    // Initiailze ids and attrs thread data
4326    for (int i = 0; i < KMP_HW_LAST; ++i)
4327      ids.ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
4328    attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
4329  
4330    // Iterate through each os id within the mask and determine
4331    // the topology id and attribute information
4332    int cpu;
4333    int depth = __kmp_topology->get_depth();
4334    KMP_CPU_SET_ITERATE(cpu, mask) {
4335      int osid_idx = __kmp_osid_to_hwthread_map[cpu];
4336      ids.os_id = cpu;
4337      const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx);
4338      for (int level = 0; level < depth; ++level) {
4339        kmp_hw_t type = __kmp_topology->get_type(level);
4340        int id = hw_thread.sub_ids[level];
4341        if (ids.ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids.ids[type] == id) {
4342          ids.ids[type] = id;
4343        } else {
4344          // This mask spans across multiple topology units, set it as such
4345          // and mark every level below as such as well.
4346          ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
4347          for (; level < depth; ++level) {
4348            kmp_hw_t type = __kmp_topology->get_type(level);
4349            ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
4350          }
4351        }
4352      }
4353      if (!attrs.valid) {
4354        attrs.core_type = hw_thread.attrs.get_core_type();
4355        attrs.core_eff = hw_thread.attrs.get_core_eff();
4356        attrs.valid = 1;
4357      } else {
4358        // This mask spans across multiple attributes, set it as such
4359        if (attrs.core_type != hw_thread.attrs.get_core_type())
4360          attrs.core_type = KMP_HW_CORE_TYPE_UNKNOWN;
4361        if (attrs.core_eff != hw_thread.attrs.get_core_eff())
4362          attrs.core_eff = kmp_hw_attr_t::UNKNOWN_CORE_EFF;
4363      }
4364    }
4365  }
4366  
__kmp_affinity_get_thread_topology_info(kmp_info_t * th)4367  static void __kmp_affinity_get_thread_topology_info(kmp_info_t *th) {
4368    if (!KMP_AFFINITY_CAPABLE())
4369      return;
4370    const kmp_affin_mask_t *mask = th->th.th_affin_mask;
4371    kmp_affinity_ids_t &ids = th->th.th_topology_ids;
4372    kmp_affinity_attrs_t &attrs = th->th.th_topology_attrs;
4373    __kmp_affinity_get_mask_topology_info(mask, ids, attrs);
4374  }
4375  
4376  // Assign the topology information to each place in the place list
4377  // A thread can then grab not only its affinity mask, but the topology
4378  // information associated with that mask. e.g., Which socket is a thread on
__kmp_affinity_get_topology_info(kmp_affinity_t & affinity)4379  static void __kmp_affinity_get_topology_info(kmp_affinity_t &affinity) {
4380    if (!KMP_AFFINITY_CAPABLE())
4381      return;
4382    if (affinity.type != affinity_none) {
4383      KMP_ASSERT(affinity.num_os_id_masks);
4384      KMP_ASSERT(affinity.os_id_masks);
4385    }
4386    KMP_ASSERT(affinity.num_masks);
4387    KMP_ASSERT(affinity.masks);
4388    KMP_ASSERT(__kmp_affin_fullMask);
4389  
4390    int max_cpu = __kmp_affin_fullMask->get_max_cpu();
4391    int num_hw_threads = __kmp_topology->get_num_hw_threads();
4392  
4393    // Allocate thread topology information
4394    if (!affinity.ids) {
4395      affinity.ids = (kmp_affinity_ids_t *)__kmp_allocate(
4396          sizeof(kmp_affinity_ids_t) * affinity.num_masks);
4397    }
4398    if (!affinity.attrs) {
4399      affinity.attrs = (kmp_affinity_attrs_t *)__kmp_allocate(
4400          sizeof(kmp_affinity_attrs_t) * affinity.num_masks);
4401    }
4402    if (!__kmp_osid_to_hwthread_map) {
4403      // Want the +1 because max_cpu should be valid index into map
4404      __kmp_osid_to_hwthread_map =
4405          (int *)__kmp_allocate(sizeof(int) * (max_cpu + 1));
4406    }
4407  
4408    // Create the OS proc to hardware thread map
4409    for (int hw_thread = 0; hw_thread < num_hw_threads; ++hw_thread) {
4410      int os_id = __kmp_topology->at(hw_thread).os_id;
4411      if (KMP_CPU_ISSET(os_id, __kmp_affin_fullMask))
4412        __kmp_osid_to_hwthread_map[os_id] = hw_thread;
4413    }
4414  
4415    for (unsigned i = 0; i < affinity.num_masks; ++i) {
4416      kmp_affinity_ids_t &ids = affinity.ids[i];
4417      kmp_affinity_attrs_t &attrs = affinity.attrs[i];
4418      kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.masks, i);
4419      __kmp_affinity_get_mask_topology_info(mask, ids, attrs);
4420    }
4421  }
4422  
4423  // Called when __kmp_topology is ready
__kmp_aux_affinity_initialize_other_data(kmp_affinity_t & affinity)4424  static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t &affinity) {
4425    // Initialize other data structures which depend on the topology
4426    if (__kmp_topology && __kmp_topology->get_num_hw_threads()) {
4427      machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
4428      __kmp_affinity_get_topology_info(affinity);
4429  #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
4430      __kmp_first_osid_with_ecore = __kmp_get_first_osid_with_ecore();
4431  #endif
4432    }
4433  }
4434  
4435  // Create a one element mask array (set of places) which only contains the
4436  // initial process's affinity mask
__kmp_create_affinity_none_places(kmp_affinity_t & affinity)4437  static void __kmp_create_affinity_none_places(kmp_affinity_t &affinity) {
4438    KMP_ASSERT(__kmp_affin_fullMask != NULL);
4439    KMP_ASSERT(affinity.type == affinity_none);
4440    KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
4441    affinity.num_masks = 1;
4442    KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
4443    kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, 0);
4444    KMP_CPU_COPY(dest, __kmp_affin_fullMask);
4445    __kmp_aux_affinity_initialize_other_data(affinity);
4446  }
4447  
__kmp_aux_affinity_initialize_masks(kmp_affinity_t & affinity)4448  static void __kmp_aux_affinity_initialize_masks(kmp_affinity_t &affinity) {
4449    // Create the "full" mask - this defines all of the processors that we
4450    // consider to be in the machine model. If respect is set, then it is the
4451    // initialization thread's affinity mask. Otherwise, it is all processors that
4452    // we know about on the machine.
4453    int verbose = affinity.flags.verbose;
4454    const char *env_var = affinity.env_var;
4455  
4456    // Already initialized
4457    if (__kmp_affin_fullMask && __kmp_affin_origMask)
4458      return;
4459  
4460    if (__kmp_affin_fullMask == NULL) {
4461      KMP_CPU_ALLOC(__kmp_affin_fullMask);
4462    }
4463    if (__kmp_affin_origMask == NULL) {
4464      KMP_CPU_ALLOC(__kmp_affin_origMask);
4465    }
4466    if (KMP_AFFINITY_CAPABLE()) {
4467      __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
4468      // Make a copy before possible expanding to the entire machine mask
4469      __kmp_affin_origMask->copy(__kmp_affin_fullMask);
4470      if (affinity.flags.respect) {
4471        // Count the number of available processors.
4472        unsigned i;
4473        __kmp_avail_proc = 0;
4474        KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
4475          if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
4476            continue;
4477          }
4478          __kmp_avail_proc++;
4479        }
4480        if (__kmp_avail_proc > __kmp_xproc) {
4481          KMP_AFF_WARNING(affinity, ErrorInitializeAffinity);
4482          affinity.type = affinity_none;
4483          KMP_AFFINITY_DISABLE();
4484          return;
4485        }
4486  
4487        if (verbose) {
4488          char buf[KMP_AFFIN_MASK_PRINT_LEN];
4489          __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4490                                    __kmp_affin_fullMask);
4491          KMP_INFORM(InitOSProcSetRespect, env_var, buf);
4492        }
4493      } else {
4494        if (verbose) {
4495          char buf[KMP_AFFIN_MASK_PRINT_LEN];
4496          __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4497                                    __kmp_affin_fullMask);
4498          KMP_INFORM(InitOSProcSetNotRespect, env_var, buf);
4499        }
4500        __kmp_avail_proc =
4501            __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
4502  #if KMP_OS_WINDOWS
4503        if (__kmp_num_proc_groups <= 1) {
4504          // Copy expanded full mask if topology has single processor group
4505          __kmp_affin_origMask->copy(__kmp_affin_fullMask);
4506        }
4507        // Set the process affinity mask since threads' affinity
4508        // masks must be subset of process mask in Windows* OS
4509        __kmp_affin_fullMask->set_process_affinity(true);
4510  #endif
4511      }
4512    }
4513  }
4514  
__kmp_aux_affinity_initialize_topology(kmp_affinity_t & affinity)4515  static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) {
4516    bool success = false;
4517    const char *env_var = affinity.env_var;
4518    kmp_i18n_id_t msg_id = kmp_i18n_null;
4519    int verbose = affinity.flags.verbose;
4520  
4521    // For backward compatibility, setting KMP_CPUINFO_FILE =>
4522    // KMP_TOPOLOGY_METHOD=cpuinfo
4523    if ((__kmp_cpuinfo_file != NULL) &&
4524        (__kmp_affinity_top_method == affinity_top_method_all)) {
4525      __kmp_affinity_top_method = affinity_top_method_cpuinfo;
4526    }
4527  
4528    if (__kmp_affinity_top_method == affinity_top_method_all) {
4529  // In the default code path, errors are not fatal - we just try using
4530  // another method. We only emit a warning message if affinity is on, or the
4531  // verbose flag is set, an the nowarnings flag was not set.
4532  #if KMP_USE_HWLOC
4533      if (!success &&
4534          __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
4535        if (!__kmp_hwloc_error) {
4536          success = __kmp_affinity_create_hwloc_map(&msg_id);
4537          if (!success && verbose) {
4538            KMP_INFORM(AffIgnoringHwloc, env_var);
4539          }
4540        } else if (verbose) {
4541          KMP_INFORM(AffIgnoringHwloc, env_var);
4542        }
4543      }
4544  #endif
4545  
4546  #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4547      if (!success) {
4548        success = __kmp_affinity_create_x2apicid_map(&msg_id);
4549        if (!success && verbose && msg_id != kmp_i18n_null) {
4550          KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4551        }
4552      }
4553      if (!success) {
4554        success = __kmp_affinity_create_apicid_map(&msg_id);
4555        if (!success && verbose && msg_id != kmp_i18n_null) {
4556          KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4557        }
4558      }
4559  #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4560  
4561  #if KMP_OS_LINUX || KMP_OS_AIX
4562      if (!success) {
4563        int line = 0;
4564        success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
4565        if (!success && verbose && msg_id != kmp_i18n_null) {
4566          KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4567        }
4568      }
4569  #endif /* KMP_OS_LINUX */
4570  
4571  #if KMP_GROUP_AFFINITY
4572      if (!success && (__kmp_num_proc_groups > 1)) {
4573        success = __kmp_affinity_create_proc_group_map(&msg_id);
4574        if (!success && verbose && msg_id != kmp_i18n_null) {
4575          KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4576        }
4577      }
4578  #endif /* KMP_GROUP_AFFINITY */
4579  
4580      if (!success) {
4581        success = __kmp_affinity_create_flat_map(&msg_id);
4582        if (!success && verbose && msg_id != kmp_i18n_null) {
4583          KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4584        }
4585        KMP_ASSERT(success);
4586      }
4587    }
4588  
4589  // If the user has specified that a paricular topology discovery method is to be
4590  // used, then we abort if that method fails. The exception is group affinity,
4591  // which might have been implicitly set.
4592  #if KMP_USE_HWLOC
4593    else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
4594      KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
4595      success = __kmp_affinity_create_hwloc_map(&msg_id);
4596      if (!success) {
4597        KMP_ASSERT(msg_id != kmp_i18n_null);
4598        KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4599      }
4600    }
4601  #endif // KMP_USE_HWLOC
4602  
4603  #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4604    else if (__kmp_affinity_top_method == affinity_top_method_x2apicid ||
4605             __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
4606      success = __kmp_affinity_create_x2apicid_map(&msg_id);
4607      if (!success) {
4608        KMP_ASSERT(msg_id != kmp_i18n_null);
4609        KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4610      }
4611    } else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
4612      success = __kmp_affinity_create_apicid_map(&msg_id);
4613      if (!success) {
4614        KMP_ASSERT(msg_id != kmp_i18n_null);
4615        KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4616      }
4617    }
4618  #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4619  
4620    else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
4621      int line = 0;
4622      success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
4623      if (!success) {
4624        KMP_ASSERT(msg_id != kmp_i18n_null);
4625        const char *filename = __kmp_cpuinfo_get_filename();
4626        if (line > 0) {
4627          KMP_FATAL(FileLineMsgExiting, filename, line,
4628                    __kmp_i18n_catgets(msg_id));
4629        } else {
4630          KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
4631        }
4632      }
4633    }
4634  
4635  #if KMP_GROUP_AFFINITY
4636    else if (__kmp_affinity_top_method == affinity_top_method_group) {
4637      success = __kmp_affinity_create_proc_group_map(&msg_id);
4638      KMP_ASSERT(success);
4639      if (!success) {
4640        KMP_ASSERT(msg_id != kmp_i18n_null);
4641        KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4642      }
4643    }
4644  #endif /* KMP_GROUP_AFFINITY */
4645  
4646    else if (__kmp_affinity_top_method == affinity_top_method_flat) {
4647      success = __kmp_affinity_create_flat_map(&msg_id);
4648      // should not fail
4649      KMP_ASSERT(success);
4650    }
4651  
4652    // Early exit if topology could not be created
4653    if (!__kmp_topology) {
4654      if (KMP_AFFINITY_CAPABLE()) {
4655        KMP_AFF_WARNING(affinity, ErrorInitializeAffinity);
4656      }
4657      if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 &&
4658          __kmp_ncores > 0) {
4659        __kmp_topology = kmp_topology_t::allocate(0, 0, NULL);
4660        __kmp_topology->canonicalize(nPackages, nCoresPerPkg,
4661                                     __kmp_nThreadsPerCore, __kmp_ncores);
4662        if (verbose) {
4663          __kmp_topology->print(env_var);
4664        }
4665      }
4666      return false;
4667    }
4668  
4669    // Canonicalize, print (if requested), apply KMP_HW_SUBSET
4670    __kmp_topology->canonicalize();
4671    if (verbose)
4672      __kmp_topology->print(env_var);
4673    bool filtered = __kmp_topology->filter_hw_subset();
4674    if (filtered && verbose)
4675      __kmp_topology->print("KMP_HW_SUBSET");
4676    return success;
4677  }
4678  
__kmp_aux_affinity_initialize(kmp_affinity_t & affinity)4679  static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) {
4680    bool is_regular_affinity = (&affinity == &__kmp_affinity);
4681    bool is_hidden_helper_affinity = (&affinity == &__kmp_hh_affinity);
4682    const char *env_var = __kmp_get_affinity_env_var(affinity);
4683  
4684    if (affinity.flags.initialized) {
4685      KMP_ASSERT(__kmp_affin_fullMask != NULL);
4686      return;
4687    }
4688  
4689    if (is_regular_affinity && (!__kmp_affin_fullMask || !__kmp_affin_origMask))
4690      __kmp_aux_affinity_initialize_masks(affinity);
4691  
4692    if (is_regular_affinity && !__kmp_topology) {
4693      bool success = __kmp_aux_affinity_initialize_topology(affinity);
4694      if (success) {
4695        KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
4696      } else {
4697        affinity.type = affinity_none;
4698        KMP_AFFINITY_DISABLE();
4699      }
4700    }
4701  
4702    // If KMP_AFFINITY=none, then only create the single "none" place
4703    // which is the process's initial affinity mask or the number of
4704    // hardware threads depending on respect,norespect
4705    if (affinity.type == affinity_none) {
4706      __kmp_create_affinity_none_places(affinity);
4707  #if KMP_USE_HIER_SCHED
4708      __kmp_dispatch_set_hierarchy_values();
4709  #endif
4710      affinity.flags.initialized = TRUE;
4711      return;
4712    }
4713  
4714    __kmp_topology->set_granularity(affinity);
4715    int depth = __kmp_topology->get_depth();
4716  
4717    // Create the table of masks, indexed by thread Id.
4718    unsigned numUnique;
4719    int numAddrs = __kmp_topology->get_num_hw_threads();
4720    // If OMP_PLACES=cores:<attribute> specified, then attempt
4721    // to make OS Id mask table using those attributes
4722    if (affinity.core_attr_gran.valid) {
4723      __kmp_create_os_id_masks(&numUnique, affinity, [&](int idx) {
4724        KMP_ASSERT(idx >= -1);
4725        for (int i = idx + 1; i < numAddrs; ++i)
4726          if (__kmp_topology->at(i).attrs.contains(affinity.core_attr_gran))
4727            return i;
4728        return numAddrs;
4729      });
4730      if (!affinity.os_id_masks) {
4731        const char *core_attribute;
4732        if (affinity.core_attr_gran.core_eff != kmp_hw_attr_t::UNKNOWN_CORE_EFF)
4733          core_attribute = "core_efficiency";
4734        else
4735          core_attribute = "core_type";
4736        KMP_AFF_WARNING(affinity, AffIgnoringNotAvailable, env_var,
4737                        core_attribute,
4738                        __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true))
4739      }
4740    }
4741    // If core attributes did not work, or none were specified,
4742    // then make OS Id mask table using typical incremental way.
4743    if (!affinity.os_id_masks) {
4744      __kmp_create_os_id_masks(&numUnique, affinity, [](int idx) {
4745        KMP_ASSERT(idx >= -1);
4746        return idx + 1;
4747      });
4748    }
4749    if (affinity.gran_levels == 0) {
4750      KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
4751    }
4752  
4753    switch (affinity.type) {
4754  
4755    case affinity_explicit:
4756      KMP_DEBUG_ASSERT(affinity.proclist != NULL);
4757      if (is_hidden_helper_affinity ||
4758          __kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) {
4759        __kmp_affinity_process_proclist(affinity);
4760      } else {
4761        __kmp_affinity_process_placelist(affinity);
4762      }
4763      if (affinity.num_masks == 0) {
4764        KMP_AFF_WARNING(affinity, AffNoValidProcID);
4765        affinity.type = affinity_none;
4766        __kmp_create_affinity_none_places(affinity);
4767        affinity.flags.initialized = TRUE;
4768        return;
4769      }
4770      break;
4771  
4772    // The other affinity types rely on sorting the hardware threads according to
4773    // some permutation of the machine topology tree. Set affinity.compact
4774    // and affinity.offset appropriately, then jump to a common code
4775    // fragment to do the sort and create the array of affinity masks.
4776    case affinity_logical:
4777      affinity.compact = 0;
4778      if (affinity.offset) {
4779        affinity.offset =
4780            __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc;
4781      }
4782      goto sortTopology;
4783  
4784    case affinity_physical:
4785      if (__kmp_nThreadsPerCore > 1) {
4786        affinity.compact = 1;
4787        if (affinity.compact >= depth) {
4788          affinity.compact = 0;
4789        }
4790      } else {
4791        affinity.compact = 0;
4792      }
4793      if (affinity.offset) {
4794        affinity.offset =
4795            __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc;
4796      }
4797      goto sortTopology;
4798  
4799    case affinity_scatter:
4800      if (affinity.compact >= depth) {
4801        affinity.compact = 0;
4802      } else {
4803        affinity.compact = depth - 1 - affinity.compact;
4804      }
4805      goto sortTopology;
4806  
4807    case affinity_compact:
4808      if (affinity.compact >= depth) {
4809        affinity.compact = depth - 1;
4810      }
4811      goto sortTopology;
4812  
4813    case affinity_balanced:
4814      if (depth <= 1 || is_hidden_helper_affinity) {
4815        KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var);
4816        affinity.type = affinity_none;
4817        __kmp_create_affinity_none_places(affinity);
4818        affinity.flags.initialized = TRUE;
4819        return;
4820      } else if (!__kmp_topology->is_uniform()) {
4821        // Save the depth for further usage
4822        __kmp_aff_depth = depth;
4823  
4824        int core_level =
4825            __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1);
4826        int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1,
4827                                                   core_level);
4828        int maxprocpercore = __kmp_affinity_max_proc_per_core(
4829            __kmp_avail_proc, depth - 1, core_level);
4830  
4831        int nproc = ncores * maxprocpercore;
4832        if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
4833          KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var);
4834          affinity.type = affinity_none;
4835          __kmp_create_affinity_none_places(affinity);
4836          affinity.flags.initialized = TRUE;
4837          return;
4838        }
4839  
4840        procarr = (int *)__kmp_allocate(sizeof(int) * nproc);
4841        for (int i = 0; i < nproc; i++) {
4842          procarr[i] = -1;
4843        }
4844  
4845        int lastcore = -1;
4846        int inlastcore = 0;
4847        for (int i = 0; i < __kmp_avail_proc; i++) {
4848          int proc = __kmp_topology->at(i).os_id;
4849          int core = __kmp_affinity_find_core(i, depth - 1, core_level);
4850  
4851          if (core == lastcore) {
4852            inlastcore++;
4853          } else {
4854            inlastcore = 0;
4855          }
4856          lastcore = core;
4857  
4858          procarr[core * maxprocpercore + inlastcore] = proc;
4859        }
4860      }
4861      if (affinity.compact >= depth) {
4862        affinity.compact = depth - 1;
4863      }
4864  
4865    sortTopology:
4866      // Allocate the gtid->affinity mask table.
4867      if (affinity.flags.dups) {
4868        affinity.num_masks = __kmp_avail_proc;
4869      } else {
4870        affinity.num_masks = numUnique;
4871      }
4872  
4873      if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
4874          (__kmp_affinity_num_places > 0) &&
4875          ((unsigned)__kmp_affinity_num_places < affinity.num_masks) &&
4876          !is_hidden_helper_affinity) {
4877        affinity.num_masks = __kmp_affinity_num_places;
4878      }
4879  
4880      KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
4881  
4882      // Sort the topology table according to the current setting of
4883      // affinity.compact, then fill out affinity.masks.
4884      __kmp_topology->sort_compact(affinity);
4885      {
4886        int i;
4887        unsigned j;
4888        int num_hw_threads = __kmp_topology->get_num_hw_threads();
4889        kmp_full_mask_modifier_t full_mask;
4890        for (i = 0, j = 0; i < num_hw_threads; i++) {
4891          if ((!affinity.flags.dups) && (!__kmp_topology->at(i).leader)) {
4892            continue;
4893          }
4894          int osId = __kmp_topology->at(i).os_id;
4895  
4896          kmp_affin_mask_t *src = KMP_CPU_INDEX(affinity.os_id_masks, osId);
4897          kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, j);
4898          KMP_ASSERT(KMP_CPU_ISSET(osId, src));
4899          KMP_CPU_COPY(dest, src);
4900          full_mask.include(src);
4901          if (++j >= affinity.num_masks) {
4902            break;
4903          }
4904        }
4905        KMP_DEBUG_ASSERT(j == affinity.num_masks);
4906        // See if the places list further restricts or changes the full mask
4907        if (full_mask.restrict_to_mask() && affinity.flags.verbose) {
4908          __kmp_topology->print(env_var);
4909        }
4910      }
4911      // Sort the topology back using ids
4912      __kmp_topology->sort_ids();
4913      break;
4914  
4915    default:
4916      KMP_ASSERT2(0, "Unexpected affinity setting");
4917    }
4918    __kmp_aux_affinity_initialize_other_data(affinity);
4919    affinity.flags.initialized = TRUE;
4920  }
4921  
__kmp_affinity_initialize(kmp_affinity_t & affinity)4922  void __kmp_affinity_initialize(kmp_affinity_t &affinity) {
4923    // Much of the code above was written assuming that if a machine was not
4924    // affinity capable, then affinity type == affinity_none.
4925    // We now explicitly represent this as affinity type == affinity_disabled.
4926    // There are too many checks for affinity type == affinity_none in this code.
4927    // Instead of trying to change them all, check if
4928    // affinity type == affinity_disabled, and if so, slam it with affinity_none,
4929    // call the real initialization routine, then restore affinity type to
4930    // affinity_disabled.
4931    int disabled = (affinity.type == affinity_disabled);
4932    if (!KMP_AFFINITY_CAPABLE())
4933      KMP_ASSERT(disabled);
4934    if (disabled)
4935      affinity.type = affinity_none;
4936    __kmp_aux_affinity_initialize(affinity);
4937    if (disabled)
4938      affinity.type = affinity_disabled;
4939  }
4940  
__kmp_affinity_uninitialize(void)4941  void __kmp_affinity_uninitialize(void) {
4942    for (kmp_affinity_t *affinity : __kmp_affinities) {
4943      if (affinity->masks != NULL)
4944        KMP_CPU_FREE_ARRAY(affinity->masks, affinity->num_masks);
4945      if (affinity->os_id_masks != NULL)
4946        KMP_CPU_FREE_ARRAY(affinity->os_id_masks, affinity->num_os_id_masks);
4947      if (affinity->proclist != NULL)
4948        __kmp_free(affinity->proclist);
4949      if (affinity->ids != NULL)
4950        __kmp_free(affinity->ids);
4951      if (affinity->attrs != NULL)
4952        __kmp_free(affinity->attrs);
4953      *affinity = KMP_AFFINITY_INIT(affinity->env_var);
4954    }
4955    if (__kmp_affin_origMask != NULL) {
4956      if (KMP_AFFINITY_CAPABLE()) {
4957  #if KMP_OS_AIX
4958        // Uninitialize by unbinding the thread.
4959        bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY);
4960  #else
4961        __kmp_set_system_affinity(__kmp_affin_origMask, FALSE);
4962  #endif
4963      }
4964      KMP_CPU_FREE(__kmp_affin_origMask);
4965      __kmp_affin_origMask = NULL;
4966    }
4967    __kmp_affinity_num_places = 0;
4968    if (procarr != NULL) {
4969      __kmp_free(procarr);
4970      procarr = NULL;
4971    }
4972    if (__kmp_osid_to_hwthread_map) {
4973      __kmp_free(__kmp_osid_to_hwthread_map);
4974      __kmp_osid_to_hwthread_map = NULL;
4975    }
4976  #if KMP_USE_HWLOC
4977    if (__kmp_hwloc_topology != NULL) {
4978      hwloc_topology_destroy(__kmp_hwloc_topology);
4979      __kmp_hwloc_topology = NULL;
4980    }
4981  #endif
4982    if (__kmp_hw_subset) {
4983      kmp_hw_subset_t::deallocate(__kmp_hw_subset);
4984      __kmp_hw_subset = nullptr;
4985    }
4986    if (__kmp_topology) {
4987      kmp_topology_t::deallocate(__kmp_topology);
4988      __kmp_topology = nullptr;
4989    }
4990    KMPAffinity::destroy_api();
4991  }
4992  
__kmp_select_mask_by_gtid(int gtid,const kmp_affinity_t * affinity,int * place,kmp_affin_mask_t ** mask)4993  static void __kmp_select_mask_by_gtid(int gtid, const kmp_affinity_t *affinity,
4994                                        int *place, kmp_affin_mask_t **mask) {
4995    int mask_idx;
4996    bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
4997    if (is_hidden_helper)
4998      // The first gtid is the regular primary thread, the second gtid is the main
4999      // thread of hidden team which does not participate in task execution.
5000      mask_idx = gtid - 2;
5001    else
5002      mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
5003    KMP_DEBUG_ASSERT(affinity->num_masks > 0);
5004    *place = (mask_idx + affinity->offset) % affinity->num_masks;
5005    *mask = KMP_CPU_INDEX(affinity->masks, *place);
5006  }
5007  
5008  // This function initializes the per-thread data concerning affinity including
5009  // the mask and topology information
__kmp_affinity_set_init_mask(int gtid,int isa_root)5010  void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
5011  
5012    kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
5013  
5014    // Set the thread topology information to default of unknown
5015    for (int id = 0; id < KMP_HW_LAST; ++id)
5016      th->th.th_topology_ids.ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
5017    th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
5018  
5019    if (!KMP_AFFINITY_CAPABLE()) {
5020      return;
5021    }
5022  
5023    if (th->th.th_affin_mask == NULL) {
5024      KMP_CPU_ALLOC(th->th.th_affin_mask);
5025    } else {
5026      KMP_CPU_ZERO(th->th.th_affin_mask);
5027    }
5028  
5029    // Copy the thread mask to the kmp_info_t structure. If
5030    // __kmp_affinity.type == affinity_none, copy the "full" mask, i.e.
5031    // one that has all of the OS proc ids set, or if
5032    // __kmp_affinity.flags.respect is set, then the full mask is the
5033    // same as the mask of the initialization thread.
5034    kmp_affin_mask_t *mask;
5035    int i;
5036    const kmp_affinity_t *affinity;
5037    bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
5038  
5039    if (is_hidden_helper)
5040      affinity = &__kmp_hh_affinity;
5041    else
5042      affinity = &__kmp_affinity;
5043  
5044    if (KMP_AFFINITY_NON_PROC_BIND || is_hidden_helper) {
5045      if ((affinity->type == affinity_none) ||
5046          (affinity->type == affinity_balanced) ||
5047          KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
5048  #if KMP_GROUP_AFFINITY
5049        if (__kmp_num_proc_groups > 1) {
5050          return;
5051        }
5052  #endif
5053        KMP_ASSERT(__kmp_affin_fullMask != NULL);
5054        i = 0;
5055        mask = __kmp_affin_fullMask;
5056      } else {
5057        __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask);
5058      }
5059    } else {
5060      if (!isa_root || __kmp_nested_proc_bind.bind_types[0] == proc_bind_false) {
5061  #if KMP_GROUP_AFFINITY
5062        if (__kmp_num_proc_groups > 1) {
5063          return;
5064        }
5065  #endif
5066        KMP_ASSERT(__kmp_affin_fullMask != NULL);
5067        i = KMP_PLACE_ALL;
5068        mask = __kmp_affin_fullMask;
5069      } else {
5070        __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask);
5071      }
5072    }
5073  
5074    th->th.th_current_place = i;
5075    if (isa_root && !is_hidden_helper) {
5076      th->th.th_new_place = i;
5077      th->th.th_first_place = 0;
5078      th->th.th_last_place = affinity->num_masks - 1;
5079    } else if (KMP_AFFINITY_NON_PROC_BIND) {
5080      // When using a Non-OMP_PROC_BIND affinity method,
5081      // set all threads' place-partition-var to the entire place list
5082      th->th.th_first_place = 0;
5083      th->th.th_last_place = affinity->num_masks - 1;
5084    }
5085    // Copy topology information associated with the place
5086    if (i >= 0) {
5087      th->th.th_topology_ids = __kmp_affinity.ids[i];
5088      th->th.th_topology_attrs = __kmp_affinity.attrs[i];
5089    }
5090  
5091    if (i == KMP_PLACE_ALL) {
5092      KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to all places\n",
5093                     gtid));
5094    } else {
5095      KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to place %d\n",
5096                     gtid, i));
5097    }
5098  
5099    KMP_CPU_COPY(th->th.th_affin_mask, mask);
5100  }
5101  
__kmp_affinity_bind_init_mask(int gtid)5102  void __kmp_affinity_bind_init_mask(int gtid) {
5103    if (!KMP_AFFINITY_CAPABLE()) {
5104      return;
5105    }
5106    kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
5107    const kmp_affinity_t *affinity;
5108    const char *env_var;
5109    bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
5110  
5111    if (is_hidden_helper)
5112      affinity = &__kmp_hh_affinity;
5113    else
5114      affinity = &__kmp_affinity;
5115    env_var = __kmp_get_affinity_env_var(*affinity, /*for_binding=*/true);
5116    /* to avoid duplicate printing (will be correctly printed on barrier) */
5117    if (affinity->flags.verbose && (affinity->type == affinity_none ||
5118                                    (th->th.th_current_place != KMP_PLACE_ALL &&
5119                                     affinity->type != affinity_balanced)) &&
5120        !KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
5121      char buf[KMP_AFFIN_MASK_PRINT_LEN];
5122      __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5123                                th->th.th_affin_mask);
5124      KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
5125                 gtid, buf);
5126    }
5127  
5128  #if KMP_OS_WINDOWS
5129    // On Windows* OS, the process affinity mask might have changed. If the user
5130    // didn't request affinity and this call fails, just continue silently.
5131    // See CQ171393.
5132    if (affinity->type == affinity_none) {
5133      __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
5134    } else
5135  #endif
5136  #ifndef KMP_OS_AIX
5137      // Do not set the full mask as the init mask on AIX.
5138      __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
5139  #endif
5140  }
5141  
__kmp_affinity_bind_place(int gtid)5142  void __kmp_affinity_bind_place(int gtid) {
5143    // Hidden helper threads should not be affected by OMP_PLACES/OMP_PROC_BIND
5144    if (!KMP_AFFINITY_CAPABLE() || KMP_HIDDEN_HELPER_THREAD(gtid)) {
5145      return;
5146    }
5147  
5148    kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
5149  
5150    KA_TRACE(100, ("__kmp_affinity_bind_place: binding T#%d to place %d (current "
5151                   "place = %d)\n",
5152                   gtid, th->th.th_new_place, th->th.th_current_place));
5153  
5154    // Check that the new place is within this thread's partition.
5155    KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
5156    KMP_ASSERT(th->th.th_new_place >= 0);
5157    KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity.num_masks);
5158    if (th->th.th_first_place <= th->th.th_last_place) {
5159      KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
5160                 (th->th.th_new_place <= th->th.th_last_place));
5161    } else {
5162      KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) ||
5163                 (th->th.th_new_place >= th->th.th_last_place));
5164    }
5165  
5166    // Copy the thread mask to the kmp_info_t structure,
5167    // and set this thread's affinity.
5168    kmp_affin_mask_t *mask =
5169        KMP_CPU_INDEX(__kmp_affinity.masks, th->th.th_new_place);
5170    KMP_CPU_COPY(th->th.th_affin_mask, mask);
5171    th->th.th_current_place = th->th.th_new_place;
5172  
5173    if (__kmp_affinity.flags.verbose) {
5174      char buf[KMP_AFFIN_MASK_PRINT_LEN];
5175      __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5176                                th->th.th_affin_mask);
5177      KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
5178                 __kmp_gettid(), gtid, buf);
5179    }
5180    __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
5181  }
5182  
__kmp_aux_set_affinity(void ** mask)5183  int __kmp_aux_set_affinity(void **mask) {
5184    int gtid;
5185    kmp_info_t *th;
5186    int retval;
5187  
5188    if (!KMP_AFFINITY_CAPABLE()) {
5189      return -1;
5190    }
5191  
5192    gtid = __kmp_entry_gtid();
5193    KA_TRACE(
5194        1000, (""); {
5195          char buf[KMP_AFFIN_MASK_PRINT_LEN];
5196          __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5197                                    (kmp_affin_mask_t *)(*mask));
5198          __kmp_debug_printf(
5199              "kmp_set_affinity: setting affinity mask for thread %d = %s\n",
5200              gtid, buf);
5201        });
5202  
5203    if (__kmp_env_consistency_check) {
5204      if ((mask == NULL) || (*mask == NULL)) {
5205        KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
5206      } else {
5207        unsigned proc;
5208        int num_procs = 0;
5209  
5210        KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) {
5211          if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5212            KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
5213          }
5214          if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
5215            continue;
5216          }
5217          num_procs++;
5218        }
5219        if (num_procs == 0) {
5220          KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
5221        }
5222  
5223  #if KMP_GROUP_AFFINITY
5224        if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
5225          KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
5226        }
5227  #endif /* KMP_GROUP_AFFINITY */
5228      }
5229    }
5230  
5231    th = __kmp_threads[gtid];
5232    KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
5233    retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
5234    if (retval == 0) {
5235      KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
5236    }
5237  
5238    th->th.th_current_place = KMP_PLACE_UNDEFINED;
5239    th->th.th_new_place = KMP_PLACE_UNDEFINED;
5240    th->th.th_first_place = 0;
5241    th->th.th_last_place = __kmp_affinity.num_masks - 1;
5242  
5243    // Turn off 4.0 affinity for the current tread at this parallel level.
5244    th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
5245  
5246    return retval;
5247  }
5248  
__kmp_aux_get_affinity(void ** mask)5249  int __kmp_aux_get_affinity(void **mask) {
5250    int gtid;
5251    int retval;
5252  #if KMP_OS_WINDOWS || KMP_OS_AIX || KMP_DEBUG
5253    kmp_info_t *th;
5254  #endif
5255    if (!KMP_AFFINITY_CAPABLE()) {
5256      return -1;
5257    }
5258  
5259    gtid = __kmp_entry_gtid();
5260  #if KMP_OS_WINDOWS || KMP_OS_AIX || KMP_DEBUG
5261    th = __kmp_threads[gtid];
5262  #else
5263    (void)gtid; // unused variable
5264  #endif
5265    KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
5266  
5267    KA_TRACE(
5268        1000, (""); {
5269          char buf[KMP_AFFIN_MASK_PRINT_LEN];
5270          __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5271                                    th->th.th_affin_mask);
5272          __kmp_printf(
5273              "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid,
5274              buf);
5275        });
5276  
5277    if (__kmp_env_consistency_check) {
5278      if ((mask == NULL) || (*mask == NULL)) {
5279        KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
5280      }
5281    }
5282  
5283  #if !KMP_OS_WINDOWS && !KMP_OS_AIX
5284  
5285    retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
5286    KA_TRACE(
5287        1000, (""); {
5288          char buf[KMP_AFFIN_MASK_PRINT_LEN];
5289          __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5290                                    (kmp_affin_mask_t *)(*mask));
5291          __kmp_printf(
5292              "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid,
5293              buf);
5294        });
5295    return retval;
5296  
5297  #else
5298    (void)retval;
5299  
5300    KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
5301    return 0;
5302  
5303  #endif /* !KMP_OS_WINDOWS && !KMP_OS_AIX */
5304  }
5305  
__kmp_aux_get_affinity_max_proc()5306  int __kmp_aux_get_affinity_max_proc() {
5307    if (!KMP_AFFINITY_CAPABLE()) {
5308      return 0;
5309    }
5310  #if KMP_GROUP_AFFINITY
5311    if (__kmp_num_proc_groups > 1) {
5312      return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT);
5313    }
5314  #endif
5315    return __kmp_xproc;
5316  }
5317  
__kmp_aux_set_affinity_mask_proc(int proc,void ** mask)5318  int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) {
5319    if (!KMP_AFFINITY_CAPABLE()) {
5320      return -1;
5321    }
5322  
5323    KA_TRACE(
5324        1000, (""); {
5325          int gtid = __kmp_entry_gtid();
5326          char buf[KMP_AFFIN_MASK_PRINT_LEN];
5327          __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5328                                    (kmp_affin_mask_t *)(*mask));
5329          __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
5330                             "affinity mask for thread %d = %s\n",
5331                             proc, gtid, buf);
5332        });
5333  
5334    if (__kmp_env_consistency_check) {
5335      if ((mask == NULL) || (*mask == NULL)) {
5336        KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
5337      }
5338    }
5339  
5340    if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5341      return -1;
5342    }
5343    if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5344      return -2;
5345    }
5346  
5347    KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
5348    return 0;
5349  }
5350  
__kmp_aux_unset_affinity_mask_proc(int proc,void ** mask)5351  int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) {
5352    if (!KMP_AFFINITY_CAPABLE()) {
5353      return -1;
5354    }
5355  
5356    KA_TRACE(
5357        1000, (""); {
5358          int gtid = __kmp_entry_gtid();
5359          char buf[KMP_AFFIN_MASK_PRINT_LEN];
5360          __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5361                                    (kmp_affin_mask_t *)(*mask));
5362          __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
5363                             "affinity mask for thread %d = %s\n",
5364                             proc, gtid, buf);
5365        });
5366  
5367    if (__kmp_env_consistency_check) {
5368      if ((mask == NULL) || (*mask == NULL)) {
5369        KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
5370      }
5371    }
5372  
5373    if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5374      return -1;
5375    }
5376    if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5377      return -2;
5378    }
5379  
5380    KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
5381    return 0;
5382  }
5383  
__kmp_aux_get_affinity_mask_proc(int proc,void ** mask)5384  int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
5385    if (!KMP_AFFINITY_CAPABLE()) {
5386      return -1;
5387    }
5388  
5389    KA_TRACE(
5390        1000, (""); {
5391          int gtid = __kmp_entry_gtid();
5392          char buf[KMP_AFFIN_MASK_PRINT_LEN];
5393          __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5394                                    (kmp_affin_mask_t *)(*mask));
5395          __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
5396                             "affinity mask for thread %d = %s\n",
5397                             proc, gtid, buf);
5398        });
5399  
5400    if (__kmp_env_consistency_check) {
5401      if ((mask == NULL) || (*mask == NULL)) {
5402        KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
5403      }
5404    }
5405  
5406    if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5407      return -1;
5408    }
5409    if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5410      return 0;
5411    }
5412  
5413    return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
5414  }
5415  
5416  #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
5417  // Returns first os proc id with ATOM core
__kmp_get_first_osid_with_ecore(void)5418  int __kmp_get_first_osid_with_ecore(void) {
5419    int low = 0;
5420    int high = __kmp_topology->get_num_hw_threads() - 1;
5421    int mid = 0;
5422    while (high - low > 1) {
5423      mid = (high + low) / 2;
5424      if (__kmp_topology->at(mid).attrs.get_core_type() ==
5425          KMP_HW_CORE_TYPE_CORE) {
5426        low = mid + 1;
5427      } else {
5428        high = mid;
5429      }
5430    }
5431    if (__kmp_topology->at(mid).attrs.get_core_type() == KMP_HW_CORE_TYPE_ATOM) {
5432      return mid;
5433    }
5434    return -1;
5435  }
5436  #endif
5437  
5438  // Dynamic affinity settings - Affinity balanced
__kmp_balanced_affinity(kmp_info_t * th,int nthreads)5439  void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
5440    KMP_DEBUG_ASSERT(th);
5441    bool fine_gran = true;
5442    int tid = th->th.th_info.ds.ds_tid;
5443    const char *env_var = "KMP_AFFINITY";
5444  
5445    // Do not perform balanced affinity for the hidden helper threads
5446    if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th)))
5447      return;
5448  
5449    switch (__kmp_affinity.gran) {
5450    case KMP_HW_THREAD:
5451      break;
5452    case KMP_HW_CORE:
5453      if (__kmp_nThreadsPerCore > 1) {
5454        fine_gran = false;
5455      }
5456      break;
5457    case KMP_HW_SOCKET:
5458      if (nCoresPerPkg > 1) {
5459        fine_gran = false;
5460      }
5461      break;
5462    default:
5463      fine_gran = false;
5464    }
5465  
5466    if (__kmp_topology->is_uniform()) {
5467      int coreID;
5468      int threadID;
5469      // Number of hyper threads per core in HT machine
5470      int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
5471      // Number of cores
5472      int ncores = __kmp_ncores;
5473      if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) {
5474        __kmp_nth_per_core = __kmp_avail_proc / nPackages;
5475        ncores = nPackages;
5476      }
5477      // How many threads will be bound to each core
5478      int chunk = nthreads / ncores;
5479      // How many cores will have an additional thread bound to it - "big cores"
5480      int big_cores = nthreads % ncores;
5481      // Number of threads on the big cores
5482      int big_nth = (chunk + 1) * big_cores;
5483      if (tid < big_nth) {
5484        coreID = tid / (chunk + 1);
5485        threadID = (tid % (chunk + 1)) % __kmp_nth_per_core;
5486      } else { // tid >= big_nth
5487        coreID = (tid - big_cores) / chunk;
5488        threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
5489      }
5490      KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
5491                        "Illegal set affinity operation when not capable");
5492  
5493      kmp_affin_mask_t *mask = th->th.th_affin_mask;
5494      KMP_CPU_ZERO(mask);
5495  
5496      if (fine_gran) {
5497        int osID =
5498            __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id;
5499        KMP_CPU_SET(osID, mask);
5500      } else {
5501        for (int i = 0; i < __kmp_nth_per_core; i++) {
5502          int osID;
5503          osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id;
5504          KMP_CPU_SET(osID, mask);
5505        }
5506      }
5507      if (__kmp_affinity.flags.verbose) {
5508        char buf[KMP_AFFIN_MASK_PRINT_LEN];
5509        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
5510        KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
5511                   tid, buf);
5512      }
5513      __kmp_affinity_get_thread_topology_info(th);
5514      __kmp_set_system_affinity(mask, TRUE);
5515    } else { // Non-uniform topology
5516  
5517      kmp_affin_mask_t *mask = th->th.th_affin_mask;
5518      KMP_CPU_ZERO(mask);
5519  
5520      int core_level =
5521          __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1);
5522      int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc,
5523                                                 __kmp_aff_depth - 1, core_level);
5524      int nth_per_core = __kmp_affinity_max_proc_per_core(
5525          __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
5526  
5527      // For performance gain consider the special case nthreads ==
5528      // __kmp_avail_proc
5529      if (nthreads == __kmp_avail_proc) {
5530        if (fine_gran) {
5531          int osID = __kmp_topology->at(tid).os_id;
5532          KMP_CPU_SET(osID, mask);
5533        } else {
5534          int core =
5535              __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level);
5536          for (int i = 0; i < __kmp_avail_proc; i++) {
5537            int osID = __kmp_topology->at(i).os_id;
5538            if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) ==
5539                core) {
5540              KMP_CPU_SET(osID, mask);
5541            }
5542          }
5543        }
5544      } else if (nthreads <= ncores) {
5545  
5546        int core = 0;
5547        for (int i = 0; i < ncores; i++) {
5548          // Check if this core from procarr[] is in the mask
5549          int in_mask = 0;
5550          for (int j = 0; j < nth_per_core; j++) {
5551            if (procarr[i * nth_per_core + j] != -1) {
5552              in_mask = 1;
5553              break;
5554            }
5555          }
5556          if (in_mask) {
5557            if (tid == core) {
5558              for (int j = 0; j < nth_per_core; j++) {
5559                int osID = procarr[i * nth_per_core + j];
5560                if (osID != -1) {
5561                  KMP_CPU_SET(osID, mask);
5562                  // For fine granularity it is enough to set the first available
5563                  // osID for this core
5564                  if (fine_gran) {
5565                    break;
5566                  }
5567                }
5568              }
5569              break;
5570            } else {
5571              core++;
5572            }
5573          }
5574        }
5575      } else { // nthreads > ncores
5576        // Array to save the number of processors at each core
5577        int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores);
5578        // Array to save the number of cores with "x" available processors;
5579        int *ncores_with_x_procs =
5580            (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
5581        // Array to save the number of cores with # procs from x to nth_per_core
5582        int *ncores_with_x_to_max_procs =
5583            (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
5584  
5585        for (int i = 0; i <= nth_per_core; i++) {
5586          ncores_with_x_procs[i] = 0;
5587          ncores_with_x_to_max_procs[i] = 0;
5588        }
5589  
5590        for (int i = 0; i < ncores; i++) {
5591          int cnt = 0;
5592          for (int j = 0; j < nth_per_core; j++) {
5593            if (procarr[i * nth_per_core + j] != -1) {
5594              cnt++;
5595            }
5596          }
5597          nproc_at_core[i] = cnt;
5598          ncores_with_x_procs[cnt]++;
5599        }
5600  
5601        for (int i = 0; i <= nth_per_core; i++) {
5602          for (int j = i; j <= nth_per_core; j++) {
5603            ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j];
5604          }
5605        }
5606  
5607        // Max number of processors
5608        int nproc = nth_per_core * ncores;
5609        // An array to keep number of threads per each context
5610        int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc);
5611        for (int i = 0; i < nproc; i++) {
5612          newarr[i] = 0;
5613        }
5614  
5615        int nth = nthreads;
5616        int flag = 0;
5617        while (nth > 0) {
5618          for (int j = 1; j <= nth_per_core; j++) {
5619            int cnt = ncores_with_x_to_max_procs[j];
5620            for (int i = 0; i < ncores; i++) {
5621              // Skip the core with 0 processors
5622              if (nproc_at_core[i] == 0) {
5623                continue;
5624              }
5625              for (int k = 0; k < nth_per_core; k++) {
5626                if (procarr[i * nth_per_core + k] != -1) {
5627                  if (newarr[i * nth_per_core + k] == 0) {
5628                    newarr[i * nth_per_core + k] = 1;
5629                    cnt--;
5630                    nth--;
5631                    break;
5632                  } else {
5633                    if (flag != 0) {
5634                      newarr[i * nth_per_core + k]++;
5635                      cnt--;
5636                      nth--;
5637                      break;
5638                    }
5639                  }
5640                }
5641              }
5642              if (cnt == 0 || nth == 0) {
5643                break;
5644              }
5645            }
5646            if (nth == 0) {
5647              break;
5648            }
5649          }
5650          flag = 1;
5651        }
5652        int sum = 0;
5653        for (int i = 0; i < nproc; i++) {
5654          sum += newarr[i];
5655          if (sum > tid) {
5656            if (fine_gran) {
5657              int osID = procarr[i];
5658              KMP_CPU_SET(osID, mask);
5659            } else {
5660              int coreID = i / nth_per_core;
5661              for (int ii = 0; ii < nth_per_core; ii++) {
5662                int osID = procarr[coreID * nth_per_core + ii];
5663                if (osID != -1) {
5664                  KMP_CPU_SET(osID, mask);
5665                }
5666              }
5667            }
5668            break;
5669          }
5670        }
5671        __kmp_free(newarr);
5672      }
5673  
5674      if (__kmp_affinity.flags.verbose) {
5675        char buf[KMP_AFFIN_MASK_PRINT_LEN];
5676        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
5677        KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
5678                   tid, buf);
5679      }
5680      __kmp_affinity_get_thread_topology_info(th);
5681      __kmp_set_system_affinity(mask, TRUE);
5682    }
5683  }
5684  
5685  #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||     \
5686      KMP_OS_AIX
5687  // We don't need this entry for Windows because
5688  // there is GetProcessAffinityMask() api
5689  //
5690  // The intended usage is indicated by these steps:
5691  // 1) The user gets the current affinity mask
5692  // 2) Then sets the affinity by calling this function
5693  // 3) Error check the return value
5694  // 4) Use non-OpenMP parallelization
5695  // 5) Reset the affinity to what was stored in step 1)
5696  #ifdef __cplusplus
5697  extern "C"
5698  #endif
5699      int
kmp_set_thread_affinity_mask_initial()5700      kmp_set_thread_affinity_mask_initial()
5701  // the function returns 0 on success,
5702  //   -1 if we cannot bind thread
5703  //   >0 (errno) if an error happened during binding
5704  {
5705    int gtid = __kmp_get_gtid();
5706    if (gtid < 0) {
5707      // Do not touch non-omp threads
5708      KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5709                    "non-omp thread, returning\n"));
5710      return -1;
5711    }
5712    if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
5713      KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5714                    "affinity not initialized, returning\n"));
5715      return -1;
5716    }
5717    KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5718                  "set full mask for thread %d\n",
5719                  gtid));
5720    KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
5721  #if KMP_OS_AIX
5722    return bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY);
5723  #else
5724    return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
5725  #endif
5726  }
5727  #endif
5728  
5729  #endif // KMP_AFFINITY_SUPPORTED
5730