xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_dispatch_hier.h (revision 6966ac055c3b7a39266fb982493330df7a097997)
1 /*
2  * kmp_dispatch_hier.h -- hierarchical scheduling methods and data structures
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_DISPATCH_HIER_H
14 #define KMP_DISPATCH_HIER_H
15 #include "kmp.h"
16 #include "kmp_dispatch.h"
17 
18 // Layer type for scheduling hierarchy
19 enum kmp_hier_layer_e {
20   LAYER_THREAD = -1,
21   LAYER_L1,
22   LAYER_L2,
23   LAYER_L3,
24   LAYER_NUMA,
25   LAYER_LOOP,
26   LAYER_LAST
27 };
28 
29 // Convert hierarchy type (LAYER_L1, LAYER_L2, etc.) to C-style string
30 static inline const char *__kmp_get_hier_str(kmp_hier_layer_e type) {
31   switch (type) {
32   case kmp_hier_layer_e::LAYER_THREAD:
33     return "THREAD";
34   case kmp_hier_layer_e::LAYER_L1:
35     return "L1";
36   case kmp_hier_layer_e::LAYER_L2:
37     return "L2";
38   case kmp_hier_layer_e::LAYER_L3:
39     return "L3";
40   case kmp_hier_layer_e::LAYER_NUMA:
41     return "NUMA";
42   case kmp_hier_layer_e::LAYER_LOOP:
43     return "WHOLE_LOOP";
44   case kmp_hier_layer_e::LAYER_LAST:
45     return "LAST";
46   }
47   KMP_ASSERT(0);
48   // Appease compilers, should never get here
49   return "ERROR";
50 }
51 
52 // Structure to store values parsed from OMP_SCHEDULE for scheduling hierarchy
53 typedef struct kmp_hier_sched_env_t {
54   int size;
55   int capacity;
56   enum sched_type *scheds;
57   kmp_int32 *small_chunks;
58   kmp_int64 *large_chunks;
59   kmp_hier_layer_e *layers;
60   // Append a level of the hierarchy
61   void append(enum sched_type sched, kmp_int32 chunk, kmp_hier_layer_e layer) {
62     if (capacity == 0) {
63       scheds = (enum sched_type *)__kmp_allocate(sizeof(enum sched_type) *
64                                                  kmp_hier_layer_e::LAYER_LAST);
65       small_chunks = (kmp_int32 *)__kmp_allocate(sizeof(kmp_int32) *
66                                                  kmp_hier_layer_e::LAYER_LAST);
67       large_chunks = (kmp_int64 *)__kmp_allocate(sizeof(kmp_int64) *
68                                                  kmp_hier_layer_e::LAYER_LAST);
69       layers = (kmp_hier_layer_e *)__kmp_allocate(sizeof(kmp_hier_layer_e) *
70                                                   kmp_hier_layer_e::LAYER_LAST);
71       capacity = kmp_hier_layer_e::LAYER_LAST;
72     }
73     int current_size = size;
74     KMP_DEBUG_ASSERT(current_size < kmp_hier_layer_e::LAYER_LAST);
75     scheds[current_size] = sched;
76     layers[current_size] = layer;
77     small_chunks[current_size] = chunk;
78     large_chunks[current_size] = (kmp_int64)chunk;
79     size++;
80   }
81   // Sort the hierarchy using selection sort, size will always be small
82   // (less than LAYER_LAST) so it is not necessary to use an nlog(n) algorithm
83   void sort() {
84     if (size <= 1)
85       return;
86     for (int i = 0; i < size; ++i) {
87       int switch_index = i;
88       for (int j = i + 1; j < size; ++j) {
89         if (layers[j] < layers[switch_index])
90           switch_index = j;
91       }
92       if (switch_index != i) {
93         kmp_hier_layer_e temp1 = layers[i];
94         enum sched_type temp2 = scheds[i];
95         kmp_int32 temp3 = small_chunks[i];
96         kmp_int64 temp4 = large_chunks[i];
97         layers[i] = layers[switch_index];
98         scheds[i] = scheds[switch_index];
99         small_chunks[i] = small_chunks[switch_index];
100         large_chunks[i] = large_chunks[switch_index];
101         layers[switch_index] = temp1;
102         scheds[switch_index] = temp2;
103         small_chunks[switch_index] = temp3;
104         large_chunks[switch_index] = temp4;
105       }
106     }
107   }
108   // Free all memory
109   void deallocate() {
110     if (capacity > 0) {
111       __kmp_free(scheds);
112       __kmp_free(layers);
113       __kmp_free(small_chunks);
114       __kmp_free(large_chunks);
115       scheds = NULL;
116       layers = NULL;
117       small_chunks = NULL;
118       large_chunks = NULL;
119     }
120     size = 0;
121     capacity = 0;
122   }
123 } kmp_hier_sched_env_t;
124 
125 extern int __kmp_dispatch_hand_threading;
126 extern kmp_hier_sched_env_t __kmp_hier_scheds;
127 
128 // Sizes of layer arrays bounded by max number of detected L1s, L2s, etc.
129 extern int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];
130 extern int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];
131 
132 extern int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type);
133 extern int __kmp_dispatch_get_id(int gtid, kmp_hier_layer_e type);
134 extern int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1,
135                                         kmp_hier_layer_e t2);
136 extern void __kmp_dispatch_free_hierarchies(kmp_team_t *team);
137 
138 template <typename T> struct kmp_hier_shared_bdata_t {
139   typedef typename traits_t<T>::signed_t ST;
140   volatile kmp_uint64 val[2];
141   kmp_int32 status[2];
142   T lb[2];
143   T ub[2];
144   ST st[2];
145   dispatch_shared_info_template<T> sh[2];
146   void zero() {
147     val[0] = val[1] = 0;
148     status[0] = status[1] = 0;
149     lb[0] = lb[1] = 0;
150     ub[0] = ub[1] = 0;
151     st[0] = st[1] = 0;
152     sh[0].u.s.iteration = sh[1].u.s.iteration = 0;
153   }
154   void set_next_hand_thread(T nlb, T nub, ST nst, kmp_int32 nstatus,
155                             kmp_uint64 index) {
156     lb[1 - index] = nlb;
157     ub[1 - index] = nub;
158     st[1 - index] = nst;
159     status[1 - index] = nstatus;
160   }
161   void set_next(T nlb, T nub, ST nst, kmp_int32 nstatus, kmp_uint64 index) {
162     lb[1 - index] = nlb;
163     ub[1 - index] = nub;
164     st[1 - index] = nst;
165     status[1 - index] = nstatus;
166     sh[1 - index].u.s.iteration = 0;
167   }
168 
169   kmp_int32 get_next_status(kmp_uint64 index) const {
170     return status[1 - index];
171   }
172   T get_next_lb(kmp_uint64 index) const { return lb[1 - index]; }
173   T get_next_ub(kmp_uint64 index) const { return ub[1 - index]; }
174   ST get_next_st(kmp_uint64 index) const { return st[1 - index]; }
175   dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
176     return &(sh[1 - index]);
177   }
178 
179   kmp_int32 get_curr_status(kmp_uint64 index) const { return status[index]; }
180   T get_curr_lb(kmp_uint64 index) const { return lb[index]; }
181   T get_curr_ub(kmp_uint64 index) const { return ub[index]; }
182   ST get_curr_st(kmp_uint64 index) const { return st[index]; }
183   dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
184     return &(sh[index]);
185   }
186 };
187 
188 /*
189  * In the barrier implementations, num_active is the number of threads that are
190  * attached to the kmp_hier_top_unit_t structure in the scheduling hierarchy.
191  * bdata is the shared barrier data that resides on the kmp_hier_top_unit_t
192  * structure. tdata is the thread private data that resides on the thread
193  * data structure.
194  *
195  * The reset_shared() method is used to initialize the barrier data on the
196  * kmp_hier_top_unit_t hierarchy structure
197  *
198  * The reset_private() method is used to initialize the barrier data on the
199  * thread's private dispatch buffer structure
200  *
201  * The barrier() method takes an id, which is that thread's id for the
202  * kmp_hier_top_unit_t structure, and implements the barrier.  All threads wait
203  * inside barrier() until all fellow threads who are attached to that
204  * kmp_hier_top_unit_t structure have arrived.
205  */
206 
207 // Core barrier implementation
208 // Can be used in a unit with between 2 to 8 threads
209 template <typename T> class core_barrier_impl {
210   static inline kmp_uint64 get_wait_val(int num_active) {
211     kmp_uint64 wait_val = 0LL;
212     switch (num_active) {
213     case 2:
214       wait_val = 0x0101LL;
215       break;
216     case 3:
217       wait_val = 0x010101LL;
218       break;
219     case 4:
220       wait_val = 0x01010101LL;
221       break;
222     case 5:
223       wait_val = 0x0101010101LL;
224       break;
225     case 6:
226       wait_val = 0x010101010101LL;
227       break;
228     case 7:
229       wait_val = 0x01010101010101LL;
230       break;
231     case 8:
232       wait_val = 0x0101010101010101LL;
233       break;
234     default:
235       // don't use the core_barrier_impl for more than 8 threads
236       KMP_ASSERT(0);
237     }
238     return wait_val;
239   }
240 
241 public:
242   static void reset_private(kmp_int32 num_active,
243                             kmp_hier_private_bdata_t *tdata);
244   static void reset_shared(kmp_int32 num_active,
245                            kmp_hier_shared_bdata_t<T> *bdata);
246   static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
247                       kmp_hier_private_bdata_t *tdata);
248 };
249 
250 template <typename T>
251 void core_barrier_impl<T>::reset_private(kmp_int32 num_active,
252                                          kmp_hier_private_bdata_t *tdata) {
253   tdata->num_active = num_active;
254   tdata->index = 0;
255   tdata->wait_val[0] = tdata->wait_val[1] = get_wait_val(num_active);
256 }
257 template <typename T>
258 void core_barrier_impl<T>::reset_shared(kmp_int32 num_active,
259                                         kmp_hier_shared_bdata_t<T> *bdata) {
260   bdata->val[0] = bdata->val[1] = 0LL;
261   bdata->status[0] = bdata->status[1] = 0LL;
262 }
263 template <typename T>
264 void core_barrier_impl<T>::barrier(kmp_int32 id,
265                                    kmp_hier_shared_bdata_t<T> *bdata,
266                                    kmp_hier_private_bdata_t *tdata) {
267   kmp_uint64 current_index = tdata->index;
268   kmp_uint64 next_index = 1 - current_index;
269   kmp_uint64 current_wait_value = tdata->wait_val[current_index];
270   kmp_uint64 next_wait_value =
271       (current_wait_value ? 0 : get_wait_val(tdata->num_active));
272   KD_TRACE(10, ("core_barrier_impl::barrier(): T#%d current_index:%llu "
273                 "next_index:%llu curr_wait:%llu next_wait:%llu\n",
274                 __kmp_get_gtid(), current_index, next_index, current_wait_value,
275                 next_wait_value));
276   char v = (current_wait_value ? 0x1 : 0x0);
277   (RCAST(volatile char *, &(bdata->val[current_index])))[id] = v;
278   __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
279                          __kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
280   tdata->wait_val[current_index] = next_wait_value;
281   tdata->index = next_index;
282 }
283 
284 // Counter barrier implementation
285 // Can be used in a unit with arbitrary number of active threads
286 template <typename T> class counter_barrier_impl {
287 public:
288   static void reset_private(kmp_int32 num_active,
289                             kmp_hier_private_bdata_t *tdata);
290   static void reset_shared(kmp_int32 num_active,
291                            kmp_hier_shared_bdata_t<T> *bdata);
292   static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
293                       kmp_hier_private_bdata_t *tdata);
294 };
295 
296 template <typename T>
297 void counter_barrier_impl<T>::reset_private(kmp_int32 num_active,
298                                             kmp_hier_private_bdata_t *tdata) {
299   tdata->num_active = num_active;
300   tdata->index = 0;
301   tdata->wait_val[0] = tdata->wait_val[1] = (kmp_uint64)num_active;
302 }
303 template <typename T>
304 void counter_barrier_impl<T>::reset_shared(kmp_int32 num_active,
305                                            kmp_hier_shared_bdata_t<T> *bdata) {
306   bdata->val[0] = bdata->val[1] = 0LL;
307   bdata->status[0] = bdata->status[1] = 0LL;
308 }
309 template <typename T>
310 void counter_barrier_impl<T>::barrier(kmp_int32 id,
311                                       kmp_hier_shared_bdata_t<T> *bdata,
312                                       kmp_hier_private_bdata_t *tdata) {
313   volatile kmp_int64 *val;
314   kmp_uint64 current_index = tdata->index;
315   kmp_uint64 next_index = 1 - current_index;
316   kmp_uint64 current_wait_value = tdata->wait_val[current_index];
317   kmp_uint64 next_wait_value = current_wait_value + tdata->num_active;
318 
319   KD_TRACE(10, ("counter_barrier_impl::barrier(): T#%d current_index:%llu "
320                 "next_index:%llu curr_wait:%llu next_wait:%llu\n",
321                 __kmp_get_gtid(), current_index, next_index, current_wait_value,
322                 next_wait_value));
323   val = RCAST(volatile kmp_int64 *, &(bdata->val[current_index]));
324   KMP_TEST_THEN_INC64(val);
325   __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
326                          __kmp_ge<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
327   tdata->wait_val[current_index] = next_wait_value;
328   tdata->index = next_index;
329 }
330 
331 // Data associated with topology unit within a layer
332 // For example, one kmp_hier_top_unit_t corresponds to one L1 cache
333 template <typename T> struct kmp_hier_top_unit_t {
334   typedef typename traits_t<T>::signed_t ST;
335   typedef typename traits_t<T>::unsigned_t UT;
336   kmp_int32 active; // number of topology units that communicate with this unit
337   // chunk information (lower/upper bound, stride, etc.)
338   dispatch_private_info_template<T> hier_pr;
339   kmp_hier_top_unit_t<T> *hier_parent; // pointer to parent unit
340   kmp_hier_shared_bdata_t<T> hier_barrier; // shared barrier data for this unit
341 
342   kmp_int32 get_hier_id() const { return hier_pr.hier_id; }
343   void reset_shared_barrier() {
344     KMP_DEBUG_ASSERT(active > 0);
345     if (active == 1)
346       return;
347     hier_barrier.zero();
348     if (active >= 2 && active <= 8) {
349       core_barrier_impl<T>::reset_shared(active, &hier_barrier);
350     } else {
351       counter_barrier_impl<T>::reset_shared(active, &hier_barrier);
352     }
353   }
354   void reset_private_barrier(kmp_hier_private_bdata_t *tdata) {
355     KMP_DEBUG_ASSERT(tdata);
356     KMP_DEBUG_ASSERT(active > 0);
357     if (active == 1)
358       return;
359     if (active >= 2 && active <= 8) {
360       core_barrier_impl<T>::reset_private(active, tdata);
361     } else {
362       counter_barrier_impl<T>::reset_private(active, tdata);
363     }
364   }
365   void barrier(kmp_int32 id, kmp_hier_private_bdata_t *tdata) {
366     KMP_DEBUG_ASSERT(tdata);
367     KMP_DEBUG_ASSERT(active > 0);
368     KMP_DEBUG_ASSERT(id >= 0 && id < active);
369     if (active == 1) {
370       tdata->index = 1 - tdata->index;
371       return;
372     }
373     if (active >= 2 && active <= 8) {
374       core_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
375     } else {
376       counter_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
377     }
378   }
379 
380   kmp_int32 get_next_status(kmp_uint64 index) const {
381     return hier_barrier.get_next_status(index);
382   }
383   T get_next_lb(kmp_uint64 index) const {
384     return hier_barrier.get_next_lb(index);
385   }
386   T get_next_ub(kmp_uint64 index) const {
387     return hier_barrier.get_next_ub(index);
388   }
389   ST get_next_st(kmp_uint64 index) const {
390     return hier_barrier.get_next_st(index);
391   }
392   dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
393     return hier_barrier.get_next_sh(index);
394   }
395 
396   kmp_int32 get_curr_status(kmp_uint64 index) const {
397     return hier_barrier.get_curr_status(index);
398   }
399   T get_curr_lb(kmp_uint64 index) const {
400     return hier_barrier.get_curr_lb(index);
401   }
402   T get_curr_ub(kmp_uint64 index) const {
403     return hier_barrier.get_curr_ub(index);
404   }
405   ST get_curr_st(kmp_uint64 index) const {
406     return hier_barrier.get_curr_st(index);
407   }
408   dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
409     return hier_barrier.get_curr_sh(index);
410   }
411 
412   void set_next_hand_thread(T lb, T ub, ST st, kmp_int32 status,
413                             kmp_uint64 index) {
414     hier_barrier.set_next_hand_thread(lb, ub, st, status, index);
415   }
416   void set_next(T lb, T ub, ST st, kmp_int32 status, kmp_uint64 index) {
417     hier_barrier.set_next(lb, ub, st, status, index);
418   }
419   dispatch_private_info_template<T> *get_my_pr() { return &hier_pr; }
420   kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
421   dispatch_private_info_template<T> *get_parent_pr() {
422     return &(hier_parent->hier_pr);
423   }
424 
425   kmp_int32 is_active() const { return active; }
426   kmp_int32 get_num_active() const { return active; }
427 #ifdef KMP_DEBUG
428   void print() {
429     KD_TRACE(
430         10,
431         ("    kmp_hier_top_unit_t: active:%d pr:%p lb:%d ub:%d st:%d tc:%d\n",
432          active, &hier_pr, hier_pr.u.p.lb, hier_pr.u.p.ub, hier_pr.u.p.st,
433          hier_pr.u.p.tc));
434   }
435 #endif
436 };
437 
438 // Information regarding a single layer within the scheduling hierarchy
439 template <typename T> struct kmp_hier_layer_info_t {
440   int num_active; // number of threads active in this level
441   kmp_hier_layer_e type; // LAYER_L1, LAYER_L2, etc.
442   enum sched_type sched; // static, dynamic, guided, etc.
443   typename traits_t<T>::signed_t chunk; // chunk size associated with schedule
444   int length; // length of the kmp_hier_top_unit_t array
445 
446 #ifdef KMP_DEBUG
447   // Print this layer's information
448   void print() {
449     const char *t = __kmp_get_hier_str(type);
450     KD_TRACE(
451         10,
452         ("    kmp_hier_layer_info_t: num_active:%d type:%s sched:%d chunk:%d "
453          "length:%d\n",
454          num_active, t, sched, chunk, length));
455   }
456 #endif
457 };
458 
459 /*
460  * Structure to implement entire hierarchy
461  *
462  * The hierarchy is kept as an array of arrays to represent the different
463  * layers.  Layer 0 is the lowest layer to layer num_layers - 1 which is the
464  * highest layer.
465  * Example:
466  * [ 2 ] -> [ L3 | L3 ]
467  * [ 1 ] -> [ L2 | L2 | L2 | L2 ]
468  * [ 0 ] -> [ L1 | L1 | L1 | L1 | L1 | L1 | L1 | L1 ]
469  * There is also an array of layer_info_t which has information regarding
470  * each layer
471  */
472 template <typename T> struct kmp_hier_t {
473 public:
474   typedef typename traits_t<T>::unsigned_t UT;
475   typedef typename traits_t<T>::signed_t ST;
476 
477 private:
478   int next_recurse(ident_t *loc, int gtid, kmp_hier_top_unit_t<T> *current,
479                    kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st,
480                    kmp_int32 previous_id, int hier_level) {
481     int status;
482     kmp_info_t *th = __kmp_threads[gtid];
483     auto parent = current->get_parent();
484     bool last_layer = (hier_level == get_num_layers() - 1);
485     KMP_DEBUG_ASSERT(th);
486     kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[hier_level]);
487     KMP_DEBUG_ASSERT(current);
488     KMP_DEBUG_ASSERT(hier_level >= 0);
489     KMP_DEBUG_ASSERT(hier_level < get_num_layers());
490     KMP_DEBUG_ASSERT(tdata);
491     KMP_DEBUG_ASSERT(parent || last_layer);
492 
493     KD_TRACE(
494         1, ("kmp_hier_t.next_recurse(): T#%d (%d) called\n", gtid, hier_level));
495 
496     T hier_id = (T)current->get_hier_id();
497     // Attempt to grab next iteration range for this level
498     if (previous_id == 0) {
499       KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is master of unit\n",
500                    gtid, hier_level));
501       kmp_int32 contains_last;
502       T my_lb, my_ub;
503       ST my_st;
504       T nproc;
505       dispatch_shared_info_template<T> volatile *my_sh;
506       dispatch_private_info_template<T> *my_pr;
507       if (last_layer) {
508         // last layer below the very top uses the single shared buffer
509         // from the team struct.
510         KD_TRACE(10,
511                  ("kmp_hier_t.next_recurse(): T#%d (%d) using top level sh\n",
512                   gtid, hier_level));
513         my_sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
514             th->th.th_dispatch->th_dispatch_sh_current);
515         nproc = (T)get_top_level_nproc();
516       } else {
517         // middle layers use the shared buffer inside the kmp_hier_top_unit_t
518         // structure
519         KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) using hier sh\n",
520                       gtid, hier_level));
521         my_sh =
522             parent->get_curr_sh(th->th.th_hier_bar_data[hier_level + 1].index);
523         nproc = (T)parent->get_num_active();
524       }
525       my_pr = current->get_my_pr();
526       KMP_DEBUG_ASSERT(my_sh);
527       KMP_DEBUG_ASSERT(my_pr);
528       enum sched_type schedule = get_sched(hier_level);
529       ST chunk = (ST)get_chunk(hier_level);
530       status = __kmp_dispatch_next_algorithm<T>(gtid, my_pr, my_sh,
531                                                 &contains_last, &my_lb, &my_ub,
532                                                 &my_st, nproc, hier_id);
533       KD_TRACE(
534           10,
535           ("kmp_hier_t.next_recurse(): T#%d (%d) next_pr_sh() returned %d\n",
536            gtid, hier_level, status));
537       // When no iterations are found (status == 0) and this is not the last
538       // layer, attempt to go up the hierarchy for more iterations
539       if (status == 0 && !last_layer) {
540         status = next_recurse(loc, gtid, parent, &contains_last, &my_lb, &my_ub,
541                               &my_st, hier_id, hier_level + 1);
542         KD_TRACE(
543             10,
544             ("kmp_hier_t.next_recurse(): T#%d (%d) hier_next() returned %d\n",
545              gtid, hier_level, status));
546         if (status == 1) {
547           kmp_hier_private_bdata_t *upper_tdata =
548               &(th->th.th_hier_bar_data[hier_level + 1]);
549           my_sh = parent->get_curr_sh(upper_tdata->index);
550           KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) about to init\n",
551                         gtid, hier_level));
552           __kmp_dispatch_init_algorithm(loc, gtid, my_pr, schedule,
553                                         parent->get_curr_lb(upper_tdata->index),
554                                         parent->get_curr_ub(upper_tdata->index),
555                                         parent->get_curr_st(upper_tdata->index),
556 #if USE_ITT_BUILD
557                                         NULL,
558 #endif
559                                         chunk, nproc, hier_id);
560           status = __kmp_dispatch_next_algorithm<T>(
561               gtid, my_pr, my_sh, &contains_last, &my_lb, &my_ub, &my_st, nproc,
562               hier_id);
563           if (!status) {
564             KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) status not 1 "
565                           "setting to 2!\n",
566                           gtid, hier_level));
567             status = 2;
568           }
569         }
570       }
571       current->set_next(my_lb, my_ub, my_st, status, tdata->index);
572       // Propagate whether a unit holds the actual global last iteration
573       // The contains_last attribute is sent downwards from the top to the
574       // bottom of the hierarchy via the contains_last flag inside the
575       // private dispatch buffers in the hierarchy's middle layers
576       if (contains_last) {
577         // If the next_algorithm() method returns 1 for p_last and it is the
578         // last layer or our parent contains the last serial chunk, then the
579         // chunk must contain the last serial iteration.
580         if (last_layer || parent->hier_pr.flags.contains_last) {
581           KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) Setting this pr "
582                         "to contain last.\n",
583                         gtid, hier_level));
584           current->hier_pr.flags.contains_last = contains_last;
585         }
586         if (!current->hier_pr.flags.contains_last)
587           contains_last = FALSE;
588       }
589       if (p_last)
590         *p_last = contains_last;
591     } // if master thread of this unit
592     if (hier_level > 0 || !__kmp_dispatch_hand_threading) {
593       KD_TRACE(10,
594                ("kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n",
595                 gtid, hier_level));
596       current->barrier(previous_id, tdata);
597       KD_TRACE(10,
598                ("kmp_hier_t.next_recurse(): T#%d (%d) released and exit %d\n",
599                 gtid, hier_level, current->get_curr_status(tdata->index)));
600     } else {
601       KMP_DEBUG_ASSERT(previous_id == 0);
602       return status;
603     }
604     return current->get_curr_status(tdata->index);
605   }
606 
607 public:
608   int top_level_nproc;
609   int num_layers;
610   bool valid;
611   int type_size;
612   kmp_hier_layer_info_t<T> *info;
613   kmp_hier_top_unit_t<T> **layers;
614   // Deallocate all memory from this hierarchy
615   void deallocate() {
616     for (int i = 0; i < num_layers; ++i)
617       if (layers[i] != NULL) {
618         __kmp_free(layers[i]);
619       }
620     if (layers != NULL) {
621       __kmp_free(layers);
622       layers = NULL;
623     }
624     if (info != NULL) {
625       __kmp_free(info);
626       info = NULL;
627     }
628     num_layers = 0;
629     valid = false;
630   }
631   // Returns true if reallocation is needed else false
632   bool need_to_reallocate(int n, const kmp_hier_layer_e *new_layers,
633                           const enum sched_type *new_scheds,
634                           const ST *new_chunks) const {
635     if (!valid || layers == NULL || info == NULL ||
636         traits_t<T>::type_size != type_size || n != num_layers)
637       return true;
638     for (int i = 0; i < n; ++i) {
639       if (info[i].type != new_layers[i])
640         return true;
641       if (info[i].sched != new_scheds[i])
642         return true;
643       if (info[i].chunk != new_chunks[i])
644         return true;
645     }
646     return false;
647   }
648   // A single thread should call this function while the other threads wait
649   // create a new scheduling hierarchy consisting of new_layers, new_scheds
650   // and new_chunks.  These should come pre-sorted according to
651   // kmp_hier_layer_e value.  This function will try to avoid reallocation
652   // if it can
653   void allocate_hier(int n, const kmp_hier_layer_e *new_layers,
654                      const enum sched_type *new_scheds, const ST *new_chunks) {
655     top_level_nproc = 0;
656     if (!need_to_reallocate(n, new_layers, new_scheds, new_chunks)) {
657       KD_TRACE(
658           10,
659           ("kmp_hier_t<T>::allocate_hier: T#0 do not need to reallocate\n"));
660       for (int i = 0; i < n; ++i) {
661         info[i].num_active = 0;
662         for (int j = 0; j < get_length(i); ++j)
663           layers[i][j].active = 0;
664       }
665       return;
666     }
667     KD_TRACE(10, ("kmp_hier_t<T>::allocate_hier: T#0 full alloc\n"));
668     deallocate();
669     type_size = traits_t<T>::type_size;
670     num_layers = n;
671     info = (kmp_hier_layer_info_t<T> *)__kmp_allocate(
672         sizeof(kmp_hier_layer_info_t<T>) * n);
673     layers = (kmp_hier_top_unit_t<T> **)__kmp_allocate(
674         sizeof(kmp_hier_top_unit_t<T> *) * n);
675     for (int i = 0; i < n; ++i) {
676       int max = 0;
677       kmp_hier_layer_e layer = new_layers[i];
678       info[i].num_active = 0;
679       info[i].type = layer;
680       info[i].sched = new_scheds[i];
681       info[i].chunk = new_chunks[i];
682       max = __kmp_hier_max_units[layer + 1];
683       if (max == 0) {
684         valid = false;
685         KMP_WARNING(HierSchedInvalid, __kmp_get_hier_str(layer));
686         deallocate();
687         return;
688       }
689       info[i].length = max;
690       layers[i] = (kmp_hier_top_unit_t<T> *)__kmp_allocate(
691           sizeof(kmp_hier_top_unit_t<T>) * max);
692       for (int j = 0; j < max; ++j) {
693         layers[i][j].active = 0;
694         layers[i][j].hier_pr.flags.use_hier = TRUE;
695       }
696     }
697     valid = true;
698   }
699   // loc - source file location
700   // gtid - global thread identifier
701   // pr - this thread's private dispatch buffer (corresponding with gtid)
702   // p_last (return value) - pointer to flag indicating this set of iterations
703   // contains last
704   //          iteration
705   // p_lb (return value) - lower bound for this chunk of iterations
706   // p_ub (return value) - upper bound for this chunk of iterations
707   // p_st (return value) - stride for this chunk of iterations
708   //
709   // Returns 1 if there are more iterations to perform, 0 otherwise
710   int next(ident_t *loc, int gtid, dispatch_private_info_template<T> *pr,
711            kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st) {
712     int status;
713     kmp_int32 contains_last = 0;
714     kmp_info_t *th = __kmp_threads[gtid];
715     kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[0]);
716     auto parent = pr->get_parent();
717     KMP_DEBUG_ASSERT(parent);
718     KMP_DEBUG_ASSERT(th);
719     KMP_DEBUG_ASSERT(tdata);
720     KMP_DEBUG_ASSERT(parent);
721     T nproc = (T)parent->get_num_active();
722     T unit_id = (T)pr->get_hier_id();
723     KD_TRACE(
724         10,
725         ("kmp_hier_t.next(): T#%d THREAD LEVEL nproc:%d unit_id:%d called\n",
726          gtid, nproc, unit_id));
727     // Handthreading implementation
728     // Each iteration is performed by all threads on last unit (typically
729     // cores/tiles)
730     // e.g., threads 0,1,2,3 all execute iteration 0
731     //       threads 0,1,2,3 all execute iteration 1
732     //       threads 4,5,6,7 all execute iteration 2
733     //       threads 4,5,6,7 all execute iteration 3
734     //       ... etc.
735     if (__kmp_dispatch_hand_threading) {
736       KD_TRACE(10,
737                ("kmp_hier_t.next(): T#%d THREAD LEVEL using hand threading\n",
738                 gtid));
739       if (unit_id == 0) {
740         // For hand threading, the sh buffer on the lowest level is only ever
741         // modified and read by the master thread on that level.  Because of
742         // this, we can always use the first sh buffer.
743         auto sh = &(parent->hier_barrier.sh[0]);
744         KMP_DEBUG_ASSERT(sh);
745         status = __kmp_dispatch_next_algorithm<T>(
746             gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
747         if (!status) {
748           bool done = false;
749           while (!done) {
750             done = true;
751             status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
752                                   p_st, unit_id, 0);
753             if (status == 1) {
754               __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
755                                             parent->get_next_lb(tdata->index),
756                                             parent->get_next_ub(tdata->index),
757                                             parent->get_next_st(tdata->index),
758 #if USE_ITT_BUILD
759                                             NULL,
760 #endif
761                                             pr->u.p.parm1, nproc, unit_id);
762               sh->u.s.iteration = 0;
763               status = __kmp_dispatch_next_algorithm<T>(
764                   gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc,
765                   unit_id);
766               if (!status) {
767                 KD_TRACE(10,
768                          ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
769                           "after next_pr_sh()"
770                           "trying again.\n",
771                           gtid));
772                 done = false;
773               }
774             } else if (status == 2) {
775               KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
776                             "trying again.\n",
777                             gtid));
778               done = false;
779             }
780           }
781         }
782         parent->set_next_hand_thread(*p_lb, *p_ub, *p_st, status, tdata->index);
783       } // if master thread of lowest unit level
784       parent->barrier(pr->get_hier_id(), tdata);
785       if (unit_id != 0) {
786         *p_lb = parent->get_curr_lb(tdata->index);
787         *p_ub = parent->get_curr_ub(tdata->index);
788         *p_st = parent->get_curr_st(tdata->index);
789         status = parent->get_curr_status(tdata->index);
790       }
791     } else {
792       // Normal implementation
793       // Each thread grabs an iteration chunk and executes it (no cooperation)
794       auto sh = parent->get_curr_sh(tdata->index);
795       KMP_DEBUG_ASSERT(sh);
796       status = __kmp_dispatch_next_algorithm<T>(
797           gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
798       KD_TRACE(10,
799                ("kmp_hier_t.next(): T#%d THREAD LEVEL next_algorithm status:%d "
800                 "contains_last:%d p_lb:%d p_ub:%d p_st:%d\n",
801                 gtid, status, contains_last, *p_lb, *p_ub, *p_st));
802       if (!status) {
803         bool done = false;
804         while (!done) {
805           done = true;
806           status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
807                                 p_st, unit_id, 0);
808           if (status == 1) {
809             sh = parent->get_curr_sh(tdata->index);
810             __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
811                                           parent->get_curr_lb(tdata->index),
812                                           parent->get_curr_ub(tdata->index),
813                                           parent->get_curr_st(tdata->index),
814 #if USE_ITT_BUILD
815                                           NULL,
816 #endif
817                                           pr->u.p.parm1, nproc, unit_id);
818             status = __kmp_dispatch_next_algorithm<T>(
819                 gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
820             if (!status) {
821               KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
822                             "after next_pr_sh()"
823                             "trying again.\n",
824                             gtid));
825               done = false;
826             }
827           } else if (status == 2) {
828             KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
829                           "trying again.\n",
830                           gtid));
831             done = false;
832           }
833         }
834       }
835     }
836     if (contains_last && !parent->hier_pr.flags.contains_last) {
837       KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL resetting "
838                     "contains_last to FALSE\n",
839                     gtid));
840       contains_last = FALSE;
841     }
842     if (p_last)
843       *p_last = contains_last;
844     KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL exit status %d\n", gtid,
845                   status));
846     return status;
847   }
848   // These functions probe the layer info structure
849   // Returns the type of topology unit given level
850   kmp_hier_layer_e get_type(int level) const {
851     KMP_DEBUG_ASSERT(level >= 0);
852     KMP_DEBUG_ASSERT(level < num_layers);
853     return info[level].type;
854   }
855   // Returns the schedule type at given level
856   enum sched_type get_sched(int level) const {
857     KMP_DEBUG_ASSERT(level >= 0);
858     KMP_DEBUG_ASSERT(level < num_layers);
859     return info[level].sched;
860   }
861   // Returns the chunk size at given level
862   ST get_chunk(int level) const {
863     KMP_DEBUG_ASSERT(level >= 0);
864     KMP_DEBUG_ASSERT(level < num_layers);
865     return info[level].chunk;
866   }
867   // Returns the number of active threads at given level
868   int get_num_active(int level) const {
869     KMP_DEBUG_ASSERT(level >= 0);
870     KMP_DEBUG_ASSERT(level < num_layers);
871     return info[level].num_active;
872   }
873   // Returns the length of topology unit array at given level
874   int get_length(int level) const {
875     KMP_DEBUG_ASSERT(level >= 0);
876     KMP_DEBUG_ASSERT(level < num_layers);
877     return info[level].length;
878   }
879   // Returns the topology unit given the level and index
880   kmp_hier_top_unit_t<T> *get_unit(int level, int index) {
881     KMP_DEBUG_ASSERT(level >= 0);
882     KMP_DEBUG_ASSERT(level < num_layers);
883     KMP_DEBUG_ASSERT(index >= 0);
884     KMP_DEBUG_ASSERT(index < get_length(level));
885     return &(layers[level][index]);
886   }
887   // Returns the number of layers in the hierarchy
888   int get_num_layers() const { return num_layers; }
889   // Returns the number of threads in the top layer
890   // This is necessary because we don't store a topology unit as
891   // the very top level and the scheduling algorithms need this information
892   int get_top_level_nproc() const { return top_level_nproc; }
893   // Return whether this hierarchy is valid or not
894   bool is_valid() const { return valid; }
895 #ifdef KMP_DEBUG
896   // Print the hierarchy
897   void print() {
898     KD_TRACE(10, ("kmp_hier_t:\n"));
899     for (int i = num_layers - 1; i >= 0; --i) {
900       KD_TRACE(10, ("Info[%d] = ", i));
901       info[i].print();
902     }
903     for (int i = num_layers - 1; i >= 0; --i) {
904       KD_TRACE(10, ("Layer[%d] =\n", i));
905       for (int j = 0; j < info[i].length; ++j) {
906         layers[i][j].print();
907       }
908     }
909   }
910 #endif
911 };
912 
913 template <typename T>
914 void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,
915                                    kmp_hier_layer_e *new_layers,
916                                    enum sched_type *new_scheds,
917                                    typename traits_t<T>::signed_t *new_chunks,
918                                    T lb, T ub,
919                                    typename traits_t<T>::signed_t st) {
920   int tid, gtid, num_hw_threads, num_threads_per_layer1, active;
921   int my_buffer_index;
922   kmp_info_t *th;
923   kmp_team_t *team;
924   dispatch_private_info_template<T> *pr;
925   dispatch_shared_info_template<T> volatile *sh;
926   gtid = __kmp_entry_gtid();
927   tid = __kmp_tid_from_gtid(gtid);
928 #ifdef KMP_DEBUG
929   KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d called: %d layer(s)\n",
930                 gtid, n));
931   for (int i = 0; i < n; ++i) {
932     const char *layer = __kmp_get_hier_str(new_layers[i]);
933     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d: new_layers[%d] = %s, "
934                   "new_scheds[%d] = %d, new_chunks[%d] = %u\n",
935                   gtid, i, layer, i, (int)new_scheds[i], i, new_chunks[i]));
936   }
937 #endif // KMP_DEBUG
938   KMP_DEBUG_ASSERT(n > 0);
939   KMP_DEBUG_ASSERT(new_layers);
940   KMP_DEBUG_ASSERT(new_scheds);
941   KMP_DEBUG_ASSERT(new_chunks);
942   if (!TCR_4(__kmp_init_parallel))
943     __kmp_parallel_initialize();
944   __kmp_resume_if_soft_paused();
945 
946   th = __kmp_threads[gtid];
947   team = th->th.th_team;
948   active = !team->t.t_serialized;
949   th->th.th_ident = loc;
950   num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
951   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
952                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
953   my_buffer_index = th->th.th_dispatch->th_disp_index;
954   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
955       &th->th.th_dispatch
956            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
957   sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
958       &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
959   if (!active) {
960     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d not active parallel. "
961                   "Using normal dispatch functions.\n",
962                   gtid));
963     KMP_DEBUG_ASSERT(pr);
964     pr->flags.use_hier = FALSE;
965     pr->flags.contains_last = FALSE;
966     return;
967   }
968   KMP_DEBUG_ASSERT(pr);
969   KMP_DEBUG_ASSERT(sh);
970   pr->flags.use_hier = TRUE;
971   pr->u.p.tc = 0;
972   // Have master allocate the hierarchy
973   if (__kmp_tid_from_gtid(gtid) == 0) {
974     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating "
975                   "hierarchy\n",
976                   gtid, pr, sh));
977     if (sh->hier == NULL) {
978       sh->hier = (kmp_hier_t<T> *)__kmp_allocate(sizeof(kmp_hier_t<T>));
979     }
980     sh->hier->allocate_hier(n, new_layers, new_scheds, new_chunks);
981     sh->u.s.iteration = 0;
982   }
983   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
984   // Check to make sure the hierarchy is valid
985   kmp_hier_t<T> *hier = sh->hier;
986   if (!sh->hier->is_valid()) {
987     pr->flags.use_hier = FALSE;
988     return;
989   }
990   // Have threads allocate their thread-private barrier data if it hasn't
991   // already been allocated
992   if (th->th.th_hier_bar_data == NULL) {
993     th->th.th_hier_bar_data = (kmp_hier_private_bdata_t *)__kmp_allocate(
994         sizeof(kmp_hier_private_bdata_t) * kmp_hier_layer_e::LAYER_LAST);
995   }
996   // Have threads "register" themselves by modifiying the active count for each
997   // level they are involved in. The active count will act as nthreads for that
998   // level regarding the scheduling algorithms
999   for (int i = 0; i < n; ++i) {
1000     int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
1001     kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
1002     // Setup the thread's private dispatch buffer's hierarchy pointers
1003     if (i == 0)
1004       pr->hier_parent = my_unit;
1005     // If this unit is already active, then increment active count and wait
1006     if (my_unit->is_active()) {
1007       KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
1008                     "is already active (%d)\n",
1009                     gtid, my_unit, my_unit->active));
1010       KMP_TEST_THEN_INC32(&(my_unit->active));
1011       break;
1012     }
1013     // Flag that this unit is active
1014     if (KMP_COMPARE_AND_STORE_ACQ32(&(my_unit->active), 0, 1)) {
1015       // Do not setup parent pointer for top level unit since it has no parent
1016       if (i < n - 1) {
1017         // Setup middle layer pointers to parents
1018         my_unit->get_my_pr()->hier_id =
1019             index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
1020                                                  hier->get_type(i + 1));
1021         int parent_index = __kmp_dispatch_get_index(tid, hier->get_type(i + 1));
1022         my_unit->hier_parent = hier->get_unit(i + 1, parent_index);
1023       } else {
1024         // Setup top layer information (no parent pointers are set)
1025         my_unit->get_my_pr()->hier_id =
1026             index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
1027                                                  kmp_hier_layer_e::LAYER_LOOP);
1028         KMP_TEST_THEN_INC32(&(hier->top_level_nproc));
1029         my_unit->hier_parent = nullptr;
1030       }
1031       // Set trip count to 0 so that next() operation will initially climb up
1032       // the hierarchy to get more iterations (early exit in next() for tc == 0)
1033       my_unit->get_my_pr()->u.p.tc = 0;
1034       // Increment this layer's number of active units
1035       KMP_TEST_THEN_INC32(&(hier->info[i].num_active));
1036       KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
1037                     "incrementing num_active\n",
1038                     gtid, my_unit));
1039     } else {
1040       KMP_TEST_THEN_INC32(&(my_unit->active));
1041       break;
1042     }
1043   }
1044   // Set this thread's id
1045   num_threads_per_layer1 = __kmp_dispatch_get_t1_per_t2(
1046       kmp_hier_layer_e::LAYER_THREAD, hier->get_type(0));
1047   pr->hier_id = tid % num_threads_per_layer1;
1048   // For oversubscribed threads, increment their index within the lowest unit
1049   // This is done to prevent having two or more threads with id 0, id 1, etc.
1050   if (tid >= num_hw_threads)
1051     pr->hier_id += ((tid / num_hw_threads) * num_threads_per_layer1);
1052   KD_TRACE(
1053       10, ("__kmp_dispatch_init_hierarchy: T#%d setting lowest hier_id to %d\n",
1054            gtid, pr->hier_id));
1055 
1056   pr->flags.contains_last = FALSE;
1057   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1058 
1059   // Now that the number of active threads at each level is determined,
1060   // the barrier data for each unit can be initialized and the last layer's
1061   // loop information can be initialized.
1062   int prev_id = pr->get_hier_id();
1063   for (int i = 0; i < n; ++i) {
1064     if (prev_id != 0)
1065       break;
1066     int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
1067     kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
1068     // Only master threads of this unit within the hierarchy do initialization
1069     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n",
1070                   gtid, i));
1071     my_unit->reset_shared_barrier();
1072     my_unit->hier_pr.flags.contains_last = FALSE;
1073     // Last layer, initialize the private buffers with entire loop information
1074     // Now the next next_algorithim() call will get the first chunk of
1075     // iterations properly
1076     if (i == n - 1) {
1077       __kmp_dispatch_init_algorithm<T>(
1078           loc, gtid, my_unit->get_my_pr(), hier->get_sched(i), lb, ub, st,
1079 #if USE_ITT_BUILD
1080           NULL,
1081 #endif
1082           hier->get_chunk(i), hier->get_num_active(i), my_unit->get_hier_id());
1083     }
1084     prev_id = my_unit->get_hier_id();
1085   }
1086   // Initialize each layer of the thread's private barrier data
1087   kmp_hier_top_unit_t<T> *unit = pr->hier_parent;
1088   for (int i = 0; i < n && unit; ++i, unit = unit->get_parent()) {
1089     kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[i]);
1090     unit->reset_private_barrier(tdata);
1091   }
1092   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1093 
1094 #ifdef KMP_DEBUG
1095   if (__kmp_tid_from_gtid(gtid) == 0) {
1096     for (int i = 0; i < n; ++i) {
1097       KD_TRACE(10,
1098                ("__kmp_dispatch_init_hierarchy: T#%d active count[%d] = %d\n",
1099                 gtid, i, hier->get_num_active(i)));
1100     }
1101     hier->print();
1102   }
1103   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1104 #endif // KMP_DEBUG
1105 }
1106 #endif
1107