xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_barrier.cpp (revision b59017c5cad90d0f09a59e68c00457b7faf93e7c)
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp_wait_release.h"
14 #include "kmp_barrier.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 // for distributed barrier
20 #include "kmp_affinity.h"
21 
22 #if KMP_MIC
23 #include <immintrin.h>
24 #define USE_NGO_STORES 1
25 #endif // KMP_MIC
26 
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39 
40 void __kmp_print_structure(void); // Forward declaration
41 
42 // ---------------------------- Barrier Algorithms ----------------------------
43 // Distributed barrier
44 
45 // Compute how many threads to have polling each cache-line.
46 // We want to limit the number of writes to IDEAL_GO_RESOLUTION.
47 void distributedBarrier::computeVarsForN(size_t n) {
48   int nsockets = 1;
49   if (__kmp_topology) {
50     int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
51     int core_level = __kmp_topology->get_level(KMP_HW_CORE);
52     int ncores_per_socket =
53         __kmp_topology->calculate_ratio(core_level, socket_level);
54     nsockets = __kmp_topology->get_count(socket_level);
55 
56     if (nsockets <= 0)
57       nsockets = 1;
58     if (ncores_per_socket <= 0)
59       ncores_per_socket = 1;
60 
61     threads_per_go = ncores_per_socket >> 1;
62     if (!fix_threads_per_go) {
63       // Minimize num_gos
64       if (threads_per_go > 4) {
65         if (KMP_OPTIMIZE_FOR_REDUCTIONS) {
66           threads_per_go = threads_per_go >> 1;
67         }
68         if (threads_per_go > 4 && nsockets == 1)
69           threads_per_go = threads_per_go >> 1;
70       }
71     }
72     if (threads_per_go == 0)
73       threads_per_go = 1;
74     fix_threads_per_go = true;
75     num_gos = n / threads_per_go;
76     if (n % threads_per_go)
77       num_gos++;
78     if (nsockets == 1 || num_gos == 1)
79       num_groups = 1;
80     else {
81       num_groups = num_gos / nsockets;
82       if (num_gos % nsockets)
83         num_groups++;
84     }
85     if (num_groups <= 0)
86       num_groups = 1;
87     gos_per_group = num_gos / num_groups;
88     if (num_gos % num_groups)
89       gos_per_group++;
90     threads_per_group = threads_per_go * gos_per_group;
91   } else {
92     num_gos = n / threads_per_go;
93     if (n % threads_per_go)
94       num_gos++;
95     if (num_gos == 1)
96       num_groups = 1;
97     else {
98       num_groups = num_gos / 2;
99       if (num_gos % 2)
100         num_groups++;
101     }
102     gos_per_group = num_gos / num_groups;
103     if (num_gos % num_groups)
104       gos_per_group++;
105     threads_per_group = threads_per_go * gos_per_group;
106   }
107 }
108 
109 void distributedBarrier::computeGo(size_t n) {
110   // Minimize num_gos
111   for (num_gos = 1;; num_gos++)
112     if (IDEAL_CONTENTION * num_gos >= n)
113       break;
114   threads_per_go = n / num_gos;
115   if (n % num_gos)
116     threads_per_go++;
117   while (num_gos > MAX_GOS) {
118     threads_per_go++;
119     num_gos = n / threads_per_go;
120     if (n % threads_per_go)
121       num_gos++;
122   }
123   computeVarsForN(n);
124 }
125 
126 // This function is to resize the barrier arrays when the new number of threads
127 // exceeds max_threads, which is the current size of all the arrays
128 void distributedBarrier::resize(size_t nthr) {
129   KMP_DEBUG_ASSERT(nthr > max_threads);
130 
131   // expand to requested size * 2
132   max_threads = nthr * 2;
133 
134   // allocate arrays to new max threads
135   for (int i = 0; i < MAX_ITERS; ++i) {
136     if (flags[i])
137       flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
138                                                  max_threads * sizeof(flags_s));
139     else
140       flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s));
141   }
142 
143   if (go)
144     go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s));
145   else
146     go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s));
147 
148   if (iter)
149     iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s));
150   else
151     iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s));
152 
153   if (sleep)
154     sleep =
155         (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s));
156   else
157     sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s));
158 }
159 
160 // This function is to set all the go flags that threads might be waiting
161 // on, and when blocktime is not infinite, it should be followed by a wake-up
162 // call to each thread
163 kmp_uint64 distributedBarrier::go_release() {
164   kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS;
165   for (size_t j = 0; j < num_gos; j++) {
166     go[j].go.store(next_go);
167   }
168   return next_go;
169 }
170 
171 void distributedBarrier::go_reset() {
172   for (size_t j = 0; j < max_threads; ++j) {
173     for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
174       flags[i][j].stillNeed = 1;
175     }
176     go[j].go.store(0);
177     iter[j].iter = 0;
178   }
179 }
180 
181 // This function inits/re-inits the distributed barrier for a particular number
182 // of threads. If a resize of arrays is needed, it calls the resize function.
183 void distributedBarrier::init(size_t nthr) {
184   size_t old_max = max_threads;
185   if (nthr > max_threads) { // need more space in arrays
186     resize(nthr);
187   }
188 
189   for (size_t i = 0; i < max_threads; i++) {
190     for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
191       flags[j][i].stillNeed = 1;
192     }
193     go[i].go.store(0);
194     iter[i].iter = 0;
195     if (i >= old_max)
196       sleep[i].sleep = false;
197   }
198 
199   // Recalculate num_gos, etc. based on new nthr
200   computeVarsForN(nthr);
201 
202   num_threads = nthr;
203 
204   if (team_icvs == NULL)
205     team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t));
206 }
207 
208 // This function is used only when KMP_BLOCKTIME is not infinite.
209 // static
210 void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team,
211                                size_t start, size_t stop, size_t inc,
212                                size_t tid) {
213   KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);
214   if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
215     return;
216 
217   kmp_info_t **other_threads = team->t.t_threads;
218   for (size_t thr = start; thr < stop; thr += inc) {
219     KMP_DEBUG_ASSERT(other_threads[thr]);
220     int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
221     // Wake up worker regardless of if it appears to be sleeping or not
222     __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL);
223   }
224 }
225 
226 static void __kmp_dist_barrier_gather(
227     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
228     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
229   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);
230   kmp_team_t *team;
231   distributedBarrier *b;
232   kmp_info_t **other_threads;
233   kmp_uint64 my_current_iter, my_next_iter;
234   kmp_uint32 nproc;
235   bool group_leader;
236 
237   team = this_thr->th.th_team;
238   nproc = this_thr->th.th_team_nproc;
239   other_threads = team->t.t_threads;
240   b = team->t.b;
241   my_current_iter = b->iter[tid].iter;
242   my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
243   group_leader = ((tid % b->threads_per_group) == 0);
244 
245   KA_TRACE(20,
246            ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
247             gtid, team->t.t_id, tid, bt));
248 
249 #if USE_ITT_BUILD && USE_ITT_NOTIFY
250   // Barrier imbalance - save arrive time to the thread
251   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
252     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
253         __itt_get_timestamp();
254   }
255 #endif
256 
257   if (group_leader) {
258     // Start from the thread after the group leader
259     size_t group_start = tid + 1;
260     size_t group_end = tid + b->threads_per_group;
261     size_t threads_pending = 0;
262 
263     if (group_end > nproc)
264       group_end = nproc;
265     do { // wait for threads in my group
266       threads_pending = 0;
267       // Check all the flags every time to avoid branch misspredict
268       for (size_t thr = group_start; thr < group_end; thr++) {
269         // Each thread uses a different cache line
270         threads_pending += b->flags[my_current_iter][thr].stillNeed;
271       }
272       // Execute tasks here
273       if (__kmp_tasking_mode != tskm_immediate_exec) {
274         kmp_task_team_t *task_team = this_thr->th.th_task_team;
275         if (task_team != NULL) {
276           if (TCR_SYNC_4(task_team->tt.tt_active)) {
277             if (KMP_TASKING_ENABLED(task_team)) {
278               int tasks_completed = FALSE;
279               __kmp_atomic_execute_tasks_64(
280                   this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
281                   &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
282             } else
283               this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
284           }
285         } else {
286           this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
287         } // if
288       }
289       if (TCR_4(__kmp_global.g.g_done)) {
290         if (__kmp_global.g.g_abort)
291           __kmp_abort_thread();
292         break;
293       } else if (__kmp_tasking_mode != tskm_immediate_exec &&
294                  this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
295         this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
296       }
297     } while (threads_pending > 0);
298 
299     if (reduce) { // Perform reduction if needed
300       OMPT_REDUCTION_DECL(this_thr, gtid);
301       OMPT_REDUCTION_BEGIN;
302       // Group leader reduces all threads in group
303       for (size_t thr = group_start; thr < group_end; thr++) {
304         (*reduce)(this_thr->th.th_local.reduce_data,
305                   other_threads[thr]->th.th_local.reduce_data);
306       }
307       OMPT_REDUCTION_END;
308     }
309 
310     // Set flag for next iteration
311     b->flags[my_next_iter][tid].stillNeed = 1;
312     // Each thread uses a different cache line; resets stillNeed to 0 to
313     // indicate it has reached the barrier
314     b->flags[my_current_iter][tid].stillNeed = 0;
315 
316     do { // wait for all group leaders
317       threads_pending = 0;
318       for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
319         threads_pending += b->flags[my_current_iter][thr].stillNeed;
320       }
321       // Execute tasks here
322       if (__kmp_tasking_mode != tskm_immediate_exec) {
323         kmp_task_team_t *task_team = this_thr->th.th_task_team;
324         if (task_team != NULL) {
325           if (TCR_SYNC_4(task_team->tt.tt_active)) {
326             if (KMP_TASKING_ENABLED(task_team)) {
327               int tasks_completed = FALSE;
328               __kmp_atomic_execute_tasks_64(
329                   this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
330                   &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
331             } else
332               this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
333           }
334         } else {
335           this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
336         } // if
337       }
338       if (TCR_4(__kmp_global.g.g_done)) {
339         if (__kmp_global.g.g_abort)
340           __kmp_abort_thread();
341         break;
342       } else if (__kmp_tasking_mode != tskm_immediate_exec &&
343                  this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
344         this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
345       }
346     } while (threads_pending > 0);
347 
348     if (reduce) { // Perform reduction if needed
349       if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders
350         OMPT_REDUCTION_DECL(this_thr, gtid);
351         OMPT_REDUCTION_BEGIN;
352         for (size_t thr = b->threads_per_group; thr < nproc;
353              thr += b->threads_per_group) {
354           (*reduce)(this_thr->th.th_local.reduce_data,
355                     other_threads[thr]->th.th_local.reduce_data);
356         }
357         OMPT_REDUCTION_END;
358       }
359     }
360   } else {
361     // Set flag for next iteration
362     b->flags[my_next_iter][tid].stillNeed = 1;
363     // Each thread uses a different cache line; resets stillNeed to 0 to
364     // indicate it has reached the barrier
365     b->flags[my_current_iter][tid].stillNeed = 0;
366   }
367 
368   KMP_MFENCE();
369 
370   KA_TRACE(20,
371            ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
372             gtid, team->t.t_id, tid, bt));
373 }
374 
375 static void __kmp_dist_barrier_release(
376     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
377     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
378   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
379   kmp_team_t *team;
380   distributedBarrier *b;
381   kmp_bstate_t *thr_bar;
382   kmp_uint64 my_current_iter, next_go;
383   size_t my_go_index;
384   bool group_leader;
385 
386   KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
387                 gtid, tid, bt));
388 
389   thr_bar = &this_thr->th.th_bar[bt].bb;
390 
391   if (!KMP_MASTER_TID(tid)) {
392     // workers and non-master group leaders need to check their presence in team
393     do {
394       if (this_thr->th.th_used_in_team.load() != 1 &&
395           this_thr->th.th_used_in_team.load() != 3) {
396         // Thread is not in use in a team. Wait on location in tid's thread
397         // struct. The 0 value tells anyone looking that this thread is spinning
398         // or sleeping until this location becomes 3 again; 3 is the transition
399         // state to get to 1 which is waiting on go and being in the team
400         kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
401         if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
402                                         0) ||
403             this_thr->th.th_used_in_team.load() == 0) {
404           my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
405         }
406 #if USE_ITT_BUILD && USE_ITT_NOTIFY
407         if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
408           // In fork barrier where we could not get the object reliably
409           itt_sync_obj =
410               __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
411           // Cancel wait on previous parallel region...
412           __kmp_itt_task_starting(itt_sync_obj);
413 
414           if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
415             return;
416 
417           itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
418           if (itt_sync_obj != NULL)
419             // Call prepare as early as possible for "new" barrier
420             __kmp_itt_task_finished(itt_sync_obj);
421         } else
422 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
423             if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
424           return;
425       }
426       if (this_thr->th.th_used_in_team.load() != 1 &&
427           this_thr->th.th_used_in_team.load() != 3) // spurious wake-up?
428         continue;
429       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
430         return;
431 
432       // At this point, the thread thinks it is in use in a team, or in
433       // transition to be used in a team, but it might have reached this barrier
434       // before it was marked unused by the team. Unused threads are awoken and
435       // shifted to wait on local thread struct elsewhere. It also might reach
436       // this point by being picked up for use by a different team. Either way,
437       // we need to update the tid.
438       tid = __kmp_tid_from_gtid(gtid);
439       team = this_thr->th.th_team;
440       KMP_DEBUG_ASSERT(tid >= 0);
441       KMP_DEBUG_ASSERT(team);
442       b = team->t.b;
443       my_current_iter = b->iter[tid].iter;
444       next_go = my_current_iter + distributedBarrier::MAX_ITERS;
445       my_go_index = tid / b->threads_per_go;
446       if (this_thr->th.th_used_in_team.load() == 3) {
447         KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1);
448       }
449       // Check if go flag is set
450       if (b->go[my_go_index].go.load() != next_go) {
451         // Wait on go flag on team
452         kmp_atomic_flag_64<false, true> my_flag(
453             &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
454         my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
455         KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
456                          b->iter[tid].iter == 0);
457         KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);
458       }
459 
460       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
461         return;
462       // At this point, the thread's go location was set. This means the primary
463       // thread is safely in the barrier, and so this thread's data is
464       // up-to-date, but we should check again that this thread is really in
465       // use in the team, as it could have been woken up for the purpose of
466       // changing team size, or reaping threads at shutdown.
467       if (this_thr->th.th_used_in_team.load() == 1)
468         break;
469     } while (1);
470 
471     if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
472       return;
473 
474     group_leader = ((tid % b->threads_per_group) == 0);
475     if (group_leader) {
476       // Tell all the threads in my group they can go!
477       for (size_t go_idx = my_go_index + 1;
478            go_idx < my_go_index + b->gos_per_group; go_idx++) {
479         b->go[go_idx].go.store(next_go);
480       }
481       // Fence added so that workers can see changes to go. sfence inadequate.
482       KMP_MFENCE();
483     }
484 
485 #if KMP_BARRIER_ICV_PUSH
486     if (propagate_icvs) { // copy ICVs to final dest
487       __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
488                                tid, FALSE);
489       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
490                 (kmp_internal_control_t *)team->t.b->team_icvs);
491       copy_icvs(&thr_bar->th_fixed_icvs,
492                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
493     }
494 #endif
495     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
496       // This thread is now awake and participating in the barrier;
497       // wake up the other threads in the group
498       size_t nproc = this_thr->th.th_team_nproc;
499       size_t group_end = tid + b->threads_per_group;
500       if (nproc < group_end)
501         group_end = nproc;
502       __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
503     }
504   } else { //  Primary thread
505     team = this_thr->th.th_team;
506     b = team->t.b;
507     my_current_iter = b->iter[tid].iter;
508     next_go = my_current_iter + distributedBarrier::MAX_ITERS;
509 #if KMP_BARRIER_ICV_PUSH
510     if (propagate_icvs) {
511       // primary thread has ICVs in final destination; copy
512       copy_icvs(&thr_bar->th_fixed_icvs,
513                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
514     }
515 #endif
516     // Tell all the group leaders they can go!
517     for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
518       b->go[go_idx].go.store(next_go);
519     }
520 
521     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
522       // Wake-up the group leaders
523       size_t nproc = this_thr->th.th_team_nproc;
524       __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
525                                 b->threads_per_group, tid);
526     }
527 
528     // Tell all the threads in my group they can go!
529     for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
530       b->go[go_idx].go.store(next_go);
531     }
532 
533     // Fence added so that workers can see changes to go. sfence inadequate.
534     KMP_MFENCE();
535 
536     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
537       // Wake-up the other threads in my group
538       size_t nproc = this_thr->th.th_team_nproc;
539       size_t group_end = tid + b->threads_per_group;
540       if (nproc < group_end)
541         group_end = nproc;
542       __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
543     }
544   }
545   // Update to next iteration
546   KMP_ASSERT(my_current_iter == b->iter[tid].iter);
547   b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
548 
549   KA_TRACE(
550       20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
551            gtid, team->t.t_id, tid, bt));
552 }
553 
554 // Linear Barrier
555 template <bool cancellable = false>
556 static bool __kmp_linear_barrier_gather_template(
557     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
558     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
559   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
560   kmp_team_t *team = this_thr->th.th_team;
561   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
562   kmp_info_t **other_threads = team->t.t_threads;
563 
564   KA_TRACE(
565       20,
566       ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
567        gtid, team->t.t_id, tid, bt));
568   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
569 
570 #if USE_ITT_BUILD && USE_ITT_NOTIFY
571   // Barrier imbalance - save arrive time to the thread
572   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
573     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
574         __itt_get_timestamp();
575   }
576 #endif
577   // We now perform a linear reduction to signal that all of the threads have
578   // arrived.
579   if (!KMP_MASTER_TID(tid)) {
580     KA_TRACE(20,
581              ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
582               "arrived(%p): %llu => %llu\n",
583               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
584               team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
585               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
586     // Mark arrival to primary thread
587     /* After performing this write, a worker thread may not assume that the team
588        is valid any more - it could be deallocated by the primary thread at any
589        time. */
590     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
591     flag.release();
592   } else {
593     kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
594     int nproc = this_thr->th.th_team_nproc;
595     int i;
596     // Don't have to worry about sleep bit here or atomic since team setting
597     kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
598 
599     // Collect all the worker team member threads.
600     for (i = 1; i < nproc; ++i) {
601 #if KMP_CACHE_MANAGE
602       // Prefetch next thread's arrived count
603       if (i + 1 < nproc)
604         KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
605 #endif /* KMP_CACHE_MANAGE */
606       KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
607                     "arrived(%p) == %llu\n",
608                     gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
609                     team->t.t_id, i,
610                     &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
611 
612       // Wait for worker thread to arrive
613       if (cancellable) {
614         kmp_flag_64<true, false> flag(
615             &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
616         if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
617           return true;
618       } else {
619         kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
620                            new_state);
621         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
622       }
623 #if USE_ITT_BUILD && USE_ITT_NOTIFY
624       // Barrier imbalance - write min of the thread time and the other thread
625       // time to the thread.
626       if (__kmp_forkjoin_frames_mode == 2) {
627         this_thr->th.th_bar_min_time = KMP_MIN(
628             this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
629       }
630 #endif
631       if (reduce) {
632         KA_TRACE(100,
633                  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
634                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
635                   team->t.t_id, i));
636         OMPT_REDUCTION_DECL(this_thr, gtid);
637         OMPT_REDUCTION_BEGIN;
638         (*reduce)(this_thr->th.th_local.reduce_data,
639                   other_threads[i]->th.th_local.reduce_data);
640         OMPT_REDUCTION_END;
641       }
642     }
643     // Don't have to worry about sleep bit here or atomic since team setting
644     team_bar->b_arrived = new_state;
645     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
646                   "arrived(%p) = %llu\n",
647                   gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
648                   new_state));
649   }
650   KA_TRACE(
651       20,
652       ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
653        gtid, team->t.t_id, tid, bt));
654   return false;
655 }
656 
657 template <bool cancellable = false>
658 static bool __kmp_linear_barrier_release_template(
659     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
660     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
661   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
662   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
663   kmp_team_t *team;
664 
665   if (KMP_MASTER_TID(tid)) {
666     unsigned int i;
667     kmp_uint32 nproc = this_thr->th.th_team_nproc;
668     kmp_info_t **other_threads;
669 
670     team = __kmp_threads[gtid]->th.th_team;
671     KMP_DEBUG_ASSERT(team != NULL);
672     other_threads = team->t.t_threads;
673 
674     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
675                   "barrier type %d\n",
676                   gtid, team->t.t_id, tid, bt));
677 
678     if (nproc > 1) {
679 #if KMP_BARRIER_ICV_PUSH
680       {
681         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
682         if (propagate_icvs) {
683           ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
684           for (i = 1; i < nproc; ++i) {
685             __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
686                                      team, i, FALSE);
687             ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
688                            &team->t.t_implicit_task_taskdata[0].td_icvs);
689           }
690           ngo_sync();
691         }
692       }
693 #endif // KMP_BARRIER_ICV_PUSH
694 
695       // Now, release all of the worker threads
696       for (i = 1; i < nproc; ++i) {
697 #if KMP_CACHE_MANAGE
698         // Prefetch next thread's go flag
699         if (i + 1 < nproc)
700           KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
701 #endif /* KMP_CACHE_MANAGE */
702         KA_TRACE(
703             20,
704             ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
705              "go(%p): %u => %u\n",
706              gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
707              team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
708              other_threads[i]->th.th_bar[bt].bb.b_go,
709              other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
710         kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
711                            other_threads[i]);
712         flag.release();
713       }
714     }
715   } else { // Wait for the PRIMARY thread to release us
716     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
717                   gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
718     if (cancellable) {
719       kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
720       if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
721         return true;
722     } else {
723       kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
724       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
725     }
726 #if USE_ITT_BUILD && USE_ITT_NOTIFY
727     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
728       // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
729       // disabled)
730       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
731       // Cancel wait on previous parallel region...
732       __kmp_itt_task_starting(itt_sync_obj);
733 
734       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
735         return false;
736 
737       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
738       if (itt_sync_obj != NULL)
739         // Call prepare as early as possible for "new" barrier
740         __kmp_itt_task_finished(itt_sync_obj);
741     } else
742 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
743         // Early exit for reaping threads releasing forkjoin barrier
744         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
745       return false;
746 // The worker thread may now assume that the team is valid.
747 #ifdef KMP_DEBUG
748     tid = __kmp_tid_from_gtid(gtid);
749     team = __kmp_threads[gtid]->th.th_team;
750 #endif
751     KMP_DEBUG_ASSERT(team != NULL);
752     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
753     KA_TRACE(20,
754              ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
755               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
756     KMP_MB(); // Flush all pending memory write invalidates.
757   }
758   KA_TRACE(
759       20,
760       ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
761        gtid, team->t.t_id, tid, bt));
762   return false;
763 }
764 
765 static void __kmp_linear_barrier_gather(
766     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
767     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
768   __kmp_linear_barrier_gather_template<false>(
769       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
770 }
771 
772 static bool __kmp_linear_barrier_gather_cancellable(
773     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
774     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
775   return __kmp_linear_barrier_gather_template<true>(
776       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
777 }
778 
779 static void __kmp_linear_barrier_release(
780     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
781     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
782   __kmp_linear_barrier_release_template<false>(
783       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
784 }
785 
786 static bool __kmp_linear_barrier_release_cancellable(
787     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
788     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
789   return __kmp_linear_barrier_release_template<true>(
790       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
791 }
792 
793 // Tree barrier
794 static void __kmp_tree_barrier_gather(
795     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
796     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
797   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
798   kmp_team_t *team = this_thr->th.th_team;
799   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
800   kmp_info_t **other_threads = team->t.t_threads;
801   kmp_uint32 nproc = this_thr->th.th_team_nproc;
802   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
803   kmp_uint32 branch_factor = 1 << branch_bits;
804   kmp_uint32 child;
805   kmp_uint32 child_tid;
806   kmp_uint64 new_state = 0;
807 
808   KA_TRACE(
809       20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
810            gtid, team->t.t_id, tid, bt));
811   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
812 
813 #if USE_ITT_BUILD && USE_ITT_NOTIFY
814   // Barrier imbalance - save arrive time to the thread
815   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
816     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
817         __itt_get_timestamp();
818   }
819 #endif
820   // Perform tree gather to wait until all threads have arrived; reduce any
821   // required data as we go
822   child_tid = (tid << branch_bits) + 1;
823   if (child_tid < nproc) {
824     // Parent threads wait for all their children to arrive
825     new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
826     child = 1;
827     do {
828       kmp_info_t *child_thr = other_threads[child_tid];
829       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
830 #if KMP_CACHE_MANAGE
831       // Prefetch next thread's arrived count
832       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
833         KMP_CACHE_PREFETCH(
834             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
835 #endif /* KMP_CACHE_MANAGE */
836       KA_TRACE(20,
837                ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
838                 "arrived(%p) == %llu\n",
839                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
840                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
841       // Wait for child to arrive
842       kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
843       flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
844 #if USE_ITT_BUILD && USE_ITT_NOTIFY
845       // Barrier imbalance - write min of the thread time and a child time to
846       // the thread.
847       if (__kmp_forkjoin_frames_mode == 2) {
848         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
849                                                child_thr->th.th_bar_min_time);
850       }
851 #endif
852       if (reduce) {
853         KA_TRACE(100,
854                  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
855                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
856                   team->t.t_id, child_tid));
857         OMPT_REDUCTION_DECL(this_thr, gtid);
858         OMPT_REDUCTION_BEGIN;
859         (*reduce)(this_thr->th.th_local.reduce_data,
860                   child_thr->th.th_local.reduce_data);
861         OMPT_REDUCTION_END;
862       }
863       child++;
864       child_tid++;
865     } while (child <= branch_factor && child_tid < nproc);
866   }
867 
868   if (!KMP_MASTER_TID(tid)) { // Worker threads
869     kmp_int32 parent_tid = (tid - 1) >> branch_bits;
870 
871     KA_TRACE(20,
872              ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
873               "arrived(%p): %llu => %llu\n",
874               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
875               team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
876               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
877 
878     // Mark arrival to parent thread
879     /* After performing this write, a worker thread may not assume that the team
880        is valid any more - it could be deallocated by the primary thread at any
881        time.  */
882     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
883     flag.release();
884   } else {
885     // Need to update the team arrived pointer if we are the primary thread
886     if (nproc > 1) // New value was already computed above
887       team->t.t_bar[bt].b_arrived = new_state;
888     else
889       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
890     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
891                   "arrived(%p) = %llu\n",
892                   gtid, team->t.t_id, tid, team->t.t_id,
893                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
894   }
895   KA_TRACE(20,
896            ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
897             gtid, team->t.t_id, tid, bt));
898 }
899 
900 static void __kmp_tree_barrier_release(
901     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
902     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
903   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
904   kmp_team_t *team;
905   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
906   kmp_uint32 nproc;
907   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
908   kmp_uint32 branch_factor = 1 << branch_bits;
909   kmp_uint32 child;
910   kmp_uint32 child_tid;
911 
912   // Perform a tree release for all of the threads that have been gathered
913   if (!KMP_MASTER_TID(
914           tid)) { // Handle fork barrier workers who aren't part of a team yet
915     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
916                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
917     // Wait for parent thread to release us
918     kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
919     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
920 #if USE_ITT_BUILD && USE_ITT_NOTIFY
921     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
922       // In fork barrier where we could not get the object reliably (or
923       // ITTNOTIFY is disabled)
924       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
925       // Cancel wait on previous parallel region...
926       __kmp_itt_task_starting(itt_sync_obj);
927 
928       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
929         return;
930 
931       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
932       if (itt_sync_obj != NULL)
933         // Call prepare as early as possible for "new" barrier
934         __kmp_itt_task_finished(itt_sync_obj);
935     } else
936 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
937         // Early exit for reaping threads releasing forkjoin barrier
938         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
939       return;
940 
941     // The worker thread may now assume that the team is valid.
942     team = __kmp_threads[gtid]->th.th_team;
943     KMP_DEBUG_ASSERT(team != NULL);
944     tid = __kmp_tid_from_gtid(gtid);
945 
946     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
947     KA_TRACE(20,
948              ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
949               team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
950     KMP_MB(); // Flush all pending memory write invalidates.
951   } else {
952     team = __kmp_threads[gtid]->th.th_team;
953     KMP_DEBUG_ASSERT(team != NULL);
954     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
955                   "barrier type %d\n",
956                   gtid, team->t.t_id, tid, bt));
957   }
958   nproc = this_thr->th.th_team_nproc;
959   child_tid = (tid << branch_bits) + 1;
960 
961   if (child_tid < nproc) {
962     kmp_info_t **other_threads = team->t.t_threads;
963     child = 1;
964     // Parent threads release all their children
965     do {
966       kmp_info_t *child_thr = other_threads[child_tid];
967       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
968 #if KMP_CACHE_MANAGE
969       // Prefetch next thread's go count
970       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
971         KMP_CACHE_PREFETCH(
972             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
973 #endif /* KMP_CACHE_MANAGE */
974 
975 #if KMP_BARRIER_ICV_PUSH
976       {
977         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
978         if (propagate_icvs) {
979           __kmp_init_implicit_task(team->t.t_ident,
980                                    team->t.t_threads[child_tid], team,
981                                    child_tid, FALSE);
982           copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
983                     &team->t.t_implicit_task_taskdata[0].td_icvs);
984         }
985       }
986 #endif // KMP_BARRIER_ICV_PUSH
987       KA_TRACE(20,
988                ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
989                 "go(%p): %u => %u\n",
990                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
991                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
992                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
993       // Release child from barrier
994       kmp_flag_64<> flag(&child_bar->b_go, child_thr);
995       flag.release();
996       child++;
997       child_tid++;
998     } while (child <= branch_factor && child_tid < nproc);
999   }
1000   KA_TRACE(
1001       20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1002            gtid, team->t.t_id, tid, bt));
1003 }
1004 
1005 // Hyper Barrier
1006 static void __kmp_hyper_barrier_gather(
1007     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1008     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1009   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
1010   kmp_team_t *team = this_thr->th.th_team;
1011   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1012   kmp_info_t **other_threads = team->t.t_threads;
1013   kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
1014   kmp_uint32 num_threads = this_thr->th.th_team_nproc;
1015   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
1016   kmp_uint32 branch_factor = 1 << branch_bits;
1017   kmp_uint32 offset;
1018   kmp_uint32 level;
1019 
1020   KA_TRACE(
1021       20,
1022       ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1023        gtid, team->t.t_id, tid, bt));
1024   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1025 
1026 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1027   // Barrier imbalance - save arrive time to the thread
1028   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1029     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
1030         __itt_get_timestamp();
1031   }
1032 #endif
1033   /* Perform a hypercube-embedded tree gather to wait until all of the threads
1034      have arrived, and reduce any required data as we go.  */
1035   kmp_flag_64<> p_flag(&thr_bar->b_arrived);
1036   for (level = 0, offset = 1; offset < num_threads;
1037        level += branch_bits, offset <<= branch_bits) {
1038     kmp_uint32 child;
1039     kmp_uint32 child_tid;
1040 
1041     if (((tid >> level) & (branch_factor - 1)) != 0) {
1042       kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
1043 
1044       KMP_MB(); // Synchronize parent and child threads.
1045       KA_TRACE(20,
1046                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1047                 "arrived(%p): %llu => %llu\n",
1048                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
1049                 team->t.t_id, parent_tid, &thr_bar->b_arrived,
1050                 thr_bar->b_arrived,
1051                 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1052       // Mark arrival to parent thread
1053       /* After performing this write (in the last iteration of the enclosing for
1054          loop), a worker thread may not assume that the team is valid any more
1055          - it could be deallocated by the primary thread at any time.  */
1056       p_flag.set_waiter(other_threads[parent_tid]);
1057       p_flag.release();
1058       break;
1059     }
1060 
1061     // Parent threads wait for children to arrive
1062     if (new_state == KMP_BARRIER_UNUSED_STATE)
1063       new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1064     for (child = 1, child_tid = tid + (1 << level);
1065          child < branch_factor && child_tid < num_threads;
1066          child++, child_tid += (1 << level)) {
1067       kmp_info_t *child_thr = other_threads[child_tid];
1068       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1069 #if KMP_CACHE_MANAGE
1070       kmp_uint32 next_child_tid = child_tid + (1 << level);
1071       // Prefetch next thread's arrived count
1072       if (child + 1 < branch_factor && next_child_tid < num_threads)
1073         KMP_CACHE_PREFETCH(
1074             &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
1075 #endif /* KMP_CACHE_MANAGE */
1076       KA_TRACE(20,
1077                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1078                 "arrived(%p) == %llu\n",
1079                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1080                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
1081       // Wait for child to arrive
1082       kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
1083       c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1084       KMP_MB(); // Synchronize parent and child threads.
1085 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1086       // Barrier imbalance - write min of the thread time and a child time to
1087       // the thread.
1088       if (__kmp_forkjoin_frames_mode == 2) {
1089         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
1090                                                child_thr->th.th_bar_min_time);
1091       }
1092 #endif
1093       if (reduce) {
1094         KA_TRACE(100,
1095                  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1096                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1097                   team->t.t_id, child_tid));
1098         OMPT_REDUCTION_DECL(this_thr, gtid);
1099         OMPT_REDUCTION_BEGIN;
1100         (*reduce)(this_thr->th.th_local.reduce_data,
1101                   child_thr->th.th_local.reduce_data);
1102         OMPT_REDUCTION_END;
1103       }
1104     }
1105   }
1106 
1107   if (KMP_MASTER_TID(tid)) {
1108     // Need to update the team arrived pointer if we are the primary thread
1109     if (new_state == KMP_BARRIER_UNUSED_STATE)
1110       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
1111     else
1112       team->t.t_bar[bt].b_arrived = new_state;
1113     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
1114                   "arrived(%p) = %llu\n",
1115                   gtid, team->t.t_id, tid, team->t.t_id,
1116                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1117   }
1118   KA_TRACE(
1119       20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1120            gtid, team->t.t_id, tid, bt));
1121 }
1122 
1123 // The reverse versions seem to beat the forward versions overall
1124 #define KMP_REVERSE_HYPER_BAR
1125 static void __kmp_hyper_barrier_release(
1126     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1127     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1128   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
1129   kmp_team_t *team;
1130   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1131   kmp_info_t **other_threads;
1132   kmp_uint32 num_threads;
1133   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
1134   kmp_uint32 branch_factor = 1 << branch_bits;
1135   kmp_uint32 child;
1136   kmp_uint32 child_tid;
1137   kmp_uint32 offset;
1138   kmp_uint32 level;
1139 
1140   /* Perform a hypercube-embedded tree release for all of the threads that have
1141      been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
1142      are released in the reverse order of the corresponding gather, otherwise
1143      threads are released in the same order. */
1144   if (KMP_MASTER_TID(tid)) { // primary thread
1145     team = __kmp_threads[gtid]->th.th_team;
1146     KMP_DEBUG_ASSERT(team != NULL);
1147     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
1148                   "barrier type %d\n",
1149                   gtid, team->t.t_id, tid, bt));
1150 #if KMP_BARRIER_ICV_PUSH
1151     if (propagate_icvs) { // primary already has ICVs in final destination; copy
1152       copy_icvs(&thr_bar->th_fixed_icvs,
1153                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1154     }
1155 #endif
1156   } else { // Handle fork barrier workers who aren't part of a team yet
1157     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
1158                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
1159     // Wait for parent thread to release us
1160     kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1161     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1162 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1163     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
1164       // In fork barrier where we could not get the object reliably
1165       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
1166       // Cancel wait on previous parallel region...
1167       __kmp_itt_task_starting(itt_sync_obj);
1168 
1169       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1170         return;
1171 
1172       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1173       if (itt_sync_obj != NULL)
1174         // Call prepare as early as possible for "new" barrier
1175         __kmp_itt_task_finished(itt_sync_obj);
1176     } else
1177 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1178         // Early exit for reaping threads releasing forkjoin barrier
1179         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1180       return;
1181 
1182     // The worker thread may now assume that the team is valid.
1183     team = __kmp_threads[gtid]->th.th_team;
1184     KMP_DEBUG_ASSERT(team != NULL);
1185     tid = __kmp_tid_from_gtid(gtid);
1186 
1187     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1188     KA_TRACE(20,
1189              ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1190               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1191     KMP_MB(); // Flush all pending memory write invalidates.
1192   }
1193   num_threads = this_thr->th.th_team_nproc;
1194   other_threads = team->t.t_threads;
1195 
1196 #ifdef KMP_REVERSE_HYPER_BAR
1197   // Count up to correct level for parent
1198   for (level = 0, offset = 1;
1199        offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
1200        level += branch_bits, offset <<= branch_bits)
1201     ;
1202 
1203   // Now go down from there
1204   for (level -= branch_bits, offset >>= branch_bits; offset != 0;
1205        level -= branch_bits, offset >>= branch_bits)
1206 #else
1207   // Go down the tree, level by level
1208   for (level = 0, offset = 1; offset < num_threads;
1209        level += branch_bits, offset <<= branch_bits)
1210 #endif // KMP_REVERSE_HYPER_BAR
1211   {
1212 #ifdef KMP_REVERSE_HYPER_BAR
1213     /* Now go in reverse order through the children, highest to lowest.
1214        Initial setting of child is conservative here. */
1215     child = num_threads >> ((level == 0) ? level : level - 1);
1216     for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
1217         child_tid = tid + (child << level);
1218          child >= 1; child--, child_tid -= (1 << level))
1219 #else
1220     if (((tid >> level) & (branch_factor - 1)) != 0)
1221       // No need to go lower than this, since this is the level parent would be
1222       // notified
1223       break;
1224     // Iterate through children on this level of the tree
1225     for (child = 1, child_tid = tid + (1 << level);
1226          child < branch_factor && child_tid < num_threads;
1227          child++, child_tid += (1 << level))
1228 #endif // KMP_REVERSE_HYPER_BAR
1229     {
1230       if (child_tid >= num_threads)
1231         continue; // Child doesn't exist so keep going
1232       else {
1233         kmp_info_t *child_thr = other_threads[child_tid];
1234         kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1235 #if KMP_CACHE_MANAGE
1236         kmp_uint32 next_child_tid = child_tid - (1 << level);
1237 // Prefetch next thread's go count
1238 #ifdef KMP_REVERSE_HYPER_BAR
1239         if (child - 1 >= 1 && next_child_tid < num_threads)
1240 #else
1241         if (child + 1 < branch_factor && next_child_tid < num_threads)
1242 #endif // KMP_REVERSE_HYPER_BAR
1243           KMP_CACHE_PREFETCH(
1244               &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
1245 #endif /* KMP_CACHE_MANAGE */
1246 
1247 #if KMP_BARRIER_ICV_PUSH
1248         if (propagate_icvs) // push my fixed ICVs to my child
1249           copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1250 #endif // KMP_BARRIER_ICV_PUSH
1251 
1252         KA_TRACE(
1253             20,
1254             ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1255              "go(%p): %u => %u\n",
1256              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1257              team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1258              child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1259         // Release child from barrier
1260         kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1261         flag.release();
1262       }
1263     }
1264   }
1265 #if KMP_BARRIER_ICV_PUSH
1266   if (propagate_icvs &&
1267       !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
1268     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1269                              FALSE);
1270     copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1271               &thr_bar->th_fixed_icvs);
1272   }
1273 #endif
1274   KA_TRACE(
1275       20,
1276       ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1277        gtid, team->t.t_id, tid, bt));
1278 }
1279 
1280 // Hierarchical Barrier
1281 
1282 // Initialize thread barrier data
1283 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
1284    Performs the minimum amount of initialization required based on how the team
1285    has changed. Returns true if leaf children will require both on-core and
1286    traditional wake-up mechanisms. For example, if the team size increases,
1287    threads already in the team will respond to on-core wakeup on their parent
1288    thread, but threads newly added to the team will only be listening on the
1289    their local b_go. */
1290 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
1291                                                    kmp_bstate_t *thr_bar,
1292                                                    kmp_uint32 nproc, int gtid,
1293                                                    int tid, kmp_team_t *team) {
1294   // Checks to determine if (re-)initialization is needed
1295   bool uninitialized = thr_bar->team == NULL;
1296   bool team_changed = team != thr_bar->team;
1297   bool team_sz_changed = nproc != thr_bar->nproc;
1298   bool tid_changed = tid != thr_bar->old_tid;
1299   bool retval = false;
1300 
1301   if (uninitialized || team_sz_changed) {
1302     __kmp_get_hierarchy(nproc, thr_bar);
1303   }
1304 
1305   if (uninitialized || team_sz_changed || tid_changed) {
1306     thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
1307     thr_bar->parent_tid = -1; // default for primary thread
1308     if (!KMP_MASTER_TID(tid)) {
1309       // if not primary thread, find parent thread in hierarchy
1310       kmp_uint32 d = 0;
1311       while (d < thr_bar->depth) { // find parent based on level of thread in
1312         // hierarchy, and note level
1313         kmp_uint32 rem;
1314         if (d == thr_bar->depth - 2) { // reached level right below the primary
1315           thr_bar->parent_tid = 0;
1316           thr_bar->my_level = d;
1317           break;
1318         } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
1319           // TODO: can we make the above op faster?
1320           // thread is not a subtree root at next level, so this is max
1321           thr_bar->parent_tid = tid - rem;
1322           thr_bar->my_level = d;
1323           break;
1324         }
1325         ++d;
1326       }
1327     }
1328     __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
1329                             (thr_bar->skip_per_level[thr_bar->my_level])),
1330                        &(thr_bar->offset));
1331     thr_bar->old_tid = tid;
1332     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1333     thr_bar->team = team;
1334     thr_bar->parent_bar =
1335         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1336   }
1337   if (uninitialized || team_changed || tid_changed) {
1338     thr_bar->team = team;
1339     thr_bar->parent_bar =
1340         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1341     retval = true;
1342   }
1343   if (uninitialized || team_sz_changed || tid_changed) {
1344     thr_bar->nproc = nproc;
1345     thr_bar->leaf_kids = thr_bar->base_leaf_kids;
1346     if (thr_bar->my_level == 0)
1347       thr_bar->leaf_kids = 0;
1348     if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
1349       __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
1350     thr_bar->leaf_state = 0;
1351     for (int i = 0; i < thr_bar->leaf_kids; ++i)
1352       ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
1353   }
1354   return retval;
1355 }
1356 
1357 static void __kmp_hierarchical_barrier_gather(
1358     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1359     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1360   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
1361   kmp_team_t *team = this_thr->th.th_team;
1362   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1363   kmp_uint32 nproc = this_thr->th.th_team_nproc;
1364   kmp_info_t **other_threads = team->t.t_threads;
1365   kmp_uint64 new_state = 0;
1366 
1367   int level = team->t.t_level;
1368   if (other_threads[0]
1369           ->th.th_teams_microtask) // are we inside the teams construct?
1370     if (this_thr->th.th_teams_size.nteams > 1)
1371       ++level; // level was not increased in teams construct for team_of_masters
1372   if (level == 1)
1373     thr_bar->use_oncore_barrier = 1;
1374   else
1375     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1376 
1377   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
1378                 "barrier type %d\n",
1379                 gtid, team->t.t_id, tid, bt));
1380   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1381 
1382 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1383   // Barrier imbalance - save arrive time to the thread
1384   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1385     this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
1386   }
1387 #endif
1388 
1389   (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
1390                                                team);
1391 
1392   if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
1393     kmp_int32 child_tid;
1394     new_state =
1395         (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1396     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1397         thr_bar->use_oncore_barrier) {
1398       if (thr_bar->leaf_kids) {
1399         // First, wait for leaf children to check-in on my b_arrived flag
1400         kmp_uint64 leaf_state =
1401             KMP_MASTER_TID(tid)
1402                 ? thr_bar->b_arrived | thr_bar->leaf_state
1403                 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
1404         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
1405                       "for leaf kids\n",
1406                       gtid, team->t.t_id, tid));
1407         kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
1408         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1409         if (reduce) {
1410           OMPT_REDUCTION_DECL(this_thr, gtid);
1411           OMPT_REDUCTION_BEGIN;
1412           for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
1413                ++child_tid) {
1414             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1415                            "T#%d(%d:%d)\n",
1416                            gtid, team->t.t_id, tid,
1417                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1418                            child_tid));
1419             (*reduce)(this_thr->th.th_local.reduce_data,
1420                       other_threads[child_tid]->th.th_local.reduce_data);
1421           }
1422           OMPT_REDUCTION_END;
1423         }
1424         // clear leaf_state bits
1425         KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
1426       }
1427       // Next, wait for higher level children on each child's b_arrived flag
1428       for (kmp_uint32 d = 1; d < thr_bar->my_level;
1429            ++d) { // gather lowest level threads first, but skip 0
1430         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1431                    skip = thr_bar->skip_per_level[d];
1432         if (last > nproc)
1433           last = nproc;
1434         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1435           kmp_info_t *child_thr = other_threads[child_tid];
1436           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1437           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1438                         "T#%d(%d:%d) "
1439                         "arrived(%p) == %llu\n",
1440                         gtid, team->t.t_id, tid,
1441                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1442                         child_tid, &child_bar->b_arrived, new_state));
1443           kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1444           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1445           if (reduce) {
1446             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1447                            "T#%d(%d:%d)\n",
1448                            gtid, team->t.t_id, tid,
1449                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1450                            child_tid));
1451             (*reduce)(this_thr->th.th_local.reduce_data,
1452                       child_thr->th.th_local.reduce_data);
1453           }
1454         }
1455       }
1456     } else { // Blocktime is not infinite
1457       for (kmp_uint32 d = 0; d < thr_bar->my_level;
1458            ++d) { // Gather lowest level threads first
1459         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1460                    skip = thr_bar->skip_per_level[d];
1461         if (last > nproc)
1462           last = nproc;
1463         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1464           kmp_info_t *child_thr = other_threads[child_tid];
1465           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1466           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1467                         "T#%d(%d:%d) "
1468                         "arrived(%p) == %llu\n",
1469                         gtid, team->t.t_id, tid,
1470                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1471                         child_tid, &child_bar->b_arrived, new_state));
1472           kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1473           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1474           if (reduce) {
1475             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1476                            "T#%d(%d:%d)\n",
1477                            gtid, team->t.t_id, tid,
1478                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1479                            child_tid));
1480             (*reduce)(this_thr->th.th_local.reduce_data,
1481                       child_thr->th.th_local.reduce_data);
1482           }
1483         }
1484       }
1485     }
1486   }
1487   // All subordinates are gathered; now release parent if not primary thread
1488 
1489   if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1490     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1491                   " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1492                   gtid, team->t.t_id, tid,
1493                   __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1494                   thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1495                   thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1496     /* Mark arrival to parent: After performing this write, a worker thread may
1497        not assume that the team is valid any more - it could be deallocated by
1498        the primary thread at any time. */
1499     if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1500         !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1501       // flag; release it
1502       kmp_flag_64<> flag(&thr_bar->b_arrived,
1503                          other_threads[thr_bar->parent_tid]);
1504       flag.release();
1505     } else {
1506       // Leaf does special release on "offset" bits of parent's b_arrived flag
1507       thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1508       kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1509                            thr_bar->offset + 1);
1510       flag.set_waiter(other_threads[thr_bar->parent_tid]);
1511       flag.release();
1512     }
1513   } else { // Primary thread needs to update the team's b_arrived value
1514     team->t.t_bar[bt].b_arrived = new_state;
1515     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1516                   "arrived(%p) = %llu\n",
1517                   gtid, team->t.t_id, tid, team->t.t_id,
1518                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1519   }
1520   // Is the team access below unsafe or just technically invalid?
1521   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1522                 "barrier type %d\n",
1523                 gtid, team->t.t_id, tid, bt));
1524 }
1525 
1526 static void __kmp_hierarchical_barrier_release(
1527     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1528     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1529   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1530   kmp_team_t *team;
1531   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1532   kmp_uint32 nproc;
1533   bool team_change = false; // indicates on-core barrier shouldn't be used
1534 
1535   if (KMP_MASTER_TID(tid)) {
1536     team = __kmp_threads[gtid]->th.th_team;
1537     KMP_DEBUG_ASSERT(team != NULL);
1538     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1539                   "entered barrier type %d\n",
1540                   gtid, team->t.t_id, tid, bt));
1541   } else { // Worker threads
1542     // Wait for parent thread to release me
1543     if (!thr_bar->use_oncore_barrier ||
1544         __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1545         thr_bar->team == NULL) {
1546       // Use traditional method of waiting on my own b_go flag
1547       thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1548       kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1549       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1550       TCW_8(thr_bar->b_go,
1551             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1552     } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1553       // infinite, not nested
1554       // Wait on my "offset" bits on parent's b_go flag
1555       thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1556       kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1557                            thr_bar->offset + 1, bt,
1558                            this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1559       flag.wait(this_thr, TRUE);
1560       if (thr_bar->wait_flag ==
1561           KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1562         TCW_8(thr_bar->b_go,
1563               KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1564       } else { // Reset my bits on parent's b_go flag
1565         (RCAST(volatile char *,
1566                &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1567       }
1568     }
1569     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1570     // Early exit for reaping threads releasing forkjoin barrier
1571     if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1572       return;
1573     // The worker thread may now assume that the team is valid.
1574     team = __kmp_threads[gtid]->th.th_team;
1575     KMP_DEBUG_ASSERT(team != NULL);
1576     tid = __kmp_tid_from_gtid(gtid);
1577 
1578     KA_TRACE(
1579         20,
1580         ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1581          gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1582     KMP_MB(); // Flush all pending memory write invalidates.
1583   }
1584 
1585   nproc = this_thr->th.th_team_nproc;
1586   int level = team->t.t_level;
1587   if (team->t.t_threads[0]
1588           ->th.th_teams_microtask) { // are we inside the teams construct?
1589     if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1590         this_thr->th.th_teams_level == level)
1591       ++level; // level was not increased in teams construct for team_of_workers
1592     if (this_thr->th.th_teams_size.nteams > 1)
1593       ++level; // level was not increased in teams construct for team_of_masters
1594   }
1595   if (level == 1)
1596     thr_bar->use_oncore_barrier = 1;
1597   else
1598     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1599 
1600   // If the team size has increased, we still communicate with old leaves via
1601   // oncore barrier.
1602   unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1603   kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1604   team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1605                                                        tid, team);
1606   // But if the entire team changes, we won't use oncore barrier at all
1607   if (team_change)
1608     old_leaf_kids = 0;
1609 
1610 #if KMP_BARRIER_ICV_PUSH
1611   if (propagate_icvs) {
1612     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1613                              FALSE);
1614     if (KMP_MASTER_TID(
1615             tid)) { // primary already has copy in final destination; copy
1616       copy_icvs(&thr_bar->th_fixed_icvs,
1617                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1618     } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1619                thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1620       if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1621         // leaves (on-core children) pull parent's fixed ICVs directly to local
1622         // ICV store
1623         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1624                   &thr_bar->parent_bar->th_fixed_icvs);
1625       // non-leaves will get ICVs piggybacked with b_go via NGO store
1626     } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1627       if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1628         // access
1629         copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1630       else // leaves copy parent's fixed ICVs directly to local ICV store
1631         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1632                   &thr_bar->parent_bar->th_fixed_icvs);
1633     }
1634   }
1635 #endif // KMP_BARRIER_ICV_PUSH
1636 
1637   // Now, release my children
1638   if (thr_bar->my_level) { // not a leaf
1639     kmp_int32 child_tid;
1640     kmp_uint32 last;
1641     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1642         thr_bar->use_oncore_barrier) {
1643       if (KMP_MASTER_TID(tid)) { // do a flat release
1644         // Set local b_go to bump children via NGO store of the cache line
1645         // containing IVCs and b_go.
1646         thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1647         // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1648         // the cache line
1649         ngo_load(&thr_bar->th_fixed_icvs);
1650         // This loops over all the threads skipping only the leaf nodes in the
1651         // hierarchy
1652         for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1653              child_tid += thr_bar->skip_per_level[1]) {
1654           kmp_bstate_t *child_bar =
1655               &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1656           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1657                         "releasing T#%d(%d:%d)"
1658                         " go(%p): %u => %u\n",
1659                         gtid, team->t.t_id, tid,
1660                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1661                         child_tid, &child_bar->b_go, child_bar->b_go,
1662                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1663           // Use ngo store (if available) to both store ICVs and release child
1664           // via child's b_go
1665           ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1666         }
1667         ngo_sync();
1668       }
1669       TCW_8(thr_bar->b_go,
1670             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1671       // Now, release leaf children
1672       if (thr_bar->leaf_kids) { // if there are any
1673         // We test team_change on the off-chance that the level 1 team changed.
1674         if (team_change ||
1675             old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1676           if (old_leaf_kids) { // release old leaf kids
1677             thr_bar->b_go |= old_leaf_state;
1678           }
1679           // Release new leaf kids
1680           last = tid + thr_bar->skip_per_level[1];
1681           if (last > nproc)
1682             last = nproc;
1683           for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1684                ++child_tid) { // skip_per_level[0]=1
1685             kmp_info_t *child_thr = team->t.t_threads[child_tid];
1686             kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1687             KA_TRACE(
1688                 20,
1689                 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1690                  " T#%d(%d:%d) go(%p): %u => %u\n",
1691                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1692                  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1693                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1694             // Release child using child's b_go flag
1695             kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1696             flag.release();
1697           }
1698         } else { // Release all children at once with leaf_state bits on my own
1699           // b_go flag
1700           thr_bar->b_go |= thr_bar->leaf_state;
1701         }
1702       }
1703     } else { // Blocktime is not infinite; do a simple hierarchical release
1704       for (int d = thr_bar->my_level - 1; d >= 0;
1705            --d) { // Release highest level threads first
1706         last = tid + thr_bar->skip_per_level[d + 1];
1707         kmp_uint32 skip = thr_bar->skip_per_level[d];
1708         if (last > nproc)
1709           last = nproc;
1710         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1711           kmp_info_t *child_thr = team->t.t_threads[child_tid];
1712           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1713           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1714                         "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1715                         gtid, team->t.t_id, tid,
1716                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1717                         child_tid, &child_bar->b_go, child_bar->b_go,
1718                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1719           // Release child using child's b_go flag
1720           kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1721           flag.release();
1722         }
1723       }
1724     }
1725 #if KMP_BARRIER_ICV_PUSH
1726     if (propagate_icvs && !KMP_MASTER_TID(tid))
1727       // non-leaves copy ICVs from fixed ICVs to local dest
1728       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1729                 &thr_bar->th_fixed_icvs);
1730 #endif // KMP_BARRIER_ICV_PUSH
1731   }
1732   KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1733                 "barrier type %d\n",
1734                 gtid, team->t.t_id, tid, bt));
1735 }
1736 
1737 // End of Barrier Algorithms
1738 
1739 // type traits for cancellable value
1740 // if cancellable is true, then is_cancellable is a normal boolean variable
1741 // if cancellable is false, then is_cancellable is a compile time constant
1742 template <bool cancellable> struct is_cancellable {};
1743 template <> struct is_cancellable<true> {
1744   bool value;
1745   is_cancellable() : value(false) {}
1746   is_cancellable(bool b) : value(b) {}
1747   is_cancellable &operator=(bool b) {
1748     value = b;
1749     return *this;
1750   }
1751   operator bool() const { return value; }
1752 };
1753 template <> struct is_cancellable<false> {
1754   is_cancellable &operator=(bool b) { return *this; }
1755   constexpr operator bool() const { return false; }
1756 };
1757 
1758 // Internal function to do a barrier.
1759 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1760    If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1761    barrier
1762    When cancellable = false,
1763      Returns 0 if primary thread, 1 if worker thread.
1764    When cancellable = true
1765      Returns 0 if not cancelled, 1 if cancelled.  */
1766 template <bool cancellable = false>
1767 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1768                                   size_t reduce_size, void *reduce_data,
1769                                   void (*reduce)(void *, void *)) {
1770   KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1771   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1772   int tid = __kmp_tid_from_gtid(gtid);
1773   kmp_info_t *this_thr = __kmp_threads[gtid];
1774   kmp_team_t *team = this_thr->th.th_team;
1775   int status = 0;
1776   is_cancellable<cancellable> cancelled;
1777 #if OMPT_SUPPORT && OMPT_OPTIONAL
1778   ompt_data_t *my_task_data;
1779   ompt_data_t *my_parallel_data;
1780   void *return_address;
1781   ompt_sync_region_t barrier_kind;
1782 #endif
1783 
1784   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1785                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1786 
1787 #if OMPT_SUPPORT
1788   if (ompt_enabled.enabled) {
1789 #if OMPT_OPTIONAL
1790     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1791     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1792     return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1793     barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1794     if (ompt_enabled.ompt_callback_sync_region) {
1795       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1796           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1797           return_address);
1798     }
1799     if (ompt_enabled.ompt_callback_sync_region_wait) {
1800       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1801           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1802           return_address);
1803     }
1804 #endif
1805     // It is OK to report the barrier state after the barrier begin callback.
1806     // According to the OMPT specification, a compliant implementation may
1807     // even delay reporting this state until the barrier begins to wait.
1808     auto *ompt_thr_info = &this_thr->th.ompt_thread_info;
1809     switch (barrier_kind) {
1810     case ompt_sync_region_barrier_explicit:
1811       ompt_thr_info->state = ompt_state_wait_barrier_explicit;
1812       break;
1813     case ompt_sync_region_barrier_implicit_workshare:
1814       ompt_thr_info->state = ompt_state_wait_barrier_implicit_workshare;
1815       break;
1816     case ompt_sync_region_barrier_implicit_parallel:
1817       ompt_thr_info->state = ompt_state_wait_barrier_implicit_parallel;
1818       break;
1819     case ompt_sync_region_barrier_teams:
1820       ompt_thr_info->state = ompt_state_wait_barrier_teams;
1821       break;
1822     case ompt_sync_region_barrier_implementation:
1823       [[fallthrough]];
1824     default:
1825       ompt_thr_info->state = ompt_state_wait_barrier_implementation;
1826     }
1827   }
1828 #endif
1829 
1830   if (!team->t.t_serialized) {
1831 #if USE_ITT_BUILD
1832     // This value will be used in itt notify events below.
1833     void *itt_sync_obj = NULL;
1834 #if USE_ITT_NOTIFY
1835     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1836       itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1837 #endif
1838 #endif /* USE_ITT_BUILD */
1839     if (__kmp_tasking_mode == tskm_extra_barrier) {
1840       __kmp_tasking_barrier(team, this_thr, gtid);
1841       KA_TRACE(15,
1842                ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1843                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1844     }
1845 
1846     /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1847        access it when the team struct is not guaranteed to exist. */
1848     // See note about the corresponding code in __kmp_join_barrier() being
1849     // performance-critical.
1850     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1851 #if KMP_USE_MONITOR
1852       this_thr->th.th_team_bt_intervals =
1853           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1854       this_thr->th.th_team_bt_set =
1855           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1856 #else
1857       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1858 #endif
1859     }
1860 
1861 #if USE_ITT_BUILD
1862     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1863       __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1864 #endif /* USE_ITT_BUILD */
1865 #if USE_DEBUGGER
1866     // Let the debugger know: the thread arrived to the barrier and waiting.
1867     if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1868       team->t.t_bar[bt].b_master_arrived += 1;
1869     } else {
1870       this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1871     } // if
1872 #endif /* USE_DEBUGGER */
1873     if (reduce != NULL) {
1874       // KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1875       this_thr->th.th_local.reduce_data = reduce_data;
1876     }
1877 
1878     if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1879       __kmp_task_team_setup(this_thr, team);
1880 
1881     if (cancellable) {
1882       cancelled = __kmp_linear_barrier_gather_cancellable(
1883           bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1884     } else {
1885       switch (__kmp_barrier_gather_pattern[bt]) {
1886       case bp_dist_bar: {
1887         __kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
1888                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1889         break;
1890       }
1891       case bp_hyper_bar: {
1892         // don't set branch bits to 0; use linear
1893         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1894         __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1895                                    reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1896         break;
1897       }
1898       case bp_hierarchical_bar: {
1899         __kmp_hierarchical_barrier_gather(
1900             bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1901         break;
1902       }
1903       case bp_tree_bar: {
1904         // don't set branch bits to 0; use linear
1905         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1906         __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1907                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1908         break;
1909       }
1910       default: {
1911         __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1912                                     reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1913       }
1914       }
1915     }
1916 
1917     KMP_MB();
1918 
1919     if (KMP_MASTER_TID(tid)) {
1920       status = 0;
1921       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1922         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1923       }
1924 #if USE_DEBUGGER
1925       // Let the debugger know: All threads are arrived and starting leaving the
1926       // barrier.
1927       team->t.t_bar[bt].b_team_arrived += 1;
1928 #endif
1929 
1930       if (__kmp_omp_cancellation) {
1931         kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1932         // Reset cancellation flag for worksharing constructs
1933         if (cancel_request == cancel_loop ||
1934             cancel_request == cancel_sections) {
1935           KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1936         }
1937       }
1938 #if USE_ITT_BUILD
1939       /* TODO: In case of split reduction barrier, primary thread may send
1940          acquired event early, before the final summation into the shared
1941          variable is done (final summation can be a long operation for array
1942          reductions).  */
1943       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1944         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1945 #endif /* USE_ITT_BUILD */
1946 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1947       // Barrier - report frame end (only if active_level == 1)
1948       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1949           __kmp_forkjoin_frames_mode &&
1950           (this_thr->th.th_teams_microtask == NULL || // either not in teams
1951            this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1952           team->t.t_active_level == 1) {
1953         ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1954         kmp_uint64 cur_time = __itt_get_timestamp();
1955         kmp_info_t **other_threads = team->t.t_threads;
1956         int nproc = this_thr->th.th_team_nproc;
1957         int i;
1958         switch (__kmp_forkjoin_frames_mode) {
1959         case 1:
1960           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1961                                  loc, nproc);
1962           this_thr->th.th_frame_time = cur_time;
1963           break;
1964         case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1965           // be fixed)
1966           __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1967                                  1, loc, nproc);
1968           break;
1969         case 3:
1970           if (__itt_metadata_add_ptr) {
1971             // Initialize with primary thread's wait time
1972             kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1973             // Set arrive time to zero to be able to check it in
1974             // __kmp_invoke_task(); the same is done inside the loop below
1975             this_thr->th.th_bar_arrive_time = 0;
1976             for (i = 1; i < nproc; ++i) {
1977               delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1978               other_threads[i]->th.th_bar_arrive_time = 0;
1979             }
1980             __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1981                                          cur_time, delta,
1982                                          (kmp_uint64)(reduce != NULL));
1983           }
1984           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1985                                  loc, nproc);
1986           this_thr->th.th_frame_time = cur_time;
1987           break;
1988         }
1989       }
1990 #endif /* USE_ITT_BUILD */
1991     } else {
1992       status = 1;
1993 #if USE_ITT_BUILD
1994       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1995         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1996 #endif /* USE_ITT_BUILD */
1997     }
1998     if ((status == 1 || !is_split) && !cancelled) {
1999       if (cancellable) {
2000         cancelled = __kmp_linear_barrier_release_cancellable(
2001             bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2002       } else {
2003         switch (__kmp_barrier_release_pattern[bt]) {
2004         case bp_dist_bar: {
2005           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2006           __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2007                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2008           break;
2009         }
2010         case bp_hyper_bar: {
2011           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2012           __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2013                                       FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2014           break;
2015         }
2016         case bp_hierarchical_bar: {
2017           __kmp_hierarchical_barrier_release(
2018               bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2019           break;
2020         }
2021         case bp_tree_bar: {
2022           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2023           __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2024                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2025           break;
2026         }
2027         default: {
2028           __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2029                                        FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2030         }
2031         }
2032       }
2033       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
2034         __kmp_task_team_sync(this_thr, team);
2035       }
2036     }
2037 
2038 #if USE_ITT_BUILD
2039     /* GEH: TODO: Move this under if-condition above and also include in
2040        __kmp_end_split_barrier(). This will more accurately represent the actual
2041        release time of the threads for split barriers.  */
2042     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2043       __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2044 #endif /* USE_ITT_BUILD */
2045   } else { // Team is serialized.
2046     status = 0;
2047     if (__kmp_tasking_mode != tskm_immediate_exec) {
2048       if (this_thr->th.th_task_team != NULL) {
2049 #if USE_ITT_NOTIFY
2050         void *itt_sync_obj = NULL;
2051         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2052           itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
2053           __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2054         }
2055 #endif
2056 
2057         KMP_DEBUG_ASSERT(
2058             this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE ||
2059             this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
2060                 TRUE);
2061         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2062         __kmp_task_team_setup(this_thr, team);
2063 
2064 #if USE_ITT_BUILD
2065         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2066           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2067 #endif /* USE_ITT_BUILD */
2068       }
2069     }
2070   }
2071   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2072                 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
2073                 __kmp_tid_from_gtid(gtid), status));
2074 
2075 #if OMPT_SUPPORT
2076   if (ompt_enabled.enabled) {
2077 #if OMPT_OPTIONAL
2078     if (ompt_enabled.ompt_callback_sync_region_wait) {
2079       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2080           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2081           return_address);
2082     }
2083     if (ompt_enabled.ompt_callback_sync_region) {
2084       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2085           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2086           return_address);
2087     }
2088 #endif
2089     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
2090   }
2091 #endif
2092 
2093   if (cancellable)
2094     return (int)cancelled;
2095   return status;
2096 }
2097 
2098 // Returns 0 if primary thread, 1 if worker thread.
2099 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
2100                   size_t reduce_size, void *reduce_data,
2101                   void (*reduce)(void *, void *)) {
2102   return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
2103                                   reduce);
2104 }
2105 
2106 #if defined(KMP_GOMP_COMPAT)
2107 // Returns 1 if cancelled, 0 otherwise
2108 int __kmp_barrier_gomp_cancel(int gtid) {
2109   if (__kmp_omp_cancellation) {
2110     int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
2111                                                  0, NULL, NULL);
2112     if (cancelled) {
2113       int tid = __kmp_tid_from_gtid(gtid);
2114       kmp_info_t *this_thr = __kmp_threads[gtid];
2115       if (KMP_MASTER_TID(tid)) {
2116         // Primary thread does not need to revert anything
2117       } else {
2118         // Workers need to revert their private b_arrived flag
2119         this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
2120             KMP_BARRIER_STATE_BUMP;
2121       }
2122     }
2123     return cancelled;
2124   }
2125   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2126   return FALSE;
2127 }
2128 #endif
2129 
2130 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
2131   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
2132   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
2133   KMP_DEBUG_ASSERT(bt < bs_last_barrier);
2134   int tid = __kmp_tid_from_gtid(gtid);
2135   kmp_info_t *this_thr = __kmp_threads[gtid];
2136   kmp_team_t *team = this_thr->th.th_team;
2137 
2138   if (!team->t.t_serialized) {
2139     if (KMP_MASTER_GTID(gtid)) {
2140       switch (__kmp_barrier_release_pattern[bt]) {
2141       case bp_dist_bar: {
2142         __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2143                                    FALSE USE_ITT_BUILD_ARG(NULL));
2144         break;
2145       }
2146       case bp_hyper_bar: {
2147         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2148         __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2149                                     FALSE USE_ITT_BUILD_ARG(NULL));
2150         break;
2151       }
2152       case bp_hierarchical_bar: {
2153         __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
2154                                            FALSE USE_ITT_BUILD_ARG(NULL));
2155         break;
2156       }
2157       case bp_tree_bar: {
2158         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2159         __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2160                                    FALSE USE_ITT_BUILD_ARG(NULL));
2161         break;
2162       }
2163       default: {
2164         __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2165                                      FALSE USE_ITT_BUILD_ARG(NULL));
2166       }
2167       }
2168       if (__kmp_tasking_mode != tskm_immediate_exec) {
2169         __kmp_task_team_sync(this_thr, team);
2170       } // if
2171     }
2172   }
2173 }
2174 
2175 void __kmp_join_barrier(int gtid) {
2176   KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
2177   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2178 
2179   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
2180 
2181   kmp_info_t *this_thr = __kmp_threads[gtid];
2182   kmp_team_t *team;
2183   int tid;
2184 #ifdef KMP_DEBUG
2185   int team_id;
2186 #endif /* KMP_DEBUG */
2187 #if USE_ITT_BUILD
2188   void *itt_sync_obj = NULL;
2189 #if USE_ITT_NOTIFY
2190   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
2191     // Get object created at fork_barrier
2192     itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2193 #endif
2194 #endif /* USE_ITT_BUILD */
2195 #if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG)
2196   int nproc = this_thr->th.th_team_nproc;
2197 #endif
2198   KMP_MB();
2199 
2200   // Get current info
2201   team = this_thr->th.th_team;
2202   KMP_DEBUG_ASSERT(nproc == team->t.t_nproc);
2203   tid = __kmp_tid_from_gtid(gtid);
2204 #ifdef KMP_DEBUG
2205   team_id = team->t.t_id;
2206   kmp_info_t *master_thread = this_thr->th.th_team_master;
2207   if (master_thread != team->t.t_threads[0]) {
2208     __kmp_print_structure();
2209   }
2210 #endif /* KMP_DEBUG */
2211   KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
2212   KMP_MB();
2213 
2214   // Verify state
2215   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
2216   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
2217   KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
2218   KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
2219                 gtid, team_id, tid));
2220 
2221 #if OMPT_SUPPORT
2222   if (ompt_enabled.enabled) {
2223 #if OMPT_OPTIONAL
2224     ompt_data_t *my_task_data;
2225     ompt_data_t *my_parallel_data;
2226     void *codeptr = NULL;
2227     int ds_tid = this_thr->th.th_info.ds.ds_tid;
2228     if (KMP_MASTER_TID(ds_tid) &&
2229         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2230          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2231       codeptr = team->t.ompt_team_info.master_return_address;
2232     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
2233     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
2234     ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2235     ompt_state_t ompt_state = ompt_state_wait_barrier_implicit_parallel;
2236     if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) {
2237       sync_kind = ompt_sync_region_barrier_teams;
2238       ompt_state = ompt_state_wait_barrier_teams;
2239     }
2240     if (ompt_enabled.ompt_callback_sync_region) {
2241       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2242           sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2243     }
2244     if (ompt_enabled.ompt_callback_sync_region_wait) {
2245       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2246           sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2247     }
2248     if (!KMP_MASTER_TID(ds_tid))
2249       this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
2250 #endif
2251     this_thr->th.ompt_thread_info.state = ompt_state;
2252   }
2253 #endif
2254 
2255   if (__kmp_tasking_mode == tskm_extra_barrier) {
2256     __kmp_tasking_barrier(team, this_thr, gtid);
2257     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
2258                   gtid, team_id, tid));
2259   }
2260 #ifdef KMP_DEBUG
2261   if (__kmp_tasking_mode != tskm_immediate_exec) {
2262     KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
2263                   "%p, th_task_team = %p\n",
2264                   __kmp_gtid_from_thread(this_thr), team_id,
2265                   team->t.t_task_team[this_thr->th.th_task_state],
2266                   this_thr->th.th_task_team));
2267     KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr);
2268   }
2269 #endif /* KMP_DEBUG */
2270 
2271   /* Copy the blocktime info to the thread, where __kmp_wait_template() can
2272      access it when the team struct is not guaranteed to exist. Doing these
2273      loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
2274      we do not perform the copy if blocktime=infinite, since the values are not
2275      used by __kmp_wait_template() in that case. */
2276   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2277 #if KMP_USE_MONITOR
2278     this_thr->th.th_team_bt_intervals =
2279         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2280     this_thr->th.th_team_bt_set =
2281         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2282 #else
2283     this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2284 #endif
2285   }
2286 
2287 #if USE_ITT_BUILD
2288   if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2289     __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2290 #endif /* USE_ITT_BUILD */
2291 
2292   switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
2293   case bp_dist_bar: {
2294     __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2295                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2296     break;
2297   }
2298   case bp_hyper_bar: {
2299     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2300     __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2301                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2302     break;
2303   }
2304   case bp_hierarchical_bar: {
2305     __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2306                                       NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2307     break;
2308   }
2309   case bp_tree_bar: {
2310     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2311     __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2312                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2313     break;
2314   }
2315   default: {
2316     __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2317                                 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2318   }
2319   }
2320 
2321   /* From this point on, the team data structure may be deallocated at any time
2322      by the primary thread - it is unsafe to reference it in any of the worker
2323      threads. Any per-team data items that need to be referenced before the
2324      end of the barrier should be moved to the kmp_task_team_t structs.  */
2325   if (KMP_MASTER_TID(tid)) {
2326     if (__kmp_tasking_mode != tskm_immediate_exec) {
2327       __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2328     }
2329     if (__kmp_display_affinity) {
2330       KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
2331     }
2332 #if KMP_STATS_ENABLED
2333     // Have primary thread flag the workers to indicate they are now waiting for
2334     // next parallel region, Also wake them up so they switch their timers to
2335     // idle.
2336     for (int i = 0; i < team->t.t_nproc; ++i) {
2337       kmp_info_t *team_thread = team->t.t_threads[i];
2338       if (team_thread == this_thr)
2339         continue;
2340       team_thread->th.th_stats->setIdleFlag();
2341       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2342           team_thread->th.th_sleep_loc != NULL)
2343         __kmp_null_resume_wrapper(team_thread);
2344     }
2345 #endif
2346 #if USE_ITT_BUILD
2347     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2348       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2349 #endif /* USE_ITT_BUILD */
2350 
2351 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2352     // Join barrier - report frame end
2353     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2354         __kmp_forkjoin_frames_mode &&
2355         (this_thr->th.th_teams_microtask == NULL || // either not in teams
2356          this_thr->th.th_teams_size.nteams == 1) && // or inside single team
2357         team->t.t_active_level == 1) {
2358       kmp_uint64 cur_time = __itt_get_timestamp();
2359       ident_t *loc = team->t.t_ident;
2360       kmp_info_t **other_threads = team->t.t_threads;
2361       switch (__kmp_forkjoin_frames_mode) {
2362       case 1:
2363         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2364                                loc, nproc);
2365         break;
2366       case 2:
2367         __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
2368                                loc, nproc);
2369         break;
2370       case 3:
2371         if (__itt_metadata_add_ptr) {
2372           // Initialize with primary thread's wait time
2373           kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
2374           // Set arrive time to zero to be able to check it in
2375           // __kmp_invoke_task(); the same is done inside the loop below
2376           this_thr->th.th_bar_arrive_time = 0;
2377           for (int i = 1; i < nproc; ++i) {
2378             delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2379             other_threads[i]->th.th_bar_arrive_time = 0;
2380           }
2381           __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2382                                        cur_time, delta, 0);
2383         }
2384         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2385                                loc, nproc);
2386         this_thr->th.th_frame_time = cur_time;
2387         break;
2388       }
2389     }
2390 #endif /* USE_ITT_BUILD */
2391   }
2392 #if USE_ITT_BUILD
2393   else {
2394     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2395       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2396   }
2397 #endif /* USE_ITT_BUILD */
2398 
2399 #if KMP_DEBUG
2400   if (KMP_MASTER_TID(tid)) {
2401     KA_TRACE(
2402         15,
2403         ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
2404          gtid, team_id, tid, nproc));
2405   }
2406 #endif /* KMP_DEBUG */
2407 
2408   // TODO now, mark worker threads as done so they may be disbanded
2409   KMP_MB(); // Flush all pending memory write invalidates.
2410   KA_TRACE(10,
2411            ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
2412 
2413 }
2414 
2415 // TODO release worker threads' fork barriers as we are ready instead of all at
2416 // once
2417 void __kmp_fork_barrier(int gtid, int tid) {
2418   KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
2419   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2420   kmp_info_t *this_thr = __kmp_threads[gtid];
2421   kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
2422 #if USE_ITT_BUILD
2423   void *itt_sync_obj = NULL;
2424 #endif /* USE_ITT_BUILD */
2425 #ifdef KMP_DEBUG
2426   if (team)
2427     KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
2428                   (team != NULL) ? team->t.t_id : -1, tid));
2429 #endif
2430   // th_team pointer only valid for primary thread here
2431   if (KMP_MASTER_TID(tid)) {
2432 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2433     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2434       // Create itt barrier object
2435       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
2436       __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
2437     }
2438 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2439 
2440 #ifdef KMP_DEBUG
2441     KMP_DEBUG_ASSERT(team);
2442     kmp_info_t **other_threads = team->t.t_threads;
2443     int i;
2444 
2445     // Verify state
2446     KMP_MB();
2447 
2448     for (i = 1; i < team->t.t_nproc; ++i) {
2449       KA_TRACE(500,
2450                ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
2451                 "== %u.\n",
2452                 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
2453                 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
2454                 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
2455       KMP_DEBUG_ASSERT(
2456           (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
2457            ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
2458       KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
2459     }
2460 #endif
2461 
2462     if (__kmp_tasking_mode != tskm_immediate_exec)
2463       __kmp_task_team_setup(this_thr, team);
2464 
2465     /* The primary thread may have changed its blocktime between join barrier
2466        and fork barrier. Copy the blocktime info to the thread, where
2467        __kmp_wait_template() can access it when the team struct is not
2468        guaranteed to exist. */
2469     // See note about the corresponding code in __kmp_join_barrier() being
2470     // performance-critical
2471     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2472 #if KMP_USE_MONITOR
2473       this_thr->th.th_team_bt_intervals =
2474           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2475       this_thr->th.th_team_bt_set =
2476           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2477 #else
2478       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2479 #endif
2480     }
2481   } // primary thread
2482 
2483   switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
2484   case bp_dist_bar: {
2485     __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2486                                TRUE USE_ITT_BUILD_ARG(NULL));
2487     break;
2488   }
2489   case bp_hyper_bar: {
2490     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2491     __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2492                                 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2493     break;
2494   }
2495   case bp_hierarchical_bar: {
2496     __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2497                                        TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2498     break;
2499   }
2500   case bp_tree_bar: {
2501     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2502     __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2503                                TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2504     break;
2505   }
2506   default: {
2507     __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2508                                  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2509   }
2510   }
2511 
2512 #if OMPT_SUPPORT
2513   ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
2514   if (ompt_enabled.enabled &&
2515       (ompt_state == ompt_state_wait_barrier_teams ||
2516        ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
2517     int ds_tid = this_thr->th.th_info.ds.ds_tid;
2518     ompt_data_t *task_data = (team)
2519                                  ? OMPT_CUR_TASK_DATA(this_thr)
2520                                  : &(this_thr->th.ompt_thread_info.task_data);
2521     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2522 #if OMPT_OPTIONAL
2523     void *codeptr = NULL;
2524     if (KMP_MASTER_TID(ds_tid) &&
2525         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2526          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2527       codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2528     ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2529     if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
2530       sync_kind = ompt_sync_region_barrier_teams;
2531     if (ompt_enabled.ompt_callback_sync_region_wait) {
2532       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2533           sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2534     }
2535     if (ompt_enabled.ompt_callback_sync_region) {
2536       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2537           sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2538     }
2539 #endif
2540     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2541       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2542           ompt_scope_end, NULL, task_data, 0, ds_tid,
2543           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2544     }
2545   }
2546 #endif
2547 
2548   // Early exit for reaping threads releasing forkjoin barrier
2549   if (TCR_4(__kmp_global.g.g_done)) {
2550     this_thr->th.th_task_team = NULL;
2551 
2552 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2553     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2554       if (!KMP_MASTER_TID(tid)) {
2555         itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2556         if (itt_sync_obj)
2557           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2558       }
2559     }
2560 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2561     KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2562     return;
2563   }
2564 
2565   /* We can now assume that a valid team structure has been allocated by the
2566      primary thread and propagated to all worker threads. The current thread,
2567      however, may not be part of the team, so we can't blindly assume that the
2568      team pointer is non-null.  */
2569   team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2570   KMP_DEBUG_ASSERT(team != NULL);
2571   tid = __kmp_tid_from_gtid(gtid);
2572 
2573 #if KMP_BARRIER_ICV_PULL
2574   /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2575      __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2576      implicit task has this data before this function is called. We cannot
2577      modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2578      thread struct, because it is not always the case that the threads arrays
2579      have been allocated when __kmp_fork_call() is executed. */
2580   {
2581     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2582     if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2583       // Copy the initial ICVs from the primary thread's thread struct to the
2584       // implicit task for this tid.
2585       KA_TRACE(10,
2586                ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2587       __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2588                                tid, FALSE);
2589       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2590                 &team->t.t_threads[0]
2591                      ->th.th_bar[bs_forkjoin_barrier]
2592                      .bb.th_fixed_icvs);
2593     }
2594   }
2595 #endif // KMP_BARRIER_ICV_PULL
2596 
2597   if (__kmp_tasking_mode != tskm_immediate_exec) {
2598     __kmp_task_team_sync(this_thr, team);
2599   }
2600 
2601 #if KMP_AFFINITY_SUPPORTED
2602   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2603   if (proc_bind == proc_bind_intel) {
2604     // Call dynamic affinity settings
2605     if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) {
2606       __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2607     }
2608   } else if (proc_bind != proc_bind_false) {
2609     if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2610       KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2611                      __kmp_gtid_from_thread(this_thr),
2612                      this_thr->th.th_current_place));
2613     } else {
2614       __kmp_affinity_bind_place(gtid);
2615     }
2616   }
2617 #endif // KMP_AFFINITY_SUPPORTED
2618   // Perform the display affinity functionality
2619   if (__kmp_display_affinity) {
2620     if (team->t.t_display_affinity
2621 #if KMP_AFFINITY_SUPPORTED
2622         || (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed)
2623 #endif
2624     ) {
2625       // NULL means use the affinity-format-var ICV
2626       __kmp_aux_display_affinity(gtid, NULL);
2627       this_thr->th.th_prev_num_threads = team->t.t_nproc;
2628       this_thr->th.th_prev_level = team->t.t_level;
2629     }
2630   }
2631   if (!KMP_MASTER_TID(tid))
2632     KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2633 
2634 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2635   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2636     if (!KMP_MASTER_TID(tid)) {
2637       // Get correct barrier object
2638       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2639       __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2640     } // (prepare called inside barrier_release)
2641   }
2642 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2643   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2644                 team->t.t_id, tid));
2645 }
2646 
2647 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2648                           kmp_internal_control_t *new_icvs, ident_t *loc) {
2649   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2650 
2651   KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2652   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2653 
2654 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2655    __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2656    implicit task has this data before this function is called. */
2657 #if KMP_BARRIER_ICV_PULL
2658   /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2659      remains untouched), where all of the worker threads can access them and
2660      make their own copies after the barrier. */
2661   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2662   // allocated at this point
2663   copy_icvs(
2664       &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2665       new_icvs);
2666   KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2667                 team->t.t_threads[0], team));
2668 #elif KMP_BARRIER_ICV_PUSH
2669   // The ICVs will be propagated in the fork barrier, so nothing needs to be
2670   // done here.
2671   KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2672                 team->t.t_threads[0], team));
2673 #else
2674   // Copy the ICVs to each of the non-primary threads.  This takes O(nthreads)
2675   // time.
2676   ngo_load(new_icvs);
2677   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2678   // allocated at this point
2679   for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
2680     // TODO: GEH - pass in better source location info since usually NULL here
2681     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2682                   f, team->t.t_threads[f], team));
2683     __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2684     ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2685     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2686                   f, team->t.t_threads[f], team));
2687   }
2688   ngo_sync();
2689 #endif // KMP_BARRIER_ICV_PULL
2690 }
2691