xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_barrier.cpp (revision c1d255d3ffdbe447de3ab875bf4e7d7accc5bfc5)
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 
20 #if KMP_MIC
21 #include <immintrin.h>
22 #define USE_NGO_STORES 1
23 #endif // KMP_MIC
24 
25 #include "tsan_annotations.h"
26 
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39 
40 void __kmp_print_structure(void); // Forward declaration
41 
42 // ---------------------------- Barrier Algorithms ----------------------------
43 
44 // Linear Barrier
45 template <bool cancellable = false>
46 static bool __kmp_linear_barrier_gather_template(
47     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
49   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
50   kmp_team_t *team = this_thr->th.th_team;
51   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
52   kmp_info_t **other_threads = team->t.t_threads;
53 
54   KA_TRACE(
55       20,
56       ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57        gtid, team->t.t_id, tid, bt));
58   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59 
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
61   // Barrier imbalance - save arrive time to the thread
62   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
64         __itt_get_timestamp();
65   }
66 #endif
67   // We now perform a linear reduction to signal that all of the threads have
68   // arrived.
69   if (!KMP_MASTER_TID(tid)) {
70     KA_TRACE(20,
71              ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
72               "arrived(%p): %llu => %llu\n",
73               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
74               team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
75               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
76     // Mark arrival to master thread
77     /* After performing this write, a worker thread may not assume that the team
78        is valid any more - it could be deallocated by the master thread at any
79        time. */
80     ANNOTATE_BARRIER_BEGIN(this_thr);
81     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
82     flag.release();
83   } else {
84     kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
85     int nproc = this_thr->th.th_team_nproc;
86     int i;
87     // Don't have to worry about sleep bit here or atomic since team setting
88     kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
89 
90     // Collect all the worker team member threads.
91     for (i = 1; i < nproc; ++i) {
92 #if KMP_CACHE_MANAGE
93       // Prefetch next thread's arrived count
94       if (i + 1 < nproc)
95         KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
96 #endif /* KMP_CACHE_MANAGE */
97       KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
98                     "arrived(%p) == %llu\n",
99                     gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
100                     team->t.t_id, i,
101                     &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
102 
103       // Wait for worker thread to arrive
104       if (cancellable) {
105         kmp_flag_64<true, false> flag(
106             &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
107         if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
108           return true;
109       } else {
110         kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
111                            new_state);
112         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
113       }
114       ANNOTATE_BARRIER_END(other_threads[i]);
115 #if USE_ITT_BUILD && USE_ITT_NOTIFY
116       // Barrier imbalance - write min of the thread time and the other thread
117       // time to the thread.
118       if (__kmp_forkjoin_frames_mode == 2) {
119         this_thr->th.th_bar_min_time = KMP_MIN(
120             this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
121       }
122 #endif
123       if (reduce) {
124         KA_TRACE(100,
125                  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
126                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
127                   team->t.t_id, i));
128         ANNOTATE_REDUCE_AFTER(reduce);
129         OMPT_REDUCTION_DECL(this_thr, gtid);
130         OMPT_REDUCTION_BEGIN;
131         (*reduce)(this_thr->th.th_local.reduce_data,
132                   other_threads[i]->th.th_local.reduce_data);
133         OMPT_REDUCTION_END;
134         ANNOTATE_REDUCE_BEFORE(reduce);
135         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
136       }
137     }
138     // Don't have to worry about sleep bit here or atomic since team setting
139     team_bar->b_arrived = new_state;
140     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
141                   "arrived(%p) = %llu\n",
142                   gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
143                   new_state));
144   }
145   KA_TRACE(
146       20,
147       ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
148        gtid, team->t.t_id, tid, bt));
149   return false;
150 }
151 
152 template <bool cancellable = false>
153 static bool __kmp_linear_barrier_release_template(
154     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
155     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
156   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
157   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
158   kmp_team_t *team;
159 
160   if (KMP_MASTER_TID(tid)) {
161     unsigned int i;
162     kmp_uint32 nproc = this_thr->th.th_team_nproc;
163     kmp_info_t **other_threads;
164 
165     team = __kmp_threads[gtid]->th.th_team;
166     KMP_DEBUG_ASSERT(team != NULL);
167     other_threads = team->t.t_threads;
168 
169     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
170                   "barrier type %d\n",
171                   gtid, team->t.t_id, tid, bt));
172 
173     if (nproc > 1) {
174 #if KMP_BARRIER_ICV_PUSH
175       {
176         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
177         if (propagate_icvs) {
178           ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
179           for (i = 1; i < nproc; ++i) {
180             __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
181                                      team, i, FALSE);
182             ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
183                            &team->t.t_implicit_task_taskdata[0].td_icvs);
184           }
185           ngo_sync();
186         }
187       }
188 #endif // KMP_BARRIER_ICV_PUSH
189 
190       // Now, release all of the worker threads
191       for (i = 1; i < nproc; ++i) {
192 #if KMP_CACHE_MANAGE
193         // Prefetch next thread's go flag
194         if (i + 1 < nproc)
195           KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
196 #endif /* KMP_CACHE_MANAGE */
197         KA_TRACE(
198             20,
199             ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
200              "go(%p): %u => %u\n",
201              gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
202              team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
203              other_threads[i]->th.th_bar[bt].bb.b_go,
204              other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
205         ANNOTATE_BARRIER_BEGIN(other_threads[i]);
206         kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
207                          other_threads[i]);
208         flag.release();
209       }
210     }
211   } else { // Wait for the MASTER thread to release us
212     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
213                   gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
214     if (cancellable) {
215       kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
216       if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
217         return true;
218     } else {
219       kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
220       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
221     }
222     ANNOTATE_BARRIER_END(this_thr);
223 #if USE_ITT_BUILD && USE_ITT_NOTIFY
224     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
225       // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
226       // disabled)
227       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
228       // Cancel wait on previous parallel region...
229       __kmp_itt_task_starting(itt_sync_obj);
230 
231       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
232         return false;
233 
234       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
235       if (itt_sync_obj != NULL)
236         // Call prepare as early as possible for "new" barrier
237         __kmp_itt_task_finished(itt_sync_obj);
238     } else
239 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
240         // Early exit for reaping threads releasing forkjoin barrier
241         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
242       return false;
243 // The worker thread may now assume that the team is valid.
244 #ifdef KMP_DEBUG
245     tid = __kmp_tid_from_gtid(gtid);
246     team = __kmp_threads[gtid]->th.th_team;
247 #endif
248     KMP_DEBUG_ASSERT(team != NULL);
249     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
250     KA_TRACE(20,
251              ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
252               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
253     KMP_MB(); // Flush all pending memory write invalidates.
254   }
255   KA_TRACE(
256       20,
257       ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
258        gtid, team->t.t_id, tid, bt));
259   return false;
260 }
261 
262 static void __kmp_linear_barrier_gather(
263     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
264     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
265   __kmp_linear_barrier_gather_template<false>(
266       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
267 }
268 
269 static bool __kmp_linear_barrier_gather_cancellable(
270     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
271     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
272   return __kmp_linear_barrier_gather_template<true>(
273       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
274 }
275 
276 static void __kmp_linear_barrier_release(
277     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
278     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
279   __kmp_linear_barrier_release_template<false>(
280       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
281 }
282 
283 static bool __kmp_linear_barrier_release_cancellable(
284     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
285     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
286   return __kmp_linear_barrier_release_template<true>(
287       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
288 }
289 
290 // Tree barrier
291 static void
292 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
293                           int tid, void (*reduce)(void *, void *)
294                                        USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
295   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
296   kmp_team_t *team = this_thr->th.th_team;
297   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
298   kmp_info_t **other_threads = team->t.t_threads;
299   kmp_uint32 nproc = this_thr->th.th_team_nproc;
300   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
301   kmp_uint32 branch_factor = 1 << branch_bits;
302   kmp_uint32 child;
303   kmp_uint32 child_tid;
304   kmp_uint64 new_state;
305 
306   KA_TRACE(
307       20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
308            gtid, team->t.t_id, tid, bt));
309   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
310 
311 #if USE_ITT_BUILD && USE_ITT_NOTIFY
312   // Barrier imbalance - save arrive time to the thread
313   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
314     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
315         __itt_get_timestamp();
316   }
317 #endif
318   // Perform tree gather to wait until all threads have arrived; reduce any
319   // required data as we go
320   child_tid = (tid << branch_bits) + 1;
321   if (child_tid < nproc) {
322     // Parent threads wait for all their children to arrive
323     new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
324     child = 1;
325     do {
326       kmp_info_t *child_thr = other_threads[child_tid];
327       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
328 #if KMP_CACHE_MANAGE
329       // Prefetch next thread's arrived count
330       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
331         KMP_CACHE_PREFETCH(
332             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
333 #endif /* KMP_CACHE_MANAGE */
334       KA_TRACE(20,
335                ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
336                 "arrived(%p) == %llu\n",
337                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
338                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
339       // Wait for child to arrive
340       kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
341       flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
342       ANNOTATE_BARRIER_END(child_thr);
343 #if USE_ITT_BUILD && USE_ITT_NOTIFY
344       // Barrier imbalance - write min of the thread time and a child time to
345       // the thread.
346       if (__kmp_forkjoin_frames_mode == 2) {
347         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
348                                                child_thr->th.th_bar_min_time);
349       }
350 #endif
351       if (reduce) {
352         KA_TRACE(100,
353                  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
354                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
355                   team->t.t_id, child_tid));
356         ANNOTATE_REDUCE_AFTER(reduce);
357         OMPT_REDUCTION_DECL(this_thr, gtid);
358         OMPT_REDUCTION_BEGIN;
359         (*reduce)(this_thr->th.th_local.reduce_data,
360                   child_thr->th.th_local.reduce_data);
361         OMPT_REDUCTION_END;
362         ANNOTATE_REDUCE_BEFORE(reduce);
363         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
364       }
365       child++;
366       child_tid++;
367     } while (child <= branch_factor && child_tid < nproc);
368   }
369 
370   if (!KMP_MASTER_TID(tid)) { // Worker threads
371     kmp_int32 parent_tid = (tid - 1) >> branch_bits;
372 
373     KA_TRACE(20,
374              ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
375               "arrived(%p): %llu => %llu\n",
376               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
377               team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
378               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
379 
380     // Mark arrival to parent thread
381     /* After performing this write, a worker thread may not assume that the team
382        is valid any more - it could be deallocated by the master thread at any
383        time.  */
384     ANNOTATE_BARRIER_BEGIN(this_thr);
385     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
386     flag.release();
387   } else {
388     // Need to update the team arrived pointer if we are the master thread
389     if (nproc > 1) // New value was already computed above
390       team->t.t_bar[bt].b_arrived = new_state;
391     else
392       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
393     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
394                   "arrived(%p) = %llu\n",
395                   gtid, team->t.t_id, tid, team->t.t_id,
396                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
397   }
398   KA_TRACE(20,
399            ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
400             gtid, team->t.t_id, tid, bt));
401 }
402 
403 static void __kmp_tree_barrier_release(
404     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
405     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
406   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
407   kmp_team_t *team;
408   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
409   kmp_uint32 nproc;
410   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
411   kmp_uint32 branch_factor = 1 << branch_bits;
412   kmp_uint32 child;
413   kmp_uint32 child_tid;
414 
415   // Perform a tree release for all of the threads that have been gathered
416   if (!KMP_MASTER_TID(
417           tid)) { // Handle fork barrier workers who aren't part of a team yet
418     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
419                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
420     // Wait for parent thread to release us
421     kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
422     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
423     ANNOTATE_BARRIER_END(this_thr);
424 #if USE_ITT_BUILD && USE_ITT_NOTIFY
425     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
426       // In fork barrier where we could not get the object reliably (or
427       // ITTNOTIFY is disabled)
428       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
429       // Cancel wait on previous parallel region...
430       __kmp_itt_task_starting(itt_sync_obj);
431 
432       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
433         return;
434 
435       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
436       if (itt_sync_obj != NULL)
437         // Call prepare as early as possible for "new" barrier
438         __kmp_itt_task_finished(itt_sync_obj);
439     } else
440 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
441         // Early exit for reaping threads releasing forkjoin barrier
442         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
443       return;
444 
445     // The worker thread may now assume that the team is valid.
446     team = __kmp_threads[gtid]->th.th_team;
447     KMP_DEBUG_ASSERT(team != NULL);
448     tid = __kmp_tid_from_gtid(gtid);
449 
450     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
451     KA_TRACE(20,
452              ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
453               team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
454     KMP_MB(); // Flush all pending memory write invalidates.
455   } else {
456     team = __kmp_threads[gtid]->th.th_team;
457     KMP_DEBUG_ASSERT(team != NULL);
458     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
459                   "barrier type %d\n",
460                   gtid, team->t.t_id, tid, bt));
461   }
462   nproc = this_thr->th.th_team_nproc;
463   child_tid = (tid << branch_bits) + 1;
464 
465   if (child_tid < nproc) {
466     kmp_info_t **other_threads = team->t.t_threads;
467     child = 1;
468     // Parent threads release all their children
469     do {
470       kmp_info_t *child_thr = other_threads[child_tid];
471       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
472 #if KMP_CACHE_MANAGE
473       // Prefetch next thread's go count
474       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
475         KMP_CACHE_PREFETCH(
476             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
477 #endif /* KMP_CACHE_MANAGE */
478 
479 #if KMP_BARRIER_ICV_PUSH
480       {
481         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
482         if (propagate_icvs) {
483           __kmp_init_implicit_task(team->t.t_ident,
484                                    team->t.t_threads[child_tid], team,
485                                    child_tid, FALSE);
486           copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
487                     &team->t.t_implicit_task_taskdata[0].td_icvs);
488         }
489       }
490 #endif // KMP_BARRIER_ICV_PUSH
491       KA_TRACE(20,
492                ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
493                 "go(%p): %u => %u\n",
494                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
495                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
496                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
497       // Release child from barrier
498       ANNOTATE_BARRIER_BEGIN(child_thr);
499       kmp_flag_64<> flag(&child_bar->b_go, child_thr);
500       flag.release();
501       child++;
502       child_tid++;
503     } while (child <= branch_factor && child_tid < nproc);
504   }
505   KA_TRACE(
506       20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
507            gtid, team->t.t_id, tid, bt));
508 }
509 
510 // Hyper Barrier
511 static void
512 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
513                            int tid, void (*reduce)(void *, void *)
514                                         USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
515   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
516   kmp_team_t *team = this_thr->th.th_team;
517   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
518   kmp_info_t **other_threads = team->t.t_threads;
519   kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
520   kmp_uint32 num_threads = this_thr->th.th_team_nproc;
521   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
522   kmp_uint32 branch_factor = 1 << branch_bits;
523   kmp_uint32 offset;
524   kmp_uint32 level;
525 
526   KA_TRACE(
527       20,
528       ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
529        gtid, team->t.t_id, tid, bt));
530   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
531 
532 #if USE_ITT_BUILD && USE_ITT_NOTIFY
533   // Barrier imbalance - save arrive time to the thread
534   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
535     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
536         __itt_get_timestamp();
537   }
538 #endif
539   /* Perform a hypercube-embedded tree gather to wait until all of the threads
540      have arrived, and reduce any required data as we go.  */
541   kmp_flag_64<> p_flag(&thr_bar->b_arrived);
542   for (level = 0, offset = 1; offset < num_threads;
543        level += branch_bits, offset <<= branch_bits) {
544     kmp_uint32 child;
545     kmp_uint32 child_tid;
546 
547     if (((tid >> level) & (branch_factor - 1)) != 0) {
548       kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
549 
550       KMP_MB(); // Synchronize parent and child threads.
551       KA_TRACE(20,
552                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
553                 "arrived(%p): %llu => %llu\n",
554                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
555                 team->t.t_id, parent_tid, &thr_bar->b_arrived,
556                 thr_bar->b_arrived,
557                 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
558       // Mark arrival to parent thread
559       /* After performing this write (in the last iteration of the enclosing for
560          loop), a worker thread may not assume that the team is valid any more
561          - it could be deallocated by the master thread at any time.  */
562       ANNOTATE_BARRIER_BEGIN(this_thr);
563       p_flag.set_waiter(other_threads[parent_tid]);
564       p_flag.release();
565       break;
566     }
567 
568     // Parent threads wait for children to arrive
569     if (new_state == KMP_BARRIER_UNUSED_STATE)
570       new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
571     for (child = 1, child_tid = tid + (1 << level);
572          child < branch_factor && child_tid < num_threads;
573          child++, child_tid += (1 << level)) {
574       kmp_info_t *child_thr = other_threads[child_tid];
575       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
576 #if KMP_CACHE_MANAGE
577       kmp_uint32 next_child_tid = child_tid + (1 << level);
578       // Prefetch next thread's arrived count
579       if (child + 1 < branch_factor && next_child_tid < num_threads)
580         KMP_CACHE_PREFETCH(
581             &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
582 #endif /* KMP_CACHE_MANAGE */
583       KA_TRACE(20,
584                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
585                 "arrived(%p) == %llu\n",
586                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
587                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
588       // Wait for child to arrive
589       kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
590       c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
591       ANNOTATE_BARRIER_END(child_thr);
592       KMP_MB(); // Synchronize parent and child threads.
593 #if USE_ITT_BUILD && USE_ITT_NOTIFY
594       // Barrier imbalance - write min of the thread time and a child time to
595       // the thread.
596       if (__kmp_forkjoin_frames_mode == 2) {
597         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
598                                                child_thr->th.th_bar_min_time);
599       }
600 #endif
601       if (reduce) {
602         KA_TRACE(100,
603                  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
604                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
605                   team->t.t_id, child_tid));
606         ANNOTATE_REDUCE_AFTER(reduce);
607         OMPT_REDUCTION_DECL(this_thr, gtid);
608         OMPT_REDUCTION_BEGIN;
609         (*reduce)(this_thr->th.th_local.reduce_data,
610                   child_thr->th.th_local.reduce_data);
611         OMPT_REDUCTION_END;
612         ANNOTATE_REDUCE_BEFORE(reduce);
613         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
614       }
615     }
616   }
617 
618   if (KMP_MASTER_TID(tid)) {
619     // Need to update the team arrived pointer if we are the master thread
620     if (new_state == KMP_BARRIER_UNUSED_STATE)
621       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
622     else
623       team->t.t_bar[bt].b_arrived = new_state;
624     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
625                   "arrived(%p) = %llu\n",
626                   gtid, team->t.t_id, tid, team->t.t_id,
627                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
628   }
629   KA_TRACE(
630       20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
631            gtid, team->t.t_id, tid, bt));
632 }
633 
634 // The reverse versions seem to beat the forward versions overall
635 #define KMP_REVERSE_HYPER_BAR
636 static void __kmp_hyper_barrier_release(
637     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
638     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
639   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
640   kmp_team_t *team;
641   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
642   kmp_info_t **other_threads;
643   kmp_uint32 num_threads;
644   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
645   kmp_uint32 branch_factor = 1 << branch_bits;
646   kmp_uint32 child;
647   kmp_uint32 child_tid;
648   kmp_uint32 offset;
649   kmp_uint32 level;
650 
651   /* Perform a hypercube-embedded tree release for all of the threads that have
652      been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
653      are released in the reverse order of the corresponding gather, otherwise
654      threads are released in the same order. */
655   if (KMP_MASTER_TID(tid)) { // master
656     team = __kmp_threads[gtid]->th.th_team;
657     KMP_DEBUG_ASSERT(team != NULL);
658     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
659                   "barrier type %d\n",
660                   gtid, team->t.t_id, tid, bt));
661 #if KMP_BARRIER_ICV_PUSH
662     if (propagate_icvs) { // master already has ICVs in final destination; copy
663       copy_icvs(&thr_bar->th_fixed_icvs,
664                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
665     }
666 #endif
667   } else { // Handle fork barrier workers who aren't part of a team yet
668     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
669                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
670     // Wait for parent thread to release us
671     kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
672     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
673     ANNOTATE_BARRIER_END(this_thr);
674 #if USE_ITT_BUILD && USE_ITT_NOTIFY
675     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
676       // In fork barrier where we could not get the object reliably
677       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
678       // Cancel wait on previous parallel region...
679       __kmp_itt_task_starting(itt_sync_obj);
680 
681       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
682         return;
683 
684       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
685       if (itt_sync_obj != NULL)
686         // Call prepare as early as possible for "new" barrier
687         __kmp_itt_task_finished(itt_sync_obj);
688     } else
689 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
690         // Early exit for reaping threads releasing forkjoin barrier
691         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
692       return;
693 
694     // The worker thread may now assume that the team is valid.
695     team = __kmp_threads[gtid]->th.th_team;
696     KMP_DEBUG_ASSERT(team != NULL);
697     tid = __kmp_tid_from_gtid(gtid);
698 
699     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
700     KA_TRACE(20,
701              ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
702               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
703     KMP_MB(); // Flush all pending memory write invalidates.
704   }
705   num_threads = this_thr->th.th_team_nproc;
706   other_threads = team->t.t_threads;
707 
708 #ifdef KMP_REVERSE_HYPER_BAR
709   // Count up to correct level for parent
710   for (level = 0, offset = 1;
711        offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
712        level += branch_bits, offset <<= branch_bits)
713     ;
714 
715   // Now go down from there
716   for (level -= branch_bits, offset >>= branch_bits; offset != 0;
717        level -= branch_bits, offset >>= branch_bits)
718 #else
719   // Go down the tree, level by level
720   for (level = 0, offset = 1; offset < num_threads;
721        level += branch_bits, offset <<= branch_bits)
722 #endif // KMP_REVERSE_HYPER_BAR
723   {
724 #ifdef KMP_REVERSE_HYPER_BAR
725     /* Now go in reverse order through the children, highest to lowest.
726        Initial setting of child is conservative here. */
727     child = num_threads >> ((level == 0) ? level : level - 1);
728     for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
729         child_tid = tid + (child << level);
730          child >= 1; child--, child_tid -= (1 << level))
731 #else
732     if (((tid >> level) & (branch_factor - 1)) != 0)
733       // No need to go lower than this, since this is the level parent would be
734       // notified
735       break;
736     // Iterate through children on this level of the tree
737     for (child = 1, child_tid = tid + (1 << level);
738          child < branch_factor && child_tid < num_threads;
739          child++, child_tid += (1 << level))
740 #endif // KMP_REVERSE_HYPER_BAR
741     {
742       if (child_tid >= num_threads)
743         continue; // Child doesn't exist so keep going
744       else {
745         kmp_info_t *child_thr = other_threads[child_tid];
746         kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
747 #if KMP_CACHE_MANAGE
748         kmp_uint32 next_child_tid = child_tid - (1 << level);
749 // Prefetch next thread's go count
750 #ifdef KMP_REVERSE_HYPER_BAR
751         if (child - 1 >= 1 && next_child_tid < num_threads)
752 #else
753         if (child + 1 < branch_factor && next_child_tid < num_threads)
754 #endif // KMP_REVERSE_HYPER_BAR
755           KMP_CACHE_PREFETCH(
756               &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
757 #endif /* KMP_CACHE_MANAGE */
758 
759 #if KMP_BARRIER_ICV_PUSH
760         if (propagate_icvs) // push my fixed ICVs to my child
761           copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
762 #endif // KMP_BARRIER_ICV_PUSH
763 
764         KA_TRACE(
765             20,
766             ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
767              "go(%p): %u => %u\n",
768              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
769              team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
770              child_bar->b_go + KMP_BARRIER_STATE_BUMP));
771         // Release child from barrier
772         ANNOTATE_BARRIER_BEGIN(child_thr);
773         kmp_flag_64<> flag(&child_bar->b_go, child_thr);
774         flag.release();
775       }
776     }
777   }
778 #if KMP_BARRIER_ICV_PUSH
779   if (propagate_icvs &&
780       !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
781     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
782                              FALSE);
783     copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
784               &thr_bar->th_fixed_icvs);
785   }
786 #endif
787   KA_TRACE(
788       20,
789       ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
790        gtid, team->t.t_id, tid, bt));
791 }
792 
793 // Hierarchical Barrier
794 
795 // Initialize thread barrier data
796 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
797    Performs the minimum amount of initialization required based on how the team
798    has changed. Returns true if leaf children will require both on-core and
799    traditional wake-up mechanisms. For example, if the team size increases,
800    threads already in the team will respond to on-core wakeup on their parent
801    thread, but threads newly added to the team will only be listening on the
802    their local b_go. */
803 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
804                                                    kmp_bstate_t *thr_bar,
805                                                    kmp_uint32 nproc, int gtid,
806                                                    int tid, kmp_team_t *team) {
807   // Checks to determine if (re-)initialization is needed
808   bool uninitialized = thr_bar->team == NULL;
809   bool team_changed = team != thr_bar->team;
810   bool team_sz_changed = nproc != thr_bar->nproc;
811   bool tid_changed = tid != thr_bar->old_tid;
812   bool retval = false;
813 
814   if (uninitialized || team_sz_changed) {
815     __kmp_get_hierarchy(nproc, thr_bar);
816   }
817 
818   if (uninitialized || team_sz_changed || tid_changed) {
819     thr_bar->my_level = thr_bar->depth - 1; // default for master
820     thr_bar->parent_tid = -1; // default for master
821     if (!KMP_MASTER_TID(
822             tid)) { // if not master, find parent thread in hierarchy
823       kmp_uint32 d = 0;
824       while (d < thr_bar->depth) { // find parent based on level of thread in
825         // hierarchy, and note level
826         kmp_uint32 rem;
827         if (d == thr_bar->depth - 2) { // reached level right below the master
828           thr_bar->parent_tid = 0;
829           thr_bar->my_level = d;
830           break;
831         } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
832           // TODO: can we make the above op faster?
833           // thread is not a subtree root at next level, so this is max
834           thr_bar->parent_tid = tid - rem;
835           thr_bar->my_level = d;
836           break;
837         }
838         ++d;
839       }
840     }
841     __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
842                             (thr_bar->skip_per_level[thr_bar->my_level])),
843                        &(thr_bar->offset));
844     thr_bar->old_tid = tid;
845     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
846     thr_bar->team = team;
847     thr_bar->parent_bar =
848         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
849   }
850   if (uninitialized || team_changed || tid_changed) {
851     thr_bar->team = team;
852     thr_bar->parent_bar =
853         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
854     retval = true;
855   }
856   if (uninitialized || team_sz_changed || tid_changed) {
857     thr_bar->nproc = nproc;
858     thr_bar->leaf_kids = thr_bar->base_leaf_kids;
859     if (thr_bar->my_level == 0)
860       thr_bar->leaf_kids = 0;
861     if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
862       __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
863     thr_bar->leaf_state = 0;
864     for (int i = 0; i < thr_bar->leaf_kids; ++i)
865       ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
866   }
867   return retval;
868 }
869 
870 static void __kmp_hierarchical_barrier_gather(
871     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
872     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
873   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
874   kmp_team_t *team = this_thr->th.th_team;
875   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
876   kmp_uint32 nproc = this_thr->th.th_team_nproc;
877   kmp_info_t **other_threads = team->t.t_threads;
878   kmp_uint64 new_state;
879 
880   int level = team->t.t_level;
881   if (other_threads[0]
882           ->th.th_teams_microtask) // are we inside the teams construct?
883     if (this_thr->th.th_teams_size.nteams > 1)
884       ++level; // level was not increased in teams construct for team_of_masters
885   if (level == 1)
886     thr_bar->use_oncore_barrier = 1;
887   else
888     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
889 
890   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
891                 "barrier type %d\n",
892                 gtid, team->t.t_id, tid, bt));
893   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
894 
895 #if USE_ITT_BUILD && USE_ITT_NOTIFY
896   // Barrier imbalance - save arrive time to the thread
897   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
898     this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
899   }
900 #endif
901 
902   (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
903                                                team);
904 
905   if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
906     kmp_int32 child_tid;
907     new_state =
908         (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
909     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
910         thr_bar->use_oncore_barrier) {
911       if (thr_bar->leaf_kids) {
912         // First, wait for leaf children to check-in on my b_arrived flag
913         kmp_uint64 leaf_state =
914             KMP_MASTER_TID(tid)
915                 ? thr_bar->b_arrived | thr_bar->leaf_state
916                 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
917         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
918                       "for leaf kids\n",
919                       gtid, team->t.t_id, tid));
920         kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
921         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
922         if (reduce) {
923           ANNOTATE_REDUCE_AFTER(reduce);
924           OMPT_REDUCTION_DECL(this_thr, gtid);
925           OMPT_REDUCTION_BEGIN;
926           for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
927                ++child_tid) {
928             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
929                            "T#%d(%d:%d)\n",
930                            gtid, team->t.t_id, tid,
931                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
932                            child_tid));
933             ANNOTATE_BARRIER_END(other_threads[child_tid]);
934             (*reduce)(this_thr->th.th_local.reduce_data,
935                       other_threads[child_tid]->th.th_local.reduce_data);
936           }
937           OMPT_REDUCTION_END;
938           ANNOTATE_REDUCE_BEFORE(reduce);
939           ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
940         }
941         // clear leaf_state bits
942         KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
943       }
944       // Next, wait for higher level children on each child's b_arrived flag
945       for (kmp_uint32 d = 1; d < thr_bar->my_level;
946            ++d) { // gather lowest level threads first, but skip 0
947         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
948                    skip = thr_bar->skip_per_level[d];
949         if (last > nproc)
950           last = nproc;
951         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
952           kmp_info_t *child_thr = other_threads[child_tid];
953           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
954           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
955                         "T#%d(%d:%d) "
956                         "arrived(%p) == %llu\n",
957                         gtid, team->t.t_id, tid,
958                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
959                         child_tid, &child_bar->b_arrived, new_state));
960           kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
961           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
962           ANNOTATE_BARRIER_END(child_thr);
963           if (reduce) {
964             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
965                            "T#%d(%d:%d)\n",
966                            gtid, team->t.t_id, tid,
967                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
968                            child_tid));
969             ANNOTATE_REDUCE_AFTER(reduce);
970             (*reduce)(this_thr->th.th_local.reduce_data,
971                       child_thr->th.th_local.reduce_data);
972             ANNOTATE_REDUCE_BEFORE(reduce);
973             ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
974           }
975         }
976       }
977     } else { // Blocktime is not infinite
978       for (kmp_uint32 d = 0; d < thr_bar->my_level;
979            ++d) { // Gather lowest level threads first
980         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
981                    skip = thr_bar->skip_per_level[d];
982         if (last > nproc)
983           last = nproc;
984         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
985           kmp_info_t *child_thr = other_threads[child_tid];
986           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
987           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
988                         "T#%d(%d:%d) "
989                         "arrived(%p) == %llu\n",
990                         gtid, team->t.t_id, tid,
991                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
992                         child_tid, &child_bar->b_arrived, new_state));
993           kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
994           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
995           ANNOTATE_BARRIER_END(child_thr);
996           if (reduce) {
997             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
998                            "T#%d(%d:%d)\n",
999                            gtid, team->t.t_id, tid,
1000                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1001                            child_tid));
1002             ANNOTATE_REDUCE_AFTER(reduce);
1003             (*reduce)(this_thr->th.th_local.reduce_data,
1004                       child_thr->th.th_local.reduce_data);
1005             ANNOTATE_REDUCE_BEFORE(reduce);
1006             ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1007           }
1008         }
1009       }
1010     }
1011   }
1012   // All subordinates are gathered; now release parent if not master thread
1013 
1014   if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1015     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1016                   " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1017                   gtid, team->t.t_id, tid,
1018                   __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1019                   thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1020                   thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1021     /* Mark arrival to parent: After performing this write, a worker thread may
1022        not assume that the team is valid any more - it could be deallocated by
1023        the master thread at any time. */
1024     if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1025         !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1026       // flag; release it
1027       ANNOTATE_BARRIER_BEGIN(this_thr);
1028       kmp_flag_64<> flag(&thr_bar->b_arrived,
1029                          other_threads[thr_bar->parent_tid]);
1030       flag.release();
1031     } else {
1032       // Leaf does special release on "offset" bits of parent's b_arrived flag
1033       thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1034       kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1035                            thr_bar->offset + 1);
1036       flag.set_waiter(other_threads[thr_bar->parent_tid]);
1037       flag.release();
1038     }
1039   } else { // Master thread needs to update the team's b_arrived value
1040     team->t.t_bar[bt].b_arrived = new_state;
1041     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1042                   "arrived(%p) = %llu\n",
1043                   gtid, team->t.t_id, tid, team->t.t_id,
1044                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1045   }
1046   // Is the team access below unsafe or just technically invalid?
1047   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1048                 "barrier type %d\n",
1049                 gtid, team->t.t_id, tid, bt));
1050 }
1051 
1052 static void __kmp_hierarchical_barrier_release(
1053     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1054     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1055   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1056   kmp_team_t *team;
1057   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1058   kmp_uint32 nproc;
1059   bool team_change = false; // indicates on-core barrier shouldn't be used
1060 
1061   if (KMP_MASTER_TID(tid)) {
1062     team = __kmp_threads[gtid]->th.th_team;
1063     KMP_DEBUG_ASSERT(team != NULL);
1064     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1065                   "entered barrier type %d\n",
1066                   gtid, team->t.t_id, tid, bt));
1067   } else { // Worker threads
1068     // Wait for parent thread to release me
1069     if (!thr_bar->use_oncore_barrier ||
1070         __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1071         thr_bar->team == NULL) {
1072       // Use traditional method of waiting on my own b_go flag
1073       thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1074       kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1075       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1076       ANNOTATE_BARRIER_END(this_thr);
1077       TCW_8(thr_bar->b_go,
1078             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1079     } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1080       // infinite, not nested
1081       // Wait on my "offset" bits on parent's b_go flag
1082       thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1083       kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1084                            thr_bar->offset + 1, bt,
1085                            this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1086       flag.wait(this_thr, TRUE);
1087       if (thr_bar->wait_flag ==
1088           KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1089         TCW_8(thr_bar->b_go,
1090               KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1091       } else { // Reset my bits on parent's b_go flag
1092         (RCAST(volatile char *,
1093                &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1094       }
1095     }
1096     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1097     // Early exit for reaping threads releasing forkjoin barrier
1098     if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1099       return;
1100     // The worker thread may now assume that the team is valid.
1101     team = __kmp_threads[gtid]->th.th_team;
1102     KMP_DEBUG_ASSERT(team != NULL);
1103     tid = __kmp_tid_from_gtid(gtid);
1104 
1105     KA_TRACE(
1106         20,
1107         ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1108          gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1109     KMP_MB(); // Flush all pending memory write invalidates.
1110   }
1111 
1112   nproc = this_thr->th.th_team_nproc;
1113   int level = team->t.t_level;
1114   if (team->t.t_threads[0]
1115           ->th.th_teams_microtask) { // are we inside the teams construct?
1116     if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1117         this_thr->th.th_teams_level == level)
1118       ++level; // level was not increased in teams construct for team_of_workers
1119     if (this_thr->th.th_teams_size.nteams > 1)
1120       ++level; // level was not increased in teams construct for team_of_masters
1121   }
1122   if (level == 1)
1123     thr_bar->use_oncore_barrier = 1;
1124   else
1125     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1126 
1127   // If the team size has increased, we still communicate with old leaves via
1128   // oncore barrier.
1129   unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1130   kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1131   team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1132                                                        tid, team);
1133   // But if the entire team changes, we won't use oncore barrier at all
1134   if (team_change)
1135     old_leaf_kids = 0;
1136 
1137 #if KMP_BARRIER_ICV_PUSH
1138   if (propagate_icvs) {
1139     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1140                              FALSE);
1141     if (KMP_MASTER_TID(
1142             tid)) { // master already has copy in final destination; copy
1143       copy_icvs(&thr_bar->th_fixed_icvs,
1144                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1145     } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1146                thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1147       if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1148         // leaves (on-core children) pull parent's fixed ICVs directly to local
1149         // ICV store
1150         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1151                   &thr_bar->parent_bar->th_fixed_icvs);
1152       // non-leaves will get ICVs piggybacked with b_go via NGO store
1153     } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1154       if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1155         // access
1156         copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1157       else // leaves copy parent's fixed ICVs directly to local ICV store
1158         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1159                   &thr_bar->parent_bar->th_fixed_icvs);
1160     }
1161   }
1162 #endif // KMP_BARRIER_ICV_PUSH
1163 
1164   // Now, release my children
1165   if (thr_bar->my_level) { // not a leaf
1166     kmp_int32 child_tid;
1167     kmp_uint32 last;
1168     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1169         thr_bar->use_oncore_barrier) {
1170       if (KMP_MASTER_TID(tid)) { // do a flat release
1171         // Set local b_go to bump children via NGO store of the cache line
1172         // containing IVCs and b_go.
1173         thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1174         // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1175         // the cache line
1176         ngo_load(&thr_bar->th_fixed_icvs);
1177         // This loops over all the threads skipping only the leaf nodes in the
1178         // hierarchy
1179         for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1180              child_tid += thr_bar->skip_per_level[1]) {
1181           kmp_bstate_t *child_bar =
1182               &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1183           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1184                         "releasing T#%d(%d:%d)"
1185                         " go(%p): %u => %u\n",
1186                         gtid, team->t.t_id, tid,
1187                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1188                         child_tid, &child_bar->b_go, child_bar->b_go,
1189                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1190           // Use ngo store (if available) to both store ICVs and release child
1191           // via child's b_go
1192           ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1193         }
1194         ngo_sync();
1195       }
1196       TCW_8(thr_bar->b_go,
1197             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1198       // Now, release leaf children
1199       if (thr_bar->leaf_kids) { // if there are any
1200         // We test team_change on the off-chance that the level 1 team changed.
1201         if (team_change ||
1202             old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1203           if (old_leaf_kids) { // release old leaf kids
1204             thr_bar->b_go |= old_leaf_state;
1205           }
1206           // Release new leaf kids
1207           last = tid + thr_bar->skip_per_level[1];
1208           if (last > nproc)
1209             last = nproc;
1210           for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1211                ++child_tid) { // skip_per_level[0]=1
1212             kmp_info_t *child_thr = team->t.t_threads[child_tid];
1213             kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1214             KA_TRACE(
1215                 20,
1216                 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1217                  " T#%d(%d:%d) go(%p): %u => %u\n",
1218                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1219                  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1220                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1221             // Release child using child's b_go flag
1222             ANNOTATE_BARRIER_BEGIN(child_thr);
1223             kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1224             flag.release();
1225           }
1226         } else { // Release all children at once with leaf_state bits on my own
1227           // b_go flag
1228           thr_bar->b_go |= thr_bar->leaf_state;
1229         }
1230       }
1231     } else { // Blocktime is not infinite; do a simple hierarchical release
1232       for (int d = thr_bar->my_level - 1; d >= 0;
1233            --d) { // Release highest level threads first
1234         last = tid + thr_bar->skip_per_level[d + 1];
1235         kmp_uint32 skip = thr_bar->skip_per_level[d];
1236         if (last > nproc)
1237           last = nproc;
1238         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1239           kmp_info_t *child_thr = team->t.t_threads[child_tid];
1240           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1241           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1242                         "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1243                         gtid, team->t.t_id, tid,
1244                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1245                         child_tid, &child_bar->b_go, child_bar->b_go,
1246                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1247           // Release child using child's b_go flag
1248           ANNOTATE_BARRIER_BEGIN(child_thr);
1249           kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1250           flag.release();
1251         }
1252       }
1253     }
1254 #if KMP_BARRIER_ICV_PUSH
1255     if (propagate_icvs && !KMP_MASTER_TID(tid))
1256       // non-leaves copy ICVs from fixed ICVs to local dest
1257       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1258                 &thr_bar->th_fixed_icvs);
1259 #endif // KMP_BARRIER_ICV_PUSH
1260   }
1261   KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1262                 "barrier type %d\n",
1263                 gtid, team->t.t_id, tid, bt));
1264 }
1265 
1266 // End of Barrier Algorithms
1267 
1268 // type traits for cancellable value
1269 // if cancellable is true, then is_cancellable is a normal boolean variable
1270 // if cancellable is false, then is_cancellable is a compile time constant
1271 template <bool cancellable> struct is_cancellable {};
1272 template <> struct is_cancellable<true> {
1273   bool value;
1274   is_cancellable() : value(false) {}
1275   is_cancellable(bool b) : value(b) {}
1276   is_cancellable &operator=(bool b) {
1277     value = b;
1278     return *this;
1279   }
1280   operator bool() const { return value; }
1281 };
1282 template <> struct is_cancellable<false> {
1283   is_cancellable &operator=(bool b) { return *this; }
1284   constexpr operator bool() const { return false; }
1285 };
1286 
1287 // Internal function to do a barrier.
1288 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1289    If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1290    barrier
1291    When cancellable = false,
1292      Returns 0 if master thread, 1 if worker thread.
1293    When cancellable = true
1294      Returns 0 if not cancelled, 1 if cancelled.  */
1295 template <bool cancellable = false>
1296 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1297                                   size_t reduce_size, void *reduce_data,
1298                                   void (*reduce)(void *, void *)) {
1299   KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1300   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1301   int tid = __kmp_tid_from_gtid(gtid);
1302   kmp_info_t *this_thr = __kmp_threads[gtid];
1303   kmp_team_t *team = this_thr->th.th_team;
1304   int status = 0;
1305   is_cancellable<cancellable> cancelled;
1306 #if OMPT_SUPPORT && OMPT_OPTIONAL
1307   ompt_data_t *my_task_data;
1308   ompt_data_t *my_parallel_data;
1309   void *return_address;
1310   ompt_sync_region_t barrier_kind;
1311 #endif
1312 
1313   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1314                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1315 
1316   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1317 #if OMPT_SUPPORT
1318   if (ompt_enabled.enabled) {
1319 #if OMPT_OPTIONAL
1320     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1321     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1322     return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1323     barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1324     if (ompt_enabled.ompt_callback_sync_region) {
1325       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1326           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1327           return_address);
1328     }
1329     if (ompt_enabled.ompt_callback_sync_region_wait) {
1330       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1331           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1332           return_address);
1333     }
1334 #endif
1335     // It is OK to report the barrier state after the barrier begin callback.
1336     // According to the OMPT specification, a compliant implementation may
1337     // even delay reporting this state until the barrier begins to wait.
1338     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1339   }
1340 #endif
1341 
1342   if (!team->t.t_serialized) {
1343 #if USE_ITT_BUILD
1344     // This value will be used in itt notify events below.
1345     void *itt_sync_obj = NULL;
1346 #if USE_ITT_NOTIFY
1347     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1348       itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1349 #endif
1350 #endif /* USE_ITT_BUILD */
1351     if (__kmp_tasking_mode == tskm_extra_barrier) {
1352       __kmp_tasking_barrier(team, this_thr, gtid);
1353       KA_TRACE(15,
1354                ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1355                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1356     }
1357 
1358     /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1359        access it when the team struct is not guaranteed to exist. */
1360     // See note about the corresponding code in __kmp_join_barrier() being
1361     // performance-critical.
1362     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1363 #if KMP_USE_MONITOR
1364       this_thr->th.th_team_bt_intervals =
1365           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1366       this_thr->th.th_team_bt_set =
1367           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1368 #else
1369       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1370 #endif
1371     }
1372 
1373 #if USE_ITT_BUILD
1374     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1375       __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1376 #endif /* USE_ITT_BUILD */
1377 #if USE_DEBUGGER
1378     // Let the debugger know: the thread arrived to the barrier and waiting.
1379     if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1380       team->t.t_bar[bt].b_master_arrived += 1;
1381     } else {
1382       this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1383     } // if
1384 #endif /* USE_DEBUGGER */
1385     if (reduce != NULL) {
1386       // KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1387       this_thr->th.th_local.reduce_data = reduce_data;
1388     }
1389 
1390     if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1391       // use 0 to only setup the current team if nthreads > 1
1392       __kmp_task_team_setup(this_thr, team, 0);
1393 
1394     if (cancellable) {
1395       cancelled = __kmp_linear_barrier_gather_cancellable(
1396           bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1397     } else {
1398       switch (__kmp_barrier_gather_pattern[bt]) {
1399       case bp_hyper_bar: {
1400         // don't set branch bits to 0; use linear
1401         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1402         __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1403                                    reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1404         break;
1405       }
1406       case bp_hierarchical_bar: {
1407         __kmp_hierarchical_barrier_gather(
1408             bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1409         break;
1410       }
1411       case bp_tree_bar: {
1412         // don't set branch bits to 0; use linear
1413         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1414         __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1415                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1416         break;
1417       }
1418       default: {
1419         __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1420                                     reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1421       }
1422       }
1423     }
1424 
1425     KMP_MB();
1426 
1427     if (KMP_MASTER_TID(tid)) {
1428       status = 0;
1429       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1430         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1431       }
1432 #if USE_DEBUGGER
1433       // Let the debugger know: All threads are arrived and starting leaving the
1434       // barrier.
1435       team->t.t_bar[bt].b_team_arrived += 1;
1436 #endif
1437 
1438       if (__kmp_omp_cancellation) {
1439         kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1440         // Reset cancellation flag for worksharing constructs
1441         if (cancel_request == cancel_loop ||
1442             cancel_request == cancel_sections) {
1443           KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1444         }
1445       }
1446 #if USE_ITT_BUILD
1447       /* TODO: In case of split reduction barrier, master thread may send
1448          acquired event early, before the final summation into the shared
1449          variable is done (final summation can be a long operation for array
1450          reductions).  */
1451       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1452         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1453 #endif /* USE_ITT_BUILD */
1454 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1455       // Barrier - report frame end (only if active_level == 1)
1456       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1457           __kmp_forkjoin_frames_mode &&
1458           (this_thr->th.th_teams_microtask == NULL || // either not in teams
1459            this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1460           team->t.t_active_level == 1) {
1461         ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1462         kmp_uint64 cur_time = __itt_get_timestamp();
1463         kmp_info_t **other_threads = team->t.t_threads;
1464         int nproc = this_thr->th.th_team_nproc;
1465         int i;
1466         switch (__kmp_forkjoin_frames_mode) {
1467         case 1:
1468           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1469                                  loc, nproc);
1470           this_thr->th.th_frame_time = cur_time;
1471           break;
1472         case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1473           // be fixed)
1474           __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1475                                  1, loc, nproc);
1476           break;
1477         case 3:
1478           if (__itt_metadata_add_ptr) {
1479             // Initialize with master's wait time
1480             kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1481             // Set arrive time to zero to be able to check it in
1482             // __kmp_invoke_task(); the same is done inside the loop below
1483             this_thr->th.th_bar_arrive_time = 0;
1484             for (i = 1; i < nproc; ++i) {
1485               delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1486               other_threads[i]->th.th_bar_arrive_time = 0;
1487             }
1488             __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1489                                          cur_time, delta,
1490                                          (kmp_uint64)(reduce != NULL));
1491           }
1492           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1493                                  loc, nproc);
1494           this_thr->th.th_frame_time = cur_time;
1495           break;
1496         }
1497       }
1498 #endif /* USE_ITT_BUILD */
1499     } else {
1500       status = 1;
1501 #if USE_ITT_BUILD
1502       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1503         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1504 #endif /* USE_ITT_BUILD */
1505     }
1506     if ((status == 1 || !is_split) && !cancelled) {
1507       if (cancellable) {
1508         cancelled = __kmp_linear_barrier_release_cancellable(
1509             bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1510       } else {
1511         switch (__kmp_barrier_release_pattern[bt]) {
1512         case bp_hyper_bar: {
1513           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1514           __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1515                                       FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1516           break;
1517         }
1518         case bp_hierarchical_bar: {
1519           __kmp_hierarchical_barrier_release(
1520               bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1521           break;
1522         }
1523         case bp_tree_bar: {
1524           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1525           __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1526                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1527           break;
1528         }
1529         default: {
1530           __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1531                                        FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1532         }
1533         }
1534       }
1535       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1536         __kmp_task_team_sync(this_thr, team);
1537       }
1538     }
1539 
1540 #if USE_ITT_BUILD
1541     /* GEH: TODO: Move this under if-condition above and also include in
1542        __kmp_end_split_barrier(). This will more accurately represent the actual
1543        release time of the threads for split barriers.  */
1544     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1545       __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1546 #endif /* USE_ITT_BUILD */
1547   } else { // Team is serialized.
1548     status = 0;
1549     if (__kmp_tasking_mode != tskm_immediate_exec) {
1550       if (this_thr->th.th_task_team != NULL) {
1551 #if USE_ITT_NOTIFY
1552         void *itt_sync_obj = NULL;
1553         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1554           itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1555           __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1556         }
1557 #endif
1558 
1559         KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1560                          TRUE);
1561         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1562         __kmp_task_team_setup(this_thr, team, 0);
1563 
1564 #if USE_ITT_BUILD
1565         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1566           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1567 #endif /* USE_ITT_BUILD */
1568       }
1569     }
1570   }
1571   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1572                 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1573                 __kmp_tid_from_gtid(gtid), status));
1574 
1575 #if OMPT_SUPPORT
1576   if (ompt_enabled.enabled) {
1577 #if OMPT_OPTIONAL
1578     if (ompt_enabled.ompt_callback_sync_region_wait) {
1579       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1580           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1581           return_address);
1582     }
1583     if (ompt_enabled.ompt_callback_sync_region) {
1584       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1585           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1586           return_address);
1587     }
1588 #endif
1589     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1590   }
1591 #endif
1592   ANNOTATE_BARRIER_END(&team->t.t_bar);
1593 
1594   if (cancellable)
1595     return (int)cancelled;
1596   return status;
1597 }
1598 
1599 // Returns 0 if master thread, 1 if worker thread.
1600 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1601                   size_t reduce_size, void *reduce_data,
1602                   void (*reduce)(void *, void *)) {
1603   return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1604                                   reduce);
1605 }
1606 
1607 #if defined(KMP_GOMP_COMPAT)
1608 // Returns 1 if cancelled, 0 otherwise
1609 int __kmp_barrier_gomp_cancel(int gtid) {
1610   if (__kmp_omp_cancellation) {
1611     int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1612                                                  0, NULL, NULL);
1613     if (cancelled) {
1614       int tid = __kmp_tid_from_gtid(gtid);
1615       kmp_info_t *this_thr = __kmp_threads[gtid];
1616       if (KMP_MASTER_TID(tid)) {
1617         // Master does not need to revert anything
1618       } else {
1619         // Workers need to revert their private b_arrived flag
1620         this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1621             KMP_BARRIER_STATE_BUMP;
1622       }
1623     }
1624     return cancelled;
1625   }
1626   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1627   return FALSE;
1628 }
1629 #endif
1630 
1631 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1632   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1633   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1634   int tid = __kmp_tid_from_gtid(gtid);
1635   kmp_info_t *this_thr = __kmp_threads[gtid];
1636   kmp_team_t *team = this_thr->th.th_team;
1637 
1638   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1639   if (!team->t.t_serialized) {
1640     if (KMP_MASTER_GTID(gtid)) {
1641       switch (__kmp_barrier_release_pattern[bt]) {
1642       case bp_hyper_bar: {
1643         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1644         __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1645                                     FALSE USE_ITT_BUILD_ARG(NULL));
1646         break;
1647       }
1648       case bp_hierarchical_bar: {
1649         __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1650                                            FALSE USE_ITT_BUILD_ARG(NULL));
1651         break;
1652       }
1653       case bp_tree_bar: {
1654         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1655         __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1656                                    FALSE USE_ITT_BUILD_ARG(NULL));
1657         break;
1658       }
1659       default: {
1660         __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1661                                      FALSE USE_ITT_BUILD_ARG(NULL));
1662       }
1663       }
1664       if (__kmp_tasking_mode != tskm_immediate_exec) {
1665         __kmp_task_team_sync(this_thr, team);
1666       } // if
1667     }
1668   }
1669   ANNOTATE_BARRIER_END(&team->t.t_bar);
1670 }
1671 
1672 void __kmp_join_barrier(int gtid) {
1673   KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1674   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1675   kmp_info_t *this_thr = __kmp_threads[gtid];
1676   kmp_team_t *team;
1677   kmp_uint nproc;
1678   kmp_info_t *master_thread;
1679   int tid;
1680 #ifdef KMP_DEBUG
1681   int team_id;
1682 #endif /* KMP_DEBUG */
1683 #if USE_ITT_BUILD
1684   void *itt_sync_obj = NULL;
1685 #if USE_ITT_NOTIFY
1686   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1687     // Get object created at fork_barrier
1688     itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1689 #endif
1690 #endif /* USE_ITT_BUILD */
1691   KMP_MB();
1692 
1693   // Get current info
1694   team = this_thr->th.th_team;
1695   nproc = this_thr->th.th_team_nproc;
1696   KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1697   tid = __kmp_tid_from_gtid(gtid);
1698 #ifdef KMP_DEBUG
1699   team_id = team->t.t_id;
1700 #endif /* KMP_DEBUG */
1701   master_thread = this_thr->th.th_team_master;
1702 #ifdef KMP_DEBUG
1703   if (master_thread != team->t.t_threads[0]) {
1704     __kmp_print_structure();
1705   }
1706 #endif /* KMP_DEBUG */
1707   KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1708   KMP_MB();
1709 
1710   // Verify state
1711   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1712   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1713   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1714   KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1715   KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1716                 gtid, team_id, tid));
1717 
1718   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1719 #if OMPT_SUPPORT
1720   if (ompt_enabled.enabled) {
1721 #if OMPT_OPTIONAL
1722     ompt_data_t *my_task_data;
1723     ompt_data_t *my_parallel_data;
1724     void *codeptr = NULL;
1725     int ds_tid = this_thr->th.th_info.ds.ds_tid;
1726     if (KMP_MASTER_TID(ds_tid) &&
1727         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1728          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1729       codeptr = team->t.ompt_team_info.master_return_address;
1730     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1731     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1732     if (ompt_enabled.ompt_callback_sync_region) {
1733       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1734           ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1735           my_task_data, codeptr);
1736     }
1737     if (ompt_enabled.ompt_callback_sync_region_wait) {
1738       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1739           ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1740           my_task_data, codeptr);
1741     }
1742     if (!KMP_MASTER_TID(ds_tid))
1743       this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1744 #endif
1745     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1746   }
1747 #endif
1748 
1749   if (__kmp_tasking_mode == tskm_extra_barrier) {
1750     __kmp_tasking_barrier(team, this_thr, gtid);
1751     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1752                   team_id, tid));
1753   }
1754 #ifdef KMP_DEBUG
1755   if (__kmp_tasking_mode != tskm_immediate_exec) {
1756     KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1757                   "%p, th_task_team = %p\n",
1758                   __kmp_gtid_from_thread(this_thr), team_id,
1759                   team->t.t_task_team[this_thr->th.th_task_state],
1760                   this_thr->th.th_task_team));
1761     KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1762                      team->t.t_task_team[this_thr->th.th_task_state]);
1763   }
1764 #endif /* KMP_DEBUG */
1765 
1766   /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1767      access it when the team struct is not guaranteed to exist. Doing these
1768      loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1769      we do not perform the copy if blocktime=infinite, since the values are not
1770      used by __kmp_wait_template() in that case. */
1771   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1772 #if KMP_USE_MONITOR
1773     this_thr->th.th_team_bt_intervals =
1774         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1775     this_thr->th.th_team_bt_set =
1776         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1777 #else
1778     this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1779 #endif
1780   }
1781 
1782 #if USE_ITT_BUILD
1783   if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1784     __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1785 #endif /* USE_ITT_BUILD */
1786 
1787   switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1788   case bp_hyper_bar: {
1789     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1790     __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1791                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1792     break;
1793   }
1794   case bp_hierarchical_bar: {
1795     __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1796                                       NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1797     break;
1798   }
1799   case bp_tree_bar: {
1800     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1801     __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1802                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1803     break;
1804   }
1805   default: {
1806     __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1807                                 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1808   }
1809   }
1810 
1811   /* From this point on, the team data structure may be deallocated at any time
1812      by the master thread - it is unsafe to reference it in any of the worker
1813      threads. Any per-team data items that need to be referenced before the
1814      end of the barrier should be moved to the kmp_task_team_t structs.  */
1815   if (KMP_MASTER_TID(tid)) {
1816     if (__kmp_tasking_mode != tskm_immediate_exec) {
1817       __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1818     }
1819     if (__kmp_display_affinity) {
1820       KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1821     }
1822 #if KMP_STATS_ENABLED
1823     // Have master thread flag the workers to indicate they are now waiting for
1824     // next parallel region, Also wake them up so they switch their timers to
1825     // idle.
1826     for (int i = 0; i < team->t.t_nproc; ++i) {
1827       kmp_info_t *team_thread = team->t.t_threads[i];
1828       if (team_thread == this_thr)
1829         continue;
1830       team_thread->th.th_stats->setIdleFlag();
1831       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1832           team_thread->th.th_sleep_loc != NULL)
1833         __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1834                                   team_thread->th.th_sleep_loc);
1835     }
1836 #endif
1837 #if USE_ITT_BUILD
1838     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1839       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1840 #endif /* USE_ITT_BUILD */
1841 
1842 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1843     // Join barrier - report frame end
1844     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1845         __kmp_forkjoin_frames_mode &&
1846         (this_thr->th.th_teams_microtask == NULL || // either not in teams
1847          this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1848         team->t.t_active_level == 1) {
1849       kmp_uint64 cur_time = __itt_get_timestamp();
1850       ident_t *loc = team->t.t_ident;
1851       kmp_info_t **other_threads = team->t.t_threads;
1852       int nproc = this_thr->th.th_team_nproc;
1853       int i;
1854       switch (__kmp_forkjoin_frames_mode) {
1855       case 1:
1856         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1857                                loc, nproc);
1858         break;
1859       case 2:
1860         __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1861                                loc, nproc);
1862         break;
1863       case 3:
1864         if (__itt_metadata_add_ptr) {
1865           // Initialize with master's wait time
1866           kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1867           // Set arrive time to zero to be able to check it in
1868           // __kmp_invoke_task(); the same is done inside the loop below
1869           this_thr->th.th_bar_arrive_time = 0;
1870           for (i = 1; i < nproc; ++i) {
1871             delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1872             other_threads[i]->th.th_bar_arrive_time = 0;
1873           }
1874           __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1875                                        cur_time, delta, 0);
1876         }
1877         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1878                                loc, nproc);
1879         this_thr->th.th_frame_time = cur_time;
1880         break;
1881       }
1882     }
1883 #endif /* USE_ITT_BUILD */
1884   }
1885 #if USE_ITT_BUILD
1886   else {
1887     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1888       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1889   }
1890 #endif /* USE_ITT_BUILD */
1891 
1892 #if KMP_DEBUG
1893   if (KMP_MASTER_TID(tid)) {
1894     KA_TRACE(
1895         15,
1896         ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1897          gtid, team_id, tid, nproc));
1898   }
1899 #endif /* KMP_DEBUG */
1900 
1901   // TODO now, mark worker threads as done so they may be disbanded
1902   KMP_MB(); // Flush all pending memory write invalidates.
1903   KA_TRACE(10,
1904            ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1905 
1906   ANNOTATE_BARRIER_END(&team->t.t_bar);
1907 }
1908 
1909 // TODO release worker threads' fork barriers as we are ready instead of all at
1910 // once
1911 void __kmp_fork_barrier(int gtid, int tid) {
1912   KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1913   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1914   kmp_info_t *this_thr = __kmp_threads[gtid];
1915   kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1916 #if USE_ITT_BUILD
1917   void *itt_sync_obj = NULL;
1918 #endif /* USE_ITT_BUILD */
1919   if (team)
1920     ANNOTATE_BARRIER_END(&team->t.t_bar);
1921 
1922   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1923                 (team != NULL) ? team->t.t_id : -1, tid));
1924 
1925   // th_team pointer only valid for master thread here
1926   if (KMP_MASTER_TID(tid)) {
1927 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1928     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1929       // Create itt barrier object
1930       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1931       __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1932     }
1933 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1934 
1935 #ifdef KMP_DEBUG
1936     kmp_info_t **other_threads = team->t.t_threads;
1937     int i;
1938 
1939     // Verify state
1940     KMP_MB();
1941 
1942     for (i = 1; i < team->t.t_nproc; ++i) {
1943       KA_TRACE(500,
1944                ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1945                 "== %u.\n",
1946                 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1947                 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1948                 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1949       KMP_DEBUG_ASSERT(
1950           (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1951            ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1952       KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1953     }
1954 #endif
1955 
1956     if (__kmp_tasking_mode != tskm_immediate_exec) {
1957       // 0 indicates setup current task team if nthreads > 1
1958       __kmp_task_team_setup(this_thr, team, 0);
1959     }
1960 
1961     /* The master thread may have changed its blocktime between the join barrier
1962        and the fork barrier. Copy the blocktime info to the thread, where
1963        __kmp_wait_template() can access it when the team struct is not
1964        guaranteed to exist. */
1965     // See note about the corresponding code in __kmp_join_barrier() being
1966     // performance-critical
1967     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1968 #if KMP_USE_MONITOR
1969       this_thr->th.th_team_bt_intervals =
1970           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1971       this_thr->th.th_team_bt_set =
1972           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1973 #else
1974       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1975 #endif
1976     }
1977   } // master
1978 
1979   switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1980   case bp_hyper_bar: {
1981     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1982     __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1983                                 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1984     break;
1985   }
1986   case bp_hierarchical_bar: {
1987     __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1988                                        TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1989     break;
1990   }
1991   case bp_tree_bar: {
1992     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1993     __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1994                                TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1995     break;
1996   }
1997   default: {
1998     __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1999                                  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2000   }
2001   }
2002 
2003 #if OMPT_SUPPORT
2004   if (ompt_enabled.enabled &&
2005       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2006     int ds_tid = this_thr->th.th_info.ds.ds_tid;
2007     ompt_data_t *task_data = (team)
2008                                  ? OMPT_CUR_TASK_DATA(this_thr)
2009                                  : &(this_thr->th.ompt_thread_info.task_data);
2010     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2011 #if OMPT_OPTIONAL
2012     void *codeptr = NULL;
2013     if (KMP_MASTER_TID(ds_tid) &&
2014         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2015          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2016       codeptr = team->t.ompt_team_info.master_return_address;
2017     if (ompt_enabled.ompt_callback_sync_region_wait) {
2018       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2019           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2020           codeptr);
2021     }
2022     if (ompt_enabled.ompt_callback_sync_region) {
2023       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2024           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2025           codeptr);
2026     }
2027 #endif
2028     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2029       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2030           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2031     }
2032   }
2033 #endif
2034 
2035   // Early exit for reaping threads releasing forkjoin barrier
2036   if (TCR_4(__kmp_global.g.g_done)) {
2037     this_thr->th.th_task_team = NULL;
2038 
2039 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2040     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2041       if (!KMP_MASTER_TID(tid)) {
2042         itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2043         if (itt_sync_obj)
2044           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2045       }
2046     }
2047 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2048     KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2049     return;
2050   }
2051 
2052   /* We can now assume that a valid team structure has been allocated by the
2053      master and propagated to all worker threads. The current thread, however,
2054      may not be part of the team, so we can't blindly assume that the team
2055      pointer is non-null.  */
2056   team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2057   KMP_DEBUG_ASSERT(team != NULL);
2058   tid = __kmp_tid_from_gtid(gtid);
2059 
2060 #if KMP_BARRIER_ICV_PULL
2061   /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2062      __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2063      implicit task has this data before this function is called. We cannot
2064      modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
2065      struct, because it is not always the case that the threads arrays have
2066      been allocated when __kmp_fork_call() is executed. */
2067   {
2068     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2069     if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
2070       // Copy the initial ICVs from the master's thread struct to the implicit
2071       // task for this tid.
2072       KA_TRACE(10,
2073                ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2074       __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2075                                tid, FALSE);
2076       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2077                 &team->t.t_threads[0]
2078                      ->th.th_bar[bs_forkjoin_barrier]
2079                      .bb.th_fixed_icvs);
2080     }
2081   }
2082 #endif // KMP_BARRIER_ICV_PULL
2083 
2084   if (__kmp_tasking_mode != tskm_immediate_exec) {
2085     __kmp_task_team_sync(this_thr, team);
2086   }
2087 
2088 #if KMP_AFFINITY_SUPPORTED
2089   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2090   if (proc_bind == proc_bind_intel) {
2091     // Call dynamic affinity settings
2092     if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2093       __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2094     }
2095   } else if (proc_bind != proc_bind_false) {
2096     if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2097       KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2098                      __kmp_gtid_from_thread(this_thr),
2099                      this_thr->th.th_current_place));
2100     } else {
2101       __kmp_affinity_set_place(gtid);
2102     }
2103   }
2104 #endif // KMP_AFFINITY_SUPPORTED
2105   // Perform the display affinity functionality
2106   if (__kmp_display_affinity) {
2107     if (team->t.t_display_affinity
2108 #if KMP_AFFINITY_SUPPORTED
2109         || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2110 #endif
2111             ) {
2112       // NULL means use the affinity-format-var ICV
2113       __kmp_aux_display_affinity(gtid, NULL);
2114       this_thr->th.th_prev_num_threads = team->t.t_nproc;
2115       this_thr->th.th_prev_level = team->t.t_level;
2116     }
2117   }
2118   if (!KMP_MASTER_TID(tid))
2119     KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2120 
2121 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2122   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2123     if (!KMP_MASTER_TID(tid)) {
2124       // Get correct barrier object
2125       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2126       __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2127     } // (prepare called inside barrier_release)
2128   }
2129 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2130   ANNOTATE_BARRIER_END(&team->t.t_bar);
2131   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2132                 team->t.t_id, tid));
2133 }
2134 
2135 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2136                           kmp_internal_control_t *new_icvs, ident_t *loc) {
2137   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2138 
2139   KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2140   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2141 
2142 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2143    __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2144    implicit task has this data before this function is called. */
2145 #if KMP_BARRIER_ICV_PULL
2146   /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2147      untouched), where all of the worker threads can access them and make their
2148      own copies after the barrier. */
2149   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2150   // allocated at this point
2151   copy_icvs(
2152       &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2153       new_icvs);
2154   KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2155                 team->t.t_threads[0], team));
2156 #elif KMP_BARRIER_ICV_PUSH
2157   // The ICVs will be propagated in the fork barrier, so nothing needs to be
2158   // done here.
2159   KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2160                 team->t.t_threads[0], team));
2161 #else
2162   // Copy the ICVs to each of the non-master threads.  This takes O(nthreads)
2163   // time.
2164   ngo_load(new_icvs);
2165   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2166   // allocated at this point
2167   for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2168     // TODO: GEH - pass in better source location info since usually NULL here
2169     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2170                   f, team->t.t_threads[f], team));
2171     __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2172     ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2173     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2174                   f, team->t.t_threads[f], team));
2175   }
2176   ngo_sync();
2177 #endif // KMP_BARRIER_ICV_PULL
2178 }
2179