xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_barrier.cpp (revision d13def78ccef6dbc25c2e197089ee5fc4d7b82c3)
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 
20 #if KMP_MIC
21 #include <immintrin.h>
22 #define USE_NGO_STORES 1
23 #endif // KMP_MIC
24 
25 #include "tsan_annotations.h"
26 
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39 
40 void __kmp_print_structure(void); // Forward declaration
41 
42 // ---------------------------- Barrier Algorithms ----------------------------
43 
44 // Linear Barrier
45 template <bool cancellable = false>
46 static bool __kmp_linear_barrier_gather_template(
47     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
49   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
50   kmp_team_t *team = this_thr->th.th_team;
51   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
52   kmp_info_t **other_threads = team->t.t_threads;
53 
54   KA_TRACE(
55       20,
56       ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57        gtid, team->t.t_id, tid, bt));
58   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59 
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
61   // Barrier imbalance - save arrive time to the thread
62   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
64         __itt_get_timestamp();
65   }
66 #endif
67   // We now perform a linear reduction to signal that all of the threads have
68   // arrived.
69   if (!KMP_MASTER_TID(tid)) {
70     KA_TRACE(20,
71              ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
72               "arrived(%p): %llu => %llu\n",
73               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
74               team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
75               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
76     // Mark arrival to master thread
77     /* After performing this write, a worker thread may not assume that the team
78        is valid any more - it could be deallocated by the master thread at any
79        time. */
80     ANNOTATE_BARRIER_BEGIN(this_thr);
81     kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
82     flag.release();
83   } else {
84     kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
85     int nproc = this_thr->th.th_team_nproc;
86     int i;
87     // Don't have to worry about sleep bit here or atomic since team setting
88     kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
89 
90     // Collect all the worker team member threads.
91     for (i = 1; i < nproc; ++i) {
92 #if KMP_CACHE_MANAGE
93       // Prefetch next thread's arrived count
94       if (i + 1 < nproc)
95         KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
96 #endif /* KMP_CACHE_MANAGE */
97       KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
98                     "arrived(%p) == %llu\n",
99                     gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
100                     team->t.t_id, i,
101                     &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
102 
103       // Wait for worker thread to arrive
104       kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
105                        new_state);
106       if (cancellable) {
107         bool cancelled = flag.wait_cancellable_nosleep(
108             this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
109         if (cancelled)
110           return true;
111       } else {
112         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
113       }
114       ANNOTATE_BARRIER_END(other_threads[i]);
115 #if USE_ITT_BUILD && USE_ITT_NOTIFY
116       // Barrier imbalance - write min of the thread time and the other thread
117       // time to the thread.
118       if (__kmp_forkjoin_frames_mode == 2) {
119         this_thr->th.th_bar_min_time = KMP_MIN(
120             this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
121       }
122 #endif
123       if (reduce) {
124         KA_TRACE(100,
125                  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
126                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
127                   team->t.t_id, i));
128         ANNOTATE_REDUCE_AFTER(reduce);
129         OMPT_REDUCTION_DECL(this_thr, gtid);
130         OMPT_REDUCTION_BEGIN;
131         (*reduce)(this_thr->th.th_local.reduce_data,
132                   other_threads[i]->th.th_local.reduce_data);
133         OMPT_REDUCTION_END;
134         ANNOTATE_REDUCE_BEFORE(reduce);
135         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
136       }
137     }
138     // Don't have to worry about sleep bit here or atomic since team setting
139     team_bar->b_arrived = new_state;
140     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
141                   "arrived(%p) = %llu\n",
142                   gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
143                   new_state));
144   }
145   KA_TRACE(
146       20,
147       ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
148        gtid, team->t.t_id, tid, bt));
149   return false;
150 }
151 
152 template <bool cancellable = false>
153 static bool __kmp_linear_barrier_release_template(
154     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
155     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
156   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
157   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
158   kmp_team_t *team;
159 
160   if (KMP_MASTER_TID(tid)) {
161     unsigned int i;
162     kmp_uint32 nproc = this_thr->th.th_team_nproc;
163     kmp_info_t **other_threads;
164 
165     team = __kmp_threads[gtid]->th.th_team;
166     KMP_DEBUG_ASSERT(team != NULL);
167     other_threads = team->t.t_threads;
168 
169     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
170                   "barrier type %d\n",
171                   gtid, team->t.t_id, tid, bt));
172 
173     if (nproc > 1) {
174 #if KMP_BARRIER_ICV_PUSH
175       {
176         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
177         if (propagate_icvs) {
178           ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
179           for (i = 1; i < nproc; ++i) {
180             __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
181                                      team, i, FALSE);
182             ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
183                            &team->t.t_implicit_task_taskdata[0].td_icvs);
184           }
185           ngo_sync();
186         }
187       }
188 #endif // KMP_BARRIER_ICV_PUSH
189 
190       // Now, release all of the worker threads
191       for (i = 1; i < nproc; ++i) {
192 #if KMP_CACHE_MANAGE
193         // Prefetch next thread's go flag
194         if (i + 1 < nproc)
195           KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
196 #endif /* KMP_CACHE_MANAGE */
197         KA_TRACE(
198             20,
199             ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
200              "go(%p): %u => %u\n",
201              gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
202              team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
203              other_threads[i]->th.th_bar[bt].bb.b_go,
204              other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
205         ANNOTATE_BARRIER_BEGIN(other_threads[i]);
206         kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
207                          other_threads[i]);
208         flag.release();
209       }
210     }
211   } else { // Wait for the MASTER thread to release us
212     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
213                   gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
214     kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
215     if (cancellable) {
216       bool cancelled = flag.wait_cancellable_nosleep(
217           this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
218       if (cancelled) {
219         return true;
220       }
221     } else {
222       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
223     }
224     ANNOTATE_BARRIER_END(this_thr);
225 #if USE_ITT_BUILD && USE_ITT_NOTIFY
226     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
227       // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
228       // disabled)
229       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
230       // Cancel wait on previous parallel region...
231       __kmp_itt_task_starting(itt_sync_obj);
232 
233       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
234         return false;
235 
236       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
237       if (itt_sync_obj != NULL)
238         // Call prepare as early as possible for "new" barrier
239         __kmp_itt_task_finished(itt_sync_obj);
240     } else
241 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
242         // Early exit for reaping threads releasing forkjoin barrier
243         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
244       return false;
245 // The worker thread may now assume that the team is valid.
246 #ifdef KMP_DEBUG
247     tid = __kmp_tid_from_gtid(gtid);
248     team = __kmp_threads[gtid]->th.th_team;
249 #endif
250     KMP_DEBUG_ASSERT(team != NULL);
251     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
252     KA_TRACE(20,
253              ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
254               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
255     KMP_MB(); // Flush all pending memory write invalidates.
256   }
257   KA_TRACE(
258       20,
259       ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
260        gtid, team->t.t_id, tid, bt));
261   return false;
262 }
263 
264 static void __kmp_linear_barrier_gather(
265     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
266     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
267   __kmp_linear_barrier_gather_template<false>(
268       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
269 }
270 
271 static bool __kmp_linear_barrier_gather_cancellable(
272     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
273     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
274   return __kmp_linear_barrier_gather_template<true>(
275       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
276 }
277 
278 static void __kmp_linear_barrier_release(
279     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
280     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
281   __kmp_linear_barrier_release_template<false>(
282       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
283 }
284 
285 static bool __kmp_linear_barrier_release_cancellable(
286     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
287     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
288   return __kmp_linear_barrier_release_template<true>(
289       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
290 }
291 
292 // Tree barrier
293 static void
294 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
295                           int tid, void (*reduce)(void *, void *)
296                                        USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
297   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
298   kmp_team_t *team = this_thr->th.th_team;
299   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
300   kmp_info_t **other_threads = team->t.t_threads;
301   kmp_uint32 nproc = this_thr->th.th_team_nproc;
302   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
303   kmp_uint32 branch_factor = 1 << branch_bits;
304   kmp_uint32 child;
305   kmp_uint32 child_tid;
306   kmp_uint64 new_state;
307 
308   KA_TRACE(
309       20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
310            gtid, team->t.t_id, tid, bt));
311   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
312 
313 #if USE_ITT_BUILD && USE_ITT_NOTIFY
314   // Barrier imbalance - save arrive time to the thread
315   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
316     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
317         __itt_get_timestamp();
318   }
319 #endif
320   // Perform tree gather to wait until all threads have arrived; reduce any
321   // required data as we go
322   child_tid = (tid << branch_bits) + 1;
323   if (child_tid < nproc) {
324     // Parent threads wait for all their children to arrive
325     new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
326     child = 1;
327     do {
328       kmp_info_t *child_thr = other_threads[child_tid];
329       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
330 #if KMP_CACHE_MANAGE
331       // Prefetch next thread's arrived count
332       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
333         KMP_CACHE_PREFETCH(
334             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
335 #endif /* KMP_CACHE_MANAGE */
336       KA_TRACE(20,
337                ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
338                 "arrived(%p) == %llu\n",
339                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
340                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
341       // Wait for child to arrive
342       kmp_flag_64 flag(&child_bar->b_arrived, new_state);
343       flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
344       ANNOTATE_BARRIER_END(child_thr);
345 #if USE_ITT_BUILD && USE_ITT_NOTIFY
346       // Barrier imbalance - write min of the thread time and a child time to
347       // the thread.
348       if (__kmp_forkjoin_frames_mode == 2) {
349         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
350                                                child_thr->th.th_bar_min_time);
351       }
352 #endif
353       if (reduce) {
354         KA_TRACE(100,
355                  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
356                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
357                   team->t.t_id, child_tid));
358         ANNOTATE_REDUCE_AFTER(reduce);
359         OMPT_REDUCTION_DECL(this_thr, gtid);
360         OMPT_REDUCTION_BEGIN;
361         (*reduce)(this_thr->th.th_local.reduce_data,
362                   child_thr->th.th_local.reduce_data);
363         OMPT_REDUCTION_END;
364         ANNOTATE_REDUCE_BEFORE(reduce);
365         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
366       }
367       child++;
368       child_tid++;
369     } while (child <= branch_factor && child_tid < nproc);
370   }
371 
372   if (!KMP_MASTER_TID(tid)) { // Worker threads
373     kmp_int32 parent_tid = (tid - 1) >> branch_bits;
374 
375     KA_TRACE(20,
376              ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
377               "arrived(%p): %llu => %llu\n",
378               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
379               team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
380               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
381 
382     // Mark arrival to parent thread
383     /* After performing this write, a worker thread may not assume that the team
384        is valid any more - it could be deallocated by the master thread at any
385        time.  */
386     ANNOTATE_BARRIER_BEGIN(this_thr);
387     kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
388     flag.release();
389   } else {
390     // Need to update the team arrived pointer if we are the master thread
391     if (nproc > 1) // New value was already computed above
392       team->t.t_bar[bt].b_arrived = new_state;
393     else
394       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
395     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
396                   "arrived(%p) = %llu\n",
397                   gtid, team->t.t_id, tid, team->t.t_id,
398                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
399   }
400   KA_TRACE(20,
401            ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
402             gtid, team->t.t_id, tid, bt));
403 }
404 
405 static void __kmp_tree_barrier_release(
406     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
407     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
408   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
409   kmp_team_t *team;
410   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
411   kmp_uint32 nproc;
412   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
413   kmp_uint32 branch_factor = 1 << branch_bits;
414   kmp_uint32 child;
415   kmp_uint32 child_tid;
416 
417   // Perform a tree release for all of the threads that have been gathered
418   if (!KMP_MASTER_TID(
419           tid)) { // Handle fork barrier workers who aren't part of a team yet
420     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
421                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
422     // Wait for parent thread to release us
423     kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
424     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
425     ANNOTATE_BARRIER_END(this_thr);
426 #if USE_ITT_BUILD && USE_ITT_NOTIFY
427     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
428       // In fork barrier where we could not get the object reliably (or
429       // ITTNOTIFY is disabled)
430       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
431       // Cancel wait on previous parallel region...
432       __kmp_itt_task_starting(itt_sync_obj);
433 
434       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
435         return;
436 
437       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
438       if (itt_sync_obj != NULL)
439         // Call prepare as early as possible for "new" barrier
440         __kmp_itt_task_finished(itt_sync_obj);
441     } else
442 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
443         // Early exit for reaping threads releasing forkjoin barrier
444         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
445       return;
446 
447     // The worker thread may now assume that the team is valid.
448     team = __kmp_threads[gtid]->th.th_team;
449     KMP_DEBUG_ASSERT(team != NULL);
450     tid = __kmp_tid_from_gtid(gtid);
451 
452     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
453     KA_TRACE(20,
454              ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
455               team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
456     KMP_MB(); // Flush all pending memory write invalidates.
457   } else {
458     team = __kmp_threads[gtid]->th.th_team;
459     KMP_DEBUG_ASSERT(team != NULL);
460     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
461                   "barrier type %d\n",
462                   gtid, team->t.t_id, tid, bt));
463   }
464   nproc = this_thr->th.th_team_nproc;
465   child_tid = (tid << branch_bits) + 1;
466 
467   if (child_tid < nproc) {
468     kmp_info_t **other_threads = team->t.t_threads;
469     child = 1;
470     // Parent threads release all their children
471     do {
472       kmp_info_t *child_thr = other_threads[child_tid];
473       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
474 #if KMP_CACHE_MANAGE
475       // Prefetch next thread's go count
476       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
477         KMP_CACHE_PREFETCH(
478             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
479 #endif /* KMP_CACHE_MANAGE */
480 
481 #if KMP_BARRIER_ICV_PUSH
482       {
483         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
484         if (propagate_icvs) {
485           __kmp_init_implicit_task(team->t.t_ident,
486                                    team->t.t_threads[child_tid], team,
487                                    child_tid, FALSE);
488           copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
489                     &team->t.t_implicit_task_taskdata[0].td_icvs);
490         }
491       }
492 #endif // KMP_BARRIER_ICV_PUSH
493       KA_TRACE(20,
494                ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
495                 "go(%p): %u => %u\n",
496                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
497                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
498                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
499       // Release child from barrier
500       ANNOTATE_BARRIER_BEGIN(child_thr);
501       kmp_flag_64 flag(&child_bar->b_go, child_thr);
502       flag.release();
503       child++;
504       child_tid++;
505     } while (child <= branch_factor && child_tid < nproc);
506   }
507   KA_TRACE(
508       20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
509            gtid, team->t.t_id, tid, bt));
510 }
511 
512 // Hyper Barrier
513 static void
514 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
515                            int tid, void (*reduce)(void *, void *)
516                                         USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
517   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
518   kmp_team_t *team = this_thr->th.th_team;
519   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
520   kmp_info_t **other_threads = team->t.t_threads;
521   kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
522   kmp_uint32 num_threads = this_thr->th.th_team_nproc;
523   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
524   kmp_uint32 branch_factor = 1 << branch_bits;
525   kmp_uint32 offset;
526   kmp_uint32 level;
527 
528   KA_TRACE(
529       20,
530       ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
531        gtid, team->t.t_id, tid, bt));
532   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
533 
534 #if USE_ITT_BUILD && USE_ITT_NOTIFY
535   // Barrier imbalance - save arrive time to the thread
536   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
537     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
538         __itt_get_timestamp();
539   }
540 #endif
541   /* Perform a hypercube-embedded tree gather to wait until all of the threads
542      have arrived, and reduce any required data as we go.  */
543   kmp_flag_64 p_flag(&thr_bar->b_arrived);
544   for (level = 0, offset = 1; offset < num_threads;
545        level += branch_bits, offset <<= branch_bits) {
546     kmp_uint32 child;
547     kmp_uint32 child_tid;
548 
549     if (((tid >> level) & (branch_factor - 1)) != 0) {
550       kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
551 
552       KA_TRACE(20,
553                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
554                 "arrived(%p): %llu => %llu\n",
555                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
556                 team->t.t_id, parent_tid, &thr_bar->b_arrived,
557                 thr_bar->b_arrived,
558                 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
559       // Mark arrival to parent thread
560       /* After performing this write (in the last iteration of the enclosing for
561          loop), a worker thread may not assume that the team is valid any more
562          - it could be deallocated by the master thread at any time.  */
563       ANNOTATE_BARRIER_BEGIN(this_thr);
564       p_flag.set_waiter(other_threads[parent_tid]);
565       p_flag.release();
566       break;
567     }
568 
569     // Parent threads wait for children to arrive
570     if (new_state == KMP_BARRIER_UNUSED_STATE)
571       new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
572     for (child = 1, child_tid = tid + (1 << level);
573          child < branch_factor && child_tid < num_threads;
574          child++, child_tid += (1 << level)) {
575       kmp_info_t *child_thr = other_threads[child_tid];
576       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
577 #if KMP_CACHE_MANAGE
578       kmp_uint32 next_child_tid = child_tid + (1 << level);
579       // Prefetch next thread's arrived count
580       if (child + 1 < branch_factor && next_child_tid < num_threads)
581         KMP_CACHE_PREFETCH(
582             &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
583 #endif /* KMP_CACHE_MANAGE */
584       KA_TRACE(20,
585                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
586                 "arrived(%p) == %llu\n",
587                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
588                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
589       // Wait for child to arrive
590       kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
591       c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
592       ANNOTATE_BARRIER_END(child_thr);
593 #if USE_ITT_BUILD && USE_ITT_NOTIFY
594       // Barrier imbalance - write min of the thread time and a child time to
595       // the thread.
596       if (__kmp_forkjoin_frames_mode == 2) {
597         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
598                                                child_thr->th.th_bar_min_time);
599       }
600 #endif
601       if (reduce) {
602         KA_TRACE(100,
603                  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
604                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
605                   team->t.t_id, child_tid));
606         ANNOTATE_REDUCE_AFTER(reduce);
607         OMPT_REDUCTION_DECL(this_thr, gtid);
608         OMPT_REDUCTION_BEGIN;
609         (*reduce)(this_thr->th.th_local.reduce_data,
610                   child_thr->th.th_local.reduce_data);
611         OMPT_REDUCTION_END;
612         ANNOTATE_REDUCE_BEFORE(reduce);
613         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
614       }
615     }
616   }
617 
618   if (KMP_MASTER_TID(tid)) {
619     // Need to update the team arrived pointer if we are the master thread
620     if (new_state == KMP_BARRIER_UNUSED_STATE)
621       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
622     else
623       team->t.t_bar[bt].b_arrived = new_state;
624     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
625                   "arrived(%p) = %llu\n",
626                   gtid, team->t.t_id, tid, team->t.t_id,
627                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
628   }
629   KA_TRACE(
630       20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
631            gtid, team->t.t_id, tid, bt));
632 }
633 
634 // The reverse versions seem to beat the forward versions overall
635 #define KMP_REVERSE_HYPER_BAR
636 static void __kmp_hyper_barrier_release(
637     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
638     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
639   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
640   kmp_team_t *team;
641   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
642   kmp_info_t **other_threads;
643   kmp_uint32 num_threads;
644   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
645   kmp_uint32 branch_factor = 1 << branch_bits;
646   kmp_uint32 child;
647   kmp_uint32 child_tid;
648   kmp_uint32 offset;
649   kmp_uint32 level;
650 
651   /* Perform a hypercube-embedded tree release for all of the threads that have
652      been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
653      are released in the reverse order of the corresponding gather, otherwise
654      threads are released in the same order. */
655   if (KMP_MASTER_TID(tid)) { // master
656     team = __kmp_threads[gtid]->th.th_team;
657     KMP_DEBUG_ASSERT(team != NULL);
658     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
659                   "barrier type %d\n",
660                   gtid, team->t.t_id, tid, bt));
661 #if KMP_BARRIER_ICV_PUSH
662     if (propagate_icvs) { // master already has ICVs in final destination; copy
663       copy_icvs(&thr_bar->th_fixed_icvs,
664                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
665     }
666 #endif
667   } else { // Handle fork barrier workers who aren't part of a team yet
668     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
669                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
670     // Wait for parent thread to release us
671     kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
672     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
673     ANNOTATE_BARRIER_END(this_thr);
674 #if USE_ITT_BUILD && USE_ITT_NOTIFY
675     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
676       // In fork barrier where we could not get the object reliably
677       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
678       // Cancel wait on previous parallel region...
679       __kmp_itt_task_starting(itt_sync_obj);
680 
681       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
682         return;
683 
684       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
685       if (itt_sync_obj != NULL)
686         // Call prepare as early as possible for "new" barrier
687         __kmp_itt_task_finished(itt_sync_obj);
688     } else
689 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
690         // Early exit for reaping threads releasing forkjoin barrier
691         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
692       return;
693 
694     // The worker thread may now assume that the team is valid.
695     team = __kmp_threads[gtid]->th.th_team;
696     KMP_DEBUG_ASSERT(team != NULL);
697     tid = __kmp_tid_from_gtid(gtid);
698 
699     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
700     KA_TRACE(20,
701              ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
702               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
703     KMP_MB(); // Flush all pending memory write invalidates.
704   }
705   num_threads = this_thr->th.th_team_nproc;
706   other_threads = team->t.t_threads;
707 
708 #ifdef KMP_REVERSE_HYPER_BAR
709   // Count up to correct level for parent
710   for (level = 0, offset = 1;
711        offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
712        level += branch_bits, offset <<= branch_bits)
713     ;
714 
715   // Now go down from there
716   for (level -= branch_bits, offset >>= branch_bits; offset != 0;
717        level -= branch_bits, offset >>= branch_bits)
718 #else
719   // Go down the tree, level by level
720   for (level = 0, offset = 1; offset < num_threads;
721        level += branch_bits, offset <<= branch_bits)
722 #endif // KMP_REVERSE_HYPER_BAR
723   {
724 #ifdef KMP_REVERSE_HYPER_BAR
725     /* Now go in reverse order through the children, highest to lowest.
726        Initial setting of child is conservative here. */
727     child = num_threads >> ((level == 0) ? level : level - 1);
728     for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
729         child_tid = tid + (child << level);
730          child >= 1; child--, child_tid -= (1 << level))
731 #else
732     if (((tid >> level) & (branch_factor - 1)) != 0)
733       // No need to go lower than this, since this is the level parent would be
734       // notified
735       break;
736     // Iterate through children on this level of the tree
737     for (child = 1, child_tid = tid + (1 << level);
738          child < branch_factor && child_tid < num_threads;
739          child++, child_tid += (1 << level))
740 #endif // KMP_REVERSE_HYPER_BAR
741     {
742       if (child_tid >= num_threads)
743         continue; // Child doesn't exist so keep going
744       else {
745         kmp_info_t *child_thr = other_threads[child_tid];
746         kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
747 #if KMP_CACHE_MANAGE
748         kmp_uint32 next_child_tid = child_tid - (1 << level);
749 // Prefetch next thread's go count
750 #ifdef KMP_REVERSE_HYPER_BAR
751         if (child - 1 >= 1 && next_child_tid < num_threads)
752 #else
753         if (child + 1 < branch_factor && next_child_tid < num_threads)
754 #endif // KMP_REVERSE_HYPER_BAR
755           KMP_CACHE_PREFETCH(
756               &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
757 #endif /* KMP_CACHE_MANAGE */
758 
759 #if KMP_BARRIER_ICV_PUSH
760         if (propagate_icvs) // push my fixed ICVs to my child
761           copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
762 #endif // KMP_BARRIER_ICV_PUSH
763 
764         KA_TRACE(
765             20,
766             ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
767              "go(%p): %u => %u\n",
768              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
769              team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
770              child_bar->b_go + KMP_BARRIER_STATE_BUMP));
771         // Release child from barrier
772         ANNOTATE_BARRIER_BEGIN(child_thr);
773         kmp_flag_64 flag(&child_bar->b_go, child_thr);
774         flag.release();
775       }
776     }
777   }
778 #if KMP_BARRIER_ICV_PUSH
779   if (propagate_icvs &&
780       !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
781     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
782                              FALSE);
783     copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
784               &thr_bar->th_fixed_icvs);
785   }
786 #endif
787   KA_TRACE(
788       20,
789       ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
790        gtid, team->t.t_id, tid, bt));
791 }
792 
793 // Hierarchical Barrier
794 
795 // Initialize thread barrier data
796 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
797    Performs the minimum amount of initialization required based on how the team
798    has changed. Returns true if leaf children will require both on-core and
799    traditional wake-up mechanisms. For example, if the team size increases,
800    threads already in the team will respond to on-core wakeup on their parent
801    thread, but threads newly added to the team will only be listening on the
802    their local b_go. */
803 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
804                                                    kmp_bstate_t *thr_bar,
805                                                    kmp_uint32 nproc, int gtid,
806                                                    int tid, kmp_team_t *team) {
807   // Checks to determine if (re-)initialization is needed
808   bool uninitialized = thr_bar->team == NULL;
809   bool team_changed = team != thr_bar->team;
810   bool team_sz_changed = nproc != thr_bar->nproc;
811   bool tid_changed = tid != thr_bar->old_tid;
812   bool retval = false;
813 
814   if (uninitialized || team_sz_changed) {
815     __kmp_get_hierarchy(nproc, thr_bar);
816   }
817 
818   if (uninitialized || team_sz_changed || tid_changed) {
819     thr_bar->my_level = thr_bar->depth - 1; // default for master
820     thr_bar->parent_tid = -1; // default for master
821     if (!KMP_MASTER_TID(
822             tid)) { // if not master, find parent thread in hierarchy
823       kmp_uint32 d = 0;
824       while (d < thr_bar->depth) { // find parent based on level of thread in
825         // hierarchy, and note level
826         kmp_uint32 rem;
827         if (d == thr_bar->depth - 2) { // reached level right below the master
828           thr_bar->parent_tid = 0;
829           thr_bar->my_level = d;
830           break;
831         } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
832                    0) { // TODO: can we make this op faster?
833           // thread is not a subtree root at next level, so this is max
834           thr_bar->parent_tid = tid - rem;
835           thr_bar->my_level = d;
836           break;
837         }
838         ++d;
839       }
840     }
841     thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
842     thr_bar->old_tid = tid;
843     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
844     thr_bar->team = team;
845     thr_bar->parent_bar =
846         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
847   }
848   if (uninitialized || team_changed || tid_changed) {
849     thr_bar->team = team;
850     thr_bar->parent_bar =
851         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
852     retval = true;
853   }
854   if (uninitialized || team_sz_changed || tid_changed) {
855     thr_bar->nproc = nproc;
856     thr_bar->leaf_kids = thr_bar->base_leaf_kids;
857     if (thr_bar->my_level == 0)
858       thr_bar->leaf_kids = 0;
859     if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
860       thr_bar->leaf_kids = nproc - tid - 1;
861     thr_bar->leaf_state = 0;
862     for (int i = 0; i < thr_bar->leaf_kids; ++i)
863       ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
864   }
865   return retval;
866 }
867 
868 static void __kmp_hierarchical_barrier_gather(
869     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
870     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
871   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
872   kmp_team_t *team = this_thr->th.th_team;
873   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
874   kmp_uint32 nproc = this_thr->th.th_team_nproc;
875   kmp_info_t **other_threads = team->t.t_threads;
876   kmp_uint64 new_state;
877 
878   int level = team->t.t_level;
879   if (other_threads[0]
880           ->th.th_teams_microtask) // are we inside the teams construct?
881     if (this_thr->th.th_teams_size.nteams > 1)
882       ++level; // level was not increased in teams construct for team_of_masters
883   if (level == 1)
884     thr_bar->use_oncore_barrier = 1;
885   else
886     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
887 
888   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
889                 "barrier type %d\n",
890                 gtid, team->t.t_id, tid, bt));
891   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
892 
893 #if USE_ITT_BUILD && USE_ITT_NOTIFY
894   // Barrier imbalance - save arrive time to the thread
895   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
896     this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
897   }
898 #endif
899 
900   (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
901                                                team);
902 
903   if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
904     kmp_int32 child_tid;
905     new_state =
906         (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
907     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
908         thr_bar->use_oncore_barrier) {
909       if (thr_bar->leaf_kids) {
910         // First, wait for leaf children to check-in on my b_arrived flag
911         kmp_uint64 leaf_state =
912             KMP_MASTER_TID(tid)
913                 ? thr_bar->b_arrived | thr_bar->leaf_state
914                 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
915         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
916                       "for leaf kids\n",
917                       gtid, team->t.t_id, tid));
918         kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
919         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
920         if (reduce) {
921           ANNOTATE_REDUCE_AFTER(reduce);
922           OMPT_REDUCTION_DECL(this_thr, gtid);
923           OMPT_REDUCTION_BEGIN;
924           for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
925                ++child_tid) {
926             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
927                            "T#%d(%d:%d)\n",
928                            gtid, team->t.t_id, tid,
929                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
930                            child_tid));
931             ANNOTATE_BARRIER_END(other_threads[child_tid]);
932             (*reduce)(this_thr->th.th_local.reduce_data,
933                       other_threads[child_tid]->th.th_local.reduce_data);
934           }
935           OMPT_REDUCTION_END;
936           ANNOTATE_REDUCE_BEFORE(reduce);
937           ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
938         }
939         // clear leaf_state bits
940         KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
941       }
942       // Next, wait for higher level children on each child's b_arrived flag
943       for (kmp_uint32 d = 1; d < thr_bar->my_level;
944            ++d) { // gather lowest level threads first, but skip 0
945         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
946                    skip = thr_bar->skip_per_level[d];
947         if (last > nproc)
948           last = nproc;
949         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
950           kmp_info_t *child_thr = other_threads[child_tid];
951           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
952           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
953                         "T#%d(%d:%d) "
954                         "arrived(%p) == %llu\n",
955                         gtid, team->t.t_id, tid,
956                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
957                         child_tid, &child_bar->b_arrived, new_state));
958           kmp_flag_64 flag(&child_bar->b_arrived, new_state);
959           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
960           ANNOTATE_BARRIER_END(child_thr);
961           if (reduce) {
962             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
963                            "T#%d(%d:%d)\n",
964                            gtid, team->t.t_id, tid,
965                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
966                            child_tid));
967             ANNOTATE_REDUCE_AFTER(reduce);
968             (*reduce)(this_thr->th.th_local.reduce_data,
969                       child_thr->th.th_local.reduce_data);
970             ANNOTATE_REDUCE_BEFORE(reduce);
971             ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
972           }
973         }
974       }
975     } else { // Blocktime is not infinite
976       for (kmp_uint32 d = 0; d < thr_bar->my_level;
977            ++d) { // Gather lowest level threads first
978         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
979                    skip = thr_bar->skip_per_level[d];
980         if (last > nproc)
981           last = nproc;
982         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
983           kmp_info_t *child_thr = other_threads[child_tid];
984           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
985           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
986                         "T#%d(%d:%d) "
987                         "arrived(%p) == %llu\n",
988                         gtid, team->t.t_id, tid,
989                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
990                         child_tid, &child_bar->b_arrived, new_state));
991           kmp_flag_64 flag(&child_bar->b_arrived, new_state);
992           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
993           ANNOTATE_BARRIER_END(child_thr);
994           if (reduce) {
995             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
996                            "T#%d(%d:%d)\n",
997                            gtid, team->t.t_id, tid,
998                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
999                            child_tid));
1000             ANNOTATE_REDUCE_AFTER(reduce);
1001             (*reduce)(this_thr->th.th_local.reduce_data,
1002                       child_thr->th.th_local.reduce_data);
1003             ANNOTATE_REDUCE_BEFORE(reduce);
1004             ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1005           }
1006         }
1007       }
1008     }
1009   }
1010   // All subordinates are gathered; now release parent if not master thread
1011 
1012   if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1013     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1014                   " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1015                   gtid, team->t.t_id, tid,
1016                   __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1017                   thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1018                   thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1019     /* Mark arrival to parent: After performing this write, a worker thread may
1020        not assume that the team is valid any more - it could be deallocated by
1021        the master thread at any time. */
1022     if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1023         !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1024       // flag; release it
1025       ANNOTATE_BARRIER_BEGIN(this_thr);
1026       kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
1027       flag.release();
1028     } else {
1029       // Leaf does special release on "offset" bits of parent's b_arrived flag
1030       thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1031       kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
1032       flag.set_waiter(other_threads[thr_bar->parent_tid]);
1033       flag.release();
1034     }
1035   } else { // Master thread needs to update the team's b_arrived value
1036     team->t.t_bar[bt].b_arrived = new_state;
1037     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1038                   "arrived(%p) = %llu\n",
1039                   gtid, team->t.t_id, tid, team->t.t_id,
1040                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1041   }
1042   // Is the team access below unsafe or just technically invalid?
1043   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1044                 "barrier type %d\n",
1045                 gtid, team->t.t_id, tid, bt));
1046 }
1047 
1048 static void __kmp_hierarchical_barrier_release(
1049     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1050     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1051   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1052   kmp_team_t *team;
1053   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1054   kmp_uint32 nproc;
1055   bool team_change = false; // indicates on-core barrier shouldn't be used
1056 
1057   if (KMP_MASTER_TID(tid)) {
1058     team = __kmp_threads[gtid]->th.th_team;
1059     KMP_DEBUG_ASSERT(team != NULL);
1060     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1061                   "entered barrier type %d\n",
1062                   gtid, team->t.t_id, tid, bt));
1063   } else { // Worker threads
1064     // Wait for parent thread to release me
1065     if (!thr_bar->use_oncore_barrier ||
1066         __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1067         thr_bar->team == NULL) {
1068       // Use traditional method of waiting on my own b_go flag
1069       thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1070       kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1071       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1072       ANNOTATE_BARRIER_END(this_thr);
1073       TCW_8(thr_bar->b_go,
1074             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1075     } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1076       // infinite, not nested
1077       // Wait on my "offset" bits on parent's b_go flag
1078       thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1079       kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1080                            thr_bar->offset, bt,
1081                            this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1082       flag.wait(this_thr, TRUE);
1083       if (thr_bar->wait_flag ==
1084           KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1085         TCW_8(thr_bar->b_go,
1086               KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1087       } else { // Reset my bits on parent's b_go flag
1088         (RCAST(volatile char *,
1089                &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1090       }
1091     }
1092     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1093     // Early exit for reaping threads releasing forkjoin barrier
1094     if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1095       return;
1096     // The worker thread may now assume that the team is valid.
1097     team = __kmp_threads[gtid]->th.th_team;
1098     KMP_DEBUG_ASSERT(team != NULL);
1099     tid = __kmp_tid_from_gtid(gtid);
1100 
1101     KA_TRACE(
1102         20,
1103         ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1104          gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1105     KMP_MB(); // Flush all pending memory write invalidates.
1106   }
1107 
1108   nproc = this_thr->th.th_team_nproc;
1109   int level = team->t.t_level;
1110   if (team->t.t_threads[0]
1111           ->th.th_teams_microtask) { // are we inside the teams construct?
1112     if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1113         this_thr->th.th_teams_level == level)
1114       ++level; // level was not increased in teams construct for team_of_workers
1115     if (this_thr->th.th_teams_size.nteams > 1)
1116       ++level; // level was not increased in teams construct for team_of_masters
1117   }
1118   if (level == 1)
1119     thr_bar->use_oncore_barrier = 1;
1120   else
1121     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1122 
1123   // If the team size has increased, we still communicate with old leaves via
1124   // oncore barrier.
1125   unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1126   kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1127   team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1128                                                        tid, team);
1129   // But if the entire team changes, we won't use oncore barrier at all
1130   if (team_change)
1131     old_leaf_kids = 0;
1132 
1133 #if KMP_BARRIER_ICV_PUSH
1134   if (propagate_icvs) {
1135     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1136                              FALSE);
1137     if (KMP_MASTER_TID(
1138             tid)) { // master already has copy in final destination; copy
1139       copy_icvs(&thr_bar->th_fixed_icvs,
1140                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1141     } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1142                thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1143       if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1144         // leaves (on-core children) pull parent's fixed ICVs directly to local
1145         // ICV store
1146         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1147                   &thr_bar->parent_bar->th_fixed_icvs);
1148       // non-leaves will get ICVs piggybacked with b_go via NGO store
1149     } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1150       if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1151         // access
1152         copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1153       else // leaves copy parent's fixed ICVs directly to local ICV store
1154         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1155                   &thr_bar->parent_bar->th_fixed_icvs);
1156     }
1157   }
1158 #endif // KMP_BARRIER_ICV_PUSH
1159 
1160   // Now, release my children
1161   if (thr_bar->my_level) { // not a leaf
1162     kmp_int32 child_tid;
1163     kmp_uint32 last;
1164     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1165         thr_bar->use_oncore_barrier) {
1166       if (KMP_MASTER_TID(tid)) { // do a flat release
1167         // Set local b_go to bump children via NGO store of the cache line
1168         // containing IVCs and b_go.
1169         thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1170         // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1171         // the cache line
1172         ngo_load(&thr_bar->th_fixed_icvs);
1173         // This loops over all the threads skipping only the leaf nodes in the
1174         // hierarchy
1175         for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1176              child_tid += thr_bar->skip_per_level[1]) {
1177           kmp_bstate_t *child_bar =
1178               &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1179           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1180                         "releasing T#%d(%d:%d)"
1181                         " go(%p): %u => %u\n",
1182                         gtid, team->t.t_id, tid,
1183                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1184                         child_tid, &child_bar->b_go, child_bar->b_go,
1185                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1186           // Use ngo store (if available) to both store ICVs and release child
1187           // via child's b_go
1188           ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1189         }
1190         ngo_sync();
1191       }
1192       TCW_8(thr_bar->b_go,
1193             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1194       // Now, release leaf children
1195       if (thr_bar->leaf_kids) { // if there are any
1196         // We test team_change on the off-chance that the level 1 team changed.
1197         if (team_change ||
1198             old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1199           if (old_leaf_kids) { // release old leaf kids
1200             thr_bar->b_go |= old_leaf_state;
1201           }
1202           // Release new leaf kids
1203           last = tid + thr_bar->skip_per_level[1];
1204           if (last > nproc)
1205             last = nproc;
1206           for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1207                ++child_tid) { // skip_per_level[0]=1
1208             kmp_info_t *child_thr = team->t.t_threads[child_tid];
1209             kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1210             KA_TRACE(
1211                 20,
1212                 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1213                  " T#%d(%d:%d) go(%p): %u => %u\n",
1214                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1215                  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1216                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1217             // Release child using child's b_go flag
1218             ANNOTATE_BARRIER_BEGIN(child_thr);
1219             kmp_flag_64 flag(&child_bar->b_go, child_thr);
1220             flag.release();
1221           }
1222         } else { // Release all children at once with leaf_state bits on my own
1223           // b_go flag
1224           thr_bar->b_go |= thr_bar->leaf_state;
1225         }
1226       }
1227     } else { // Blocktime is not infinite; do a simple hierarchical release
1228       for (int d = thr_bar->my_level - 1; d >= 0;
1229            --d) { // Release highest level threads first
1230         last = tid + thr_bar->skip_per_level[d + 1];
1231         kmp_uint32 skip = thr_bar->skip_per_level[d];
1232         if (last > nproc)
1233           last = nproc;
1234         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1235           kmp_info_t *child_thr = team->t.t_threads[child_tid];
1236           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1237           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1238                         "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1239                         gtid, team->t.t_id, tid,
1240                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1241                         child_tid, &child_bar->b_go, child_bar->b_go,
1242                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1243           // Release child using child's b_go flag
1244           ANNOTATE_BARRIER_BEGIN(child_thr);
1245           kmp_flag_64 flag(&child_bar->b_go, child_thr);
1246           flag.release();
1247         }
1248       }
1249     }
1250 #if KMP_BARRIER_ICV_PUSH
1251     if (propagate_icvs && !KMP_MASTER_TID(tid))
1252       // non-leaves copy ICVs from fixed ICVs to local dest
1253       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1254                 &thr_bar->th_fixed_icvs);
1255 #endif // KMP_BARRIER_ICV_PUSH
1256   }
1257   KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1258                 "barrier type %d\n",
1259                 gtid, team->t.t_id, tid, bt));
1260 }
1261 
1262 // End of Barrier Algorithms
1263 
1264 // type traits for cancellable value
1265 // if cancellable is true, then is_cancellable is a normal boolean variable
1266 // if cancellable is false, then is_cancellable is a compile time constant
1267 template <bool cancellable> struct is_cancellable {};
1268 template <> struct is_cancellable<true> {
1269   bool value;
1270   is_cancellable() : value(false) {}
1271   is_cancellable(bool b) : value(b) {}
1272   is_cancellable &operator=(bool b) {
1273     value = b;
1274     return *this;
1275   }
1276   operator bool() const { return value; }
1277 };
1278 template <> struct is_cancellable<false> {
1279   is_cancellable &operator=(bool b) { return *this; }
1280   constexpr operator bool() const { return false; }
1281 };
1282 
1283 // Internal function to do a barrier.
1284 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1285    If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1286    barrier
1287    When cancellable = false,
1288      Returns 0 if master thread, 1 if worker thread.
1289    When cancellable = true
1290      Returns 0 if not cancelled, 1 if cancelled.  */
1291 template <bool cancellable = false>
1292 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1293                                   size_t reduce_size, void *reduce_data,
1294                                   void (*reduce)(void *, void *)) {
1295   KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1296   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1297   int tid = __kmp_tid_from_gtid(gtid);
1298   kmp_info_t *this_thr = __kmp_threads[gtid];
1299   kmp_team_t *team = this_thr->th.th_team;
1300   int status = 0;
1301   is_cancellable<cancellable> cancelled;
1302 #if OMPT_SUPPORT && OMPT_OPTIONAL
1303   ompt_data_t *my_task_data;
1304   ompt_data_t *my_parallel_data;
1305   void *return_address;
1306   ompt_sync_region_t barrier_kind;
1307 #endif
1308 
1309   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1310                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1311 
1312   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1313 #if OMPT_SUPPORT
1314   if (ompt_enabled.enabled) {
1315 #if OMPT_OPTIONAL
1316     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1317     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1318     return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1319     barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1320     if (ompt_enabled.ompt_callback_sync_region) {
1321       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1322           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1323           return_address);
1324     }
1325     if (ompt_enabled.ompt_callback_sync_region_wait) {
1326       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1327           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1328           return_address);
1329     }
1330 #endif
1331     // It is OK to report the barrier state after the barrier begin callback.
1332     // According to the OMPT specification, a compliant implementation may
1333     // even delay reporting this state until the barrier begins to wait.
1334     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1335   }
1336 #endif
1337 
1338   if (!team->t.t_serialized) {
1339 #if USE_ITT_BUILD
1340     // This value will be used in itt notify events below.
1341     void *itt_sync_obj = NULL;
1342 #if USE_ITT_NOTIFY
1343     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1344       itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1345 #endif
1346 #endif /* USE_ITT_BUILD */
1347     if (__kmp_tasking_mode == tskm_extra_barrier) {
1348       __kmp_tasking_barrier(team, this_thr, gtid);
1349       KA_TRACE(15,
1350                ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1351                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1352     }
1353 
1354     /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1355        access it when the team struct is not guaranteed to exist. */
1356     // See note about the corresponding code in __kmp_join_barrier() being
1357     // performance-critical.
1358     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1359 #if KMP_USE_MONITOR
1360       this_thr->th.th_team_bt_intervals =
1361           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1362       this_thr->th.th_team_bt_set =
1363           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1364 #else
1365       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1366 #endif
1367     }
1368 
1369 #if USE_ITT_BUILD
1370     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1371       __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1372 #endif /* USE_ITT_BUILD */
1373 #if USE_DEBUGGER
1374     // Let the debugger know: the thread arrived to the barrier and waiting.
1375     if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1376       team->t.t_bar[bt].b_master_arrived += 1;
1377     } else {
1378       this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1379     } // if
1380 #endif /* USE_DEBUGGER */
1381     if (reduce != NULL) {
1382       // KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1383       this_thr->th.th_local.reduce_data = reduce_data;
1384     }
1385 
1386     if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1387       // use 0 to only setup the current team if nthreads > 1
1388       __kmp_task_team_setup(this_thr, team, 0);
1389 
1390     if (cancellable) {
1391       cancelled = __kmp_linear_barrier_gather_cancellable(
1392           bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1393     } else {
1394       switch (__kmp_barrier_gather_pattern[bt]) {
1395       case bp_hyper_bar: {
1396         // don't set branch bits to 0; use linear
1397         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1398         __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1399                                    reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1400         break;
1401       }
1402       case bp_hierarchical_bar: {
1403         __kmp_hierarchical_barrier_gather(
1404             bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1405         break;
1406       }
1407       case bp_tree_bar: {
1408         // don't set branch bits to 0; use linear
1409         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1410         __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1411                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1412         break;
1413       }
1414       default: {
1415         __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1416                                     reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1417       }
1418       }
1419     }
1420 
1421     KMP_MB();
1422 
1423     if (KMP_MASTER_TID(tid)) {
1424       status = 0;
1425       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1426         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1427       }
1428 #if USE_DEBUGGER
1429       // Let the debugger know: All threads are arrived and starting leaving the
1430       // barrier.
1431       team->t.t_bar[bt].b_team_arrived += 1;
1432 #endif
1433 
1434       if (__kmp_omp_cancellation) {
1435         kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1436         // Reset cancellation flag for worksharing constructs
1437         if (cancel_request == cancel_loop ||
1438             cancel_request == cancel_sections) {
1439           KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1440         }
1441       }
1442 #if USE_ITT_BUILD
1443       /* TODO: In case of split reduction barrier, master thread may send
1444          acquired event early, before the final summation into the shared
1445          variable is done (final summation can be a long operation for array
1446          reductions).  */
1447       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1448         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1449 #endif /* USE_ITT_BUILD */
1450 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1451       // Barrier - report frame end (only if active_level == 1)
1452       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1453           __kmp_forkjoin_frames_mode &&
1454           this_thr->th.th_teams_microtask == NULL &&
1455           team->t.t_active_level == 1) {
1456         ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1457         kmp_uint64 cur_time = __itt_get_timestamp();
1458         kmp_info_t **other_threads = team->t.t_threads;
1459         int nproc = this_thr->th.th_team_nproc;
1460         int i;
1461         switch (__kmp_forkjoin_frames_mode) {
1462         case 1:
1463           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1464                                  loc, nproc);
1465           this_thr->th.th_frame_time = cur_time;
1466           break;
1467         case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1468           // be fixed)
1469           __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1470                                  1, loc, nproc);
1471           break;
1472         case 3:
1473           if (__itt_metadata_add_ptr) {
1474             // Initialize with master's wait time
1475             kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1476             // Set arrive time to zero to be able to check it in
1477             // __kmp_invoke_task(); the same is done inside the loop below
1478             this_thr->th.th_bar_arrive_time = 0;
1479             for (i = 1; i < nproc; ++i) {
1480               delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1481               other_threads[i]->th.th_bar_arrive_time = 0;
1482             }
1483             __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1484                                          cur_time, delta,
1485                                          (kmp_uint64)(reduce != NULL));
1486           }
1487           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1488                                  loc, nproc);
1489           this_thr->th.th_frame_time = cur_time;
1490           break;
1491         }
1492       }
1493 #endif /* USE_ITT_BUILD */
1494     } else {
1495       status = 1;
1496 #if USE_ITT_BUILD
1497       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1498         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1499 #endif /* USE_ITT_BUILD */
1500     }
1501     if ((status == 1 || !is_split) && !cancelled) {
1502       if (cancellable) {
1503         cancelled = __kmp_linear_barrier_release_cancellable(
1504             bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1505       } else {
1506         switch (__kmp_barrier_release_pattern[bt]) {
1507         case bp_hyper_bar: {
1508           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1509           __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1510                                       FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1511           break;
1512         }
1513         case bp_hierarchical_bar: {
1514           __kmp_hierarchical_barrier_release(
1515               bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1516           break;
1517         }
1518         case bp_tree_bar: {
1519           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1520           __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1521                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1522           break;
1523         }
1524         default: {
1525           __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1526                                        FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1527         }
1528         }
1529       }
1530       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1531         __kmp_task_team_sync(this_thr, team);
1532       }
1533     }
1534 
1535 #if USE_ITT_BUILD
1536     /* GEH: TODO: Move this under if-condition above and also include in
1537        __kmp_end_split_barrier(). This will more accurately represent the actual
1538        release time of the threads for split barriers.  */
1539     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1540       __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1541 #endif /* USE_ITT_BUILD */
1542   } else { // Team is serialized.
1543     status = 0;
1544     if (__kmp_tasking_mode != tskm_immediate_exec) {
1545       if (this_thr->th.th_task_team != NULL) {
1546 #if USE_ITT_NOTIFY
1547         void *itt_sync_obj = NULL;
1548         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1549           itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1550           __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1551         }
1552 #endif
1553 
1554         KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1555                          TRUE);
1556         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1557         __kmp_task_team_setup(this_thr, team, 0);
1558 
1559 #if USE_ITT_BUILD
1560         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1561           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1562 #endif /* USE_ITT_BUILD */
1563       }
1564     }
1565   }
1566   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1567                 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1568                 __kmp_tid_from_gtid(gtid), status));
1569 
1570 #if OMPT_SUPPORT
1571   if (ompt_enabled.enabled) {
1572 #if OMPT_OPTIONAL
1573     if (ompt_enabled.ompt_callback_sync_region_wait) {
1574       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1575           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1576           return_address);
1577     }
1578     if (ompt_enabled.ompt_callback_sync_region) {
1579       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1580           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1581           return_address);
1582     }
1583 #endif
1584     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1585   }
1586 #endif
1587   ANNOTATE_BARRIER_END(&team->t.t_bar);
1588 
1589   if (cancellable)
1590     return (int)cancelled;
1591   return status;
1592 }
1593 
1594 // Returns 0 if master thread, 1 if worker thread.
1595 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1596                   size_t reduce_size, void *reduce_data,
1597                   void (*reduce)(void *, void *)) {
1598   return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1599                                   reduce);
1600 }
1601 
1602 #if defined(KMP_GOMP_COMPAT)
1603 // Returns 1 if cancelled, 0 otherwise
1604 int __kmp_barrier_gomp_cancel(int gtid) {
1605   if (__kmp_omp_cancellation) {
1606     int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1607                                                  0, NULL, NULL);
1608     if (cancelled) {
1609       int tid = __kmp_tid_from_gtid(gtid);
1610       kmp_info_t *this_thr = __kmp_threads[gtid];
1611       if (KMP_MASTER_TID(tid)) {
1612         // Master does not need to revert anything
1613       } else {
1614         // Workers need to revert their private b_arrived flag
1615         this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1616             KMP_BARRIER_STATE_BUMP;
1617       }
1618     }
1619     return cancelled;
1620   }
1621   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1622   return FALSE;
1623 }
1624 #endif
1625 
1626 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1627   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1628   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1629   int tid = __kmp_tid_from_gtid(gtid);
1630   kmp_info_t *this_thr = __kmp_threads[gtid];
1631   kmp_team_t *team = this_thr->th.th_team;
1632 
1633   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1634   if (!team->t.t_serialized) {
1635     if (KMP_MASTER_GTID(gtid)) {
1636       switch (__kmp_barrier_release_pattern[bt]) {
1637       case bp_hyper_bar: {
1638         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1639         __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1640                                     FALSE USE_ITT_BUILD_ARG(NULL));
1641         break;
1642       }
1643       case bp_hierarchical_bar: {
1644         __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1645                                            FALSE USE_ITT_BUILD_ARG(NULL));
1646         break;
1647       }
1648       case bp_tree_bar: {
1649         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1650         __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1651                                    FALSE USE_ITT_BUILD_ARG(NULL));
1652         break;
1653       }
1654       default: {
1655         __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1656                                      FALSE USE_ITT_BUILD_ARG(NULL));
1657       }
1658       }
1659       if (__kmp_tasking_mode != tskm_immediate_exec) {
1660         __kmp_task_team_sync(this_thr, team);
1661       } // if
1662     }
1663   }
1664   ANNOTATE_BARRIER_END(&team->t.t_bar);
1665 }
1666 
1667 void __kmp_join_barrier(int gtid) {
1668   KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1669   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1670   kmp_info_t *this_thr = __kmp_threads[gtid];
1671   kmp_team_t *team;
1672   kmp_uint nproc;
1673   kmp_info_t *master_thread;
1674   int tid;
1675 #ifdef KMP_DEBUG
1676   int team_id;
1677 #endif /* KMP_DEBUG */
1678 #if USE_ITT_BUILD
1679   void *itt_sync_obj = NULL;
1680 #if USE_ITT_NOTIFY
1681   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1682     // Get object created at fork_barrier
1683     itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1684 #endif
1685 #endif /* USE_ITT_BUILD */
1686   KMP_MB();
1687 
1688   // Get current info
1689   team = this_thr->th.th_team;
1690   nproc = this_thr->th.th_team_nproc;
1691   KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1692   tid = __kmp_tid_from_gtid(gtid);
1693 #ifdef KMP_DEBUG
1694   team_id = team->t.t_id;
1695 #endif /* KMP_DEBUG */
1696   master_thread = this_thr->th.th_team_master;
1697 #ifdef KMP_DEBUG
1698   if (master_thread != team->t.t_threads[0]) {
1699     __kmp_print_structure();
1700   }
1701 #endif /* KMP_DEBUG */
1702   KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1703   KMP_MB();
1704 
1705   // Verify state
1706   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1707   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1708   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1709   KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1710   KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1711                 gtid, team_id, tid));
1712 
1713   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1714 #if OMPT_SUPPORT
1715   if (ompt_enabled.enabled) {
1716 #if OMPT_OPTIONAL
1717     ompt_data_t *my_task_data;
1718     ompt_data_t *my_parallel_data;
1719     void *codeptr = NULL;
1720     int ds_tid = this_thr->th.th_info.ds.ds_tid;
1721     if (KMP_MASTER_TID(ds_tid) &&
1722         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1723          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1724       codeptr = team->t.ompt_team_info.master_return_address;
1725     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1726     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1727     if (ompt_enabled.ompt_callback_sync_region) {
1728       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1729           ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1730           my_task_data, codeptr);
1731     }
1732     if (ompt_enabled.ompt_callback_sync_region_wait) {
1733       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1734           ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1735           my_task_data, codeptr);
1736     }
1737     if (!KMP_MASTER_TID(ds_tid))
1738       this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1739 #endif
1740     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1741   }
1742 #endif
1743 
1744   if (__kmp_tasking_mode == tskm_extra_barrier) {
1745     __kmp_tasking_barrier(team, this_thr, gtid);
1746     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1747                   team_id, tid));
1748   }
1749 #ifdef KMP_DEBUG
1750   if (__kmp_tasking_mode != tskm_immediate_exec) {
1751     KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1752                   "%p, th_task_team = %p\n",
1753                   __kmp_gtid_from_thread(this_thr), team_id,
1754                   team->t.t_task_team[this_thr->th.th_task_state],
1755                   this_thr->th.th_task_team));
1756     KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1757                      team->t.t_task_team[this_thr->th.th_task_state]);
1758   }
1759 #endif /* KMP_DEBUG */
1760 
1761   /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1762      access it when the team struct is not guaranteed to exist. Doing these
1763      loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1764      we do not perform the copy if blocktime=infinite, since the values are not
1765      used by __kmp_wait_template() in that case. */
1766   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1767 #if KMP_USE_MONITOR
1768     this_thr->th.th_team_bt_intervals =
1769         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1770     this_thr->th.th_team_bt_set =
1771         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1772 #else
1773     this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1774 #endif
1775   }
1776 
1777 #if USE_ITT_BUILD
1778   if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1779     __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1780 #endif /* USE_ITT_BUILD */
1781 
1782   switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1783   case bp_hyper_bar: {
1784     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1785     __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1786                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1787     break;
1788   }
1789   case bp_hierarchical_bar: {
1790     __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1791                                       NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1792     break;
1793   }
1794   case bp_tree_bar: {
1795     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1796     __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1797                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1798     break;
1799   }
1800   default: {
1801     __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1802                                 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1803   }
1804   }
1805 
1806   /* From this point on, the team data structure may be deallocated at any time
1807      by the master thread - it is unsafe to reference it in any of the worker
1808      threads. Any per-team data items that need to be referenced before the
1809      end of the barrier should be moved to the kmp_task_team_t structs.  */
1810   if (KMP_MASTER_TID(tid)) {
1811     if (__kmp_tasking_mode != tskm_immediate_exec) {
1812       __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1813     }
1814     if (__kmp_display_affinity) {
1815       KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1816     }
1817 #if KMP_STATS_ENABLED
1818     // Have master thread flag the workers to indicate they are now waiting for
1819     // next parallel region, Also wake them up so they switch their timers to
1820     // idle.
1821     for (int i = 0; i < team->t.t_nproc; ++i) {
1822       kmp_info_t *team_thread = team->t.t_threads[i];
1823       if (team_thread == this_thr)
1824         continue;
1825       team_thread->th.th_stats->setIdleFlag();
1826       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1827           team_thread->th.th_sleep_loc != NULL)
1828         __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1829                                   team_thread->th.th_sleep_loc);
1830     }
1831 #endif
1832 #if USE_ITT_BUILD
1833     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1834       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1835 #endif /* USE_ITT_BUILD */
1836 
1837 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1838     // Join barrier - report frame end
1839     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1840         __kmp_forkjoin_frames_mode && this_thr->th.th_teams_microtask == NULL &&
1841         team->t.t_active_level == 1) {
1842       kmp_uint64 cur_time = __itt_get_timestamp();
1843       ident_t *loc = team->t.t_ident;
1844       kmp_info_t **other_threads = team->t.t_threads;
1845       int nproc = this_thr->th.th_team_nproc;
1846       int i;
1847       switch (__kmp_forkjoin_frames_mode) {
1848       case 1:
1849         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1850                                loc, nproc);
1851         break;
1852       case 2:
1853         __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1854                                loc, nproc);
1855         break;
1856       case 3:
1857         if (__itt_metadata_add_ptr) {
1858           // Initialize with master's wait time
1859           kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1860           // Set arrive time to zero to be able to check it in
1861           // __kmp_invoke_task(); the same is done inside the loop below
1862           this_thr->th.th_bar_arrive_time = 0;
1863           for (i = 1; i < nproc; ++i) {
1864             delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1865             other_threads[i]->th.th_bar_arrive_time = 0;
1866           }
1867           __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1868                                        cur_time, delta, 0);
1869         }
1870         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1871                                loc, nproc);
1872         this_thr->th.th_frame_time = cur_time;
1873         break;
1874       }
1875     }
1876 #endif /* USE_ITT_BUILD */
1877   }
1878 #if USE_ITT_BUILD
1879   else {
1880     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1881       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1882   }
1883 #endif /* USE_ITT_BUILD */
1884 
1885 #if KMP_DEBUG
1886   if (KMP_MASTER_TID(tid)) {
1887     KA_TRACE(
1888         15,
1889         ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1890          gtid, team_id, tid, nproc));
1891   }
1892 #endif /* KMP_DEBUG */
1893 
1894   // TODO now, mark worker threads as done so they may be disbanded
1895   KMP_MB(); // Flush all pending memory write invalidates.
1896   KA_TRACE(10,
1897            ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1898 
1899   ANNOTATE_BARRIER_END(&team->t.t_bar);
1900 }
1901 
1902 // TODO release worker threads' fork barriers as we are ready instead of all at
1903 // once
1904 void __kmp_fork_barrier(int gtid, int tid) {
1905   KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1906   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1907   kmp_info_t *this_thr = __kmp_threads[gtid];
1908   kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1909 #if USE_ITT_BUILD
1910   void *itt_sync_obj = NULL;
1911 #endif /* USE_ITT_BUILD */
1912   if (team)
1913     ANNOTATE_BARRIER_END(&team->t.t_bar);
1914 
1915   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1916                 (team != NULL) ? team->t.t_id : -1, tid));
1917 
1918   // th_team pointer only valid for master thread here
1919   if (KMP_MASTER_TID(tid)) {
1920 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1921     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1922       // Create itt barrier object
1923       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1924       __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1925     }
1926 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1927 
1928 #ifdef KMP_DEBUG
1929     kmp_info_t **other_threads = team->t.t_threads;
1930     int i;
1931 
1932     // Verify state
1933     KMP_MB();
1934 
1935     for (i = 1; i < team->t.t_nproc; ++i) {
1936       KA_TRACE(500,
1937                ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1938                 "== %u.\n",
1939                 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1940                 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1941                 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1942       KMP_DEBUG_ASSERT(
1943           (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1944            ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1945       KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1946     }
1947 #endif
1948 
1949     if (__kmp_tasking_mode != tskm_immediate_exec) {
1950       // 0 indicates setup current task team if nthreads > 1
1951       __kmp_task_team_setup(this_thr, team, 0);
1952     }
1953 
1954     /* The master thread may have changed its blocktime between the join barrier
1955        and the fork barrier. Copy the blocktime info to the thread, where
1956        __kmp_wait_template() can access it when the team struct is not
1957        guaranteed to exist. */
1958     // See note about the corresponding code in __kmp_join_barrier() being
1959     // performance-critical
1960     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1961 #if KMP_USE_MONITOR
1962       this_thr->th.th_team_bt_intervals =
1963           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1964       this_thr->th.th_team_bt_set =
1965           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1966 #else
1967       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1968 #endif
1969     }
1970   } // master
1971 
1972   switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1973   case bp_hyper_bar: {
1974     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1975     __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1976                                 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1977     break;
1978   }
1979   case bp_hierarchical_bar: {
1980     __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1981                                        TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1982     break;
1983   }
1984   case bp_tree_bar: {
1985     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1986     __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1987                                TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1988     break;
1989   }
1990   default: {
1991     __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1992                                  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1993   }
1994   }
1995 
1996 #if OMPT_SUPPORT
1997   if (ompt_enabled.enabled &&
1998       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
1999     int ds_tid = this_thr->th.th_info.ds.ds_tid;
2000     ompt_data_t *task_data = (team)
2001                                  ? OMPT_CUR_TASK_DATA(this_thr)
2002                                  : &(this_thr->th.ompt_thread_info.task_data);
2003     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2004 #if OMPT_OPTIONAL
2005     void *codeptr = NULL;
2006     if (KMP_MASTER_TID(ds_tid) &&
2007         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2008          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2009       codeptr = team->t.ompt_team_info.master_return_address;
2010     if (ompt_enabled.ompt_callback_sync_region_wait) {
2011       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2012           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2013           codeptr);
2014     }
2015     if (ompt_enabled.ompt_callback_sync_region) {
2016       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2017           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2018           codeptr);
2019     }
2020 #endif
2021     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2022       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2023           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2024     }
2025   }
2026 #endif
2027 
2028   // Early exit for reaping threads releasing forkjoin barrier
2029   if (TCR_4(__kmp_global.g.g_done)) {
2030     this_thr->th.th_task_team = NULL;
2031 
2032 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2033     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2034       if (!KMP_MASTER_TID(tid)) {
2035         itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2036         if (itt_sync_obj)
2037           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2038       }
2039     }
2040 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2041     KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2042     return;
2043   }
2044 
2045   /* We can now assume that a valid team structure has been allocated by the
2046      master and propagated to all worker threads. The current thread, however,
2047      may not be part of the team, so we can't blindly assume that the team
2048      pointer is non-null.  */
2049   team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2050   KMP_DEBUG_ASSERT(team != NULL);
2051   tid = __kmp_tid_from_gtid(gtid);
2052 
2053 #if KMP_BARRIER_ICV_PULL
2054   /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2055      __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2056      implicit task has this data before this function is called. We cannot
2057      modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
2058      struct, because it is not always the case that the threads arrays have
2059      been allocated when __kmp_fork_call() is executed. */
2060   {
2061     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2062     if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
2063       // Copy the initial ICVs from the master's thread struct to the implicit
2064       // task for this tid.
2065       KA_TRACE(10,
2066                ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2067       __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2068                                tid, FALSE);
2069       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2070                 &team->t.t_threads[0]
2071                      ->th.th_bar[bs_forkjoin_barrier]
2072                      .bb.th_fixed_icvs);
2073     }
2074   }
2075 #endif // KMP_BARRIER_ICV_PULL
2076 
2077   if (__kmp_tasking_mode != tskm_immediate_exec) {
2078     __kmp_task_team_sync(this_thr, team);
2079   }
2080 
2081 #if KMP_AFFINITY_SUPPORTED
2082   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2083   if (proc_bind == proc_bind_intel) {
2084     // Call dynamic affinity settings
2085     if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2086       __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2087     }
2088   } else if (proc_bind != proc_bind_false) {
2089     if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2090       KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2091                      __kmp_gtid_from_thread(this_thr),
2092                      this_thr->th.th_current_place));
2093     } else {
2094       __kmp_affinity_set_place(gtid);
2095     }
2096   }
2097 #endif // KMP_AFFINITY_SUPPORTED
2098   // Perform the display affinity functionality
2099   if (__kmp_display_affinity) {
2100     if (team->t.t_display_affinity
2101 #if KMP_AFFINITY_SUPPORTED
2102         || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2103 #endif
2104             ) {
2105       // NULL means use the affinity-format-var ICV
2106       __kmp_aux_display_affinity(gtid, NULL);
2107       this_thr->th.th_prev_num_threads = team->t.t_nproc;
2108       this_thr->th.th_prev_level = team->t.t_level;
2109     }
2110   }
2111   if (!KMP_MASTER_TID(tid))
2112     KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2113 
2114 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2115   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2116     if (!KMP_MASTER_TID(tid)) {
2117       // Get correct barrier object
2118       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2119       __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2120     } // (prepare called inside barrier_release)
2121   }
2122 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2123   ANNOTATE_BARRIER_END(&team->t.t_bar);
2124   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2125                 team->t.t_id, tid));
2126 }
2127 
2128 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2129                           kmp_internal_control_t *new_icvs, ident_t *loc) {
2130   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2131 
2132   KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2133   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2134 
2135 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2136    __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2137    implicit task has this data before this function is called. */
2138 #if KMP_BARRIER_ICV_PULL
2139   /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2140      untouched), where all of the worker threads can access them and make their
2141      own copies after the barrier. */
2142   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2143   // allocated at this point
2144   copy_icvs(
2145       &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2146       new_icvs);
2147   KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2148                 team->t.t_threads[0], team));
2149 #elif KMP_BARRIER_ICV_PUSH
2150   // The ICVs will be propagated in the fork barrier, so nothing needs to be
2151   // done here.
2152   KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2153                 team->t.t_threads[0], team));
2154 #else
2155   // Copy the ICVs to each of the non-master threads.  This takes O(nthreads)
2156   // time.
2157   ngo_load(new_icvs);
2158   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2159   // allocated at this point
2160   for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2161     // TODO: GEH - pass in better source location info since usually NULL here
2162     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2163                   f, team->t.t_threads[f], team));
2164     __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2165     ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2166     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2167                   f, team->t.t_threads[f], team));
2168   }
2169   ngo_sync();
2170 #endif // KMP_BARRIER_ICV_PULL
2171 }
2172