xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_barrier.cpp (revision 9e5787d2284e187abb5b654d924394a65772e004)
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 
20 #if KMP_MIC
21 #include <immintrin.h>
22 #define USE_NGO_STORES 1
23 #endif // KMP_MIC
24 
25 #include "tsan_annotations.h"
26 
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39 
40 void __kmp_print_structure(void); // Forward declaration
41 
42 // ---------------------------- Barrier Algorithms ----------------------------
43 
44 // Linear Barrier
45 template <bool cancellable = false>
46 static bool __kmp_linear_barrier_gather_template(
47     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
49   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
50   kmp_team_t *team = this_thr->th.th_team;
51   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
52   kmp_info_t **other_threads = team->t.t_threads;
53 
54   KA_TRACE(
55       20,
56       ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57        gtid, team->t.t_id, tid, bt));
58   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59 
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
61   // Barrier imbalance - save arrive time to the thread
62   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
64         __itt_get_timestamp();
65   }
66 #endif
67   // We now perform a linear reduction to signal that all of the threads have
68   // arrived.
69   if (!KMP_MASTER_TID(tid)) {
70     KA_TRACE(20,
71              ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
72               "arrived(%p): %llu => %llu\n",
73               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
74               team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
75               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
76     // Mark arrival to master thread
77     /* After performing this write, a worker thread may not assume that the team
78        is valid any more - it could be deallocated by the master thread at any
79        time. */
80     ANNOTATE_BARRIER_BEGIN(this_thr);
81     kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
82     flag.release();
83   } else {
84     kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
85     int nproc = this_thr->th.th_team_nproc;
86     int i;
87     // Don't have to worry about sleep bit here or atomic since team setting
88     kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
89 
90     // Collect all the worker team member threads.
91     for (i = 1; i < nproc; ++i) {
92 #if KMP_CACHE_MANAGE
93       // Prefetch next thread's arrived count
94       if (i + 1 < nproc)
95         KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
96 #endif /* KMP_CACHE_MANAGE */
97       KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
98                     "arrived(%p) == %llu\n",
99                     gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
100                     team->t.t_id, i,
101                     &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
102 
103       // Wait for worker thread to arrive
104       kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
105                        new_state);
106       if (cancellable) {
107         bool cancelled = flag.wait_cancellable_nosleep(
108             this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
109         if (cancelled)
110           return true;
111       } else {
112         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
113       }
114       ANNOTATE_BARRIER_END(other_threads[i]);
115 #if USE_ITT_BUILD && USE_ITT_NOTIFY
116       // Barrier imbalance - write min of the thread time and the other thread
117       // time to the thread.
118       if (__kmp_forkjoin_frames_mode == 2) {
119         this_thr->th.th_bar_min_time = KMP_MIN(
120             this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
121       }
122 #endif
123       if (reduce) {
124         KA_TRACE(100,
125                  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
126                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
127                   team->t.t_id, i));
128         ANNOTATE_REDUCE_AFTER(reduce);
129         OMPT_REDUCTION_DECL(this_thr, gtid);
130         OMPT_REDUCTION_BEGIN;
131         (*reduce)(this_thr->th.th_local.reduce_data,
132                   other_threads[i]->th.th_local.reduce_data);
133         OMPT_REDUCTION_END;
134         ANNOTATE_REDUCE_BEFORE(reduce);
135         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
136       }
137     }
138     // Don't have to worry about sleep bit here or atomic since team setting
139     team_bar->b_arrived = new_state;
140     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
141                   "arrived(%p) = %llu\n",
142                   gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
143                   new_state));
144   }
145   KA_TRACE(
146       20,
147       ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
148        gtid, team->t.t_id, tid, bt));
149   return false;
150 }
151 
152 template <bool cancellable = false>
153 static bool __kmp_linear_barrier_release_template(
154     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
155     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
156   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
157   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
158   kmp_team_t *team;
159 
160   if (KMP_MASTER_TID(tid)) {
161     unsigned int i;
162     kmp_uint32 nproc = this_thr->th.th_team_nproc;
163     kmp_info_t **other_threads;
164 
165     team = __kmp_threads[gtid]->th.th_team;
166     KMP_DEBUG_ASSERT(team != NULL);
167     other_threads = team->t.t_threads;
168 
169     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
170                   "barrier type %d\n",
171                   gtid, team->t.t_id, tid, bt));
172 
173     if (nproc > 1) {
174 #if KMP_BARRIER_ICV_PUSH
175       {
176         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
177         if (propagate_icvs) {
178           ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
179           for (i = 1; i < nproc; ++i) {
180             __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
181                                      team, i, FALSE);
182             ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
183                            &team->t.t_implicit_task_taskdata[0].td_icvs);
184           }
185           ngo_sync();
186         }
187       }
188 #endif // KMP_BARRIER_ICV_PUSH
189 
190       // Now, release all of the worker threads
191       for (i = 1; i < nproc; ++i) {
192 #if KMP_CACHE_MANAGE
193         // Prefetch next thread's go flag
194         if (i + 1 < nproc)
195           KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
196 #endif /* KMP_CACHE_MANAGE */
197         KA_TRACE(
198             20,
199             ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
200              "go(%p): %u => %u\n",
201              gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
202              team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
203              other_threads[i]->th.th_bar[bt].bb.b_go,
204              other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
205         ANNOTATE_BARRIER_BEGIN(other_threads[i]);
206         kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
207                          other_threads[i]);
208         flag.release();
209       }
210     }
211   } else { // Wait for the MASTER thread to release us
212     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
213                   gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
214     kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
215     if (cancellable) {
216       bool cancelled = flag.wait_cancellable_nosleep(
217           this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
218       if (cancelled) {
219         return true;
220       }
221     } else {
222       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
223     }
224     ANNOTATE_BARRIER_END(this_thr);
225 #if USE_ITT_BUILD && USE_ITT_NOTIFY
226     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
227       // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
228       // disabled)
229       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
230       // Cancel wait on previous parallel region...
231       __kmp_itt_task_starting(itt_sync_obj);
232 
233       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
234         return false;
235 
236       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
237       if (itt_sync_obj != NULL)
238         // Call prepare as early as possible for "new" barrier
239         __kmp_itt_task_finished(itt_sync_obj);
240     } else
241 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
242         // Early exit for reaping threads releasing forkjoin barrier
243         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
244       return false;
245 // The worker thread may now assume that the team is valid.
246 #ifdef KMP_DEBUG
247     tid = __kmp_tid_from_gtid(gtid);
248     team = __kmp_threads[gtid]->th.th_team;
249 #endif
250     KMP_DEBUG_ASSERT(team != NULL);
251     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
252     KA_TRACE(20,
253              ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
254               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
255     KMP_MB(); // Flush all pending memory write invalidates.
256   }
257   KA_TRACE(
258       20,
259       ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
260        gtid, team->t.t_id, tid, bt));
261   return false;
262 }
263 
264 static void __kmp_linear_barrier_gather(
265     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
266     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
267   __kmp_linear_barrier_gather_template<false>(
268       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
269 }
270 
271 static bool __kmp_linear_barrier_gather_cancellable(
272     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
273     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
274   return __kmp_linear_barrier_gather_template<true>(
275       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
276 }
277 
278 static void __kmp_linear_barrier_release(
279     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
280     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
281   __kmp_linear_barrier_release_template<false>(
282       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
283 }
284 
285 static bool __kmp_linear_barrier_release_cancellable(
286     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
287     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
288   return __kmp_linear_barrier_release_template<true>(
289       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
290 }
291 
292 // Tree barrier
293 static void
294 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
295                           int tid, void (*reduce)(void *, void *)
296                                        USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
297   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
298   kmp_team_t *team = this_thr->th.th_team;
299   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
300   kmp_info_t **other_threads = team->t.t_threads;
301   kmp_uint32 nproc = this_thr->th.th_team_nproc;
302   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
303   kmp_uint32 branch_factor = 1 << branch_bits;
304   kmp_uint32 child;
305   kmp_uint32 child_tid;
306   kmp_uint64 new_state;
307 
308   KA_TRACE(
309       20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
310            gtid, team->t.t_id, tid, bt));
311   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
312 
313 #if USE_ITT_BUILD && USE_ITT_NOTIFY
314   // Barrier imbalance - save arrive time to the thread
315   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
316     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
317         __itt_get_timestamp();
318   }
319 #endif
320   // Perform tree gather to wait until all threads have arrived; reduce any
321   // required data as we go
322   child_tid = (tid << branch_bits) + 1;
323   if (child_tid < nproc) {
324     // Parent threads wait for all their children to arrive
325     new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
326     child = 1;
327     do {
328       kmp_info_t *child_thr = other_threads[child_tid];
329       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
330 #if KMP_CACHE_MANAGE
331       // Prefetch next thread's arrived count
332       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
333         KMP_CACHE_PREFETCH(
334             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
335 #endif /* KMP_CACHE_MANAGE */
336       KA_TRACE(20,
337                ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
338                 "arrived(%p) == %llu\n",
339                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
340                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
341       // Wait for child to arrive
342       kmp_flag_64 flag(&child_bar->b_arrived, new_state);
343       flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
344       ANNOTATE_BARRIER_END(child_thr);
345 #if USE_ITT_BUILD && USE_ITT_NOTIFY
346       // Barrier imbalance - write min of the thread time and a child time to
347       // the thread.
348       if (__kmp_forkjoin_frames_mode == 2) {
349         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
350                                                child_thr->th.th_bar_min_time);
351       }
352 #endif
353       if (reduce) {
354         KA_TRACE(100,
355                  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
356                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
357                   team->t.t_id, child_tid));
358         ANNOTATE_REDUCE_AFTER(reduce);
359         OMPT_REDUCTION_DECL(this_thr, gtid);
360         OMPT_REDUCTION_BEGIN;
361         (*reduce)(this_thr->th.th_local.reduce_data,
362                   child_thr->th.th_local.reduce_data);
363         OMPT_REDUCTION_END;
364         ANNOTATE_REDUCE_BEFORE(reduce);
365         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
366       }
367       child++;
368       child_tid++;
369     } while (child <= branch_factor && child_tid < nproc);
370   }
371 
372   if (!KMP_MASTER_TID(tid)) { // Worker threads
373     kmp_int32 parent_tid = (tid - 1) >> branch_bits;
374 
375     KA_TRACE(20,
376              ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
377               "arrived(%p): %llu => %llu\n",
378               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
379               team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
380               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
381 
382     // Mark arrival to parent thread
383     /* After performing this write, a worker thread may not assume that the team
384        is valid any more - it could be deallocated by the master thread at any
385        time.  */
386     ANNOTATE_BARRIER_BEGIN(this_thr);
387     kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
388     flag.release();
389   } else {
390     // Need to update the team arrived pointer if we are the master thread
391     if (nproc > 1) // New value was already computed above
392       team->t.t_bar[bt].b_arrived = new_state;
393     else
394       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
395     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
396                   "arrived(%p) = %llu\n",
397                   gtid, team->t.t_id, tid, team->t.t_id,
398                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
399   }
400   KA_TRACE(20,
401            ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
402             gtid, team->t.t_id, tid, bt));
403 }
404 
405 static void __kmp_tree_barrier_release(
406     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
407     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
408   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
409   kmp_team_t *team;
410   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
411   kmp_uint32 nproc;
412   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
413   kmp_uint32 branch_factor = 1 << branch_bits;
414   kmp_uint32 child;
415   kmp_uint32 child_tid;
416 
417   // Perform a tree release for all of the threads that have been gathered
418   if (!KMP_MASTER_TID(
419           tid)) { // Handle fork barrier workers who aren't part of a team yet
420     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
421                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
422     // Wait for parent thread to release us
423     kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
424     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
425     ANNOTATE_BARRIER_END(this_thr);
426 #if USE_ITT_BUILD && USE_ITT_NOTIFY
427     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
428       // In fork barrier where we could not get the object reliably (or
429       // ITTNOTIFY is disabled)
430       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
431       // Cancel wait on previous parallel region...
432       __kmp_itt_task_starting(itt_sync_obj);
433 
434       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
435         return;
436 
437       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
438       if (itt_sync_obj != NULL)
439         // Call prepare as early as possible for "new" barrier
440         __kmp_itt_task_finished(itt_sync_obj);
441     } else
442 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
443         // Early exit for reaping threads releasing forkjoin barrier
444         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
445       return;
446 
447     // The worker thread may now assume that the team is valid.
448     team = __kmp_threads[gtid]->th.th_team;
449     KMP_DEBUG_ASSERT(team != NULL);
450     tid = __kmp_tid_from_gtid(gtid);
451 
452     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
453     KA_TRACE(20,
454              ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
455               team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
456     KMP_MB(); // Flush all pending memory write invalidates.
457   } else {
458     team = __kmp_threads[gtid]->th.th_team;
459     KMP_DEBUG_ASSERT(team != NULL);
460     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
461                   "barrier type %d\n",
462                   gtid, team->t.t_id, tid, bt));
463   }
464   nproc = this_thr->th.th_team_nproc;
465   child_tid = (tid << branch_bits) + 1;
466 
467   if (child_tid < nproc) {
468     kmp_info_t **other_threads = team->t.t_threads;
469     child = 1;
470     // Parent threads release all their children
471     do {
472       kmp_info_t *child_thr = other_threads[child_tid];
473       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
474 #if KMP_CACHE_MANAGE
475       // Prefetch next thread's go count
476       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
477         KMP_CACHE_PREFETCH(
478             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
479 #endif /* KMP_CACHE_MANAGE */
480 
481 #if KMP_BARRIER_ICV_PUSH
482       {
483         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
484         if (propagate_icvs) {
485           __kmp_init_implicit_task(team->t.t_ident,
486                                    team->t.t_threads[child_tid], team,
487                                    child_tid, FALSE);
488           copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
489                     &team->t.t_implicit_task_taskdata[0].td_icvs);
490         }
491       }
492 #endif // KMP_BARRIER_ICV_PUSH
493       KA_TRACE(20,
494                ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
495                 "go(%p): %u => %u\n",
496                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
497                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
498                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
499       // Release child from barrier
500       ANNOTATE_BARRIER_BEGIN(child_thr);
501       kmp_flag_64 flag(&child_bar->b_go, child_thr);
502       flag.release();
503       child++;
504       child_tid++;
505     } while (child <= branch_factor && child_tid < nproc);
506   }
507   KA_TRACE(
508       20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
509            gtid, team->t.t_id, tid, bt));
510 }
511 
512 // Hyper Barrier
513 static void
514 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
515                            int tid, void (*reduce)(void *, void *)
516                                         USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
517   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
518   kmp_team_t *team = this_thr->th.th_team;
519   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
520   kmp_info_t **other_threads = team->t.t_threads;
521   kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
522   kmp_uint32 num_threads = this_thr->th.th_team_nproc;
523   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
524   kmp_uint32 branch_factor = 1 << branch_bits;
525   kmp_uint32 offset;
526   kmp_uint32 level;
527 
528   KA_TRACE(
529       20,
530       ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
531        gtid, team->t.t_id, tid, bt));
532   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
533 
534 #if USE_ITT_BUILD && USE_ITT_NOTIFY
535   // Barrier imbalance - save arrive time to the thread
536   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
537     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
538         __itt_get_timestamp();
539   }
540 #endif
541   /* Perform a hypercube-embedded tree gather to wait until all of the threads
542      have arrived, and reduce any required data as we go.  */
543   kmp_flag_64 p_flag(&thr_bar->b_arrived);
544   for (level = 0, offset = 1; offset < num_threads;
545        level += branch_bits, offset <<= branch_bits) {
546     kmp_uint32 child;
547     kmp_uint32 child_tid;
548 
549     if (((tid >> level) & (branch_factor - 1)) != 0) {
550       kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
551 
552       KMP_MB(); // Synchronize parent and child threads.
553       KA_TRACE(20,
554                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
555                 "arrived(%p): %llu => %llu\n",
556                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
557                 team->t.t_id, parent_tid, &thr_bar->b_arrived,
558                 thr_bar->b_arrived,
559                 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
560       // Mark arrival to parent thread
561       /* After performing this write (in the last iteration of the enclosing for
562          loop), a worker thread may not assume that the team is valid any more
563          - it could be deallocated by the master thread at any time.  */
564       ANNOTATE_BARRIER_BEGIN(this_thr);
565       p_flag.set_waiter(other_threads[parent_tid]);
566       p_flag.release();
567       break;
568     }
569 
570     // Parent threads wait for children to arrive
571     if (new_state == KMP_BARRIER_UNUSED_STATE)
572       new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
573     for (child = 1, child_tid = tid + (1 << level);
574          child < branch_factor && child_tid < num_threads;
575          child++, child_tid += (1 << level)) {
576       kmp_info_t *child_thr = other_threads[child_tid];
577       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
578 #if KMP_CACHE_MANAGE
579       kmp_uint32 next_child_tid = child_tid + (1 << level);
580       // Prefetch next thread's arrived count
581       if (child + 1 < branch_factor && next_child_tid < num_threads)
582         KMP_CACHE_PREFETCH(
583             &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
584 #endif /* KMP_CACHE_MANAGE */
585       KA_TRACE(20,
586                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
587                 "arrived(%p) == %llu\n",
588                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
589                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
590       // Wait for child to arrive
591       kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
592       c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
593       ANNOTATE_BARRIER_END(child_thr);
594       KMP_MB(); // Synchronize parent and child threads.
595 #if USE_ITT_BUILD && USE_ITT_NOTIFY
596       // Barrier imbalance - write min of the thread time and a child time to
597       // the thread.
598       if (__kmp_forkjoin_frames_mode == 2) {
599         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
600                                                child_thr->th.th_bar_min_time);
601       }
602 #endif
603       if (reduce) {
604         KA_TRACE(100,
605                  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
606                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
607                   team->t.t_id, child_tid));
608         ANNOTATE_REDUCE_AFTER(reduce);
609         OMPT_REDUCTION_DECL(this_thr, gtid);
610         OMPT_REDUCTION_BEGIN;
611         (*reduce)(this_thr->th.th_local.reduce_data,
612                   child_thr->th.th_local.reduce_data);
613         OMPT_REDUCTION_END;
614         ANNOTATE_REDUCE_BEFORE(reduce);
615         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
616       }
617     }
618   }
619 
620   if (KMP_MASTER_TID(tid)) {
621     // Need to update the team arrived pointer if we are the master thread
622     if (new_state == KMP_BARRIER_UNUSED_STATE)
623       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
624     else
625       team->t.t_bar[bt].b_arrived = new_state;
626     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
627                   "arrived(%p) = %llu\n",
628                   gtid, team->t.t_id, tid, team->t.t_id,
629                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
630   }
631   KA_TRACE(
632       20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
633            gtid, team->t.t_id, tid, bt));
634 }
635 
636 // The reverse versions seem to beat the forward versions overall
637 #define KMP_REVERSE_HYPER_BAR
638 static void __kmp_hyper_barrier_release(
639     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
640     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
641   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
642   kmp_team_t *team;
643   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
644   kmp_info_t **other_threads;
645   kmp_uint32 num_threads;
646   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
647   kmp_uint32 branch_factor = 1 << branch_bits;
648   kmp_uint32 child;
649   kmp_uint32 child_tid;
650   kmp_uint32 offset;
651   kmp_uint32 level;
652 
653   /* Perform a hypercube-embedded tree release for all of the threads that have
654      been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
655      are released in the reverse order of the corresponding gather, otherwise
656      threads are released in the same order. */
657   if (KMP_MASTER_TID(tid)) { // master
658     team = __kmp_threads[gtid]->th.th_team;
659     KMP_DEBUG_ASSERT(team != NULL);
660     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
661                   "barrier type %d\n",
662                   gtid, team->t.t_id, tid, bt));
663 #if KMP_BARRIER_ICV_PUSH
664     if (propagate_icvs) { // master already has ICVs in final destination; copy
665       copy_icvs(&thr_bar->th_fixed_icvs,
666                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
667     }
668 #endif
669   } else { // Handle fork barrier workers who aren't part of a team yet
670     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
671                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
672     // Wait for parent thread to release us
673     kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
674     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
675     ANNOTATE_BARRIER_END(this_thr);
676 #if USE_ITT_BUILD && USE_ITT_NOTIFY
677     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
678       // In fork barrier where we could not get the object reliably
679       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
680       // Cancel wait on previous parallel region...
681       __kmp_itt_task_starting(itt_sync_obj);
682 
683       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
684         return;
685 
686       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
687       if (itt_sync_obj != NULL)
688         // Call prepare as early as possible for "new" barrier
689         __kmp_itt_task_finished(itt_sync_obj);
690     } else
691 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
692         // Early exit for reaping threads releasing forkjoin barrier
693         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
694       return;
695 
696     // The worker thread may now assume that the team is valid.
697     team = __kmp_threads[gtid]->th.th_team;
698     KMP_DEBUG_ASSERT(team != NULL);
699     tid = __kmp_tid_from_gtid(gtid);
700 
701     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
702     KA_TRACE(20,
703              ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
704               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
705     KMP_MB(); // Flush all pending memory write invalidates.
706   }
707   num_threads = this_thr->th.th_team_nproc;
708   other_threads = team->t.t_threads;
709 
710 #ifdef KMP_REVERSE_HYPER_BAR
711   // Count up to correct level for parent
712   for (level = 0, offset = 1;
713        offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
714        level += branch_bits, offset <<= branch_bits)
715     ;
716 
717   // Now go down from there
718   for (level -= branch_bits, offset >>= branch_bits; offset != 0;
719        level -= branch_bits, offset >>= branch_bits)
720 #else
721   // Go down the tree, level by level
722   for (level = 0, offset = 1; offset < num_threads;
723        level += branch_bits, offset <<= branch_bits)
724 #endif // KMP_REVERSE_HYPER_BAR
725   {
726 #ifdef KMP_REVERSE_HYPER_BAR
727     /* Now go in reverse order through the children, highest to lowest.
728        Initial setting of child is conservative here. */
729     child = num_threads >> ((level == 0) ? level : level - 1);
730     for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
731         child_tid = tid + (child << level);
732          child >= 1; child--, child_tid -= (1 << level))
733 #else
734     if (((tid >> level) & (branch_factor - 1)) != 0)
735       // No need to go lower than this, since this is the level parent would be
736       // notified
737       break;
738     // Iterate through children on this level of the tree
739     for (child = 1, child_tid = tid + (1 << level);
740          child < branch_factor && child_tid < num_threads;
741          child++, child_tid += (1 << level))
742 #endif // KMP_REVERSE_HYPER_BAR
743     {
744       if (child_tid >= num_threads)
745         continue; // Child doesn't exist so keep going
746       else {
747         kmp_info_t *child_thr = other_threads[child_tid];
748         kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
749 #if KMP_CACHE_MANAGE
750         kmp_uint32 next_child_tid = child_tid - (1 << level);
751 // Prefetch next thread's go count
752 #ifdef KMP_REVERSE_HYPER_BAR
753         if (child - 1 >= 1 && next_child_tid < num_threads)
754 #else
755         if (child + 1 < branch_factor && next_child_tid < num_threads)
756 #endif // KMP_REVERSE_HYPER_BAR
757           KMP_CACHE_PREFETCH(
758               &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
759 #endif /* KMP_CACHE_MANAGE */
760 
761 #if KMP_BARRIER_ICV_PUSH
762         if (propagate_icvs) // push my fixed ICVs to my child
763           copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
764 #endif // KMP_BARRIER_ICV_PUSH
765 
766         KA_TRACE(
767             20,
768             ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
769              "go(%p): %u => %u\n",
770              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
771              team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
772              child_bar->b_go + KMP_BARRIER_STATE_BUMP));
773         // Release child from barrier
774         ANNOTATE_BARRIER_BEGIN(child_thr);
775         kmp_flag_64 flag(&child_bar->b_go, child_thr);
776         flag.release();
777       }
778     }
779   }
780 #if KMP_BARRIER_ICV_PUSH
781   if (propagate_icvs &&
782       !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
783     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
784                              FALSE);
785     copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
786               &thr_bar->th_fixed_icvs);
787   }
788 #endif
789   KA_TRACE(
790       20,
791       ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
792        gtid, team->t.t_id, tid, bt));
793 }
794 
795 // Hierarchical Barrier
796 
797 // Initialize thread barrier data
798 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
799    Performs the minimum amount of initialization required based on how the team
800    has changed. Returns true if leaf children will require both on-core and
801    traditional wake-up mechanisms. For example, if the team size increases,
802    threads already in the team will respond to on-core wakeup on their parent
803    thread, but threads newly added to the team will only be listening on the
804    their local b_go. */
805 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
806                                                    kmp_bstate_t *thr_bar,
807                                                    kmp_uint32 nproc, int gtid,
808                                                    int tid, kmp_team_t *team) {
809   // Checks to determine if (re-)initialization is needed
810   bool uninitialized = thr_bar->team == NULL;
811   bool team_changed = team != thr_bar->team;
812   bool team_sz_changed = nproc != thr_bar->nproc;
813   bool tid_changed = tid != thr_bar->old_tid;
814   bool retval = false;
815 
816   if (uninitialized || team_sz_changed) {
817     __kmp_get_hierarchy(nproc, thr_bar);
818   }
819 
820   if (uninitialized || team_sz_changed || tid_changed) {
821     thr_bar->my_level = thr_bar->depth - 1; // default for master
822     thr_bar->parent_tid = -1; // default for master
823     if (!KMP_MASTER_TID(
824             tid)) { // if not master, find parent thread in hierarchy
825       kmp_uint32 d = 0;
826       while (d < thr_bar->depth) { // find parent based on level of thread in
827         // hierarchy, and note level
828         kmp_uint32 rem;
829         if (d == thr_bar->depth - 2) { // reached level right below the master
830           thr_bar->parent_tid = 0;
831           thr_bar->my_level = d;
832           break;
833         } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
834                    0) { // TODO: can we make this op faster?
835           // thread is not a subtree root at next level, so this is max
836           thr_bar->parent_tid = tid - rem;
837           thr_bar->my_level = d;
838           break;
839         }
840         ++d;
841       }
842     }
843     thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
844     thr_bar->old_tid = tid;
845     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
846     thr_bar->team = team;
847     thr_bar->parent_bar =
848         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
849   }
850   if (uninitialized || team_changed || tid_changed) {
851     thr_bar->team = team;
852     thr_bar->parent_bar =
853         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
854     retval = true;
855   }
856   if (uninitialized || team_sz_changed || tid_changed) {
857     thr_bar->nproc = nproc;
858     thr_bar->leaf_kids = thr_bar->base_leaf_kids;
859     if (thr_bar->my_level == 0)
860       thr_bar->leaf_kids = 0;
861     if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
862       thr_bar->leaf_kids = nproc - tid - 1;
863     thr_bar->leaf_state = 0;
864     for (int i = 0; i < thr_bar->leaf_kids; ++i)
865       ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
866   }
867   return retval;
868 }
869 
870 static void __kmp_hierarchical_barrier_gather(
871     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
872     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
873   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
874   kmp_team_t *team = this_thr->th.th_team;
875   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
876   kmp_uint32 nproc = this_thr->th.th_team_nproc;
877   kmp_info_t **other_threads = team->t.t_threads;
878   kmp_uint64 new_state;
879 
880   int level = team->t.t_level;
881   if (other_threads[0]
882           ->th.th_teams_microtask) // are we inside the teams construct?
883     if (this_thr->th.th_teams_size.nteams > 1)
884       ++level; // level was not increased in teams construct for team_of_masters
885   if (level == 1)
886     thr_bar->use_oncore_barrier = 1;
887   else
888     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
889 
890   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
891                 "barrier type %d\n",
892                 gtid, team->t.t_id, tid, bt));
893   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
894 
895 #if USE_ITT_BUILD && USE_ITT_NOTIFY
896   // Barrier imbalance - save arrive time to the thread
897   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
898     this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
899   }
900 #endif
901 
902   (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
903                                                team);
904 
905   if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
906     kmp_int32 child_tid;
907     new_state =
908         (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
909     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
910         thr_bar->use_oncore_barrier) {
911       if (thr_bar->leaf_kids) {
912         // First, wait for leaf children to check-in on my b_arrived flag
913         kmp_uint64 leaf_state =
914             KMP_MASTER_TID(tid)
915                 ? thr_bar->b_arrived | thr_bar->leaf_state
916                 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
917         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
918                       "for leaf kids\n",
919                       gtid, team->t.t_id, tid));
920         kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
921         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
922         if (reduce) {
923           ANNOTATE_REDUCE_AFTER(reduce);
924           OMPT_REDUCTION_DECL(this_thr, gtid);
925           OMPT_REDUCTION_BEGIN;
926           for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
927                ++child_tid) {
928             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
929                            "T#%d(%d:%d)\n",
930                            gtid, team->t.t_id, tid,
931                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
932                            child_tid));
933             ANNOTATE_BARRIER_END(other_threads[child_tid]);
934             (*reduce)(this_thr->th.th_local.reduce_data,
935                       other_threads[child_tid]->th.th_local.reduce_data);
936           }
937           OMPT_REDUCTION_END;
938           ANNOTATE_REDUCE_BEFORE(reduce);
939           ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
940         }
941         // clear leaf_state bits
942         KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
943       }
944       // Next, wait for higher level children on each child's b_arrived flag
945       for (kmp_uint32 d = 1; d < thr_bar->my_level;
946            ++d) { // gather lowest level threads first, but skip 0
947         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
948                    skip = thr_bar->skip_per_level[d];
949         if (last > nproc)
950           last = nproc;
951         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
952           kmp_info_t *child_thr = other_threads[child_tid];
953           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
954           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
955                         "T#%d(%d:%d) "
956                         "arrived(%p) == %llu\n",
957                         gtid, team->t.t_id, tid,
958                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
959                         child_tid, &child_bar->b_arrived, new_state));
960           kmp_flag_64 flag(&child_bar->b_arrived, new_state);
961           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
962           ANNOTATE_BARRIER_END(child_thr);
963           if (reduce) {
964             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
965                            "T#%d(%d:%d)\n",
966                            gtid, team->t.t_id, tid,
967                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
968                            child_tid));
969             ANNOTATE_REDUCE_AFTER(reduce);
970             (*reduce)(this_thr->th.th_local.reduce_data,
971                       child_thr->th.th_local.reduce_data);
972             ANNOTATE_REDUCE_BEFORE(reduce);
973             ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
974           }
975         }
976       }
977     } else { // Blocktime is not infinite
978       for (kmp_uint32 d = 0; d < thr_bar->my_level;
979            ++d) { // Gather lowest level threads first
980         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
981                    skip = thr_bar->skip_per_level[d];
982         if (last > nproc)
983           last = nproc;
984         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
985           kmp_info_t *child_thr = other_threads[child_tid];
986           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
987           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
988                         "T#%d(%d:%d) "
989                         "arrived(%p) == %llu\n",
990                         gtid, team->t.t_id, tid,
991                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
992                         child_tid, &child_bar->b_arrived, new_state));
993           kmp_flag_64 flag(&child_bar->b_arrived, new_state);
994           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
995           ANNOTATE_BARRIER_END(child_thr);
996           if (reduce) {
997             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
998                            "T#%d(%d:%d)\n",
999                            gtid, team->t.t_id, tid,
1000                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1001                            child_tid));
1002             ANNOTATE_REDUCE_AFTER(reduce);
1003             (*reduce)(this_thr->th.th_local.reduce_data,
1004                       child_thr->th.th_local.reduce_data);
1005             ANNOTATE_REDUCE_BEFORE(reduce);
1006             ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1007           }
1008         }
1009       }
1010     }
1011   }
1012   // All subordinates are gathered; now release parent if not master thread
1013 
1014   if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1015     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1016                   " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1017                   gtid, team->t.t_id, tid,
1018                   __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1019                   thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1020                   thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1021     /* Mark arrival to parent: After performing this write, a worker thread may
1022        not assume that the team is valid any more - it could be deallocated by
1023        the master thread at any time. */
1024     if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1025         !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1026       // flag; release it
1027       ANNOTATE_BARRIER_BEGIN(this_thr);
1028       kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
1029       flag.release();
1030     } else {
1031       // Leaf does special release on "offset" bits of parent's b_arrived flag
1032       thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1033       kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
1034       flag.set_waiter(other_threads[thr_bar->parent_tid]);
1035       flag.release();
1036     }
1037   } else { // Master thread needs to update the team's b_arrived value
1038     team->t.t_bar[bt].b_arrived = new_state;
1039     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1040                   "arrived(%p) = %llu\n",
1041                   gtid, team->t.t_id, tid, team->t.t_id,
1042                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1043   }
1044   // Is the team access below unsafe or just technically invalid?
1045   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1046                 "barrier type %d\n",
1047                 gtid, team->t.t_id, tid, bt));
1048 }
1049 
1050 static void __kmp_hierarchical_barrier_release(
1051     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1052     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1053   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1054   kmp_team_t *team;
1055   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1056   kmp_uint32 nproc;
1057   bool team_change = false; // indicates on-core barrier shouldn't be used
1058 
1059   if (KMP_MASTER_TID(tid)) {
1060     team = __kmp_threads[gtid]->th.th_team;
1061     KMP_DEBUG_ASSERT(team != NULL);
1062     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1063                   "entered barrier type %d\n",
1064                   gtid, team->t.t_id, tid, bt));
1065   } else { // Worker threads
1066     // Wait for parent thread to release me
1067     if (!thr_bar->use_oncore_barrier ||
1068         __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1069         thr_bar->team == NULL) {
1070       // Use traditional method of waiting on my own b_go flag
1071       thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1072       kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1073       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1074       ANNOTATE_BARRIER_END(this_thr);
1075       TCW_8(thr_bar->b_go,
1076             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1077     } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1078       // infinite, not nested
1079       // Wait on my "offset" bits on parent's b_go flag
1080       thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1081       kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1082                            thr_bar->offset, bt,
1083                            this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1084       flag.wait(this_thr, TRUE);
1085       if (thr_bar->wait_flag ==
1086           KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1087         TCW_8(thr_bar->b_go,
1088               KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1089       } else { // Reset my bits on parent's b_go flag
1090         (RCAST(volatile char *,
1091                &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1092       }
1093     }
1094     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1095     // Early exit for reaping threads releasing forkjoin barrier
1096     if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1097       return;
1098     // The worker thread may now assume that the team is valid.
1099     team = __kmp_threads[gtid]->th.th_team;
1100     KMP_DEBUG_ASSERT(team != NULL);
1101     tid = __kmp_tid_from_gtid(gtid);
1102 
1103     KA_TRACE(
1104         20,
1105         ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1106          gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1107     KMP_MB(); // Flush all pending memory write invalidates.
1108   }
1109 
1110   nproc = this_thr->th.th_team_nproc;
1111   int level = team->t.t_level;
1112   if (team->t.t_threads[0]
1113           ->th.th_teams_microtask) { // are we inside the teams construct?
1114     if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1115         this_thr->th.th_teams_level == level)
1116       ++level; // level was not increased in teams construct for team_of_workers
1117     if (this_thr->th.th_teams_size.nteams > 1)
1118       ++level; // level was not increased in teams construct for team_of_masters
1119   }
1120   if (level == 1)
1121     thr_bar->use_oncore_barrier = 1;
1122   else
1123     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1124 
1125   // If the team size has increased, we still communicate with old leaves via
1126   // oncore barrier.
1127   unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1128   kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1129   team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1130                                                        tid, team);
1131   // But if the entire team changes, we won't use oncore barrier at all
1132   if (team_change)
1133     old_leaf_kids = 0;
1134 
1135 #if KMP_BARRIER_ICV_PUSH
1136   if (propagate_icvs) {
1137     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1138                              FALSE);
1139     if (KMP_MASTER_TID(
1140             tid)) { // master already has copy in final destination; copy
1141       copy_icvs(&thr_bar->th_fixed_icvs,
1142                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1143     } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1144                thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1145       if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1146         // leaves (on-core children) pull parent's fixed ICVs directly to local
1147         // ICV store
1148         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1149                   &thr_bar->parent_bar->th_fixed_icvs);
1150       // non-leaves will get ICVs piggybacked with b_go via NGO store
1151     } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1152       if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1153         // access
1154         copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1155       else // leaves copy parent's fixed ICVs directly to local ICV store
1156         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1157                   &thr_bar->parent_bar->th_fixed_icvs);
1158     }
1159   }
1160 #endif // KMP_BARRIER_ICV_PUSH
1161 
1162   // Now, release my children
1163   if (thr_bar->my_level) { // not a leaf
1164     kmp_int32 child_tid;
1165     kmp_uint32 last;
1166     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1167         thr_bar->use_oncore_barrier) {
1168       if (KMP_MASTER_TID(tid)) { // do a flat release
1169         // Set local b_go to bump children via NGO store of the cache line
1170         // containing IVCs and b_go.
1171         thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1172         // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1173         // the cache line
1174         ngo_load(&thr_bar->th_fixed_icvs);
1175         // This loops over all the threads skipping only the leaf nodes in the
1176         // hierarchy
1177         for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1178              child_tid += thr_bar->skip_per_level[1]) {
1179           kmp_bstate_t *child_bar =
1180               &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1181           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1182                         "releasing T#%d(%d:%d)"
1183                         " go(%p): %u => %u\n",
1184                         gtid, team->t.t_id, tid,
1185                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1186                         child_tid, &child_bar->b_go, child_bar->b_go,
1187                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1188           // Use ngo store (if available) to both store ICVs and release child
1189           // via child's b_go
1190           ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1191         }
1192         ngo_sync();
1193       }
1194       TCW_8(thr_bar->b_go,
1195             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1196       // Now, release leaf children
1197       if (thr_bar->leaf_kids) { // if there are any
1198         // We test team_change on the off-chance that the level 1 team changed.
1199         if (team_change ||
1200             old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1201           if (old_leaf_kids) { // release old leaf kids
1202             thr_bar->b_go |= old_leaf_state;
1203           }
1204           // Release new leaf kids
1205           last = tid + thr_bar->skip_per_level[1];
1206           if (last > nproc)
1207             last = nproc;
1208           for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1209                ++child_tid) { // skip_per_level[0]=1
1210             kmp_info_t *child_thr = team->t.t_threads[child_tid];
1211             kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1212             KA_TRACE(
1213                 20,
1214                 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1215                  " T#%d(%d:%d) go(%p): %u => %u\n",
1216                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1217                  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1218                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1219             // Release child using child's b_go flag
1220             ANNOTATE_BARRIER_BEGIN(child_thr);
1221             kmp_flag_64 flag(&child_bar->b_go, child_thr);
1222             flag.release();
1223           }
1224         } else { // Release all children at once with leaf_state bits on my own
1225           // b_go flag
1226           thr_bar->b_go |= thr_bar->leaf_state;
1227         }
1228       }
1229     } else { // Blocktime is not infinite; do a simple hierarchical release
1230       for (int d = thr_bar->my_level - 1; d >= 0;
1231            --d) { // Release highest level threads first
1232         last = tid + thr_bar->skip_per_level[d + 1];
1233         kmp_uint32 skip = thr_bar->skip_per_level[d];
1234         if (last > nproc)
1235           last = nproc;
1236         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1237           kmp_info_t *child_thr = team->t.t_threads[child_tid];
1238           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1239           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1240                         "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1241                         gtid, team->t.t_id, tid,
1242                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1243                         child_tid, &child_bar->b_go, child_bar->b_go,
1244                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1245           // Release child using child's b_go flag
1246           ANNOTATE_BARRIER_BEGIN(child_thr);
1247           kmp_flag_64 flag(&child_bar->b_go, child_thr);
1248           flag.release();
1249         }
1250       }
1251     }
1252 #if KMP_BARRIER_ICV_PUSH
1253     if (propagate_icvs && !KMP_MASTER_TID(tid))
1254       // non-leaves copy ICVs from fixed ICVs to local dest
1255       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1256                 &thr_bar->th_fixed_icvs);
1257 #endif // KMP_BARRIER_ICV_PUSH
1258   }
1259   KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1260                 "barrier type %d\n",
1261                 gtid, team->t.t_id, tid, bt));
1262 }
1263 
1264 // End of Barrier Algorithms
1265 
1266 // type traits for cancellable value
1267 // if cancellable is true, then is_cancellable is a normal boolean variable
1268 // if cancellable is false, then is_cancellable is a compile time constant
1269 template <bool cancellable> struct is_cancellable {};
1270 template <> struct is_cancellable<true> {
1271   bool value;
1272   is_cancellable() : value(false) {}
1273   is_cancellable(bool b) : value(b) {}
1274   is_cancellable &operator=(bool b) {
1275     value = b;
1276     return *this;
1277   }
1278   operator bool() const { return value; }
1279 };
1280 template <> struct is_cancellable<false> {
1281   is_cancellable &operator=(bool b) { return *this; }
1282   constexpr operator bool() const { return false; }
1283 };
1284 
1285 // Internal function to do a barrier.
1286 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1287    If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1288    barrier
1289    When cancellable = false,
1290      Returns 0 if master thread, 1 if worker thread.
1291    When cancellable = true
1292      Returns 0 if not cancelled, 1 if cancelled.  */
1293 template <bool cancellable = false>
1294 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1295                                   size_t reduce_size, void *reduce_data,
1296                                   void (*reduce)(void *, void *)) {
1297   KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1298   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1299   int tid = __kmp_tid_from_gtid(gtid);
1300   kmp_info_t *this_thr = __kmp_threads[gtid];
1301   kmp_team_t *team = this_thr->th.th_team;
1302   int status = 0;
1303   is_cancellable<cancellable> cancelled;
1304 #if OMPT_SUPPORT && OMPT_OPTIONAL
1305   ompt_data_t *my_task_data;
1306   ompt_data_t *my_parallel_data;
1307   void *return_address;
1308   ompt_sync_region_t barrier_kind;
1309 #endif
1310 
1311   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1312                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1313 
1314   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1315 #if OMPT_SUPPORT
1316   if (ompt_enabled.enabled) {
1317 #if OMPT_OPTIONAL
1318     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1319     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1320     return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1321     barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1322     if (ompt_enabled.ompt_callback_sync_region) {
1323       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1324           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1325           return_address);
1326     }
1327     if (ompt_enabled.ompt_callback_sync_region_wait) {
1328       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1329           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1330           return_address);
1331     }
1332 #endif
1333     // It is OK to report the barrier state after the barrier begin callback.
1334     // According to the OMPT specification, a compliant implementation may
1335     // even delay reporting this state until the barrier begins to wait.
1336     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1337   }
1338 #endif
1339 
1340   if (!team->t.t_serialized) {
1341 #if USE_ITT_BUILD
1342     // This value will be used in itt notify events below.
1343     void *itt_sync_obj = NULL;
1344 #if USE_ITT_NOTIFY
1345     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1346       itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1347 #endif
1348 #endif /* USE_ITT_BUILD */
1349     if (__kmp_tasking_mode == tskm_extra_barrier) {
1350       __kmp_tasking_barrier(team, this_thr, gtid);
1351       KA_TRACE(15,
1352                ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1353                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1354     }
1355 
1356     /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1357        access it when the team struct is not guaranteed to exist. */
1358     // See note about the corresponding code in __kmp_join_barrier() being
1359     // performance-critical.
1360     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1361 #if KMP_USE_MONITOR
1362       this_thr->th.th_team_bt_intervals =
1363           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1364       this_thr->th.th_team_bt_set =
1365           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1366 #else
1367       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1368 #endif
1369     }
1370 
1371 #if USE_ITT_BUILD
1372     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1373       __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1374 #endif /* USE_ITT_BUILD */
1375 #if USE_DEBUGGER
1376     // Let the debugger know: the thread arrived to the barrier and waiting.
1377     if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1378       team->t.t_bar[bt].b_master_arrived += 1;
1379     } else {
1380       this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1381     } // if
1382 #endif /* USE_DEBUGGER */
1383     if (reduce != NULL) {
1384       // KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1385       this_thr->th.th_local.reduce_data = reduce_data;
1386     }
1387 
1388     if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1389       // use 0 to only setup the current team if nthreads > 1
1390       __kmp_task_team_setup(this_thr, team, 0);
1391 
1392     if (cancellable) {
1393       cancelled = __kmp_linear_barrier_gather_cancellable(
1394           bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1395     } else {
1396       switch (__kmp_barrier_gather_pattern[bt]) {
1397       case bp_hyper_bar: {
1398         // don't set branch bits to 0; use linear
1399         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1400         __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1401                                    reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1402         break;
1403       }
1404       case bp_hierarchical_bar: {
1405         __kmp_hierarchical_barrier_gather(
1406             bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1407         break;
1408       }
1409       case bp_tree_bar: {
1410         // don't set branch bits to 0; use linear
1411         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1412         __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1413                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1414         break;
1415       }
1416       default: {
1417         __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1418                                     reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1419       }
1420       }
1421     }
1422 
1423     KMP_MB();
1424 
1425     if (KMP_MASTER_TID(tid)) {
1426       status = 0;
1427       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1428         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1429       }
1430 #if USE_DEBUGGER
1431       // Let the debugger know: All threads are arrived and starting leaving the
1432       // barrier.
1433       team->t.t_bar[bt].b_team_arrived += 1;
1434 #endif
1435 
1436       if (__kmp_omp_cancellation) {
1437         kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1438         // Reset cancellation flag for worksharing constructs
1439         if (cancel_request == cancel_loop ||
1440             cancel_request == cancel_sections) {
1441           KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1442         }
1443       }
1444 #if USE_ITT_BUILD
1445       /* TODO: In case of split reduction barrier, master thread may send
1446          acquired event early, before the final summation into the shared
1447          variable is done (final summation can be a long operation for array
1448          reductions).  */
1449       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1450         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1451 #endif /* USE_ITT_BUILD */
1452 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1453       // Barrier - report frame end (only if active_level == 1)
1454       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1455           __kmp_forkjoin_frames_mode &&
1456           this_thr->th.th_teams_microtask == NULL &&
1457           team->t.t_active_level == 1) {
1458         ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1459         kmp_uint64 cur_time = __itt_get_timestamp();
1460         kmp_info_t **other_threads = team->t.t_threads;
1461         int nproc = this_thr->th.th_team_nproc;
1462         int i;
1463         switch (__kmp_forkjoin_frames_mode) {
1464         case 1:
1465           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1466                                  loc, nproc);
1467           this_thr->th.th_frame_time = cur_time;
1468           break;
1469         case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1470           // be fixed)
1471           __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1472                                  1, loc, nproc);
1473           break;
1474         case 3:
1475           if (__itt_metadata_add_ptr) {
1476             // Initialize with master's wait time
1477             kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1478             // Set arrive time to zero to be able to check it in
1479             // __kmp_invoke_task(); the same is done inside the loop below
1480             this_thr->th.th_bar_arrive_time = 0;
1481             for (i = 1; i < nproc; ++i) {
1482               delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1483               other_threads[i]->th.th_bar_arrive_time = 0;
1484             }
1485             __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1486                                          cur_time, delta,
1487                                          (kmp_uint64)(reduce != NULL));
1488           }
1489           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1490                                  loc, nproc);
1491           this_thr->th.th_frame_time = cur_time;
1492           break;
1493         }
1494       }
1495 #endif /* USE_ITT_BUILD */
1496     } else {
1497       status = 1;
1498 #if USE_ITT_BUILD
1499       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1500         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1501 #endif /* USE_ITT_BUILD */
1502     }
1503     if ((status == 1 || !is_split) && !cancelled) {
1504       if (cancellable) {
1505         cancelled = __kmp_linear_barrier_release_cancellable(
1506             bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1507       } else {
1508         switch (__kmp_barrier_release_pattern[bt]) {
1509         case bp_hyper_bar: {
1510           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1511           __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1512                                       FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1513           break;
1514         }
1515         case bp_hierarchical_bar: {
1516           __kmp_hierarchical_barrier_release(
1517               bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1518           break;
1519         }
1520         case bp_tree_bar: {
1521           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1522           __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1523                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1524           break;
1525         }
1526         default: {
1527           __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1528                                        FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1529         }
1530         }
1531       }
1532       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1533         __kmp_task_team_sync(this_thr, team);
1534       }
1535     }
1536 
1537 #if USE_ITT_BUILD
1538     /* GEH: TODO: Move this under if-condition above and also include in
1539        __kmp_end_split_barrier(). This will more accurately represent the actual
1540        release time of the threads for split barriers.  */
1541     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1542       __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1543 #endif /* USE_ITT_BUILD */
1544   } else { // Team is serialized.
1545     status = 0;
1546     if (__kmp_tasking_mode != tskm_immediate_exec) {
1547       if (this_thr->th.th_task_team != NULL) {
1548 #if USE_ITT_NOTIFY
1549         void *itt_sync_obj = NULL;
1550         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1551           itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1552           __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1553         }
1554 #endif
1555 
1556         KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1557                          TRUE);
1558         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1559         __kmp_task_team_setup(this_thr, team, 0);
1560 
1561 #if USE_ITT_BUILD
1562         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1563           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1564 #endif /* USE_ITT_BUILD */
1565       }
1566     }
1567   }
1568   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1569                 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1570                 __kmp_tid_from_gtid(gtid), status));
1571 
1572 #if OMPT_SUPPORT
1573   if (ompt_enabled.enabled) {
1574 #if OMPT_OPTIONAL
1575     if (ompt_enabled.ompt_callback_sync_region_wait) {
1576       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1577           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1578           return_address);
1579     }
1580     if (ompt_enabled.ompt_callback_sync_region) {
1581       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1582           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1583           return_address);
1584     }
1585 #endif
1586     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1587   }
1588 #endif
1589   ANNOTATE_BARRIER_END(&team->t.t_bar);
1590 
1591   if (cancellable)
1592     return (int)cancelled;
1593   return status;
1594 }
1595 
1596 // Returns 0 if master thread, 1 if worker thread.
1597 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1598                   size_t reduce_size, void *reduce_data,
1599                   void (*reduce)(void *, void *)) {
1600   return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1601                                   reduce);
1602 }
1603 
1604 #if defined(KMP_GOMP_COMPAT)
1605 // Returns 1 if cancelled, 0 otherwise
1606 int __kmp_barrier_gomp_cancel(int gtid) {
1607   if (__kmp_omp_cancellation) {
1608     int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1609                                                  0, NULL, NULL);
1610     if (cancelled) {
1611       int tid = __kmp_tid_from_gtid(gtid);
1612       kmp_info_t *this_thr = __kmp_threads[gtid];
1613       if (KMP_MASTER_TID(tid)) {
1614         // Master does not need to revert anything
1615       } else {
1616         // Workers need to revert their private b_arrived flag
1617         this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1618             KMP_BARRIER_STATE_BUMP;
1619       }
1620     }
1621     return cancelled;
1622   }
1623   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1624   return FALSE;
1625 }
1626 #endif
1627 
1628 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1629   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1630   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1631   int tid = __kmp_tid_from_gtid(gtid);
1632   kmp_info_t *this_thr = __kmp_threads[gtid];
1633   kmp_team_t *team = this_thr->th.th_team;
1634 
1635   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1636   if (!team->t.t_serialized) {
1637     if (KMP_MASTER_GTID(gtid)) {
1638       switch (__kmp_barrier_release_pattern[bt]) {
1639       case bp_hyper_bar: {
1640         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1641         __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1642                                     FALSE USE_ITT_BUILD_ARG(NULL));
1643         break;
1644       }
1645       case bp_hierarchical_bar: {
1646         __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1647                                            FALSE USE_ITT_BUILD_ARG(NULL));
1648         break;
1649       }
1650       case bp_tree_bar: {
1651         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1652         __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1653                                    FALSE USE_ITT_BUILD_ARG(NULL));
1654         break;
1655       }
1656       default: {
1657         __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1658                                      FALSE USE_ITT_BUILD_ARG(NULL));
1659       }
1660       }
1661       if (__kmp_tasking_mode != tskm_immediate_exec) {
1662         __kmp_task_team_sync(this_thr, team);
1663       } // if
1664     }
1665   }
1666   ANNOTATE_BARRIER_END(&team->t.t_bar);
1667 }
1668 
1669 void __kmp_join_barrier(int gtid) {
1670   KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1671   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1672   kmp_info_t *this_thr = __kmp_threads[gtid];
1673   kmp_team_t *team;
1674   kmp_uint nproc;
1675   kmp_info_t *master_thread;
1676   int tid;
1677 #ifdef KMP_DEBUG
1678   int team_id;
1679 #endif /* KMP_DEBUG */
1680 #if USE_ITT_BUILD
1681   void *itt_sync_obj = NULL;
1682 #if USE_ITT_NOTIFY
1683   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1684     // Get object created at fork_barrier
1685     itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1686 #endif
1687 #endif /* USE_ITT_BUILD */
1688   KMP_MB();
1689 
1690   // Get current info
1691   team = this_thr->th.th_team;
1692   nproc = this_thr->th.th_team_nproc;
1693   KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1694   tid = __kmp_tid_from_gtid(gtid);
1695 #ifdef KMP_DEBUG
1696   team_id = team->t.t_id;
1697 #endif /* KMP_DEBUG */
1698   master_thread = this_thr->th.th_team_master;
1699 #ifdef KMP_DEBUG
1700   if (master_thread != team->t.t_threads[0]) {
1701     __kmp_print_structure();
1702   }
1703 #endif /* KMP_DEBUG */
1704   KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1705   KMP_MB();
1706 
1707   // Verify state
1708   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1709   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1710   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1711   KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1712   KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1713                 gtid, team_id, tid));
1714 
1715   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1716 #if OMPT_SUPPORT
1717   if (ompt_enabled.enabled) {
1718 #if OMPT_OPTIONAL
1719     ompt_data_t *my_task_data;
1720     ompt_data_t *my_parallel_data;
1721     void *codeptr = NULL;
1722     int ds_tid = this_thr->th.th_info.ds.ds_tid;
1723     if (KMP_MASTER_TID(ds_tid) &&
1724         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1725          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1726       codeptr = team->t.ompt_team_info.master_return_address;
1727     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1728     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1729     if (ompt_enabled.ompt_callback_sync_region) {
1730       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1731           ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1732           my_task_data, codeptr);
1733     }
1734     if (ompt_enabled.ompt_callback_sync_region_wait) {
1735       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1736           ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1737           my_task_data, codeptr);
1738     }
1739     if (!KMP_MASTER_TID(ds_tid))
1740       this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1741 #endif
1742     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1743   }
1744 #endif
1745 
1746   if (__kmp_tasking_mode == tskm_extra_barrier) {
1747     __kmp_tasking_barrier(team, this_thr, gtid);
1748     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1749                   team_id, tid));
1750   }
1751 #ifdef KMP_DEBUG
1752   if (__kmp_tasking_mode != tskm_immediate_exec) {
1753     KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1754                   "%p, th_task_team = %p\n",
1755                   __kmp_gtid_from_thread(this_thr), team_id,
1756                   team->t.t_task_team[this_thr->th.th_task_state],
1757                   this_thr->th.th_task_team));
1758     KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1759                      team->t.t_task_team[this_thr->th.th_task_state]);
1760   }
1761 #endif /* KMP_DEBUG */
1762 
1763   /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1764      access it when the team struct is not guaranteed to exist. Doing these
1765      loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1766      we do not perform the copy if blocktime=infinite, since the values are not
1767      used by __kmp_wait_template() in that case. */
1768   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1769 #if KMP_USE_MONITOR
1770     this_thr->th.th_team_bt_intervals =
1771         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1772     this_thr->th.th_team_bt_set =
1773         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1774 #else
1775     this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1776 #endif
1777   }
1778 
1779 #if USE_ITT_BUILD
1780   if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1781     __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1782 #endif /* USE_ITT_BUILD */
1783 
1784   switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1785   case bp_hyper_bar: {
1786     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1787     __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1788                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1789     break;
1790   }
1791   case bp_hierarchical_bar: {
1792     __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1793                                       NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1794     break;
1795   }
1796   case bp_tree_bar: {
1797     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1798     __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1799                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1800     break;
1801   }
1802   default: {
1803     __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1804                                 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1805   }
1806   }
1807 
1808   /* From this point on, the team data structure may be deallocated at any time
1809      by the master thread - it is unsafe to reference it in any of the worker
1810      threads. Any per-team data items that need to be referenced before the
1811      end of the barrier should be moved to the kmp_task_team_t structs.  */
1812   if (KMP_MASTER_TID(tid)) {
1813     if (__kmp_tasking_mode != tskm_immediate_exec) {
1814       __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1815     }
1816     if (__kmp_display_affinity) {
1817       KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1818     }
1819 #if KMP_STATS_ENABLED
1820     // Have master thread flag the workers to indicate they are now waiting for
1821     // next parallel region, Also wake them up so they switch their timers to
1822     // idle.
1823     for (int i = 0; i < team->t.t_nproc; ++i) {
1824       kmp_info_t *team_thread = team->t.t_threads[i];
1825       if (team_thread == this_thr)
1826         continue;
1827       team_thread->th.th_stats->setIdleFlag();
1828       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1829           team_thread->th.th_sleep_loc != NULL)
1830         __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1831                                   team_thread->th.th_sleep_loc);
1832     }
1833 #endif
1834 #if USE_ITT_BUILD
1835     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1836       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1837 #endif /* USE_ITT_BUILD */
1838 
1839 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1840     // Join barrier - report frame end
1841     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1842         __kmp_forkjoin_frames_mode && this_thr->th.th_teams_microtask == NULL &&
1843         team->t.t_active_level == 1) {
1844       kmp_uint64 cur_time = __itt_get_timestamp();
1845       ident_t *loc = team->t.t_ident;
1846       kmp_info_t **other_threads = team->t.t_threads;
1847       int nproc = this_thr->th.th_team_nproc;
1848       int i;
1849       switch (__kmp_forkjoin_frames_mode) {
1850       case 1:
1851         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1852                                loc, nproc);
1853         break;
1854       case 2:
1855         __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1856                                loc, nproc);
1857         break;
1858       case 3:
1859         if (__itt_metadata_add_ptr) {
1860           // Initialize with master's wait time
1861           kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1862           // Set arrive time to zero to be able to check it in
1863           // __kmp_invoke_task(); the same is done inside the loop below
1864           this_thr->th.th_bar_arrive_time = 0;
1865           for (i = 1; i < nproc; ++i) {
1866             delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1867             other_threads[i]->th.th_bar_arrive_time = 0;
1868           }
1869           __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1870                                        cur_time, delta, 0);
1871         }
1872         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1873                                loc, nproc);
1874         this_thr->th.th_frame_time = cur_time;
1875         break;
1876       }
1877     }
1878 #endif /* USE_ITT_BUILD */
1879   }
1880 #if USE_ITT_BUILD
1881   else {
1882     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1883       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1884   }
1885 #endif /* USE_ITT_BUILD */
1886 
1887 #if KMP_DEBUG
1888   if (KMP_MASTER_TID(tid)) {
1889     KA_TRACE(
1890         15,
1891         ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1892          gtid, team_id, tid, nproc));
1893   }
1894 #endif /* KMP_DEBUG */
1895 
1896   // TODO now, mark worker threads as done so they may be disbanded
1897   KMP_MB(); // Flush all pending memory write invalidates.
1898   KA_TRACE(10,
1899            ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1900 
1901   ANNOTATE_BARRIER_END(&team->t.t_bar);
1902 }
1903 
1904 // TODO release worker threads' fork barriers as we are ready instead of all at
1905 // once
1906 void __kmp_fork_barrier(int gtid, int tid) {
1907   KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1908   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1909   kmp_info_t *this_thr = __kmp_threads[gtid];
1910   kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1911 #if USE_ITT_BUILD
1912   void *itt_sync_obj = NULL;
1913 #endif /* USE_ITT_BUILD */
1914   if (team)
1915     ANNOTATE_BARRIER_END(&team->t.t_bar);
1916 
1917   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1918                 (team != NULL) ? team->t.t_id : -1, tid));
1919 
1920   // th_team pointer only valid for master thread here
1921   if (KMP_MASTER_TID(tid)) {
1922 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1923     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1924       // Create itt barrier object
1925       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1926       __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1927     }
1928 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1929 
1930 #ifdef KMP_DEBUG
1931     kmp_info_t **other_threads = team->t.t_threads;
1932     int i;
1933 
1934     // Verify state
1935     KMP_MB();
1936 
1937     for (i = 1; i < team->t.t_nproc; ++i) {
1938       KA_TRACE(500,
1939                ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1940                 "== %u.\n",
1941                 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1942                 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1943                 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1944       KMP_DEBUG_ASSERT(
1945           (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1946            ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1947       KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1948     }
1949 #endif
1950 
1951     if (__kmp_tasking_mode != tskm_immediate_exec) {
1952       // 0 indicates setup current task team if nthreads > 1
1953       __kmp_task_team_setup(this_thr, team, 0);
1954     }
1955 
1956     /* The master thread may have changed its blocktime between the join barrier
1957        and the fork barrier. Copy the blocktime info to the thread, where
1958        __kmp_wait_template() can access it when the team struct is not
1959        guaranteed to exist. */
1960     // See note about the corresponding code in __kmp_join_barrier() being
1961     // performance-critical
1962     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1963 #if KMP_USE_MONITOR
1964       this_thr->th.th_team_bt_intervals =
1965           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1966       this_thr->th.th_team_bt_set =
1967           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1968 #else
1969       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1970 #endif
1971     }
1972   } // master
1973 
1974   switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1975   case bp_hyper_bar: {
1976     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1977     __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1978                                 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1979     break;
1980   }
1981   case bp_hierarchical_bar: {
1982     __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1983                                        TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1984     break;
1985   }
1986   case bp_tree_bar: {
1987     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1988     __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1989                                TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1990     break;
1991   }
1992   default: {
1993     __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1994                                  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1995   }
1996   }
1997 
1998 #if OMPT_SUPPORT
1999   if (ompt_enabled.enabled &&
2000       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2001     int ds_tid = this_thr->th.th_info.ds.ds_tid;
2002     ompt_data_t *task_data = (team)
2003                                  ? OMPT_CUR_TASK_DATA(this_thr)
2004                                  : &(this_thr->th.ompt_thread_info.task_data);
2005     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2006 #if OMPT_OPTIONAL
2007     void *codeptr = NULL;
2008     if (KMP_MASTER_TID(ds_tid) &&
2009         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2010          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2011       codeptr = team->t.ompt_team_info.master_return_address;
2012     if (ompt_enabled.ompt_callback_sync_region_wait) {
2013       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2014           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2015           codeptr);
2016     }
2017     if (ompt_enabled.ompt_callback_sync_region) {
2018       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2019           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2020           codeptr);
2021     }
2022 #endif
2023     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2024       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2025           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2026     }
2027   }
2028 #endif
2029 
2030   // Early exit for reaping threads releasing forkjoin barrier
2031   if (TCR_4(__kmp_global.g.g_done)) {
2032     this_thr->th.th_task_team = NULL;
2033 
2034 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2035     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2036       if (!KMP_MASTER_TID(tid)) {
2037         itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2038         if (itt_sync_obj)
2039           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2040       }
2041     }
2042 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2043     KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2044     return;
2045   }
2046 
2047   /* We can now assume that a valid team structure has been allocated by the
2048      master and propagated to all worker threads. The current thread, however,
2049      may not be part of the team, so we can't blindly assume that the team
2050      pointer is non-null.  */
2051   team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2052   KMP_DEBUG_ASSERT(team != NULL);
2053   tid = __kmp_tid_from_gtid(gtid);
2054 
2055 #if KMP_BARRIER_ICV_PULL
2056   /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2057      __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2058      implicit task has this data before this function is called. We cannot
2059      modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
2060      struct, because it is not always the case that the threads arrays have
2061      been allocated when __kmp_fork_call() is executed. */
2062   {
2063     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2064     if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
2065       // Copy the initial ICVs from the master's thread struct to the implicit
2066       // task for this tid.
2067       KA_TRACE(10,
2068                ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2069       __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2070                                tid, FALSE);
2071       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2072                 &team->t.t_threads[0]
2073                      ->th.th_bar[bs_forkjoin_barrier]
2074                      .bb.th_fixed_icvs);
2075     }
2076   }
2077 #endif // KMP_BARRIER_ICV_PULL
2078 
2079   if (__kmp_tasking_mode != tskm_immediate_exec) {
2080     __kmp_task_team_sync(this_thr, team);
2081   }
2082 
2083 #if KMP_AFFINITY_SUPPORTED
2084   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2085   if (proc_bind == proc_bind_intel) {
2086     // Call dynamic affinity settings
2087     if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2088       __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2089     }
2090   } else if (proc_bind != proc_bind_false) {
2091     if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2092       KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2093                      __kmp_gtid_from_thread(this_thr),
2094                      this_thr->th.th_current_place));
2095     } else {
2096       __kmp_affinity_set_place(gtid);
2097     }
2098   }
2099 #endif // KMP_AFFINITY_SUPPORTED
2100   // Perform the display affinity functionality
2101   if (__kmp_display_affinity) {
2102     if (team->t.t_display_affinity
2103 #if KMP_AFFINITY_SUPPORTED
2104         || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2105 #endif
2106             ) {
2107       // NULL means use the affinity-format-var ICV
2108       __kmp_aux_display_affinity(gtid, NULL);
2109       this_thr->th.th_prev_num_threads = team->t.t_nproc;
2110       this_thr->th.th_prev_level = team->t.t_level;
2111     }
2112   }
2113   if (!KMP_MASTER_TID(tid))
2114     KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2115 
2116 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2117   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2118     if (!KMP_MASTER_TID(tid)) {
2119       // Get correct barrier object
2120       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2121       __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2122     } // (prepare called inside barrier_release)
2123   }
2124 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2125   ANNOTATE_BARRIER_END(&team->t.t_bar);
2126   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2127                 team->t.t_id, tid));
2128 }
2129 
2130 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2131                           kmp_internal_control_t *new_icvs, ident_t *loc) {
2132   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2133 
2134   KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2135   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2136 
2137 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2138    __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2139    implicit task has this data before this function is called. */
2140 #if KMP_BARRIER_ICV_PULL
2141   /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2142      untouched), where all of the worker threads can access them and make their
2143      own copies after the barrier. */
2144   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2145   // allocated at this point
2146   copy_icvs(
2147       &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2148       new_icvs);
2149   KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2150                 team->t.t_threads[0], team));
2151 #elif KMP_BARRIER_ICV_PUSH
2152   // The ICVs will be propagated in the fork barrier, so nothing needs to be
2153   // done here.
2154   KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2155                 team->t.t_threads[0], team));
2156 #else
2157   // Copy the ICVs to each of the non-master threads.  This takes O(nthreads)
2158   // time.
2159   ngo_load(new_icvs);
2160   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2161   // allocated at this point
2162   for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2163     // TODO: GEH - pass in better source location info since usually NULL here
2164     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2165                   f, team->t.t_threads[f], team));
2166     __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2167     ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2168     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2169                   f, team->t.t_threads[f], team));
2170   }
2171   ngo_sync();
2172 #endif // KMP_BARRIER_ICV_PULL
2173 }
2174