xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_wait_release.h (revision ae7e8a02e6e93455e026036132c4d053b2c12ad9)
1 /*
2  * kmp_wait_release.h -- Wait/Release implementation
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_WAIT_RELEASE_H
14 #define KMP_WAIT_RELEASE_H
15 
16 #include "kmp.h"
17 #include "kmp_itt.h"
18 #include "kmp_stats.h"
19 #if OMPT_SUPPORT
20 #include "ompt-specific.h"
21 #endif
22 
23 /*!
24 @defgroup WAIT_RELEASE Wait/Release operations
25 
26 The definitions and functions here implement the lowest level thread
27 synchronizations of suspending a thread and awaking it. They are used to build
28 higher level operations such as barriers and fork/join.
29 */
30 
31 /*!
32 @ingroup WAIT_RELEASE
33 @{
34 */
35 
36 /*!
37  * The flag_type describes the storage used for the flag.
38  */
39 enum flag_type {
40   flag32, /**< 32 bit flags */
41   flag64, /**< 64 bit flags */
42   flag_oncore /**< special 64-bit flag for on-core barrier (hierarchical) */
43 };
44 
45 struct flag_properties {
46   unsigned int type : 16;
47   unsigned int reserved : 16;
48 };
49 
50 /*!
51  * Base class for wait/release volatile flag
52  */
53 template <typename P> class kmp_flag_native {
54   volatile P *loc;
55   flag_properties t;
56 
57 public:
58   typedef P flag_t;
59   kmp_flag_native(volatile P *p, flag_type ft)
60       : loc(p), t({(short unsigned int)ft, 0U}) {}
61   volatile P *get() { return loc; }
62   void *get_void_p() { return RCAST(void *, CCAST(P *, loc)); }
63   void set(volatile P *new_loc) { loc = new_loc; }
64   flag_type get_type() { return (flag_type)(t.type); }
65   P load() { return *loc; }
66   void store(P val) { *loc = val; }
67 };
68 
69 /*!
70  * Base class for wait/release atomic flag
71  */
72 template <typename P> class kmp_flag {
73   std::atomic<P>
74       *loc; /**< Pointer to the flag storage that is modified by another thread
75              */
76   flag_properties t; /**< "Type" of the flag in loc */
77 public:
78   typedef P flag_t;
79   kmp_flag(std::atomic<P> *p, flag_type ft)
80       : loc(p), t({(short unsigned int)ft, 0U}) {}
81   /*!
82    * @result the pointer to the actual flag
83    */
84   std::atomic<P> *get() { return loc; }
85   /*!
86    * @result void* pointer to the actual flag
87    */
88   void *get_void_p() { return RCAST(void *, loc); }
89   /*!
90    * @param new_loc in   set loc to point at new_loc
91    */
92   void set(std::atomic<P> *new_loc) { loc = new_loc; }
93   /*!
94    * @result the flag_type
95    */
96   flag_type get_type() { return (flag_type)(t.type); }
97   /*!
98    * @result flag value
99    */
100   P load() { return loc->load(std::memory_order_acquire); }
101   /*!
102    * @param val the new flag value to be stored
103    */
104   void store(P val) { loc->store(val, std::memory_order_release); }
105   // Derived classes must provide the following:
106   /*
107   kmp_info_t * get_waiter(kmp_uint32 i);
108   kmp_uint32 get_num_waiters();
109   bool done_check();
110   bool done_check_val(P old_loc);
111   bool notdone_check();
112   P internal_release();
113   void suspend(int th_gtid);
114   void mwait(int th_gtid);
115   void resume(int th_gtid);
116   P set_sleeping();
117   P unset_sleeping();
118   bool is_sleeping();
119   bool is_any_sleeping();
120   bool is_sleeping_val(P old_loc);
121   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
122                     int *thread_finished
123                     USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32
124                     is_constrained);
125   */
126 };
127 
128 #if OMPT_SUPPORT
129 OMPT_NOINLINE
130 static void __ompt_implicit_task_end(kmp_info_t *this_thr,
131                                      ompt_state_t ompt_state,
132                                      ompt_data_t *tId) {
133   int ds_tid = this_thr->th.th_info.ds.ds_tid;
134   if (ompt_state == ompt_state_wait_barrier_implicit) {
135     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
136 #if OMPT_OPTIONAL
137     void *codeptr = NULL;
138     if (ompt_enabled.ompt_callback_sync_region_wait) {
139       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
140           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, tId,
141           codeptr);
142     }
143     if (ompt_enabled.ompt_callback_sync_region) {
144       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
145           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, tId,
146           codeptr);
147     }
148 #endif
149     if (!KMP_MASTER_TID(ds_tid)) {
150       if (ompt_enabled.ompt_callback_implicit_task) {
151         int flags = this_thr->th.ompt_thread_info.parallel_flags;
152         flags = (flags & ompt_parallel_league) ? ompt_task_initial
153                                                : ompt_task_implicit;
154         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
155             ompt_scope_end, NULL, tId, 0, ds_tid, flags);
156       }
157       // return to idle state
158       this_thr->th.ompt_thread_info.state = ompt_state_idle;
159     } else {
160       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
161     }
162   }
163 }
164 #endif
165 
166 /* Spin wait loop that first does pause/yield, then sleep. A thread that calls
167    __kmp_wait_*  must make certain that another thread calls __kmp_release
168    to wake it back up to prevent deadlocks!
169 
170    NOTE: We may not belong to a team at this point.  */
171 template <class C, bool final_spin, bool Cancellable = false,
172           bool Sleepable = true>
173 static inline bool
174 __kmp_wait_template(kmp_info_t *this_thr,
175                     C *flag USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
176 #if USE_ITT_BUILD && USE_ITT_NOTIFY
177   volatile void *spin = flag->get();
178 #endif
179   kmp_uint32 spins;
180   int th_gtid;
181   int tasks_completed = FALSE;
182   int oversubscribed;
183 #if !KMP_USE_MONITOR
184   kmp_uint64 poll_count;
185   kmp_uint64 hibernate_goal;
186 #else
187   kmp_uint32 hibernate;
188 #endif
189 
190   KMP_FSYNC_SPIN_INIT(spin, NULL);
191   if (flag->done_check()) {
192     KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin));
193     return false;
194   }
195   th_gtid = this_thr->th.th_info.ds.ds_gtid;
196   if (Cancellable) {
197     kmp_team_t *team = this_thr->th.th_team;
198     if (team && team->t.t_cancel_request == cancel_parallel)
199       return true;
200   }
201 #if KMP_OS_UNIX
202   if (final_spin)
203     KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true);
204 #endif
205   KA_TRACE(20,
206            ("__kmp_wait_sleep: T#%d waiting for flag(%p)\n", th_gtid, flag));
207 #if KMP_STATS_ENABLED
208   stats_state_e thread_state = KMP_GET_THREAD_STATE();
209 #endif
210 
211 /* OMPT Behavior:
212 THIS function is called from
213   __kmp_barrier (2 times)  (implicit or explicit barrier in parallel regions)
214             these have join / fork behavior
215 
216        In these cases, we don't change the state or trigger events in THIS
217 function.
218        Events are triggered in the calling code (__kmp_barrier):
219 
220                 state := ompt_state_overhead
221             barrier-begin
222             barrier-wait-begin
223                 state := ompt_state_wait_barrier
224           call join-barrier-implementation (finally arrive here)
225           {}
226           call fork-barrier-implementation (finally arrive here)
227           {}
228                 state := ompt_state_overhead
229             barrier-wait-end
230             barrier-end
231                 state := ompt_state_work_parallel
232 
233 
234   __kmp_fork_barrier  (after thread creation, before executing implicit task)
235           call fork-barrier-implementation (finally arrive here)
236           {} // worker arrive here with state = ompt_state_idle
237 
238 
239   __kmp_join_barrier  (implicit barrier at end of parallel region)
240                 state := ompt_state_barrier_implicit
241             barrier-begin
242             barrier-wait-begin
243           call join-barrier-implementation (finally arrive here
244 final_spin=FALSE)
245           {
246           }
247   __kmp_fork_barrier  (implicit barrier at end of parallel region)
248           call fork-barrier-implementation (finally arrive here final_spin=TRUE)
249 
250        Worker after task-team is finished:
251             barrier-wait-end
252             barrier-end
253             implicit-task-end
254             idle-begin
255                 state := ompt_state_idle
256 
257        Before leaving, if state = ompt_state_idle
258             idle-end
259                 state := ompt_state_overhead
260 */
261 #if OMPT_SUPPORT
262   ompt_state_t ompt_entry_state;
263   ompt_data_t *tId;
264   if (ompt_enabled.enabled) {
265     ompt_entry_state = this_thr->th.ompt_thread_info.state;
266     if (!final_spin || ompt_entry_state != ompt_state_wait_barrier_implicit ||
267         KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid)) {
268       ompt_lw_taskteam_t *team =
269           this_thr->th.th_team->t.ompt_serialized_team_info;
270       if (team) {
271         tId = &(team->ompt_task_info.task_data);
272       } else {
273         tId = OMPT_CUR_TASK_DATA(this_thr);
274       }
275     } else {
276       tId = &(this_thr->th.ompt_thread_info.task_data);
277     }
278     if (final_spin && (__kmp_tasking_mode == tskm_immediate_exec ||
279                        this_thr->th.th_task_team == NULL)) {
280       // implicit task is done. Either no taskqueue, or task-team finished
281       __ompt_implicit_task_end(this_thr, ompt_entry_state, tId);
282     }
283   }
284 #endif
285 
286   KMP_INIT_YIELD(spins); // Setup for waiting
287 
288   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
289       __kmp_pause_status == kmp_soft_paused) {
290 #if KMP_USE_MONITOR
291 // The worker threads cannot rely on the team struct existing at this point.
292 // Use the bt values cached in the thread struct instead.
293 #ifdef KMP_ADJUST_BLOCKTIME
294     if (__kmp_pause_status == kmp_soft_paused ||
295         (__kmp_zero_bt && !this_thr->th.th_team_bt_set))
296       // Force immediate suspend if not set by user and more threads than
297       // available procs
298       hibernate = 0;
299     else
300       hibernate = this_thr->th.th_team_bt_intervals;
301 #else
302     hibernate = this_thr->th.th_team_bt_intervals;
303 #endif /* KMP_ADJUST_BLOCKTIME */
304 
305     /* If the blocktime is nonzero, we want to make sure that we spin wait for
306        the entirety of the specified #intervals, plus up to one interval more.
307        This increment make certain that this thread doesn't go to sleep too
308        soon.  */
309     if (hibernate != 0)
310       hibernate++;
311 
312     // Add in the current time value.
313     hibernate += TCR_4(__kmp_global.g.g_time.dt.t_value);
314     KF_TRACE(20, ("__kmp_wait_sleep: T#%d now=%d, hibernate=%d, intervals=%d\n",
315                   th_gtid, __kmp_global.g.g_time.dt.t_value, hibernate,
316                   hibernate - __kmp_global.g.g_time.dt.t_value));
317 #else
318     if (__kmp_pause_status == kmp_soft_paused) {
319       // Force immediate suspend
320       hibernate_goal = KMP_NOW();
321     } else
322       hibernate_goal = KMP_NOW() + this_thr->th.th_team_bt_intervals;
323     poll_count = 0;
324 #endif // KMP_USE_MONITOR
325   }
326 
327   oversubscribed = (TCR_4(__kmp_nth) > __kmp_avail_proc);
328   KMP_MB();
329 
330   // Main wait spin loop
331   while (flag->notdone_check()) {
332     kmp_task_team_t *task_team = NULL;
333     if (__kmp_tasking_mode != tskm_immediate_exec) {
334       task_team = this_thr->th.th_task_team;
335       /* If the thread's task team pointer is NULL, it means one of 3 things:
336          1) A newly-created thread is first being released by
337          __kmp_fork_barrier(), and its task team has not been set up yet.
338          2) All tasks have been executed to completion.
339          3) Tasking is off for this region.  This could be because we are in a
340          serialized region (perhaps the outer one), or else tasking was manually
341          disabled (KMP_TASKING=0).  */
342       if (task_team != NULL) {
343         if (TCR_SYNC_4(task_team->tt.tt_active)) {
344           if (KMP_TASKING_ENABLED(task_team))
345             flag->execute_tasks(
346                 this_thr, th_gtid, final_spin,
347                 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
348           else
349             this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
350         } else {
351           KMP_DEBUG_ASSERT(!KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid));
352 #if OMPT_SUPPORT
353           // task-team is done now, other cases should be catched above
354           if (final_spin && ompt_enabled.enabled)
355             __ompt_implicit_task_end(this_thr, ompt_entry_state, tId);
356 #endif
357           this_thr->th.th_task_team = NULL;
358           this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
359         }
360       } else {
361         this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
362       } // if
363     } // if
364 
365     KMP_FSYNC_SPIN_PREPARE(CCAST(void *, spin));
366     if (TCR_4(__kmp_global.g.g_done)) {
367       if (__kmp_global.g.g_abort)
368         __kmp_abort_thread();
369       break;
370     }
371 
372     // If we are oversubscribed, or have waited a bit (and
373     // KMP_LIBRARY=throughput), then yield
374     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
375 
376 #if KMP_STATS_ENABLED
377     // Check if thread has been signalled to idle state
378     // This indicates that the logical "join-barrier" has finished
379     if (this_thr->th.th_stats->isIdle() &&
380         KMP_GET_THREAD_STATE() == FORK_JOIN_BARRIER) {
381       KMP_SET_THREAD_STATE(IDLE);
382       KMP_PUSH_PARTITIONED_TIMER(OMP_idle);
383     }
384 #endif
385     // Check if the barrier surrounding this wait loop has been cancelled
386     if (Cancellable) {
387       kmp_team_t *team = this_thr->th.th_team;
388       if (team && team->t.t_cancel_request == cancel_parallel)
389         break;
390     }
391 
392     // For hidden helper thread, if task_team is nullptr, it means the main
393     // thread has not released the barrier. We cannot wait here because once the
394     // main thread releases all children barriers, all hidden helper threads are
395     // still sleeping. This leads to a problem that following configuration,
396     // such as task team sync, will not be performed such that this thread does
397     // not have task team. Usually it is not bad. However, a corner case is,
398     // when the first task encountered is an untied task, the check in
399     // __kmp_task_alloc will crash because it uses the task team pointer without
400     // checking whether it is nullptr. It is probably under some kind of
401     // assumption.
402     if (task_team && KMP_HIDDEN_HELPER_WORKER_THREAD(th_gtid) &&
403         !TCR_4(__kmp_hidden_helper_team_done)) {
404       // If there is still hidden helper tasks to be executed, the hidden helper
405       // thread will not enter a waiting status.
406       if (KMP_ATOMIC_LD_ACQ(&__kmp_unexecuted_hidden_helper_tasks) == 0) {
407         __kmp_hidden_helper_worker_thread_wait();
408       }
409       continue;
410     }
411 
412     // Don't suspend if KMP_BLOCKTIME is set to "infinite"
413     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
414         __kmp_pause_status != kmp_soft_paused)
415       continue;
416 
417     // Don't suspend if there is a likelihood of new tasks being spawned.
418     if ((task_team != NULL) && TCR_4(task_team->tt.tt_found_tasks))
419       continue;
420 
421 #if KMP_USE_MONITOR
422     // If we have waited a bit more, fall asleep
423     if (TCR_4(__kmp_global.g.g_time.dt.t_value) < hibernate)
424       continue;
425 #else
426     if (KMP_BLOCKING(hibernate_goal, poll_count++))
427       continue;
428 #endif
429     // Don't suspend if wait loop designated non-sleepable
430     // in template parameters
431     if (!Sleepable)
432       continue;
433 
434     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
435         __kmp_pause_status != kmp_soft_paused)
436       continue;
437 
438 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
439     if (__kmp_mwait_enabled || __kmp_umwait_enabled) {
440       KF_TRACE(50, ("__kmp_wait_sleep: T#%d using monitor/mwait\n", th_gtid));
441       flag->mwait(th_gtid);
442     } else {
443 #endif
444       KF_TRACE(50, ("__kmp_wait_sleep: T#%d suspend time reached\n", th_gtid));
445 #if KMP_OS_UNIX
446       if (final_spin)
447         KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false);
448 #endif
449       flag->suspend(th_gtid);
450 #if KMP_OS_UNIX
451       if (final_spin)
452         KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true);
453 #endif
454 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
455     }
456 #endif
457 
458     if (TCR_4(__kmp_global.g.g_done)) {
459       if (__kmp_global.g.g_abort)
460         __kmp_abort_thread();
461       break;
462     } else if (__kmp_tasking_mode != tskm_immediate_exec &&
463                this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
464       this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
465     }
466     // TODO: If thread is done with work and times out, disband/free
467   }
468 
469 #if OMPT_SUPPORT
470   ompt_state_t ompt_exit_state = this_thr->th.ompt_thread_info.state;
471   if (ompt_enabled.enabled && ompt_exit_state != ompt_state_undefined) {
472 #if OMPT_OPTIONAL
473     if (final_spin) {
474       __ompt_implicit_task_end(this_thr, ompt_exit_state, tId);
475       ompt_exit_state = this_thr->th.ompt_thread_info.state;
476     }
477 #endif
478     if (ompt_exit_state == ompt_state_idle) {
479       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
480     }
481   }
482 #endif
483 #if KMP_STATS_ENABLED
484   // If we were put into idle state, pop that off the state stack
485   if (KMP_GET_THREAD_STATE() == IDLE) {
486     KMP_POP_PARTITIONED_TIMER();
487     KMP_SET_THREAD_STATE(thread_state);
488     this_thr->th.th_stats->resetIdleFlag();
489   }
490 #endif
491 
492 #if KMP_OS_UNIX
493   if (final_spin)
494     KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false);
495 #endif
496   KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin));
497   if (Cancellable) {
498     kmp_team_t *team = this_thr->th.th_team;
499     if (team && team->t.t_cancel_request == cancel_parallel) {
500       if (tasks_completed) {
501         // undo the previous decrement of unfinished_threads so that the
502         // thread can decrement at the join barrier with no problem
503         kmp_task_team_t *task_team = this_thr->th.th_task_team;
504         std::atomic<kmp_int32> *unfinished_threads =
505             &(task_team->tt.tt_unfinished_threads);
506         KMP_ATOMIC_INC(unfinished_threads);
507       }
508       return true;
509     }
510   }
511   return false;
512 }
513 
514 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
515 // Set up a monitor on the flag variable causing the calling thread to wait in
516 // a less active state until the flag variable is modified.
517 template <class C>
518 static inline void __kmp_mwait_template(int th_gtid, C *flag) {
519   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_mwait);
520   kmp_info_t *th = __kmp_threads[th_gtid];
521 
522   KF_TRACE(30, ("__kmp_mwait_template: T#%d enter for flag = %p\n", th_gtid,
523                 flag->get()));
524 
525   // User-level mwait is available
526   KMP_DEBUG_ASSERT(__kmp_mwait_enabled || __kmp_umwait_enabled);
527 
528   __kmp_suspend_initialize_thread(th);
529   __kmp_lock_suspend_mx(th);
530 
531   volatile void *spin = flag->get();
532   void *cacheline = (void *)(kmp_uintptr_t(spin) & ~(CACHE_LINE - 1));
533 
534   if (!flag->done_check()) {
535     // Mark thread as no longer active
536     th->th.th_active = FALSE;
537     if (th->th.th_active_in_pool) {
538       th->th.th_active_in_pool = FALSE;
539       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
540       KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
541     }
542     flag->set_sleeping();
543     KF_TRACE(50, ("__kmp_mwait_template: T#%d calling monitor\n", th_gtid));
544 #if KMP_HAVE_UMWAIT
545     if (__kmp_umwait_enabled) {
546       __kmp_umonitor(cacheline);
547     }
548 #elif KMP_HAVE_MWAIT
549     if (__kmp_mwait_enabled) {
550       __kmp_mm_monitor(cacheline, 0, 0);
551     }
552 #endif
553     // To avoid a race, check flag between 'monitor' and 'mwait'. A write to
554     // the address could happen after the last time we checked and before
555     // monitoring started, in which case monitor can't detect the change.
556     if (flag->done_check())
557       flag->unset_sleeping();
558     else {
559       // if flag changes here, wake-up happens immediately
560       TCW_PTR(th->th.th_sleep_loc, (void *)flag);
561       __kmp_unlock_suspend_mx(th);
562       KF_TRACE(50, ("__kmp_mwait_template: T#%d calling mwait\n", th_gtid));
563 #if KMP_HAVE_UMWAIT
564       if (__kmp_umwait_enabled) {
565         __kmp_umwait(1, 100); // to do: enable ctrl via hints, backoff counter
566       }
567 #elif KMP_HAVE_MWAIT
568       if (__kmp_mwait_enabled) {
569         __kmp_mm_mwait(0, __kmp_mwait_hints);
570       }
571 #endif
572       KF_TRACE(50, ("__kmp_mwait_template: T#%d mwait done\n", th_gtid));
573       __kmp_lock_suspend_mx(th);
574       // Clean up sleep info; doesn't matter how/why this thread stopped waiting
575       if (flag->is_sleeping())
576         flag->unset_sleeping();
577       TCW_PTR(th->th.th_sleep_loc, NULL);
578     }
579     // Mark thread as active again
580     th->th.th_active = TRUE;
581     if (TCR_4(th->th.th_in_pool)) {
582       KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
583       th->th.th_active_in_pool = TRUE;
584     }
585   } // Drop out to main wait loop to check flag, handle tasks, etc.
586   __kmp_unlock_suspend_mx(th);
587   KF_TRACE(30, ("__kmp_mwait_template: T#%d exit\n", th_gtid));
588 }
589 #endif // KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
590 
591 /* Release any threads specified as waiting on the flag by releasing the flag
592    and resume the waiting thread if indicated by the sleep bit(s). A thread that
593    calls __kmp_wait_template must call this function to wake up the potentially
594    sleeping thread and prevent deadlocks!  */
595 template <class C> static inline void __kmp_release_template(C *flag) {
596 #ifdef KMP_DEBUG
597   int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
598 #endif
599   KF_TRACE(20, ("__kmp_release: T#%d releasing flag(%x)\n", gtid, flag->get()));
600   KMP_DEBUG_ASSERT(flag->get());
601   KMP_FSYNC_RELEASING(flag->get_void_p());
602 
603   flag->internal_release();
604 
605   KF_TRACE(100, ("__kmp_release: T#%d set new spin=%d\n", gtid, flag->get(),
606                  flag->load()));
607 
608   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
609     // Only need to check sleep stuff if infinite block time not set.
610     // Are *any* threads waiting on flag sleeping?
611     if (flag->is_any_sleeping()) {
612       for (unsigned int i = 0; i < flag->get_num_waiters(); ++i) {
613         // if sleeping waiter exists at i, sets current_waiter to i inside flag
614         kmp_info_t *waiter = flag->get_waiter(i);
615         if (waiter) {
616           int wait_gtid = waiter->th.th_info.ds.ds_gtid;
617           // Wake up thread if needed
618           KF_TRACE(50, ("__kmp_release: T#%d waking up thread T#%d since sleep "
619                         "flag(%p) set\n",
620                         gtid, wait_gtid, flag->get()));
621           flag->resume(wait_gtid); // unsets flag's current_waiter when done
622         }
623       }
624     }
625   }
626 }
627 
628 template <typename FlagType> struct flag_traits {};
629 
630 template <> struct flag_traits<kmp_uint32> {
631   typedef kmp_uint32 flag_t;
632   static const flag_type t = flag32;
633   static inline flag_t tcr(flag_t f) { return TCR_4(f); }
634   static inline flag_t test_then_add4(volatile flag_t *f) {
635     return KMP_TEST_THEN_ADD4_32(RCAST(volatile kmp_int32 *, f));
636   }
637   static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
638     return KMP_TEST_THEN_OR32(f, v);
639   }
640   static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
641     return KMP_TEST_THEN_AND32(f, v);
642   }
643 };
644 
645 template <> struct flag_traits<kmp_uint64> {
646   typedef kmp_uint64 flag_t;
647   static const flag_type t = flag64;
648   static inline flag_t tcr(flag_t f) { return TCR_8(f); }
649   static inline flag_t test_then_add4(volatile flag_t *f) {
650     return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f));
651   }
652   static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
653     return KMP_TEST_THEN_OR64(f, v);
654   }
655   static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
656     return KMP_TEST_THEN_AND64(f, v);
657   }
658 };
659 
660 // Basic flag that does not use C11 Atomics
661 template <typename FlagType, bool Sleepable>
662 class kmp_basic_flag_native : public kmp_flag_native<FlagType> {
663   typedef flag_traits<FlagType> traits_type;
664   FlagType checker; /**< Value to compare flag to to check if flag has been
665                        released. */
666   kmp_info_t
667       *waiting_threads[1]; /**< Array of threads sleeping on this thread. */
668   kmp_uint32
669       num_waiting_threads; /**< Number of threads sleeping on this thread. */
670 public:
671   kmp_basic_flag_native(volatile FlagType *p)
672       : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
673   kmp_basic_flag_native(volatile FlagType *p, kmp_info_t *thr)
674       : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(1) {
675     waiting_threads[0] = thr;
676   }
677   kmp_basic_flag_native(volatile FlagType *p, FlagType c)
678       : kmp_flag_native<FlagType>(p, traits_type::t), checker(c),
679         num_waiting_threads(0) {}
680   /*!
681    * param i in   index into waiting_threads
682    * @result the thread that is waiting at index i
683    */
684   kmp_info_t *get_waiter(kmp_uint32 i) {
685     KMP_DEBUG_ASSERT(i < num_waiting_threads);
686     return waiting_threads[i];
687   }
688   /*!
689    * @result num_waiting_threads
690    */
691   kmp_uint32 get_num_waiters() { return num_waiting_threads; }
692   /*!
693    * @param thr in   the thread which is now waiting
694    *
695    * Insert a waiting thread at index 0.
696    */
697   void set_waiter(kmp_info_t *thr) {
698     waiting_threads[0] = thr;
699     num_waiting_threads = 1;
700   }
701   /*!
702    * @result true if the flag object has been released.
703    */
704   bool done_check() {
705     if (Sleepable)
706       return (traits_type::tcr(*(this->get())) & ~KMP_BARRIER_SLEEP_STATE) ==
707              checker;
708     else
709       return traits_type::tcr(*(this->get())) == checker;
710   }
711   /*!
712    * @param old_loc in   old value of flag
713    * @result true if the flag's old value indicates it was released.
714    */
715   bool done_check_val(FlagType old_loc) { return old_loc == checker; }
716   /*!
717    * @result true if the flag object is not yet released.
718    * Used in __kmp_wait_template like:
719    * @code
720    * while (flag.notdone_check()) { pause(); }
721    * @endcode
722    */
723   bool notdone_check() { return traits_type::tcr(*(this->get())) != checker; }
724   /*!
725    * @result Actual flag value before release was applied.
726    * Trigger all waiting threads to run by modifying flag to release state.
727    */
728   void internal_release() {
729     (void)traits_type::test_then_add4((volatile FlagType *)this->get());
730   }
731   /*!
732    * @result Actual flag value before sleep bit(s) set.
733    * Notes that there is at least one thread sleeping on the flag by setting
734    * sleep bit(s).
735    */
736   FlagType set_sleeping() {
737     return traits_type::test_then_or((volatile FlagType *)this->get(),
738                                      KMP_BARRIER_SLEEP_STATE);
739   }
740   /*!
741    * @result Actual flag value before sleep bit(s) cleared.
742    * Notes that there are no longer threads sleeping on the flag by clearing
743    * sleep bit(s).
744    */
745   FlagType unset_sleeping() {
746     return traits_type::test_then_and((volatile FlagType *)this->get(),
747                                       ~KMP_BARRIER_SLEEP_STATE);
748   }
749   /*!
750    * @param old_loc in   old value of flag
751    * Test whether there are threads sleeping on the flag's old value in old_loc.
752    */
753   bool is_sleeping_val(FlagType old_loc) {
754     return old_loc & KMP_BARRIER_SLEEP_STATE;
755   }
756   /*!
757    * Test whether there are threads sleeping on the flag.
758    */
759   bool is_sleeping() { return is_sleeping_val(*(this->get())); }
760   bool is_any_sleeping() { return is_sleeping_val(*(this->get())); }
761   kmp_uint8 *get_stolen() { return NULL; }
762   enum barrier_type get_bt() { return bs_last_barrier; }
763 };
764 
765 template <typename FlagType, bool Sleepable>
766 class kmp_basic_flag : public kmp_flag<FlagType> {
767   typedef flag_traits<FlagType> traits_type;
768   FlagType checker; /**< Value to compare flag to to check if flag has been
769                        released. */
770   kmp_info_t
771       *waiting_threads[1]; /**< Array of threads sleeping on this thread. */
772   kmp_uint32
773       num_waiting_threads; /**< Number of threads sleeping on this thread. */
774 public:
775   kmp_basic_flag(std::atomic<FlagType> *p)
776       : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
777   kmp_basic_flag(std::atomic<FlagType> *p, kmp_info_t *thr)
778       : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(1) {
779     waiting_threads[0] = thr;
780   }
781   kmp_basic_flag(std::atomic<FlagType> *p, FlagType c)
782       : kmp_flag<FlagType>(p, traits_type::t), checker(c),
783         num_waiting_threads(0) {}
784   /*!
785    * param i in   index into waiting_threads
786    * @result the thread that is waiting at index i
787    */
788   kmp_info_t *get_waiter(kmp_uint32 i) {
789     KMP_DEBUG_ASSERT(i < num_waiting_threads);
790     return waiting_threads[i];
791   }
792   /*!
793    * @result num_waiting_threads
794    */
795   kmp_uint32 get_num_waiters() { return num_waiting_threads; }
796   /*!
797    * @param thr in   the thread which is now waiting
798    *
799    * Insert a waiting thread at index 0.
800    */
801   void set_waiter(kmp_info_t *thr) {
802     waiting_threads[0] = thr;
803     num_waiting_threads = 1;
804   }
805   /*!
806    * @result true if the flag object has been released.
807    */
808   bool done_check() {
809     if (Sleepable)
810       return (this->load() & ~KMP_BARRIER_SLEEP_STATE) == checker;
811     else
812       return this->load() == checker;
813   }
814   /*!
815    * @param old_loc in   old value of flag
816    * @result true if the flag's old value indicates it was released.
817    */
818   bool done_check_val(FlagType old_loc) { return old_loc == checker; }
819   /*!
820    * @result true if the flag object is not yet released.
821    * Used in __kmp_wait_template like:
822    * @code
823    * while (flag.notdone_check()) { pause(); }
824    * @endcode
825    */
826   bool notdone_check() { return this->load() != checker; }
827   /*!
828    * @result Actual flag value before release was applied.
829    * Trigger all waiting threads to run by modifying flag to release state.
830    */
831   void internal_release() { KMP_ATOMIC_ADD(this->get(), 4); }
832   /*!
833    * @result Actual flag value before sleep bit(s) set.
834    * Notes that there is at least one thread sleeping on the flag by setting
835    * sleep bit(s).
836    */
837   FlagType set_sleeping() {
838     return KMP_ATOMIC_OR(this->get(), KMP_BARRIER_SLEEP_STATE);
839   }
840   /*!
841    * @result Actual flag value before sleep bit(s) cleared.
842    * Notes that there are no longer threads sleeping on the flag by clearing
843    * sleep bit(s).
844    */
845   FlagType unset_sleeping() {
846     return KMP_ATOMIC_AND(this->get(), ~KMP_BARRIER_SLEEP_STATE);
847   }
848   /*!
849    * @param old_loc in   old value of flag
850    * Test whether there are threads sleeping on the flag's old value in old_loc.
851    */
852   bool is_sleeping_val(FlagType old_loc) {
853     return old_loc & KMP_BARRIER_SLEEP_STATE;
854   }
855   /*!
856    * Test whether there are threads sleeping on the flag.
857    */
858   bool is_sleeping() { return is_sleeping_val(this->load()); }
859   bool is_any_sleeping() { return is_sleeping_val(this->load()); }
860   kmp_uint8 *get_stolen() { return NULL; }
861   enum barrier_type get_bt() { return bs_last_barrier; }
862 };
863 
864 template <bool Cancellable, bool Sleepable>
865 class kmp_flag_32 : public kmp_basic_flag<kmp_uint32, Sleepable> {
866 public:
867   kmp_flag_32(std::atomic<kmp_uint32> *p)
868       : kmp_basic_flag<kmp_uint32, Sleepable>(p) {}
869   kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_info_t *thr)
870       : kmp_basic_flag<kmp_uint32, Sleepable>(p, thr) {}
871   kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_uint32 c)
872       : kmp_basic_flag<kmp_uint32, Sleepable>(p, c) {}
873   void suspend(int th_gtid) { __kmp_suspend_32(th_gtid, this); }
874 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
875   void mwait(int th_gtid) { __kmp_mwait_32(th_gtid, this); }
876 #endif
877   void resume(int th_gtid) { __kmp_resume_32(th_gtid, this); }
878   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
879                     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
880                     kmp_int32 is_constrained) {
881     return __kmp_execute_tasks_32(
882         this_thr, gtid, this, final_spin,
883         thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
884   }
885   bool wait(kmp_info_t *this_thr,
886             int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
887     if (final_spin)
888       return __kmp_wait_template<kmp_flag_32, TRUE, Cancellable, Sleepable>(
889           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
890     else
891       return __kmp_wait_template<kmp_flag_32, FALSE, Cancellable, Sleepable>(
892           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
893   }
894   void release() { __kmp_release_template(this); }
895   flag_type get_ptr_type() { return flag32; }
896 };
897 
898 template <bool Cancellable, bool Sleepable>
899 class kmp_flag_64 : public kmp_basic_flag_native<kmp_uint64, Sleepable> {
900 public:
901   kmp_flag_64(volatile kmp_uint64 *p)
902       : kmp_basic_flag_native<kmp_uint64, Sleepable>(p) {}
903   kmp_flag_64(volatile kmp_uint64 *p, kmp_info_t *thr)
904       : kmp_basic_flag_native<kmp_uint64, Sleepable>(p, thr) {}
905   kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c)
906       : kmp_basic_flag_native<kmp_uint64, Sleepable>(p, c) {}
907   void suspend(int th_gtid) { __kmp_suspend_64(th_gtid, this); }
908 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
909   void mwait(int th_gtid) { __kmp_mwait_64(th_gtid, this); }
910 #endif
911   void resume(int th_gtid) { __kmp_resume_64(th_gtid, this); }
912   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
913                     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
914                     kmp_int32 is_constrained) {
915     return __kmp_execute_tasks_64(
916         this_thr, gtid, this, final_spin,
917         thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
918   }
919   bool wait(kmp_info_t *this_thr,
920             int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
921     if (final_spin)
922       return __kmp_wait_template<kmp_flag_64, TRUE, Cancellable, Sleepable>(
923           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
924     else
925       return __kmp_wait_template<kmp_flag_64, FALSE, Cancellable, Sleepable>(
926           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
927   }
928   void release() { __kmp_release_template(this); }
929   flag_type get_ptr_type() { return flag64; }
930 };
931 
932 // Hierarchical 64-bit on-core barrier instantiation
933 class kmp_flag_oncore : public kmp_flag_native<kmp_uint64> {
934   kmp_uint64 checker;
935   kmp_info_t *waiting_threads[1];
936   kmp_uint32 num_waiting_threads;
937   kmp_uint32
938       offset; /**< Portion of flag that is of interest for an operation. */
939   bool flag_switch; /**< Indicates a switch in flag location. */
940   enum barrier_type bt; /**< Barrier type. */
941   kmp_info_t *this_thr; /**< Thread that may be redirected to different flag
942                            location. */
943 #if USE_ITT_BUILD
944   void *
945       itt_sync_obj; /**< ITT object that must be passed to new flag location. */
946 #endif
947   unsigned char &byteref(volatile kmp_uint64 *loc, size_t offset) {
948     return (RCAST(unsigned char *, CCAST(kmp_uint64 *, loc)))[offset];
949   }
950 
951 public:
952   kmp_flag_oncore(volatile kmp_uint64 *p)
953       : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
954         flag_switch(false) {}
955   kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint32 idx)
956       : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
957         offset(idx), flag_switch(false) {}
958   kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint64 c, kmp_uint32 idx,
959                   enum barrier_type bar_t,
960                   kmp_info_t *thr USE_ITT_BUILD_ARG(void *itt))
961       : kmp_flag_native<kmp_uint64>(p, flag_oncore), checker(c),
962         num_waiting_threads(0), offset(idx), flag_switch(false), bt(bar_t),
963         this_thr(thr) USE_ITT_BUILD_ARG(itt_sync_obj(itt)) {}
964   kmp_info_t *get_waiter(kmp_uint32 i) {
965     KMP_DEBUG_ASSERT(i < num_waiting_threads);
966     return waiting_threads[i];
967   }
968   kmp_uint32 get_num_waiters() { return num_waiting_threads; }
969   void set_waiter(kmp_info_t *thr) {
970     waiting_threads[0] = thr;
971     num_waiting_threads = 1;
972   }
973   bool done_check_val(kmp_uint64 old_loc) {
974     return byteref(&old_loc, offset) == checker;
975   }
976   bool done_check() { return done_check_val(*get()); }
977   bool notdone_check() {
978     // Calculate flag_switch
979     if (this_thr->th.th_bar[bt].bb.wait_flag == KMP_BARRIER_SWITCH_TO_OWN_FLAG)
980       flag_switch = true;
981     if (byteref(get(), offset) != 1 && !flag_switch)
982       return true;
983     else if (flag_switch) {
984       this_thr->th.th_bar[bt].bb.wait_flag = KMP_BARRIER_SWITCHING;
985       kmp_flag_64<> flag(&this_thr->th.th_bar[bt].bb.b_go,
986                        (kmp_uint64)KMP_BARRIER_STATE_BUMP);
987       __kmp_wait_64(this_thr, &flag, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
988     }
989     return false;
990   }
991   void internal_release() {
992     // Other threads can write their own bytes simultaneously.
993     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
994       byteref(get(), offset) = 1;
995     } else {
996       kmp_uint64 mask = 0;
997       byteref(&mask, offset) = 1;
998       KMP_TEST_THEN_OR64(get(), mask);
999     }
1000   }
1001   kmp_uint64 set_sleeping() {
1002     return KMP_TEST_THEN_OR64(get(), KMP_BARRIER_SLEEP_STATE);
1003   }
1004   kmp_uint64 unset_sleeping() {
1005     return KMP_TEST_THEN_AND64(get(), ~KMP_BARRIER_SLEEP_STATE);
1006   }
1007   bool is_sleeping_val(kmp_uint64 old_loc) {
1008     return old_loc & KMP_BARRIER_SLEEP_STATE;
1009   }
1010   bool is_sleeping() { return is_sleeping_val(*get()); }
1011   bool is_any_sleeping() { return is_sleeping_val(*get()); }
1012   void wait(kmp_info_t *this_thr, int final_spin) {
1013     if (final_spin)
1014       __kmp_wait_template<kmp_flag_oncore, TRUE>(
1015           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
1016     else
1017       __kmp_wait_template<kmp_flag_oncore, FALSE>(
1018           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
1019   }
1020   void release() { __kmp_release_template(this); }
1021   void suspend(int th_gtid) { __kmp_suspend_oncore(th_gtid, this); }
1022 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
1023   void mwait(int th_gtid) { __kmp_mwait_oncore(th_gtid, this); }
1024 #endif
1025   void resume(int th_gtid) { __kmp_resume_oncore(th_gtid, this); }
1026   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
1027                     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
1028                     kmp_int32 is_constrained) {
1029     return __kmp_execute_tasks_oncore(
1030         this_thr, gtid, this, final_spin,
1031         thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
1032   }
1033   kmp_uint8 *get_stolen() { return NULL; }
1034   enum barrier_type get_bt() { return bt; }
1035   flag_type get_ptr_type() { return flag_oncore; }
1036 };
1037 
1038 // Used to wake up threads, volatile void* flag is usually the th_sleep_loc
1039 // associated with int gtid.
1040 static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) {
1041   if (!flag)
1042     return;
1043 
1044   switch (RCAST(kmp_flag_64<> *, CCAST(void *, flag))->get_type()) {
1045   case flag32:
1046     __kmp_resume_32(gtid, (kmp_flag_32<> *)NULL);
1047     break;
1048   case flag64:
1049     __kmp_resume_64(gtid, (kmp_flag_64<> *)NULL);
1050     break;
1051   case flag_oncore:
1052     __kmp_resume_oncore(gtid, (kmp_flag_oncore *)NULL);
1053     break;
1054   }
1055 }
1056 
1057 /*!
1058 @}
1059 */
1060 
1061 #endif // KMP_WAIT_RELEASE_H
1062