xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_wait_release.h (revision e92ffd9b626833ebdbf2742c8ffddc6cd94b963e)
1 /*
2  * kmp_wait_release.h -- Wait/Release implementation
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_WAIT_RELEASE_H
14 #define KMP_WAIT_RELEASE_H
15 
16 #include "kmp.h"
17 #include "kmp_itt.h"
18 #include "kmp_stats.h"
19 #if OMPT_SUPPORT
20 #include "ompt-specific.h"
21 #endif
22 
23 /*!
24 @defgroup WAIT_RELEASE Wait/Release operations
25 
26 The definitions and functions here implement the lowest level thread
27 synchronizations of suspending a thread and awaking it. They are used to build
28 higher level operations such as barriers and fork/join.
29 */
30 
31 /*!
32 @ingroup WAIT_RELEASE
33 @{
34 */
35 
36 /*!
37  * The flag_type describes the storage used for the flag.
38  */
39 enum flag_type {
40   flag32, /**< 32 bit flags */
41   flag64, /**< 64 bit flags */
42   flag_oncore /**< special 64-bit flag for on-core barrier (hierarchical) */
43 };
44 
45 struct flag_properties {
46   unsigned int type : 16;
47   unsigned int reserved : 16;
48 };
49 
50 /*!
51  * Base class for wait/release volatile flag
52  */
53 template <typename P> class kmp_flag_native {
54   volatile P *loc;
55   flag_properties t;
56 
57 public:
58   typedef P flag_t;
59   kmp_flag_native(volatile P *p, flag_type ft)
60       : loc(p), t({(short unsigned int)ft, 0U}) {}
61   volatile P *get() { return loc; }
62   void *get_void_p() { return RCAST(void *, CCAST(P *, loc)); }
63   void set(volatile P *new_loc) { loc = new_loc; }
64   flag_type get_type() { return (flag_type)(t.type); }
65   P load() { return *loc; }
66   void store(P val) { *loc = val; }
67 };
68 
69 /*!
70  * Base class for wait/release atomic flag
71  */
72 template <typename P> class kmp_flag {
73   std::atomic<P>
74       *loc; /**< Pointer to the flag storage that is modified by another thread
75              */
76   flag_properties t; /**< "Type" of the flag in loc */
77 public:
78   typedef P flag_t;
79   kmp_flag(std::atomic<P> *p, flag_type ft)
80       : loc(p), t({(short unsigned int)ft, 0U}) {}
81   /*!
82    * @result the pointer to the actual flag
83    */
84   std::atomic<P> *get() { return loc; }
85   /*!
86    * @result void* pointer to the actual flag
87    */
88   void *get_void_p() { return RCAST(void *, loc); }
89   /*!
90    * @param new_loc in   set loc to point at new_loc
91    */
92   void set(std::atomic<P> *new_loc) { loc = new_loc; }
93   /*!
94    * @result the flag_type
95    */
96   flag_type get_type() { return (flag_type)(t.type); }
97   /*!
98    * @result flag value
99    */
100   P load() { return loc->load(std::memory_order_acquire); }
101   /*!
102    * @param val the new flag value to be stored
103    */
104   void store(P val) { loc->store(val, std::memory_order_release); }
105   // Derived classes must provide the following:
106   /*
107   kmp_info_t * get_waiter(kmp_uint32 i);
108   kmp_uint32 get_num_waiters();
109   bool done_check();
110   bool done_check_val(P old_loc);
111   bool notdone_check();
112   P internal_release();
113   void suspend(int th_gtid);
114   void mwait(int th_gtid);
115   void resume(int th_gtid);
116   P set_sleeping();
117   P unset_sleeping();
118   bool is_sleeping();
119   bool is_any_sleeping();
120   bool is_sleeping_val(P old_loc);
121   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
122                     int *thread_finished
123                     USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32
124                     is_constrained);
125   */
126 };
127 
128 #if OMPT_SUPPORT
129 OMPT_NOINLINE
130 static void __ompt_implicit_task_end(kmp_info_t *this_thr,
131                                      ompt_state_t ompt_state,
132                                      ompt_data_t *tId) {
133   int ds_tid = this_thr->th.th_info.ds.ds_tid;
134   if (ompt_state == ompt_state_wait_barrier_implicit) {
135     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
136 #if OMPT_OPTIONAL
137     void *codeptr = NULL;
138     if (ompt_enabled.ompt_callback_sync_region_wait) {
139       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
140           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, tId,
141           codeptr);
142     }
143     if (ompt_enabled.ompt_callback_sync_region) {
144       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
145           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, tId,
146           codeptr);
147     }
148 #endif
149     if (!KMP_MASTER_TID(ds_tid)) {
150       if (ompt_enabled.ompt_callback_implicit_task) {
151         int flags = this_thr->th.ompt_thread_info.parallel_flags;
152         flags = (flags & ompt_parallel_league) ? ompt_task_initial
153                                                : ompt_task_implicit;
154         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
155             ompt_scope_end, NULL, tId, 0, ds_tid, flags);
156       }
157       // return to idle state
158       this_thr->th.ompt_thread_info.state = ompt_state_idle;
159     } else {
160       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
161     }
162   }
163 }
164 #endif
165 
166 /* Spin wait loop that first does pause/yield, then sleep. A thread that calls
167    __kmp_wait_*  must make certain that another thread calls __kmp_release
168    to wake it back up to prevent deadlocks!
169 
170    NOTE: We may not belong to a team at this point.  */
171 template <class C, bool final_spin, bool Cancellable = false,
172           bool Sleepable = true>
173 static inline bool
174 __kmp_wait_template(kmp_info_t *this_thr,
175                     C *flag USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
176 #if USE_ITT_BUILD && USE_ITT_NOTIFY
177   volatile void *spin = flag->get();
178 #endif
179   kmp_uint32 spins;
180   int th_gtid;
181   int tasks_completed = FALSE;
182 #if !KMP_USE_MONITOR
183   kmp_uint64 poll_count;
184   kmp_uint64 hibernate_goal;
185 #else
186   kmp_uint32 hibernate;
187 #endif
188 
189   KMP_FSYNC_SPIN_INIT(spin, NULL);
190   if (flag->done_check()) {
191     KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin));
192     return false;
193   }
194   th_gtid = this_thr->th.th_info.ds.ds_gtid;
195   if (Cancellable) {
196     kmp_team_t *team = this_thr->th.th_team;
197     if (team && team->t.t_cancel_request == cancel_parallel)
198       return true;
199   }
200 #if KMP_OS_UNIX
201   if (final_spin)
202     KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true);
203 #endif
204   KA_TRACE(20,
205            ("__kmp_wait_sleep: T#%d waiting for flag(%p)\n", th_gtid, flag));
206 #if KMP_STATS_ENABLED
207   stats_state_e thread_state = KMP_GET_THREAD_STATE();
208 #endif
209 
210 /* OMPT Behavior:
211 THIS function is called from
212   __kmp_barrier (2 times)  (implicit or explicit barrier in parallel regions)
213             these have join / fork behavior
214 
215        In these cases, we don't change the state or trigger events in THIS
216 function.
217        Events are triggered in the calling code (__kmp_barrier):
218 
219                 state := ompt_state_overhead
220             barrier-begin
221             barrier-wait-begin
222                 state := ompt_state_wait_barrier
223           call join-barrier-implementation (finally arrive here)
224           {}
225           call fork-barrier-implementation (finally arrive here)
226           {}
227                 state := ompt_state_overhead
228             barrier-wait-end
229             barrier-end
230                 state := ompt_state_work_parallel
231 
232 
233   __kmp_fork_barrier  (after thread creation, before executing implicit task)
234           call fork-barrier-implementation (finally arrive here)
235           {} // worker arrive here with state = ompt_state_idle
236 
237 
238   __kmp_join_barrier  (implicit barrier at end of parallel region)
239                 state := ompt_state_barrier_implicit
240             barrier-begin
241             barrier-wait-begin
242           call join-barrier-implementation (finally arrive here
243 final_spin=FALSE)
244           {
245           }
246   __kmp_fork_barrier  (implicit barrier at end of parallel region)
247           call fork-barrier-implementation (finally arrive here final_spin=TRUE)
248 
249        Worker after task-team is finished:
250             barrier-wait-end
251             barrier-end
252             implicit-task-end
253             idle-begin
254                 state := ompt_state_idle
255 
256        Before leaving, if state = ompt_state_idle
257             idle-end
258                 state := ompt_state_overhead
259 */
260 #if OMPT_SUPPORT
261   ompt_state_t ompt_entry_state;
262   ompt_data_t *tId;
263   if (ompt_enabled.enabled) {
264     ompt_entry_state = this_thr->th.ompt_thread_info.state;
265     if (!final_spin || ompt_entry_state != ompt_state_wait_barrier_implicit ||
266         KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid)) {
267       ompt_lw_taskteam_t *team =
268           this_thr->th.th_team->t.ompt_serialized_team_info;
269       if (team) {
270         tId = &(team->ompt_task_info.task_data);
271       } else {
272         tId = OMPT_CUR_TASK_DATA(this_thr);
273       }
274     } else {
275       tId = &(this_thr->th.ompt_thread_info.task_data);
276     }
277     if (final_spin && (__kmp_tasking_mode == tskm_immediate_exec ||
278                        this_thr->th.th_task_team == NULL)) {
279       // implicit task is done. Either no taskqueue, or task-team finished
280       __ompt_implicit_task_end(this_thr, ompt_entry_state, tId);
281     }
282   }
283 #endif
284 
285   KMP_INIT_YIELD(spins); // Setup for waiting
286 
287   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
288       __kmp_pause_status == kmp_soft_paused) {
289 #if KMP_USE_MONITOR
290 // The worker threads cannot rely on the team struct existing at this point.
291 // Use the bt values cached in the thread struct instead.
292 #ifdef KMP_ADJUST_BLOCKTIME
293     if (__kmp_pause_status == kmp_soft_paused ||
294         (__kmp_zero_bt && !this_thr->th.th_team_bt_set))
295       // Force immediate suspend if not set by user and more threads than
296       // available procs
297       hibernate = 0;
298     else
299       hibernate = this_thr->th.th_team_bt_intervals;
300 #else
301     hibernate = this_thr->th.th_team_bt_intervals;
302 #endif /* KMP_ADJUST_BLOCKTIME */
303 
304     /* If the blocktime is nonzero, we want to make sure that we spin wait for
305        the entirety of the specified #intervals, plus up to one interval more.
306        This increment make certain that this thread doesn't go to sleep too
307        soon.  */
308     if (hibernate != 0)
309       hibernate++;
310 
311     // Add in the current time value.
312     hibernate += TCR_4(__kmp_global.g.g_time.dt.t_value);
313     KF_TRACE(20, ("__kmp_wait_sleep: T#%d now=%d, hibernate=%d, intervals=%d\n",
314                   th_gtid, __kmp_global.g.g_time.dt.t_value, hibernate,
315                   hibernate - __kmp_global.g.g_time.dt.t_value));
316 #else
317     if (__kmp_pause_status == kmp_soft_paused) {
318       // Force immediate suspend
319       hibernate_goal = KMP_NOW();
320     } else
321       hibernate_goal = KMP_NOW() + this_thr->th.th_team_bt_intervals;
322     poll_count = 0;
323     (void)poll_count;
324 #endif // KMP_USE_MONITOR
325   }
326 
327   KMP_MB();
328 
329   // Main wait spin loop
330   while (flag->notdone_check()) {
331     kmp_task_team_t *task_team = NULL;
332     if (__kmp_tasking_mode != tskm_immediate_exec) {
333       task_team = this_thr->th.th_task_team;
334       /* If the thread's task team pointer is NULL, it means one of 3 things:
335          1) A newly-created thread is first being released by
336          __kmp_fork_barrier(), and its task team has not been set up yet.
337          2) All tasks have been executed to completion.
338          3) Tasking is off for this region.  This could be because we are in a
339          serialized region (perhaps the outer one), or else tasking was manually
340          disabled (KMP_TASKING=0).  */
341       if (task_team != NULL) {
342         if (TCR_SYNC_4(task_team->tt.tt_active)) {
343           if (KMP_TASKING_ENABLED(task_team))
344             flag->execute_tasks(
345                 this_thr, th_gtid, final_spin,
346                 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
347           else
348             this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
349         } else {
350           KMP_DEBUG_ASSERT(!KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid));
351 #if OMPT_SUPPORT
352           // task-team is done now, other cases should be catched above
353           if (final_spin && ompt_enabled.enabled)
354             __ompt_implicit_task_end(this_thr, ompt_entry_state, tId);
355 #endif
356           this_thr->th.th_task_team = NULL;
357           this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
358         }
359       } else {
360         this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
361       } // if
362     } // if
363 
364     KMP_FSYNC_SPIN_PREPARE(CCAST(void *, spin));
365     if (TCR_4(__kmp_global.g.g_done)) {
366       if (__kmp_global.g.g_abort)
367         __kmp_abort_thread();
368       break;
369     }
370 
371     // If we are oversubscribed, or have waited a bit (and
372     // KMP_LIBRARY=throughput), then yield
373     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
374 
375 #if KMP_STATS_ENABLED
376     // Check if thread has been signalled to idle state
377     // This indicates that the logical "join-barrier" has finished
378     if (this_thr->th.th_stats->isIdle() &&
379         KMP_GET_THREAD_STATE() == FORK_JOIN_BARRIER) {
380       KMP_SET_THREAD_STATE(IDLE);
381       KMP_PUSH_PARTITIONED_TIMER(OMP_idle);
382     }
383 #endif
384     // Check if the barrier surrounding this wait loop has been cancelled
385     if (Cancellable) {
386       kmp_team_t *team = this_thr->th.th_team;
387       if (team && team->t.t_cancel_request == cancel_parallel)
388         break;
389     }
390 
391     // For hidden helper thread, if task_team is nullptr, it means the main
392     // thread has not released the barrier. We cannot wait here because once the
393     // main thread releases all children barriers, all hidden helper threads are
394     // still sleeping. This leads to a problem that following configuration,
395     // such as task team sync, will not be performed such that this thread does
396     // not have task team. Usually it is not bad. However, a corner case is,
397     // when the first task encountered is an untied task, the check in
398     // __kmp_task_alloc will crash because it uses the task team pointer without
399     // checking whether it is nullptr. It is probably under some kind of
400     // assumption.
401     if (task_team && KMP_HIDDEN_HELPER_WORKER_THREAD(th_gtid) &&
402         !TCR_4(__kmp_hidden_helper_team_done)) {
403       // If there is still hidden helper tasks to be executed, the hidden helper
404       // thread will not enter a waiting status.
405       if (KMP_ATOMIC_LD_ACQ(&__kmp_unexecuted_hidden_helper_tasks) == 0) {
406         __kmp_hidden_helper_worker_thread_wait();
407       }
408       continue;
409     }
410 
411     // Don't suspend if KMP_BLOCKTIME is set to "infinite"
412     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
413         __kmp_pause_status != kmp_soft_paused)
414       continue;
415 
416     // Don't suspend if there is a likelihood of new tasks being spawned.
417     if ((task_team != NULL) && TCR_4(task_team->tt.tt_found_tasks))
418       continue;
419 
420 #if KMP_USE_MONITOR
421     // If we have waited a bit more, fall asleep
422     if (TCR_4(__kmp_global.g.g_time.dt.t_value) < hibernate)
423       continue;
424 #else
425     if (KMP_BLOCKING(hibernate_goal, poll_count++))
426       continue;
427 #endif
428     // Don't suspend if wait loop designated non-sleepable
429     // in template parameters
430     if (!Sleepable)
431       continue;
432 
433     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
434         __kmp_pause_status != kmp_soft_paused)
435       continue;
436 
437 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
438     if (__kmp_mwait_enabled || __kmp_umwait_enabled) {
439       KF_TRACE(50, ("__kmp_wait_sleep: T#%d using monitor/mwait\n", th_gtid));
440       flag->mwait(th_gtid);
441     } else {
442 #endif
443       KF_TRACE(50, ("__kmp_wait_sleep: T#%d suspend time reached\n", th_gtid));
444 #if KMP_OS_UNIX
445       if (final_spin)
446         KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false);
447 #endif
448       flag->suspend(th_gtid);
449 #if KMP_OS_UNIX
450       if (final_spin)
451         KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true);
452 #endif
453 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
454     }
455 #endif
456 
457     if (TCR_4(__kmp_global.g.g_done)) {
458       if (__kmp_global.g.g_abort)
459         __kmp_abort_thread();
460       break;
461     } else if (__kmp_tasking_mode != tskm_immediate_exec &&
462                this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
463       this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
464     }
465     // TODO: If thread is done with work and times out, disband/free
466   }
467 
468 #if OMPT_SUPPORT
469   ompt_state_t ompt_exit_state = this_thr->th.ompt_thread_info.state;
470   if (ompt_enabled.enabled && ompt_exit_state != ompt_state_undefined) {
471 #if OMPT_OPTIONAL
472     if (final_spin) {
473       __ompt_implicit_task_end(this_thr, ompt_exit_state, tId);
474       ompt_exit_state = this_thr->th.ompt_thread_info.state;
475     }
476 #endif
477     if (ompt_exit_state == ompt_state_idle) {
478       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
479     }
480   }
481 #endif
482 #if KMP_STATS_ENABLED
483   // If we were put into idle state, pop that off the state stack
484   if (KMP_GET_THREAD_STATE() == IDLE) {
485     KMP_POP_PARTITIONED_TIMER();
486     KMP_SET_THREAD_STATE(thread_state);
487     this_thr->th.th_stats->resetIdleFlag();
488   }
489 #endif
490 
491 #if KMP_OS_UNIX
492   if (final_spin)
493     KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false);
494 #endif
495   KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin));
496   if (Cancellable) {
497     kmp_team_t *team = this_thr->th.th_team;
498     if (team && team->t.t_cancel_request == cancel_parallel) {
499       if (tasks_completed) {
500         // undo the previous decrement of unfinished_threads so that the
501         // thread can decrement at the join barrier with no problem
502         kmp_task_team_t *task_team = this_thr->th.th_task_team;
503         std::atomic<kmp_int32> *unfinished_threads =
504             &(task_team->tt.tt_unfinished_threads);
505         KMP_ATOMIC_INC(unfinished_threads);
506       }
507       return true;
508     }
509   }
510   return false;
511 }
512 
513 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
514 // Set up a monitor on the flag variable causing the calling thread to wait in
515 // a less active state until the flag variable is modified.
516 template <class C>
517 static inline void __kmp_mwait_template(int th_gtid, C *flag) {
518   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_mwait);
519   kmp_info_t *th = __kmp_threads[th_gtid];
520 
521   KF_TRACE(30, ("__kmp_mwait_template: T#%d enter for flag = %p\n", th_gtid,
522                 flag->get()));
523 
524   // User-level mwait is available
525   KMP_DEBUG_ASSERT(__kmp_mwait_enabled || __kmp_umwait_enabled);
526 
527   __kmp_suspend_initialize_thread(th);
528   __kmp_lock_suspend_mx(th);
529 
530   volatile void *spin = flag->get();
531   void *cacheline = (void *)(kmp_uintptr_t(spin) & ~(CACHE_LINE - 1));
532 
533   if (!flag->done_check()) {
534     // Mark thread as no longer active
535     th->th.th_active = FALSE;
536     if (th->th.th_active_in_pool) {
537       th->th.th_active_in_pool = FALSE;
538       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
539       KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
540     }
541     flag->set_sleeping();
542     KF_TRACE(50, ("__kmp_mwait_template: T#%d calling monitor\n", th_gtid));
543 #if KMP_HAVE_UMWAIT
544     if (__kmp_umwait_enabled) {
545       __kmp_umonitor(cacheline);
546     }
547 #elif KMP_HAVE_MWAIT
548     if (__kmp_mwait_enabled) {
549       __kmp_mm_monitor(cacheline, 0, 0);
550     }
551 #endif
552     // To avoid a race, check flag between 'monitor' and 'mwait'. A write to
553     // the address could happen after the last time we checked and before
554     // monitoring started, in which case monitor can't detect the change.
555     if (flag->done_check())
556       flag->unset_sleeping();
557     else {
558       // if flag changes here, wake-up happens immediately
559       TCW_PTR(th->th.th_sleep_loc, (void *)flag);
560       __kmp_unlock_suspend_mx(th);
561       KF_TRACE(50, ("__kmp_mwait_template: T#%d calling mwait\n", th_gtid));
562 #if KMP_HAVE_UMWAIT
563       if (__kmp_umwait_enabled) {
564         __kmp_umwait(1, 100); // to do: enable ctrl via hints, backoff counter
565       }
566 #elif KMP_HAVE_MWAIT
567       if (__kmp_mwait_enabled) {
568         __kmp_mm_mwait(0, __kmp_mwait_hints);
569       }
570 #endif
571       KF_TRACE(50, ("__kmp_mwait_template: T#%d mwait done\n", th_gtid));
572       __kmp_lock_suspend_mx(th);
573       // Clean up sleep info; doesn't matter how/why this thread stopped waiting
574       if (flag->is_sleeping())
575         flag->unset_sleeping();
576       TCW_PTR(th->th.th_sleep_loc, NULL);
577     }
578     // Mark thread as active again
579     th->th.th_active = TRUE;
580     if (TCR_4(th->th.th_in_pool)) {
581       KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
582       th->th.th_active_in_pool = TRUE;
583     }
584   } // Drop out to main wait loop to check flag, handle tasks, etc.
585   __kmp_unlock_suspend_mx(th);
586   KF_TRACE(30, ("__kmp_mwait_template: T#%d exit\n", th_gtid));
587 }
588 #endif // KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
589 
590 /* Release any threads specified as waiting on the flag by releasing the flag
591    and resume the waiting thread if indicated by the sleep bit(s). A thread that
592    calls __kmp_wait_template must call this function to wake up the potentially
593    sleeping thread and prevent deadlocks!  */
594 template <class C> static inline void __kmp_release_template(C *flag) {
595 #ifdef KMP_DEBUG
596   int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
597 #endif
598   KF_TRACE(20, ("__kmp_release: T#%d releasing flag(%x)\n", gtid, flag->get()));
599   KMP_DEBUG_ASSERT(flag->get());
600   KMP_FSYNC_RELEASING(flag->get_void_p());
601 
602   flag->internal_release();
603 
604   KF_TRACE(100, ("__kmp_release: T#%d set new spin=%d\n", gtid, flag->get(),
605                  flag->load()));
606 
607   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
608     // Only need to check sleep stuff if infinite block time not set.
609     // Are *any* threads waiting on flag sleeping?
610     if (flag->is_any_sleeping()) {
611       for (unsigned int i = 0; i < flag->get_num_waiters(); ++i) {
612         // if sleeping waiter exists at i, sets current_waiter to i inside flag
613         kmp_info_t *waiter = flag->get_waiter(i);
614         if (waiter) {
615           int wait_gtid = waiter->th.th_info.ds.ds_gtid;
616           // Wake up thread if needed
617           KF_TRACE(50, ("__kmp_release: T#%d waking up thread T#%d since sleep "
618                         "flag(%p) set\n",
619                         gtid, wait_gtid, flag->get()));
620           flag->resume(wait_gtid); // unsets flag's current_waiter when done
621         }
622       }
623     }
624   }
625 }
626 
627 template <typename FlagType> struct flag_traits {};
628 
629 template <> struct flag_traits<kmp_uint32> {
630   typedef kmp_uint32 flag_t;
631   static const flag_type t = flag32;
632   static inline flag_t tcr(flag_t f) { return TCR_4(f); }
633   static inline flag_t test_then_add4(volatile flag_t *f) {
634     return KMP_TEST_THEN_ADD4_32(RCAST(volatile kmp_int32 *, f));
635   }
636   static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
637     return KMP_TEST_THEN_OR32(f, v);
638   }
639   static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
640     return KMP_TEST_THEN_AND32(f, v);
641   }
642 };
643 
644 template <> struct flag_traits<kmp_uint64> {
645   typedef kmp_uint64 flag_t;
646   static const flag_type t = flag64;
647   static inline flag_t tcr(flag_t f) { return TCR_8(f); }
648   static inline flag_t test_then_add4(volatile flag_t *f) {
649     return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f));
650   }
651   static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
652     return KMP_TEST_THEN_OR64(f, v);
653   }
654   static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
655     return KMP_TEST_THEN_AND64(f, v);
656   }
657 };
658 
659 // Basic flag that does not use C11 Atomics
660 template <typename FlagType, bool Sleepable>
661 class kmp_basic_flag_native : public kmp_flag_native<FlagType> {
662   typedef flag_traits<FlagType> traits_type;
663   FlagType checker; /**< Value to compare flag to to check if flag has been
664                        released. */
665   kmp_info_t
666       *waiting_threads[1]; /**< Array of threads sleeping on this thread. */
667   kmp_uint32
668       num_waiting_threads; /**< Number of threads sleeping on this thread. */
669 public:
670   kmp_basic_flag_native(volatile FlagType *p)
671       : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
672   kmp_basic_flag_native(volatile FlagType *p, kmp_info_t *thr)
673       : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(1) {
674     waiting_threads[0] = thr;
675   }
676   kmp_basic_flag_native(volatile FlagType *p, FlagType c)
677       : kmp_flag_native<FlagType>(p, traits_type::t), checker(c),
678         num_waiting_threads(0) {}
679   /*!
680    * param i in   index into waiting_threads
681    * @result the thread that is waiting at index i
682    */
683   kmp_info_t *get_waiter(kmp_uint32 i) {
684     KMP_DEBUG_ASSERT(i < num_waiting_threads);
685     return waiting_threads[i];
686   }
687   /*!
688    * @result num_waiting_threads
689    */
690   kmp_uint32 get_num_waiters() { return num_waiting_threads; }
691   /*!
692    * @param thr in   the thread which is now waiting
693    *
694    * Insert a waiting thread at index 0.
695    */
696   void set_waiter(kmp_info_t *thr) {
697     waiting_threads[0] = thr;
698     num_waiting_threads = 1;
699   }
700   /*!
701    * @result true if the flag object has been released.
702    */
703   bool done_check() {
704     if (Sleepable)
705       return (traits_type::tcr(*(this->get())) & ~KMP_BARRIER_SLEEP_STATE) ==
706              checker;
707     else
708       return traits_type::tcr(*(this->get())) == checker;
709   }
710   /*!
711    * @param old_loc in   old value of flag
712    * @result true if the flag's old value indicates it was released.
713    */
714   bool done_check_val(FlagType old_loc) { return old_loc == checker; }
715   /*!
716    * @result true if the flag object is not yet released.
717    * Used in __kmp_wait_template like:
718    * @code
719    * while (flag.notdone_check()) { pause(); }
720    * @endcode
721    */
722   bool notdone_check() { return traits_type::tcr(*(this->get())) != checker; }
723   /*!
724    * @result Actual flag value before release was applied.
725    * Trigger all waiting threads to run by modifying flag to release state.
726    */
727   void internal_release() {
728     (void)traits_type::test_then_add4((volatile FlagType *)this->get());
729   }
730   /*!
731    * @result Actual flag value before sleep bit(s) set.
732    * Notes that there is at least one thread sleeping on the flag by setting
733    * sleep bit(s).
734    */
735   FlagType set_sleeping() {
736     return traits_type::test_then_or((volatile FlagType *)this->get(),
737                                      KMP_BARRIER_SLEEP_STATE);
738   }
739   /*!
740    * @result Actual flag value before sleep bit(s) cleared.
741    * Notes that there are no longer threads sleeping on the flag by clearing
742    * sleep bit(s).
743    */
744   FlagType unset_sleeping() {
745     return traits_type::test_then_and((volatile FlagType *)this->get(),
746                                       ~KMP_BARRIER_SLEEP_STATE);
747   }
748   /*!
749    * @param old_loc in   old value of flag
750    * Test whether there are threads sleeping on the flag's old value in old_loc.
751    */
752   bool is_sleeping_val(FlagType old_loc) {
753     return old_loc & KMP_BARRIER_SLEEP_STATE;
754   }
755   /*!
756    * Test whether there are threads sleeping on the flag.
757    */
758   bool is_sleeping() { return is_sleeping_val(*(this->get())); }
759   bool is_any_sleeping() { return is_sleeping_val(*(this->get())); }
760   kmp_uint8 *get_stolen() { return NULL; }
761   enum barrier_type get_bt() { return bs_last_barrier; }
762 };
763 
764 template <typename FlagType, bool Sleepable>
765 class kmp_basic_flag : public kmp_flag<FlagType> {
766   typedef flag_traits<FlagType> traits_type;
767   FlagType checker; /**< Value to compare flag to to check if flag has been
768                        released. */
769   kmp_info_t
770       *waiting_threads[1]; /**< Array of threads sleeping on this thread. */
771   kmp_uint32
772       num_waiting_threads; /**< Number of threads sleeping on this thread. */
773 public:
774   kmp_basic_flag(std::atomic<FlagType> *p)
775       : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
776   kmp_basic_flag(std::atomic<FlagType> *p, kmp_info_t *thr)
777       : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(1) {
778     waiting_threads[0] = thr;
779   }
780   kmp_basic_flag(std::atomic<FlagType> *p, FlagType c)
781       : kmp_flag<FlagType>(p, traits_type::t), checker(c),
782         num_waiting_threads(0) {}
783   /*!
784    * param i in   index into waiting_threads
785    * @result the thread that is waiting at index i
786    */
787   kmp_info_t *get_waiter(kmp_uint32 i) {
788     KMP_DEBUG_ASSERT(i < num_waiting_threads);
789     return waiting_threads[i];
790   }
791   /*!
792    * @result num_waiting_threads
793    */
794   kmp_uint32 get_num_waiters() { return num_waiting_threads; }
795   /*!
796    * @param thr in   the thread which is now waiting
797    *
798    * Insert a waiting thread at index 0.
799    */
800   void set_waiter(kmp_info_t *thr) {
801     waiting_threads[0] = thr;
802     num_waiting_threads = 1;
803   }
804   /*!
805    * @result true if the flag object has been released.
806    */
807   bool done_check() {
808     if (Sleepable)
809       return (this->load() & ~KMP_BARRIER_SLEEP_STATE) == checker;
810     else
811       return this->load() == checker;
812   }
813   /*!
814    * @param old_loc in   old value of flag
815    * @result true if the flag's old value indicates it was released.
816    */
817   bool done_check_val(FlagType old_loc) { return old_loc == checker; }
818   /*!
819    * @result true if the flag object is not yet released.
820    * Used in __kmp_wait_template like:
821    * @code
822    * while (flag.notdone_check()) { pause(); }
823    * @endcode
824    */
825   bool notdone_check() { return this->load() != checker; }
826   /*!
827    * @result Actual flag value before release was applied.
828    * Trigger all waiting threads to run by modifying flag to release state.
829    */
830   void internal_release() { KMP_ATOMIC_ADD(this->get(), 4); }
831   /*!
832    * @result Actual flag value before sleep bit(s) set.
833    * Notes that there is at least one thread sleeping on the flag by setting
834    * sleep bit(s).
835    */
836   FlagType set_sleeping() {
837     return KMP_ATOMIC_OR(this->get(), KMP_BARRIER_SLEEP_STATE);
838   }
839   /*!
840    * @result Actual flag value before sleep bit(s) cleared.
841    * Notes that there are no longer threads sleeping on the flag by clearing
842    * sleep bit(s).
843    */
844   FlagType unset_sleeping() {
845     return KMP_ATOMIC_AND(this->get(), ~KMP_BARRIER_SLEEP_STATE);
846   }
847   /*!
848    * @param old_loc in   old value of flag
849    * Test whether there are threads sleeping on the flag's old value in old_loc.
850    */
851   bool is_sleeping_val(FlagType old_loc) {
852     return old_loc & KMP_BARRIER_SLEEP_STATE;
853   }
854   /*!
855    * Test whether there are threads sleeping on the flag.
856    */
857   bool is_sleeping() { return is_sleeping_val(this->load()); }
858   bool is_any_sleeping() { return is_sleeping_val(this->load()); }
859   kmp_uint8 *get_stolen() { return NULL; }
860   enum barrier_type get_bt() { return bs_last_barrier; }
861 };
862 
863 template <bool Cancellable, bool Sleepable>
864 class kmp_flag_32 : public kmp_basic_flag<kmp_uint32, Sleepable> {
865 public:
866   kmp_flag_32(std::atomic<kmp_uint32> *p)
867       : kmp_basic_flag<kmp_uint32, Sleepable>(p) {}
868   kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_info_t *thr)
869       : kmp_basic_flag<kmp_uint32, Sleepable>(p, thr) {}
870   kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_uint32 c)
871       : kmp_basic_flag<kmp_uint32, Sleepable>(p, c) {}
872   void suspend(int th_gtid) { __kmp_suspend_32(th_gtid, this); }
873 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
874   void mwait(int th_gtid) { __kmp_mwait_32(th_gtid, this); }
875 #endif
876   void resume(int th_gtid) { __kmp_resume_32(th_gtid, this); }
877   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
878                     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
879                     kmp_int32 is_constrained) {
880     return __kmp_execute_tasks_32(
881         this_thr, gtid, this, final_spin,
882         thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
883   }
884   bool wait(kmp_info_t *this_thr,
885             int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
886     if (final_spin)
887       return __kmp_wait_template<kmp_flag_32, TRUE, Cancellable, Sleepable>(
888           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
889     else
890       return __kmp_wait_template<kmp_flag_32, FALSE, Cancellable, Sleepable>(
891           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
892   }
893   void release() { __kmp_release_template(this); }
894   flag_type get_ptr_type() { return flag32; }
895 };
896 
897 template <bool Cancellable, bool Sleepable>
898 class kmp_flag_64 : public kmp_basic_flag_native<kmp_uint64, Sleepable> {
899 public:
900   kmp_flag_64(volatile kmp_uint64 *p)
901       : kmp_basic_flag_native<kmp_uint64, Sleepable>(p) {}
902   kmp_flag_64(volatile kmp_uint64 *p, kmp_info_t *thr)
903       : kmp_basic_flag_native<kmp_uint64, Sleepable>(p, thr) {}
904   kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c)
905       : kmp_basic_flag_native<kmp_uint64, Sleepable>(p, c) {}
906   void suspend(int th_gtid) { __kmp_suspend_64(th_gtid, this); }
907 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
908   void mwait(int th_gtid) { __kmp_mwait_64(th_gtid, this); }
909 #endif
910   void resume(int th_gtid) { __kmp_resume_64(th_gtid, this); }
911   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
912                     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
913                     kmp_int32 is_constrained) {
914     return __kmp_execute_tasks_64(
915         this_thr, gtid, this, final_spin,
916         thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
917   }
918   bool wait(kmp_info_t *this_thr,
919             int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
920     if (final_spin)
921       return __kmp_wait_template<kmp_flag_64, TRUE, Cancellable, Sleepable>(
922           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
923     else
924       return __kmp_wait_template<kmp_flag_64, FALSE, Cancellable, Sleepable>(
925           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
926   }
927   void release() { __kmp_release_template(this); }
928   flag_type get_ptr_type() { return flag64; }
929 };
930 
931 // Hierarchical 64-bit on-core barrier instantiation
932 class kmp_flag_oncore : public kmp_flag_native<kmp_uint64> {
933   kmp_uint64 checker;
934   kmp_info_t *waiting_threads[1];
935   kmp_uint32 num_waiting_threads;
936   kmp_uint32
937       offset; /**< Portion of flag that is of interest for an operation. */
938   bool flag_switch; /**< Indicates a switch in flag location. */
939   enum barrier_type bt; /**< Barrier type. */
940   kmp_info_t *this_thr; /**< Thread that may be redirected to different flag
941                            location. */
942 #if USE_ITT_BUILD
943   void *
944       itt_sync_obj; /**< ITT object that must be passed to new flag location. */
945 #endif
946   unsigned char &byteref(volatile kmp_uint64 *loc, size_t offset) {
947     return (RCAST(unsigned char *, CCAST(kmp_uint64 *, loc)))[offset];
948   }
949 
950 public:
951   kmp_flag_oncore(volatile kmp_uint64 *p)
952       : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
953         flag_switch(false) {}
954   kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint32 idx)
955       : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
956         offset(idx), flag_switch(false) {}
957   kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint64 c, kmp_uint32 idx,
958                   enum barrier_type bar_t,
959                   kmp_info_t *thr USE_ITT_BUILD_ARG(void *itt))
960       : kmp_flag_native<kmp_uint64>(p, flag_oncore), checker(c),
961         num_waiting_threads(0), offset(idx), flag_switch(false), bt(bar_t),
962         this_thr(thr) USE_ITT_BUILD_ARG(itt_sync_obj(itt)) {}
963   kmp_info_t *get_waiter(kmp_uint32 i) {
964     KMP_DEBUG_ASSERT(i < num_waiting_threads);
965     return waiting_threads[i];
966   }
967   kmp_uint32 get_num_waiters() { return num_waiting_threads; }
968   void set_waiter(kmp_info_t *thr) {
969     waiting_threads[0] = thr;
970     num_waiting_threads = 1;
971   }
972   bool done_check_val(kmp_uint64 old_loc) {
973     return byteref(&old_loc, offset) == checker;
974   }
975   bool done_check() { return done_check_val(*get()); }
976   bool notdone_check() {
977     // Calculate flag_switch
978     if (this_thr->th.th_bar[bt].bb.wait_flag == KMP_BARRIER_SWITCH_TO_OWN_FLAG)
979       flag_switch = true;
980     if (byteref(get(), offset) != 1 && !flag_switch)
981       return true;
982     else if (flag_switch) {
983       this_thr->th.th_bar[bt].bb.wait_flag = KMP_BARRIER_SWITCHING;
984       kmp_flag_64<> flag(&this_thr->th.th_bar[bt].bb.b_go,
985                          (kmp_uint64)KMP_BARRIER_STATE_BUMP);
986       __kmp_wait_64(this_thr, &flag, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
987     }
988     return false;
989   }
990   void internal_release() {
991     // Other threads can write their own bytes simultaneously.
992     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
993       byteref(get(), offset) = 1;
994     } else {
995       kmp_uint64 mask = 0;
996       byteref(&mask, offset) = 1;
997       KMP_TEST_THEN_OR64(get(), mask);
998     }
999   }
1000   kmp_uint64 set_sleeping() {
1001     return KMP_TEST_THEN_OR64(get(), KMP_BARRIER_SLEEP_STATE);
1002   }
1003   kmp_uint64 unset_sleeping() {
1004     return KMP_TEST_THEN_AND64(get(), ~KMP_BARRIER_SLEEP_STATE);
1005   }
1006   bool is_sleeping_val(kmp_uint64 old_loc) {
1007     return old_loc & KMP_BARRIER_SLEEP_STATE;
1008   }
1009   bool is_sleeping() { return is_sleeping_val(*get()); }
1010   bool is_any_sleeping() { return is_sleeping_val(*get()); }
1011   void wait(kmp_info_t *this_thr, int final_spin) {
1012     if (final_spin)
1013       __kmp_wait_template<kmp_flag_oncore, TRUE>(
1014           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
1015     else
1016       __kmp_wait_template<kmp_flag_oncore, FALSE>(
1017           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
1018   }
1019   void release() { __kmp_release_template(this); }
1020   void suspend(int th_gtid) { __kmp_suspend_oncore(th_gtid, this); }
1021 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
1022   void mwait(int th_gtid) { __kmp_mwait_oncore(th_gtid, this); }
1023 #endif
1024   void resume(int th_gtid) { __kmp_resume_oncore(th_gtid, this); }
1025   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
1026                     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
1027                     kmp_int32 is_constrained) {
1028 #if OMPD_SUPPORT
1029     int ret = __kmp_execute_tasks_oncore(
1030         this_thr, gtid, this, final_spin,
1031         thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
1032     if (ompd_state & OMPD_ENABLE_BP)
1033       ompd_bp_task_end();
1034     return ret;
1035 #else
1036     return __kmp_execute_tasks_oncore(
1037         this_thr, gtid, this, final_spin,
1038         thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
1039 #endif
1040   }
1041   kmp_uint8 *get_stolen() { return NULL; }
1042   enum barrier_type get_bt() { return bt; }
1043   flag_type get_ptr_type() { return flag_oncore; }
1044 };
1045 
1046 // Used to wake up threads, volatile void* flag is usually the th_sleep_loc
1047 // associated with int gtid.
1048 static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) {
1049   if (!flag)
1050     return;
1051 
1052   switch (RCAST(kmp_flag_64<> *, CCAST(void *, flag))->get_type()) {
1053   case flag32:
1054     __kmp_resume_32(gtid, (kmp_flag_32<> *)NULL);
1055     break;
1056   case flag64:
1057     __kmp_resume_64(gtid, (kmp_flag_64<> *)NULL);
1058     break;
1059   case flag_oncore:
1060     __kmp_resume_oncore(gtid, (kmp_flag_oncore *)NULL);
1061     break;
1062   }
1063 }
1064 
1065 /*!
1066 @}
1067 */
1068 
1069 #endif // KMP_WAIT_RELEASE_H
1070