xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_tasking.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 #if ENABLE_LIBOMPTARGET
25 static void (*tgt_target_nowait_query)(void **);
26 
__kmp_init_target_task()27 void __kmp_init_target_task() {
28   *(void **)(&tgt_target_nowait_query) = KMP_DLSYM("__tgt_target_nowait_query");
29 }
30 #endif
31 
32 /* forward declaration */
33 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
34                                  kmp_info_t *this_thr);
35 static void __kmp_alloc_task_deque(kmp_info_t *thread,
36                                    kmp_thread_data_t *thread_data);
37 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
38                                            kmp_task_team_t *task_team);
39 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
40 #if OMPX_TASKGRAPH
41 static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
42 int __kmp_taskloop_task(int gtid, void *ptask);
43 #endif
44 
45 #ifdef BUILD_TIED_TASK_STACK
46 
47 //  __kmp_trace_task_stack: print the tied tasks from the task stack in order
48 //  from top do bottom
49 //
50 //  gtid: global thread identifier for thread containing stack
51 //  thread_data: thread data for task team thread containing stack
52 //  threshold: value above which the trace statement triggers
53 //  location: string identifying call site of this function (for trace)
__kmp_trace_task_stack(kmp_int32 gtid,kmp_thread_data_t * thread_data,int threshold,char * location)54 static void __kmp_trace_task_stack(kmp_int32 gtid,
55                                    kmp_thread_data_t *thread_data,
56                                    int threshold, char *location) {
57   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
58   kmp_taskdata_t **stack_top = task_stack->ts_top;
59   kmp_int32 entries = task_stack->ts_entries;
60   kmp_taskdata_t *tied_task;
61 
62   KA_TRACE(
63       threshold,
64       ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
65        "first_block = %p, stack_top = %p \n",
66        location, gtid, entries, task_stack->ts_first_block, stack_top));
67 
68   KMP_DEBUG_ASSERT(stack_top != NULL);
69   KMP_DEBUG_ASSERT(entries > 0);
70 
71   while (entries != 0) {
72     KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
73     // fix up ts_top if we need to pop from previous block
74     if (entries & TASK_STACK_INDEX_MASK == 0) {
75       kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
76 
77       stack_block = stack_block->sb_prev;
78       stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
79     }
80 
81     // finish bookkeeping
82     stack_top--;
83     entries--;
84 
85     tied_task = *stack_top;
86 
87     KMP_DEBUG_ASSERT(tied_task != NULL);
88     KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
89 
90     KA_TRACE(threshold,
91              ("__kmp_trace_task_stack(%s):             gtid=%d, entry=%d, "
92               "stack_top=%p, tied_task=%p\n",
93               location, gtid, entries, stack_top, tied_task));
94   }
95   KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
96 
97   KA_TRACE(threshold,
98            ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
99             location, gtid));
100 }
101 
102 //  __kmp_init_task_stack: initialize the task stack for the first time
103 //  after a thread_data structure is created.
104 //  It should not be necessary to do this again (assuming the stack works).
105 //
106 //  gtid: global thread identifier of calling thread
107 //  thread_data: thread data for task team thread containing stack
__kmp_init_task_stack(kmp_int32 gtid,kmp_thread_data_t * thread_data)108 static void __kmp_init_task_stack(kmp_int32 gtid,
109                                   kmp_thread_data_t *thread_data) {
110   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
111   kmp_stack_block_t *first_block;
112 
113   // set up the first block of the stack
114   first_block = &task_stack->ts_first_block;
115   task_stack->ts_top = (kmp_taskdata_t **)first_block;
116   memset((void *)first_block, '\0',
117          TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
118 
119   // initialize the stack to be empty
120   task_stack->ts_entries = TASK_STACK_EMPTY;
121   first_block->sb_next = NULL;
122   first_block->sb_prev = NULL;
123 }
124 
125 //  __kmp_free_task_stack: free the task stack when thread_data is destroyed.
126 //
127 //  gtid: global thread identifier for calling thread
128 //  thread_data: thread info for thread containing stack
__kmp_free_task_stack(kmp_int32 gtid,kmp_thread_data_t * thread_data)129 static void __kmp_free_task_stack(kmp_int32 gtid,
130                                   kmp_thread_data_t *thread_data) {
131   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
132   kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
133 
134   KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
135   // free from the second block of the stack
136   while (stack_block != NULL) {
137     kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
138 
139     stack_block->sb_next = NULL;
140     stack_block->sb_prev = NULL;
141     if (stack_block != &task_stack->ts_first_block) {
142       __kmp_thread_free(thread,
143                         stack_block); // free the block, if not the first
144     }
145     stack_block = next_block;
146   }
147   // initialize the stack to be empty
148   task_stack->ts_entries = 0;
149   task_stack->ts_top = NULL;
150 }
151 
152 //  __kmp_push_task_stack: Push the tied task onto the task stack.
153 //     Grow the stack if necessary by allocating another block.
154 //
155 //  gtid: global thread identifier for calling thread
156 //  thread: thread info for thread containing stack
157 //  tied_task: the task to push on the stack
__kmp_push_task_stack(kmp_int32 gtid,kmp_info_t * thread,kmp_taskdata_t * tied_task)158 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
159                                   kmp_taskdata_t *tied_task) {
160   // GEH - need to consider what to do if tt_threads_data not allocated yet
161   kmp_thread_data_t *thread_data =
162       &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
163   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
164 
165   if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
166     return; // Don't push anything on stack if team or team tasks are serialized
167   }
168 
169   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
170   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
171 
172   KA_TRACE(20,
173            ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
174             gtid, thread, tied_task));
175   // Store entry
176   *(task_stack->ts_top) = tied_task;
177 
178   // Do bookkeeping for next push
179   task_stack->ts_top++;
180   task_stack->ts_entries++;
181 
182   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
183     // Find beginning of this task block
184     kmp_stack_block_t *stack_block =
185         (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
186 
187     // Check if we already have a block
188     if (stack_block->sb_next !=
189         NULL) { // reset ts_top to beginning of next block
190       task_stack->ts_top = &stack_block->sb_next->sb_block[0];
191     } else { // Alloc new block and link it up
192       kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
193           thread, sizeof(kmp_stack_block_t));
194 
195       task_stack->ts_top = &new_block->sb_block[0];
196       stack_block->sb_next = new_block;
197       new_block->sb_prev = stack_block;
198       new_block->sb_next = NULL;
199 
200       KA_TRACE(
201           30,
202           ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
203            gtid, tied_task, new_block));
204     }
205   }
206   KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
207                 tied_task));
208 }
209 
210 //  __kmp_pop_task_stack: Pop the tied task from the task stack.  Don't return
211 //  the task, just check to make sure it matches the ending task passed in.
212 //
213 //  gtid: global thread identifier for the calling thread
214 //  thread: thread info structure containing stack
215 //  tied_task: the task popped off the stack
216 //  ending_task: the task that is ending (should match popped task)
__kmp_pop_task_stack(kmp_int32 gtid,kmp_info_t * thread,kmp_taskdata_t * ending_task)217 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
218                                  kmp_taskdata_t *ending_task) {
219   // GEH - need to consider what to do if tt_threads_data not allocated yet
220   kmp_thread_data_t *thread_data =
221       &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
222   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
223   kmp_taskdata_t *tied_task;
224 
225   if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
226     // Don't pop anything from stack if team or team tasks are serialized
227     return;
228   }
229 
230   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
231   KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
232 
233   KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
234                 thread));
235 
236   // fix up ts_top if we need to pop from previous block
237   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
238     kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
239 
240     stack_block = stack_block->sb_prev;
241     task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
242   }
243 
244   // finish bookkeeping
245   task_stack->ts_top--;
246   task_stack->ts_entries--;
247 
248   tied_task = *(task_stack->ts_top);
249 
250   KMP_DEBUG_ASSERT(tied_task != NULL);
251   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
252   KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
253 
254   KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
255                 tied_task));
256   return;
257 }
258 #endif /* BUILD_TIED_TASK_STACK */
259 
260 // returns 1 if new task is allowed to execute, 0 otherwise
261 // checks Task Scheduling constraint (if requested) and
262 // mutexinoutset dependencies if any
__kmp_task_is_allowed(int gtid,const kmp_int32 is_constrained,const kmp_taskdata_t * tasknew,const kmp_taskdata_t * taskcurr)263 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
264                                   const kmp_taskdata_t *tasknew,
265                                   const kmp_taskdata_t *taskcurr) {
266   if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
267     // Check if the candidate obeys the Task Scheduling Constraints (TSC)
268     // only descendant of all deferred tied tasks can be scheduled, checking
269     // the last one is enough, as it in turn is the descendant of all others
270     kmp_taskdata_t *current = taskcurr->td_last_tied;
271     KMP_DEBUG_ASSERT(current != NULL);
272     // check if the task is not suspended on barrier
273     if (current->td_flags.tasktype == TASK_EXPLICIT ||
274         current->td_taskwait_thread > 0) { // <= 0 on barrier
275       kmp_int32 level = current->td_level;
276       kmp_taskdata_t *parent = tasknew->td_parent;
277       while (parent != current && parent->td_level > level) {
278         // check generation up to the level of the current task
279         parent = parent->td_parent;
280         KMP_DEBUG_ASSERT(parent != NULL);
281       }
282       if (parent != current)
283         return false;
284     }
285   }
286   // Check mutexinoutset dependencies, acquire locks
287   kmp_depnode_t *node = tasknew->td_depnode;
288 #if OMPX_TASKGRAPH
289   if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
290 #else
291   if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
292 #endif
293     for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
294       KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
295       if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
296         continue;
297       // could not get the lock, release previous locks
298       for (int j = i - 1; j >= 0; --j)
299         __kmp_release_lock(node->dn.mtx_locks[j], gtid);
300       return false;
301     }
302     // negative num_locks means all locks acquired successfully
303     node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
304   }
305   return true;
306 }
307 
308 // __kmp_realloc_task_deque:
309 // Re-allocates a task deque for a particular thread, copies the content from
310 // the old deque and adjusts the necessary data structures relating to the
311 // deque. This operation must be done with the deque_lock being held
312 static void __kmp_realloc_task_deque(kmp_info_t *thread,
313                                      kmp_thread_data_t *thread_data) {
314   kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
315   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
316   kmp_int32 new_size = 2 * size;
317 
318   KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
319                 "%d] for thread_data %p\n",
320                 __kmp_gtid_from_thread(thread), size, new_size, thread_data));
321 
322   kmp_taskdata_t **new_deque =
323       (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
324 
325   int i, j;
326   for (i = thread_data->td.td_deque_head, j = 0; j < size;
327        i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
328     new_deque[j] = thread_data->td.td_deque[i];
329 
330   __kmp_free(thread_data->td.td_deque);
331 
332   thread_data->td.td_deque_head = 0;
333   thread_data->td.td_deque_tail = size;
334   thread_data->td.td_deque = new_deque;
335   thread_data->td.td_deque_size = new_size;
336 }
337 
338 static kmp_task_pri_t *__kmp_alloc_task_pri_list() {
339   kmp_task_pri_t *l = (kmp_task_pri_t *)__kmp_allocate(sizeof(kmp_task_pri_t));
340   kmp_thread_data_t *thread_data = &l->td;
341   __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
342   thread_data->td.td_deque_last_stolen = -1;
343   KE_TRACE(20, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] "
344                 "for thread_data %p\n",
345                 __kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data));
346   thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
347       INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
348   thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
349   return l;
350 }
351 
352 // The function finds the deque of priority tasks with given priority, or
353 // allocates a new deque and put it into sorted (high -> low) list of deques.
354 // Deques of non-default priority tasks are shared between all threads in team,
355 // as opposed to per-thread deques of tasks with default priority.
356 // The function is called under the lock task_team->tt.tt_task_pri_lock.
357 static kmp_thread_data_t *
358 __kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri) {
359   kmp_thread_data_t *thread_data;
360   kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
361   if (lst->priority == pri) {
362     // Found queue of tasks with given priority.
363     thread_data = &lst->td;
364   } else if (lst->priority < pri) {
365     // All current priority queues contain tasks with lower priority.
366     // Allocate new one for given priority tasks.
367     kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
368     thread_data = &list->td;
369     list->priority = pri;
370     list->next = lst;
371     task_team->tt.tt_task_pri_list = list;
372   } else { // task_team->tt.tt_task_pri_list->priority > pri
373     kmp_task_pri_t *next_queue = lst->next;
374     while (next_queue && next_queue->priority > pri) {
375       lst = next_queue;
376       next_queue = lst->next;
377     }
378     // lst->priority > pri && (next == NULL || pri >= next->priority)
379     if (next_queue == NULL) {
380       // No queue with pri priority, need to allocate new one.
381       kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
382       thread_data = &list->td;
383       list->priority = pri;
384       list->next = NULL;
385       lst->next = list;
386     } else if (next_queue->priority == pri) {
387       // Found queue of tasks with given priority.
388       thread_data = &next_queue->td;
389     } else { // lst->priority > pri > next->priority
390       // insert newly allocated between existed queues
391       kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
392       thread_data = &list->td;
393       list->priority = pri;
394       list->next = next_queue;
395       lst->next = list;
396     }
397   }
398   return thread_data;
399 }
400 
401 //  __kmp_push_priority_task: Add a task to the team's priority task deque
402 static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread,
403                                           kmp_taskdata_t *taskdata,
404                                           kmp_task_team_t *task_team,
405                                           kmp_int32 pri) {
406   kmp_thread_data_t *thread_data = NULL;
407   KA_TRACE(20,
408            ("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n",
409             gtid, taskdata, pri));
410 
411   // Find task queue specific to priority value
412   kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
413   if (UNLIKELY(lst == NULL)) {
414     __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
415     if (task_team->tt.tt_task_pri_list == NULL) {
416       // List of queues is still empty, allocate one.
417       kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
418       thread_data = &list->td;
419       list->priority = pri;
420       list->next = NULL;
421       task_team->tt.tt_task_pri_list = list;
422     } else {
423       // Other thread initialized a queue. Check if it fits and get thread_data.
424       thread_data = __kmp_get_priority_deque_data(task_team, pri);
425     }
426     __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
427   } else {
428     if (lst->priority == pri) {
429       // Found queue of tasks with given priority.
430       thread_data = &lst->td;
431     } else {
432       __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
433       thread_data = __kmp_get_priority_deque_data(task_team, pri);
434       __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
435     }
436   }
437   KMP_DEBUG_ASSERT(thread_data);
438 
439   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
440   // Check if deque is full
441   if (TCR_4(thread_data->td.td_deque_ntasks) >=
442       TASK_DEQUE_SIZE(thread_data->td)) {
443     if (__kmp_enable_task_throttling &&
444         __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
445                               thread->th.th_current_task)) {
446       __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
447       KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning "
448                     "TASK_NOT_PUSHED for task %p\n",
449                     gtid, taskdata));
450       return TASK_NOT_PUSHED;
451     } else {
452       // expand deque to push the task which is not allowed to execute
453       __kmp_realloc_task_deque(thread, thread_data);
454     }
455   }
456   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
457                    TASK_DEQUE_SIZE(thread_data->td));
458   // Push taskdata.
459   thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
460   // Wrap index.
461   thread_data->td.td_deque_tail =
462       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
463   TCW_4(thread_data->td.td_deque_ntasks,
464         TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
465   KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
466   KMP_FSYNC_RELEASING(taskdata); // releasing child
467   KA_TRACE(20, ("__kmp_push_priority_task: T#%d returning "
468                 "TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n",
469                 gtid, taskdata, thread_data->td.td_deque_ntasks,
470                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
471   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
472   task_team->tt.tt_num_task_pri++; // atomic inc
473   return TASK_SUCCESSFULLY_PUSHED;
474 }
475 
476 //  __kmp_push_task: Add a task to the thread's deque
477 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
478   kmp_info_t *thread = __kmp_threads[gtid];
479   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
480 
481   // If we encounter a hidden helper task, and the current thread is not a
482   // hidden helper thread, we have to give the task to any hidden helper thread
483   // starting from its shadow one.
484   if (UNLIKELY(taskdata->td_flags.hidden_helper &&
485                !KMP_HIDDEN_HELPER_THREAD(gtid))) {
486     kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
487     __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid));
488     // Signal the hidden helper threads.
489     __kmp_hidden_helper_worker_thread_signal();
490     return TASK_SUCCESSFULLY_PUSHED;
491   }
492 
493   kmp_task_team_t *task_team = thread->th.th_task_team;
494   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
495   kmp_thread_data_t *thread_data;
496 
497   KA_TRACE(20,
498            ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
499 
500   if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
501     // untied task needs to increment counter so that the task structure is not
502     // freed prematurely
503     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
504     KMP_DEBUG_USE_VAR(counter);
505     KA_TRACE(
506         20,
507         ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
508          gtid, counter, taskdata));
509   }
510 
511   // The first check avoids building task_team thread data if serialized
512   if (UNLIKELY(taskdata->td_flags.task_serial)) {
513     KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
514                   "TASK_NOT_PUSHED for task %p\n",
515                   gtid, taskdata));
516     return TASK_NOT_PUSHED;
517   }
518 
519   // Now that serialized tasks have returned, we can assume that we are not in
520   // immediate exec mode
521   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
522   if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
523     __kmp_enable_tasking(task_team, thread);
524   }
525   KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
526   KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
527 
528   if (taskdata->td_flags.priority_specified && task->data2.priority > 0 &&
529       __kmp_max_task_priority > 0) {
530     int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority);
531     return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri);
532   }
533 
534   // Find tasking deque specific to encountering thread
535   thread_data = &task_team->tt.tt_threads_data[tid];
536 
537   // No lock needed since only owner can allocate. If the task is hidden_helper,
538   // we don't need it either because we have initialized the dequeue for hidden
539   // helper thread data.
540   if (UNLIKELY(thread_data->td.td_deque == NULL)) {
541     __kmp_alloc_task_deque(thread, thread_data);
542   }
543 
544   int locked = 0;
545   // Check if deque is full
546   if (TCR_4(thread_data->td.td_deque_ntasks) >=
547       TASK_DEQUE_SIZE(thread_data->td)) {
548     if (__kmp_enable_task_throttling &&
549         __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
550                               thread->th.th_current_task)) {
551       KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
552                     "TASK_NOT_PUSHED for task %p\n",
553                     gtid, taskdata));
554       return TASK_NOT_PUSHED;
555     } else {
556       __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
557       locked = 1;
558       if (TCR_4(thread_data->td.td_deque_ntasks) >=
559           TASK_DEQUE_SIZE(thread_data->td)) {
560         // expand deque to push the task which is not allowed to execute
561         __kmp_realloc_task_deque(thread, thread_data);
562       }
563     }
564   }
565   // Lock the deque for the task push operation
566   if (!locked) {
567     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
568     // Need to recheck as we can get a proxy task from thread outside of OpenMP
569     if (TCR_4(thread_data->td.td_deque_ntasks) >=
570         TASK_DEQUE_SIZE(thread_data->td)) {
571       if (__kmp_enable_task_throttling &&
572           __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
573                                 thread->th.th_current_task)) {
574         __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
575         KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
576                       "returning TASK_NOT_PUSHED for task %p\n",
577                       gtid, taskdata));
578         return TASK_NOT_PUSHED;
579       } else {
580         // expand deque to push the task which is not allowed to execute
581         __kmp_realloc_task_deque(thread, thread_data);
582       }
583     }
584   }
585   // Must have room since no thread can add tasks but calling thread
586   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
587                    TASK_DEQUE_SIZE(thread_data->td));
588 
589   thread_data->td.td_deque[thread_data->td.td_deque_tail] =
590       taskdata; // Push taskdata
591   // Wrap index.
592   thread_data->td.td_deque_tail =
593       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
594   TCW_4(thread_data->td.td_deque_ntasks,
595         TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
596   KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
597   KMP_FSYNC_RELEASING(taskdata); // releasing child
598   KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
599                 "task=%p ntasks=%d head=%u tail=%u\n",
600                 gtid, taskdata, thread_data->td.td_deque_ntasks,
601                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
602 
603   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
604 
605   return TASK_SUCCESSFULLY_PUSHED;
606 }
607 
608 // __kmp_pop_current_task_from_thread: set up current task from called thread
609 // when team ends
610 //
611 // this_thr: thread structure to set current_task in.
612 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
613   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
614                 "this_thread=%p, curtask=%p, "
615                 "curtask_parent=%p\n",
616                 0, this_thr, this_thr->th.th_current_task,
617                 this_thr->th.th_current_task->td_parent));
618 
619   this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
620 
621   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
622                 "this_thread=%p, curtask=%p, "
623                 "curtask_parent=%p\n",
624                 0, this_thr, this_thr->th.th_current_task,
625                 this_thr->th.th_current_task->td_parent));
626 }
627 
628 // __kmp_push_current_task_to_thread: set up current task in called thread for a
629 // new team
630 //
631 // this_thr: thread structure to set up
632 // team: team for implicit task data
633 // tid: thread within team to set up
634 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
635                                        int tid) {
636   // current task of the thread is a parent of the new just created implicit
637   // tasks of new team
638   KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
639                 "curtask=%p "
640                 "parent_task=%p\n",
641                 tid, this_thr, this_thr->th.th_current_task,
642                 team->t.t_implicit_task_taskdata[tid].td_parent));
643 
644   KMP_DEBUG_ASSERT(this_thr != NULL);
645 
646   if (tid == 0) {
647     if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
648       team->t.t_implicit_task_taskdata[0].td_parent =
649           this_thr->th.th_current_task;
650       this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
651     }
652   } else {
653     team->t.t_implicit_task_taskdata[tid].td_parent =
654         team->t.t_implicit_task_taskdata[0].td_parent;
655     this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
656   }
657 
658   KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
659                 "curtask=%p "
660                 "parent_task=%p\n",
661                 tid, this_thr, this_thr->th.th_current_task,
662                 team->t.t_implicit_task_taskdata[tid].td_parent));
663 }
664 
665 // __kmp_task_start: bookkeeping for a task starting execution
666 //
667 // GTID: global thread id of calling thread
668 // task: task starting execution
669 // current_task: task suspending
670 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
671                              kmp_taskdata_t *current_task) {
672   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
673   kmp_info_t *thread = __kmp_threads[gtid];
674 
675   KA_TRACE(10,
676            ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
677             gtid, taskdata, current_task));
678 
679   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
680 
681   // mark currently executing task as suspended
682   // TODO: GEH - make sure root team implicit task is initialized properly.
683   // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
684   current_task->td_flags.executing = 0;
685 
686 // Add task to stack if tied
687 #ifdef BUILD_TIED_TASK_STACK
688   if (taskdata->td_flags.tiedness == TASK_TIED) {
689     __kmp_push_task_stack(gtid, thread, taskdata);
690   }
691 #endif /* BUILD_TIED_TASK_STACK */
692 
693   // mark starting task as executing and as current task
694   thread->th.th_current_task = taskdata;
695 
696   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
697                    taskdata->td_flags.tiedness == TASK_UNTIED);
698   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
699                    taskdata->td_flags.tiedness == TASK_UNTIED);
700   taskdata->td_flags.started = 1;
701   taskdata->td_flags.executing = 1;
702   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
703   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
704 
705   // GEH TODO: shouldn't we pass some sort of location identifier here?
706   // APT: yes, we will pass location here.
707   // need to store current thread state (in a thread or taskdata structure)
708   // before setting work_state, otherwise wrong state is set after end of task
709 
710   KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
711 
712   return;
713 }
714 
715 #if OMPT_SUPPORT
716 //------------------------------------------------------------------------------
717 // __ompt_task_init:
718 //   Initialize OMPT fields maintained by a task. This will only be called after
719 //   ompt_start_tool, so we already know whether ompt is enabled or not.
720 
721 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
722   // The calls to __ompt_task_init already have the ompt_enabled condition.
723   task->ompt_task_info.task_data.value = 0;
724   task->ompt_task_info.frame.exit_frame = ompt_data_none;
725   task->ompt_task_info.frame.enter_frame = ompt_data_none;
726   task->ompt_task_info.frame.exit_frame_flags =
727       ompt_frame_runtime | ompt_frame_framepointer;
728   task->ompt_task_info.frame.enter_frame_flags =
729       ompt_frame_runtime | ompt_frame_framepointer;
730   task->ompt_task_info.dispatch_chunk.start = 0;
731   task->ompt_task_info.dispatch_chunk.iterations = 0;
732 }
733 
734 // __ompt_task_start:
735 //   Build and trigger task-begin event
736 static inline void __ompt_task_start(kmp_task_t *task,
737                                      kmp_taskdata_t *current_task,
738                                      kmp_int32 gtid) {
739   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
740   ompt_task_status_t status = ompt_task_switch;
741   if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
742     status = ompt_task_yield;
743     __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
744   }
745   /* let OMPT know that we're about to run this task */
746   if (ompt_enabled.ompt_callback_task_schedule) {
747     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
748         &(current_task->ompt_task_info.task_data), status,
749         &(taskdata->ompt_task_info.task_data));
750   }
751   taskdata->ompt_task_info.scheduling_parent = current_task;
752 }
753 
754 // __ompt_task_finish:
755 //   Build and trigger final task-schedule event
756 static inline void __ompt_task_finish(kmp_task_t *task,
757                                       kmp_taskdata_t *resumed_task,
758                                       ompt_task_status_t status) {
759   if (ompt_enabled.ompt_callback_task_schedule) {
760     kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
761     if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
762         taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
763       status = ompt_task_cancel;
764     }
765 
766     /* let OMPT know that we're returning to the callee task */
767     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
768         &(taskdata->ompt_task_info.task_data), status,
769         (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
770   }
771 }
772 #endif
773 
774 template <bool ompt>
775 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
776                                                kmp_task_t *task,
777                                                void *frame_address,
778                                                void *return_address) {
779   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
780   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
781 
782   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
783                 "current_task=%p\n",
784                 gtid, loc_ref, taskdata, current_task));
785 
786   if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
787     // untied task needs to increment counter so that the task structure is not
788     // freed prematurely
789     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
790     KMP_DEBUG_USE_VAR(counter);
791     KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
792                   "incremented for task %p\n",
793                   gtid, counter, taskdata));
794   }
795 
796   taskdata->td_flags.task_serial =
797       1; // Execute this task immediately, not deferred.
798   __kmp_task_start(gtid, task, current_task);
799 
800 #if OMPT_SUPPORT
801   if (ompt) {
802     if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
803       current_task->ompt_task_info.frame.enter_frame.ptr =
804           taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
805       current_task->ompt_task_info.frame.enter_frame_flags =
806           taskdata->ompt_task_info.frame.exit_frame_flags =
807               ompt_frame_application | ompt_frame_framepointer;
808     }
809     if (ompt_enabled.ompt_callback_task_create) {
810       ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
811       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
812           &(parent_info->task_data), &(parent_info->frame),
813           &(taskdata->ompt_task_info.task_data),
814           TASK_TYPE_DETAILS_FORMAT(taskdata), 0, return_address);
815     }
816     __ompt_task_start(task, current_task, gtid);
817   }
818 #endif // OMPT_SUPPORT
819 
820   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
821                 loc_ref, taskdata));
822 }
823 
824 #if OMPT_SUPPORT
825 OMPT_NOINLINE
826 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
827                                            kmp_task_t *task,
828                                            void *frame_address,
829                                            void *return_address) {
830   __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
831                                            return_address);
832 }
833 #endif // OMPT_SUPPORT
834 
835 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
836 // execution
837 //
838 // loc_ref: source location information; points to beginning of task block.
839 // gtid: global thread number.
840 // task: task thunk for the started task.
841 #ifdef __s390x__
842 // This is required for OMPT_GET_FRAME_ADDRESS(1) to compile on s390x.
843 // In order for it to work correctly, the caller also needs to be compiled with
844 // backchain. If a caller is compiled without backchain,
845 // OMPT_GET_FRAME_ADDRESS(1) will produce an incorrect value, but will not
846 // crash.
847 __attribute__((target("backchain")))
848 #endif
849 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
850                                kmp_task_t *task) {
851 #if OMPT_SUPPORT
852   if (UNLIKELY(ompt_enabled.enabled)) {
853     OMPT_STORE_RETURN_ADDRESS(gtid);
854     __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
855                                    OMPT_GET_FRAME_ADDRESS(1),
856                                    OMPT_LOAD_RETURN_ADDRESS(gtid));
857     return;
858   }
859 #endif
860   __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
861 }
862 
863 #ifdef TASK_UNUSED
864 // __kmpc_omp_task_begin: report that a given task has started execution
865 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
866 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
867   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
868 
869   KA_TRACE(
870       10,
871       ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
872        gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
873 
874   __kmp_task_start(gtid, task, current_task);
875 
876   KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
877                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
878   return;
879 }
880 #endif // TASK_UNUSED
881 
882 // __kmp_free_task: free the current task space and the space for shareds
883 //
884 // gtid: Global thread ID of calling thread
885 // taskdata: task to free
886 // thread: thread data structure of caller
887 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
888                             kmp_info_t *thread) {
889   KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
890                 taskdata));
891 
892   // Check to make sure all flags and counters have the correct values
893   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
894   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
895   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
896   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
897   KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
898                    taskdata->td_flags.task_serial == 1);
899   KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
900   kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
901   // Clear data to not be re-used later by mistake.
902   task->data1.destructors = NULL;
903   task->data2.priority = 0;
904 
905   taskdata->td_flags.freed = 1;
906 #if OMPX_TASKGRAPH
907   // do not free tasks in taskgraph
908   if (!taskdata->is_taskgraph) {
909 #endif
910 // deallocate the taskdata and shared variable blocks associated with this task
911 #if USE_FAST_MEMORY
912   __kmp_fast_free(thread, taskdata);
913 #else /* ! USE_FAST_MEMORY */
914   __kmp_thread_free(thread, taskdata);
915 #endif
916 #if OMPX_TASKGRAPH
917   } else {
918     taskdata->td_flags.complete = 0;
919     taskdata->td_flags.started = 0;
920     taskdata->td_flags.freed = 0;
921     taskdata->td_flags.executing = 0;
922     taskdata->td_flags.task_serial =
923         (taskdata->td_parent->td_flags.final ||
924           taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser);
925 
926     // taskdata->td_allow_completion_event.pending_events_count = 1;
927     KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
928     KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
929     // start at one because counts current task and children
930     KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
931   }
932 #endif
933 
934   KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
935 }
936 
937 // __kmp_free_task_and_ancestors: free the current task and ancestors without
938 // children
939 //
940 // gtid: Global thread ID of calling thread
941 // taskdata: task to free
942 // thread: thread data structure of caller
943 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
944                                           kmp_taskdata_t *taskdata,
945                                           kmp_info_t *thread) {
946   // Proxy tasks must always be allowed to free their parents
947   // because they can be run in background even in serial mode.
948   kmp_int32 team_serial =
949       (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
950       !taskdata->td_flags.proxy;
951   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
952 
953   kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
954   KMP_DEBUG_ASSERT(children >= 0);
955 
956   // Now, go up the ancestor tree to see if any ancestors can now be freed.
957   while (children == 0) {
958     kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
959 
960     KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
961                   "and freeing itself\n",
962                   gtid, taskdata));
963 
964     // --- Deallocate my ancestor task ---
965     __kmp_free_task(gtid, taskdata, thread);
966 
967     taskdata = parent_taskdata;
968 
969     if (team_serial)
970       return;
971     // Stop checking ancestors at implicit task instead of walking up ancestor
972     // tree to avoid premature deallocation of ancestors.
973     if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
974       if (taskdata->td_dephash) { // do we need to cleanup dephash?
975         int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
976         kmp_tasking_flags_t flags_old = taskdata->td_flags;
977         if (children == 0 && flags_old.complete == 1) {
978           kmp_tasking_flags_t flags_new = flags_old;
979           flags_new.complete = 0;
980           if (KMP_COMPARE_AND_STORE_ACQ32(
981                   RCAST(kmp_int32 *, &taskdata->td_flags),
982                   *RCAST(kmp_int32 *, &flags_old),
983                   *RCAST(kmp_int32 *, &flags_new))) {
984             KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
985                            "dephash of implicit task %p\n",
986                            gtid, taskdata));
987             // cleanup dephash of finished implicit task
988             __kmp_dephash_free_entries(thread, taskdata->td_dephash);
989           }
990         }
991       }
992       return;
993     }
994     // Predecrement simulated by "- 1" calculation
995     children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
996     KMP_DEBUG_ASSERT(children >= 0);
997   }
998 
999   KA_TRACE(
1000       20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
1001            "not freeing it yet\n",
1002            gtid, taskdata, children));
1003 }
1004 
1005 // Only need to keep track of child task counts if any of the following:
1006 // 1. team parallel and tasking not serialized;
1007 // 2. it is a proxy or detachable or hidden helper task
1008 // 3. the children counter of its parent task is greater than 0.
1009 // The reason for the 3rd one is for serialized team that found detached task,
1010 // hidden helper task, T. In this case, the execution of T is still deferred,
1011 // and it is also possible that a regular task depends on T. In this case, if we
1012 // don't track the children, task synchronization will be broken.
1013 static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
1014   kmp_tasking_flags_t flags = taskdata->td_flags;
1015   bool ret = !(flags.team_serial || flags.tasking_ser);
1016   ret = ret || flags.proxy == TASK_PROXY ||
1017         flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
1018   ret = ret ||
1019         KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
1020 #if OMPX_TASKGRAPH
1021   if (taskdata->td_taskgroup && taskdata->is_taskgraph)
1022     ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > 0;
1023 #endif
1024   return ret;
1025 }
1026 
1027 // __kmp_task_finish: bookkeeping to do when a task finishes execution
1028 //
1029 // gtid: global thread ID for calling thread
1030 // task: task to be finished
1031 // resumed_task: task to be resumed.  (may be NULL if task is serialized)
1032 //
1033 // template<ompt>: effectively ompt_enabled.enabled!=0
1034 // the version with ompt=false is inlined, allowing to optimize away all ompt
1035 // code in this case
1036 template <bool ompt>
1037 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
1038                               kmp_taskdata_t *resumed_task) {
1039   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1040   kmp_info_t *thread = __kmp_threads[gtid];
1041   kmp_task_team_t *task_team =
1042       thread->th.th_task_team; // might be NULL for serial teams...
1043 #if OMPX_TASKGRAPH
1044   // to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop
1045   bool is_taskgraph;
1046 #endif
1047 #if KMP_DEBUG
1048   kmp_int32 children = 0;
1049 #endif
1050   KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
1051                 "task %p\n",
1052                 gtid, taskdata, resumed_task));
1053 
1054   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
1055 
1056 #if OMPX_TASKGRAPH
1057   is_taskgraph = taskdata->is_taskgraph;
1058 #endif
1059 
1060 // Pop task from stack if tied
1061 #ifdef BUILD_TIED_TASK_STACK
1062   if (taskdata->td_flags.tiedness == TASK_TIED) {
1063     __kmp_pop_task_stack(gtid, thread, taskdata);
1064   }
1065 #endif /* BUILD_TIED_TASK_STACK */
1066 
1067   if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
1068     // untied task needs to check the counter so that the task structure is not
1069     // freed prematurely
1070     kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
1071     KA_TRACE(
1072         20,
1073         ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
1074          gtid, counter, taskdata));
1075     if (counter > 0) {
1076       // untied task is not done, to be continued possibly by other thread, do
1077       // not free it now
1078       if (resumed_task == NULL) {
1079         KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
1080         resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1081         // task is the parent
1082       }
1083       thread->th.th_current_task = resumed_task; // restore current_task
1084       resumed_task->td_flags.executing = 1; // resume previous task
1085       KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
1086                     "resuming task %p\n",
1087                     gtid, taskdata, resumed_task));
1088       return;
1089     }
1090   }
1091 
1092   // bookkeeping for resuming task:
1093   // GEH - note tasking_ser => task_serial
1094   KMP_DEBUG_ASSERT(
1095       (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
1096       taskdata->td_flags.task_serial);
1097   if (taskdata->td_flags.task_serial) {
1098     if (resumed_task == NULL) {
1099       resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1100       // task is the parent
1101     }
1102   } else {
1103     KMP_DEBUG_ASSERT(resumed_task !=
1104                      NULL); // verify that resumed task is passed as argument
1105   }
1106 
1107   /* If the tasks' destructor thunk flag has been set, we need to invoke the
1108      destructor thunk that has been generated by the compiler. The code is
1109      placed here, since at this point other tasks might have been released
1110      hence overlapping the destructor invocations with some other work in the
1111      released tasks.  The OpenMP spec is not specific on when the destructors
1112      are invoked, so we should be free to choose. */
1113   if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
1114     kmp_routine_entry_t destr_thunk = task->data1.destructors;
1115     KMP_ASSERT(destr_thunk);
1116     destr_thunk(gtid, task);
1117   }
1118 
1119   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
1120   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
1121   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
1122 
1123   bool completed = true;
1124   if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
1125     if (taskdata->td_allow_completion_event.type ==
1126         KMP_EVENT_ALLOW_COMPLETION) {
1127       // event hasn't been fulfilled yet. Try to detach task.
1128       __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
1129       if (taskdata->td_allow_completion_event.type ==
1130           KMP_EVENT_ALLOW_COMPLETION) {
1131         // task finished execution
1132         KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
1133         taskdata->td_flags.executing = 0; // suspend the finishing task
1134 
1135 #if OMPT_SUPPORT
1136         // For a detached task, which is not completed, we switch back
1137         // the omp_fulfill_event signals completion
1138         // locking is necessary to avoid a race with ompt_task_late_fulfill
1139         if (ompt)
1140           __ompt_task_finish(task, resumed_task, ompt_task_detach);
1141 #endif
1142 
1143         // no access to taskdata after this point!
1144         // __kmp_fulfill_event might free taskdata at any time from now
1145 
1146         taskdata->td_flags.proxy = TASK_PROXY; // proxify!
1147         completed = false;
1148       }
1149       __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
1150     }
1151   }
1152 
1153   // Tasks with valid target async handles must be re-enqueued.
1154   if (taskdata->td_target_data.async_handle != NULL) {
1155     // Note: no need to translate gtid to its shadow. If the current thread is a
1156     // hidden helper one, then the gtid is already correct. Otherwise, hidden
1157     // helper threads are disabled, and gtid refers to a OpenMP thread.
1158 #if OMPT_SUPPORT
1159     if (ompt) {
1160       __ompt_task_finish(task, resumed_task, ompt_task_switch);
1161     }
1162 #endif
1163     __kmpc_give_task(task, __kmp_tid_from_gtid(gtid));
1164     if (KMP_HIDDEN_HELPER_THREAD(gtid))
1165       __kmp_hidden_helper_worker_thread_signal();
1166     completed = false;
1167   }
1168 
1169   if (completed) {
1170     taskdata->td_flags.complete = 1; // mark the task as completed
1171 #if OMPX_TASKGRAPH
1172     taskdata->td_flags.onced = 1; // mark the task as ran once already
1173 #endif
1174 
1175 #if OMPT_SUPPORT
1176     // This is not a detached task, we are done here
1177     if (ompt)
1178       __ompt_task_finish(task, resumed_task, ompt_task_complete);
1179 #endif
1180     // TODO: What would be the balance between the conditions in the function
1181     // and an atomic operation?
1182     if (__kmp_track_children_task(taskdata)) {
1183       __kmp_release_deps(gtid, taskdata);
1184       // Predecrement simulated by "- 1" calculation
1185 #if KMP_DEBUG
1186       children = -1 +
1187 #endif
1188           KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
1189       KMP_DEBUG_ASSERT(children >= 0);
1190 #if OMPX_TASKGRAPH
1191       if (taskdata->td_taskgroup && !taskdata->is_taskgraph)
1192 #else
1193       if (taskdata->td_taskgroup)
1194 #endif
1195         KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
1196     } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
1197                              task_team->tt.tt_hidden_helper_task_encountered)) {
1198       // if we found proxy or hidden helper tasks there could exist a dependency
1199       // chain with the proxy task as origin
1200       __kmp_release_deps(gtid, taskdata);
1201     }
1202     // td_flags.executing must be marked as 0 after __kmp_release_deps has been
1203     // called. Othertwise, if a task is executed immediately from the
1204     // release_deps code, the flag will be reset to 1 again by this same
1205     // function
1206     KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
1207     taskdata->td_flags.executing = 0; // suspend the finishing task
1208 
1209     // Decrement the counter of hidden helper tasks to be executed.
1210     if (taskdata->td_flags.hidden_helper) {
1211       // Hidden helper tasks can only be executed by hidden helper threads.
1212       KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
1213       KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
1214     }
1215   }
1216 
1217   KA_TRACE(
1218       20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
1219            gtid, taskdata, children));
1220 
1221   // Free this task and then ancestor tasks if they have no children.
1222   // Restore th_current_task first as suggested by John:
1223   // johnmc: if an asynchronous inquiry peers into the runtime system
1224   // it doesn't see the freed task as the current task.
1225   thread->th.th_current_task = resumed_task;
1226   if (completed)
1227     __kmp_free_task_and_ancestors(gtid, taskdata, thread);
1228 
1229   // TODO: GEH - make sure root team implicit task is initialized properly.
1230   // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
1231   resumed_task->td_flags.executing = 1; // resume previous task
1232 
1233 #if OMPX_TASKGRAPH
1234   if (is_taskgraph && __kmp_track_children_task(taskdata) &&
1235       taskdata->td_taskgroup) {
1236     // TDG: we only release taskgroup barrier here because
1237     // free_task_and_ancestors will call
1238     // __kmp_free_task, which resets all task parameters such as
1239     // taskdata->started, etc. If we release the barrier earlier, these
1240     // parameters could be read before being reset. This is not an issue for
1241     // non-TDG implementation because we never reuse a task(data) structure
1242     KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
1243   }
1244 #endif
1245 
1246   KA_TRACE(
1247       10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
1248            gtid, taskdata, resumed_task));
1249 
1250   return;
1251 }
1252 
1253 template <bool ompt>
1254 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
1255                                                   kmp_int32 gtid,
1256                                                   kmp_task_t *task) {
1257   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
1258                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1259   KMP_DEBUG_ASSERT(gtid >= 0);
1260   // this routine will provide task to resume
1261   __kmp_task_finish<ompt>(gtid, task, NULL);
1262 
1263   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
1264                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1265 
1266 #if OMPT_SUPPORT
1267   if (ompt) {
1268     ompt_frame_t *ompt_frame;
1269     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1270     ompt_frame->enter_frame = ompt_data_none;
1271     ompt_frame->enter_frame_flags =
1272         ompt_frame_runtime | ompt_frame_framepointer;
1273   }
1274 #endif
1275 
1276   return;
1277 }
1278 
1279 #if OMPT_SUPPORT
1280 OMPT_NOINLINE
1281 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1282                                        kmp_task_t *task) {
1283   __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1284 }
1285 #endif // OMPT_SUPPORT
1286 
1287 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1288 //
1289 // loc_ref: source location information; points to end of task block.
1290 // gtid: global thread number.
1291 // task: task thunk for the completed task.
1292 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1293                                   kmp_task_t *task) {
1294 #if OMPT_SUPPORT
1295   if (UNLIKELY(ompt_enabled.enabled)) {
1296     __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1297     return;
1298   }
1299 #endif
1300   __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1301 }
1302 
1303 #ifdef TASK_UNUSED
1304 // __kmpc_omp_task_complete: report that a task has completed execution
1305 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1306 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1307                               kmp_task_t *task) {
1308   KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1309                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1310 
1311   __kmp_task_finish<false>(gtid, task,
1312                            NULL); // Not sure how to find task to resume
1313 
1314   KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1315                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1316   return;
1317 }
1318 #endif // TASK_UNUSED
1319 
1320 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1321 // task for a given thread
1322 //
1323 // loc_ref:  reference to source location of parallel region
1324 // this_thr:  thread data structure corresponding to implicit task
1325 // team: team for this_thr
1326 // tid: thread id of given thread within team
1327 // set_curr_task: TRUE if need to push current task to thread
1328 // NOTE: Routine does not set up the implicit task ICVS.  This is assumed to
1329 // have already been done elsewhere.
1330 // TODO: Get better loc_ref.  Value passed in may be NULL
1331 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1332                               kmp_team_t *team, int tid, int set_curr_task) {
1333   kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1334 
1335   KF_TRACE(
1336       10,
1337       ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1338        tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1339 
1340   task->td_task_id = KMP_GEN_TASK_ID();
1341   task->td_team = team;
1342   //    task->td_parent   = NULL;  // fix for CQ230101 (broken parent task info
1343   //    in debugger)
1344   task->td_ident = loc_ref;
1345   task->td_taskwait_ident = NULL;
1346   task->td_taskwait_counter = 0;
1347   task->td_taskwait_thread = 0;
1348 
1349   task->td_flags.tiedness = TASK_TIED;
1350   task->td_flags.tasktype = TASK_IMPLICIT;
1351   task->td_flags.proxy = TASK_FULL;
1352 
1353   // All implicit tasks are executed immediately, not deferred
1354   task->td_flags.task_serial = 1;
1355   task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1356   task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1357 
1358   task->td_flags.started = 1;
1359   task->td_flags.executing = 1;
1360   task->td_flags.complete = 0;
1361   task->td_flags.freed = 0;
1362 #if OMPX_TASKGRAPH
1363   task->td_flags.onced = 0;
1364 #endif
1365 
1366   task->td_depnode = NULL;
1367   task->td_last_tied = task;
1368   task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1369 
1370   if (set_curr_task) { // only do this init first time thread is created
1371     KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1372     // Not used: don't need to deallocate implicit task
1373     KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1374     task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1375     task->td_dephash = NULL;
1376     __kmp_push_current_task_to_thread(this_thr, team, tid);
1377   } else {
1378     KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1379     KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1380   }
1381 
1382 #if OMPT_SUPPORT
1383   if (UNLIKELY(ompt_enabled.enabled))
1384     __ompt_task_init(task, tid);
1385 #endif
1386 
1387   KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1388                 team, task));
1389 }
1390 
1391 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1392 // at the end of parallel regions. Some resources are kept for reuse in the next
1393 // parallel region.
1394 //
1395 // thread:  thread data structure corresponding to implicit task
1396 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1397   kmp_taskdata_t *task = thread->th.th_current_task;
1398   if (task->td_dephash) {
1399     int children;
1400     task->td_flags.complete = 1;
1401 #if OMPX_TASKGRAPH
1402     task->td_flags.onced = 1;
1403 #endif
1404     children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1405     kmp_tasking_flags_t flags_old = task->td_flags;
1406     if (children == 0 && flags_old.complete == 1) {
1407       kmp_tasking_flags_t flags_new = flags_old;
1408       flags_new.complete = 0;
1409       if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1410                                       *RCAST(kmp_int32 *, &flags_old),
1411                                       *RCAST(kmp_int32 *, &flags_new))) {
1412         KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1413                        "dephash of implicit task %p\n",
1414                        thread->th.th_info.ds.ds_gtid, task));
1415         __kmp_dephash_free_entries(thread, task->td_dephash);
1416       }
1417     }
1418   }
1419 }
1420 
1421 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1422 // when these are destroyed regions
1423 //
1424 // thread:  thread data structure corresponding to implicit task
1425 void __kmp_free_implicit_task(kmp_info_t *thread) {
1426   kmp_taskdata_t *task = thread->th.th_current_task;
1427   if (task && task->td_dephash) {
1428     __kmp_dephash_free(thread, task->td_dephash);
1429     task->td_dephash = NULL;
1430   }
1431 }
1432 
1433 // Round up a size to a power of two specified by val: Used to insert padding
1434 // between structures co-allocated using a single malloc() call
1435 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1436   if (size & (val - 1)) {
1437     size &= ~(val - 1);
1438     if (size <= KMP_SIZE_T_MAX - val) {
1439       size += val; // Round up if there is no overflow.
1440     }
1441   }
1442   return size;
1443 } // __kmp_round_up_to_va
1444 
1445 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1446 //
1447 // loc_ref: source location information
1448 // gtid: global thread number.
1449 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1450 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1451 // sizeof_kmp_task_t:  Size in bytes of kmp_task_t data structure including
1452 // private vars accessed in task.
1453 // sizeof_shareds:  Size in bytes of array of pointers to shared vars accessed
1454 // in task.
1455 // task_entry: Pointer to task code entry point generated by compiler.
1456 // returns: a pointer to the allocated kmp_task_t structure (task).
1457 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1458                              kmp_tasking_flags_t *flags,
1459                              size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1460                              kmp_routine_entry_t task_entry) {
1461   kmp_task_t *task;
1462   kmp_taskdata_t *taskdata;
1463   kmp_info_t *thread = __kmp_threads[gtid];
1464   kmp_team_t *team = thread->th.th_team;
1465   kmp_taskdata_t *parent_task = thread->th.th_current_task;
1466   size_t shareds_offset;
1467 
1468   if (UNLIKELY(!TCR_4(__kmp_init_middle)))
1469     __kmp_middle_initialize();
1470 
1471   if (flags->hidden_helper) {
1472     if (__kmp_enable_hidden_helper) {
1473       if (!TCR_4(__kmp_init_hidden_helper))
1474         __kmp_hidden_helper_initialize();
1475     } else {
1476       // If the hidden helper task is not enabled, reset the flag to FALSE.
1477       flags->hidden_helper = FALSE;
1478     }
1479   }
1480 
1481   KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1482                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1483                 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1484                 sizeof_shareds, task_entry));
1485 
1486   KMP_DEBUG_ASSERT(parent_task);
1487   if (parent_task->td_flags.final) {
1488     if (flags->merged_if0) {
1489     }
1490     flags->final = 1;
1491   }
1492 
1493   if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1494     // Untied task encountered causes the TSC algorithm to check entire deque of
1495     // the victim thread. If no untied task encountered, then checking the head
1496     // of the deque should be enough.
1497     KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1498   }
1499 
1500   // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1501   // the tasking setup
1502   // when that happens is too late.
1503   if (UNLIKELY(flags->proxy == TASK_PROXY ||
1504                flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
1505     if (flags->proxy == TASK_PROXY) {
1506       flags->tiedness = TASK_UNTIED;
1507       flags->merged_if0 = 1;
1508     }
1509     /* are we running in a sequential parallel or tskm_immediate_exec... we need
1510        tasking support enabled */
1511     if ((thread->th.th_task_team) == NULL) {
1512       /* This should only happen if the team is serialized
1513           setup a task team and propagate it to the thread */
1514       KMP_DEBUG_ASSERT(team->t.t_serialized);
1515       KA_TRACE(30,
1516                ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1517                 gtid));
1518       __kmp_task_team_setup(thread, team);
1519       thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1520     }
1521     kmp_task_team_t *task_team = thread->th.th_task_team;
1522 
1523     /* tasking must be enabled now as the task might not be pushed */
1524     if (!KMP_TASKING_ENABLED(task_team)) {
1525       KA_TRACE(
1526           30,
1527           ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1528       __kmp_enable_tasking(task_team, thread);
1529       kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1530       kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1531       // No lock needed since only owner can allocate
1532       if (thread_data->td.td_deque == NULL) {
1533         __kmp_alloc_task_deque(thread, thread_data);
1534       }
1535     }
1536 
1537     if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
1538         task_team->tt.tt_found_proxy_tasks == FALSE)
1539       TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1540     if (flags->hidden_helper &&
1541         task_team->tt.tt_hidden_helper_task_encountered == FALSE)
1542       TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
1543   }
1544 
1545   // Calculate shared structure offset including padding after kmp_task_t struct
1546   // to align pointers in shared struct
1547   shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1548   shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1549 
1550   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1551   KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1552                 shareds_offset));
1553   KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1554                 sizeof_shareds));
1555 
1556   // Avoid double allocation here by combining shareds with taskdata
1557 #if USE_FAST_MEMORY
1558   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1559                                                                sizeof_shareds);
1560 #else /* ! USE_FAST_MEMORY */
1561   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1562                                                                sizeof_shareds);
1563 #endif /* USE_FAST_MEMORY */
1564 
1565   task = KMP_TASKDATA_TO_TASK(taskdata);
1566 
1567 // Make sure task & taskdata are aligned appropriately
1568 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || KMP_ARCH_S390X || !KMP_HAVE_QUAD
1569   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1570   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1571 #else
1572   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1573   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1574 #endif
1575   if (sizeof_shareds > 0) {
1576     // Avoid double allocation here by combining shareds with taskdata
1577     task->shareds = &((char *)taskdata)[shareds_offset];
1578     // Make sure shareds struct is aligned to pointer size
1579     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1580                      0);
1581   } else {
1582     task->shareds = NULL;
1583   }
1584   task->routine = task_entry;
1585   task->part_id = 0; // AC: Always start with 0 part id
1586 
1587   taskdata->td_task_id = KMP_GEN_TASK_ID();
1588   taskdata->td_team = thread->th.th_team;
1589   taskdata->td_alloc_thread = thread;
1590   taskdata->td_parent = parent_task;
1591   taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1592   KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1593   taskdata->td_ident = loc_ref;
1594   taskdata->td_taskwait_ident = NULL;
1595   taskdata->td_taskwait_counter = 0;
1596   taskdata->td_taskwait_thread = 0;
1597   KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1598   // avoid copying icvs for proxy tasks
1599   if (flags->proxy == TASK_FULL)
1600     copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1601 
1602   taskdata->td_flags = *flags;
1603   taskdata->td_task_team = thread->th.th_task_team;
1604   taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1605   taskdata->td_flags.tasktype = TASK_EXPLICIT;
1606   // If it is hidden helper task, we need to set the team and task team
1607   // correspondingly.
1608   if (flags->hidden_helper) {
1609     kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1610     taskdata->td_team = shadow_thread->th.th_team;
1611     taskdata->td_task_team = shadow_thread->th.th_task_team;
1612   }
1613 
1614   // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1615   taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1616 
1617   // GEH - TODO: fix this to copy parent task's value of team_serial flag
1618   taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1619 
1620   // GEH - Note we serialize the task if the team is serialized to make sure
1621   // implicit parallel region tasks are not left until program termination to
1622   // execute. Also, it helps locality to execute immediately.
1623 
1624   taskdata->td_flags.task_serial =
1625       (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1626        taskdata->td_flags.tasking_ser || flags->merged_if0);
1627 
1628   taskdata->td_flags.started = 0;
1629   taskdata->td_flags.executing = 0;
1630   taskdata->td_flags.complete = 0;
1631   taskdata->td_flags.freed = 0;
1632 #if OMPX_TASKGRAPH
1633   taskdata->td_flags.onced = 0;
1634 #endif
1635   KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1636   // start at one because counts current task and children
1637   KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1638   taskdata->td_taskgroup =
1639       parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1640   taskdata->td_dephash = NULL;
1641   taskdata->td_depnode = NULL;
1642   taskdata->td_target_data.async_handle = NULL;
1643   if (flags->tiedness == TASK_UNTIED)
1644     taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1645   else
1646     taskdata->td_last_tied = taskdata;
1647   taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1648 #if OMPT_SUPPORT
1649   if (UNLIKELY(ompt_enabled.enabled))
1650     __ompt_task_init(taskdata, gtid);
1651 #endif
1652   // TODO: What would be the balance between the conditions in the function and
1653   // an atomic operation?
1654   if (__kmp_track_children_task(taskdata)) {
1655     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1656     if (parent_task->td_taskgroup)
1657       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1658     // Only need to keep track of allocated child tasks for explicit tasks since
1659     // implicit not deallocated
1660     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1661       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1662     }
1663     if (flags->hidden_helper) {
1664       taskdata->td_flags.task_serial = FALSE;
1665       // Increment the number of hidden helper tasks to be executed
1666       KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
1667     }
1668   }
1669 
1670 #if OMPX_TASKGRAPH
1671   kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
1672   if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) &&
1673       (task_entry != (kmp_routine_entry_t)__kmp_taskloop_task)) {
1674     taskdata->is_taskgraph = 1;
1675     taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
1676     taskdata->td_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
1677   }
1678 #endif
1679   KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1680                 gtid, taskdata, taskdata->td_parent));
1681 
1682   return task;
1683 }
1684 
1685 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1686                                   kmp_int32 flags, size_t sizeof_kmp_task_t,
1687                                   size_t sizeof_shareds,
1688                                   kmp_routine_entry_t task_entry) {
1689   kmp_task_t *retval;
1690   kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1691   __kmp_assert_valid_gtid(gtid);
1692   input_flags->native = FALSE;
1693   // __kmp_task_alloc() sets up all other runtime flags
1694   KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1695                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1696                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1697                 input_flags->proxy ? "proxy" : "",
1698                 input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1699                 sizeof_shareds, task_entry));
1700 
1701   retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1702                             sizeof_shareds, task_entry);
1703 
1704   KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1705 
1706   return retval;
1707 }
1708 
1709 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1710                                          kmp_int32 flags,
1711                                          size_t sizeof_kmp_task_t,
1712                                          size_t sizeof_shareds,
1713                                          kmp_routine_entry_t task_entry,
1714                                          kmp_int64 device_id) {
1715   auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1716   // target task is untied defined in the specification
1717   input_flags.tiedness = TASK_UNTIED;
1718   input_flags.target = 1;
1719 
1720   if (__kmp_enable_hidden_helper)
1721     input_flags.hidden_helper = TRUE;
1722 
1723   return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1724                                sizeof_shareds, task_entry);
1725 }
1726 
1727 /*!
1728 @ingroup TASKING
1729 @param loc_ref location of the original task directive
1730 @param gtid Global Thread ID of encountering thread
1731 @param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new
1732 task''
1733 @param naffins Number of affinity items
1734 @param affin_list List of affinity items
1735 @return Returns non-zero if registering affinity information was not successful.
1736  Returns 0 if registration was successful
1737 This entry registers the affinity information attached to a task with the task
1738 thunk structure kmp_taskdata_t.
1739 */
1740 kmp_int32
1741 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
1742                                   kmp_task_t *new_task, kmp_int32 naffins,
1743                                   kmp_task_affinity_info_t *affin_list) {
1744   return 0;
1745 }
1746 
1747 //  __kmp_invoke_task: invoke the specified task
1748 //
1749 // gtid: global thread ID of caller
1750 // task: the task to invoke
1751 // current_task: the task to resume after task invocation
1752 #ifdef __s390x__
1753 __attribute__((target("backchain")))
1754 #endif
1755 static void
1756 __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1757                   kmp_taskdata_t *current_task) {
1758   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1759   kmp_info_t *thread;
1760   int discard = 0 /* false */;
1761   KA_TRACE(
1762       30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1763            gtid, taskdata, current_task));
1764   KMP_DEBUG_ASSERT(task);
1765   if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1766                taskdata->td_flags.complete == 1)) {
1767     // This is a proxy task that was already completed but it needs to run
1768     // its bottom-half finish
1769     KA_TRACE(
1770         30,
1771         ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1772          gtid, taskdata));
1773 
1774     __kmp_bottom_half_finish_proxy(gtid, task);
1775 
1776     KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1777                   "proxy task %p, resuming task %p\n",
1778                   gtid, taskdata, current_task));
1779 
1780     return;
1781   }
1782 
1783 #if OMPT_SUPPORT
1784   // For untied tasks, the first task executed only calls __kmpc_omp_task and
1785   // does not execute code.
1786   ompt_thread_info_t oldInfo;
1787   if (UNLIKELY(ompt_enabled.enabled)) {
1788     // Store the threads states and restore them after the task
1789     thread = __kmp_threads[gtid];
1790     oldInfo = thread->th.ompt_thread_info;
1791     thread->th.ompt_thread_info.wait_id = 0;
1792     thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1793                                             ? ompt_state_work_serial
1794                                             : ompt_state_work_parallel;
1795     taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1796   }
1797 #endif
1798 
1799   // Proxy tasks are not handled by the runtime
1800   if (taskdata->td_flags.proxy != TASK_PROXY) {
1801     __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1802   }
1803 
1804   // TODO: cancel tasks if the parallel region has also been cancelled
1805   // TODO: check if this sequence can be hoisted above __kmp_task_start
1806   // if cancellation has been enabled for this run ...
1807   if (UNLIKELY(__kmp_omp_cancellation)) {
1808     thread = __kmp_threads[gtid];
1809     kmp_team_t *this_team = thread->th.th_team;
1810     kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1811     if ((taskgroup && taskgroup->cancel_request) ||
1812         (this_team->t.t_cancel_request == cancel_parallel)) {
1813 #if OMPT_SUPPORT && OMPT_OPTIONAL
1814       ompt_data_t *task_data;
1815       if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1816         __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1817         ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1818             task_data,
1819             ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1820                                                       : ompt_cancel_parallel) |
1821                 ompt_cancel_discarded_task,
1822             NULL);
1823       }
1824 #endif
1825       KMP_COUNT_BLOCK(TASK_cancelled);
1826       // this task belongs to a task group and we need to cancel it
1827       discard = 1 /* true */;
1828     }
1829   }
1830 
1831   // Invoke the task routine and pass in relevant data.
1832   // Thunks generated by gcc take a different argument list.
1833   if (!discard) {
1834     if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1835       taskdata->td_last_tied = current_task->td_last_tied;
1836       KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1837     }
1838 #if KMP_STATS_ENABLED
1839     KMP_COUNT_BLOCK(TASK_executed);
1840     switch (KMP_GET_THREAD_STATE()) {
1841     case FORK_JOIN_BARRIER:
1842       KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1843       break;
1844     case PLAIN_BARRIER:
1845       KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1846       break;
1847     case TASKYIELD:
1848       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1849       break;
1850     case TASKWAIT:
1851       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1852       break;
1853     case TASKGROUP:
1854       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1855       break;
1856     default:
1857       KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1858       break;
1859     }
1860 #endif // KMP_STATS_ENABLED
1861 
1862 // OMPT task begin
1863 #if OMPT_SUPPORT
1864     if (UNLIKELY(ompt_enabled.enabled))
1865       __ompt_task_start(task, current_task, gtid);
1866 #endif
1867 #if OMPT_SUPPORT && OMPT_OPTIONAL
1868     if (UNLIKELY(ompt_enabled.ompt_callback_dispatch &&
1869                  taskdata->ompt_task_info.dispatch_chunk.iterations > 0)) {
1870       ompt_data_t instance = ompt_data_none;
1871       instance.ptr = &(taskdata->ompt_task_info.dispatch_chunk);
1872       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1873       ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
1874           &(team_info->parallel_data), &(taskdata->ompt_task_info.task_data),
1875           ompt_dispatch_taskloop_chunk, instance);
1876       taskdata->ompt_task_info.dispatch_chunk = {0, 0};
1877     }
1878 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1879 
1880 #if OMPD_SUPPORT
1881     if (ompd_state & OMPD_ENABLE_BP)
1882       ompd_bp_task_begin();
1883 #endif
1884 
1885 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1886     kmp_uint64 cur_time;
1887     kmp_int32 kmp_itt_count_task =
1888         __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1889         current_task->td_flags.tasktype == TASK_IMPLICIT;
1890     if (kmp_itt_count_task) {
1891       thread = __kmp_threads[gtid];
1892       // Time outer level explicit task on barrier for adjusting imbalance time
1893       if (thread->th.th_bar_arrive_time)
1894         cur_time = __itt_get_timestamp();
1895       else
1896         kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1897     }
1898     KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1899 #endif
1900 
1901 #if ENABLE_LIBOMPTARGET
1902     if (taskdata->td_target_data.async_handle != NULL) {
1903       // If we have a valid target async handle, that means that we have already
1904       // executed the task routine once. We must query for the handle completion
1905       // instead of re-executing the routine.
1906       KMP_ASSERT(tgt_target_nowait_query);
1907       tgt_target_nowait_query(&taskdata->td_target_data.async_handle);
1908     } else
1909 #endif
1910     if (task->routine != NULL) {
1911 #ifdef KMP_GOMP_COMPAT
1912       if (taskdata->td_flags.native) {
1913         ((void (*)(void *))(*(task->routine)))(task->shareds);
1914       } else
1915 #endif /* KMP_GOMP_COMPAT */
1916       {
1917         (*(task->routine))(gtid, task);
1918       }
1919     }
1920     KMP_POP_PARTITIONED_TIMER();
1921 
1922 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1923     if (kmp_itt_count_task) {
1924       // Barrier imbalance - adjust arrive time with the task duration
1925       thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1926     }
1927     KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1928     KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1929 #endif
1930   }
1931 
1932 #if OMPD_SUPPORT
1933   if (ompd_state & OMPD_ENABLE_BP)
1934     ompd_bp_task_end();
1935 #endif
1936 
1937   // Proxy tasks are not handled by the runtime
1938   if (taskdata->td_flags.proxy != TASK_PROXY) {
1939 #if OMPT_SUPPORT
1940     if (UNLIKELY(ompt_enabled.enabled)) {
1941       thread->th.ompt_thread_info = oldInfo;
1942       if (taskdata->td_flags.tiedness == TASK_TIED) {
1943         taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1944       }
1945       __kmp_task_finish<true>(gtid, task, current_task);
1946     } else
1947 #endif
1948       __kmp_task_finish<false>(gtid, task, current_task);
1949   }
1950 #if OMPT_SUPPORT
1951   else if (UNLIKELY(ompt_enabled.enabled && taskdata->td_flags.target)) {
1952     __ompt_task_finish(task, current_task, ompt_task_switch);
1953   }
1954 #endif
1955 
1956   KA_TRACE(
1957       30,
1958       ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1959        gtid, taskdata, current_task));
1960   return;
1961 }
1962 
1963 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1964 //
1965 // loc_ref: location of original task pragma (ignored)
1966 // gtid: Global Thread ID of encountering thread
1967 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1968 // Returns:
1969 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1970 //    be resumed later.
1971 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1972 //    resumed later.
1973 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1974                                 kmp_task_t *new_task) {
1975   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1976 
1977   KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1978                 loc_ref, new_taskdata));
1979 
1980 #if OMPT_SUPPORT
1981   kmp_taskdata_t *parent;
1982   if (UNLIKELY(ompt_enabled.enabled)) {
1983     parent = new_taskdata->td_parent;
1984     if (ompt_enabled.ompt_callback_task_create) {
1985       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1986           &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1987           &(new_taskdata->ompt_task_info.task_data),
1988           TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1989           OMPT_GET_RETURN_ADDRESS(0));
1990     }
1991   }
1992 #endif
1993 
1994   /* Should we execute the new task or queue it? For now, let's just always try
1995      to queue it.  If the queue fills up, then we'll execute it.  */
1996 
1997   if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1998   { // Execute this task immediately
1999     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
2000     new_taskdata->td_flags.task_serial = 1;
2001     __kmp_invoke_task(gtid, new_task, current_task);
2002   }
2003 
2004   KA_TRACE(
2005       10,
2006       ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
2007        "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
2008        gtid, loc_ref, new_taskdata));
2009 
2010 #if OMPT_SUPPORT
2011   if (UNLIKELY(ompt_enabled.enabled)) {
2012     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2013   }
2014 #endif
2015   return TASK_CURRENT_NOT_QUEUED;
2016 }
2017 
2018 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
2019 //
2020 // gtid: Global Thread ID of encountering thread
2021 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
2022 // serialize_immediate: if TRUE then if the task is executed immediately its
2023 // execution will be serialized
2024 // Returns:
2025 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2026 //    be resumed later.
2027 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2028 //    resumed later.
2029 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
2030                          bool serialize_immediate) {
2031   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2032 
2033 #if OMPX_TASKGRAPH
2034   if (new_taskdata->is_taskgraph &&
2035       __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
2036     kmp_tdg_info_t *tdg = new_taskdata->tdg;
2037     // extend the record_map if needed
2038     if (new_taskdata->td_task_id >= new_taskdata->tdg->map_size) {
2039       __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
2040       // map_size could have been updated by another thread if recursive
2041       // taskloop
2042       if (new_taskdata->td_task_id >= tdg->map_size) {
2043         kmp_uint old_size = tdg->map_size;
2044         kmp_uint new_size = old_size * 2;
2045         kmp_node_info_t *old_record = tdg->record_map;
2046         kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
2047             new_size * sizeof(kmp_node_info_t));
2048 
2049         KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t));
2050         tdg->record_map = new_record;
2051 
2052         __kmp_free(old_record);
2053 
2054         for (kmp_int i = old_size; i < new_size; i++) {
2055           kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
2056               __kmp_successors_size * sizeof(kmp_int32));
2057           new_record[i].task = nullptr;
2058           new_record[i].successors = successorsList;
2059           new_record[i].nsuccessors = 0;
2060           new_record[i].npredecessors = 0;
2061           new_record[i].successors_size = __kmp_successors_size;
2062           KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
2063         }
2064         // update the size at the end, so that we avoid other
2065         // threads use old_record while map_size is already updated
2066         tdg->map_size = new_size;
2067       }
2068       __kmp_release_bootstrap_lock(&tdg->graph_lock);
2069     }
2070     // record a task
2071     if (tdg->record_map[new_taskdata->td_task_id].task == nullptr) {
2072       tdg->record_map[new_taskdata->td_task_id].task = new_task;
2073       tdg->record_map[new_taskdata->td_task_id].parent_task =
2074           new_taskdata->td_parent;
2075       KMP_ATOMIC_INC(&tdg->num_tasks);
2076     }
2077   }
2078 #endif
2079 
2080   /* Should we execute the new task or queue it? For now, let's just always try
2081      to queue it.  If the queue fills up, then we'll execute it.  */
2082   if (new_taskdata->td_flags.proxy == TASK_PROXY ||
2083       __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
2084   { // Execute this task immediately
2085     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
2086     if (serialize_immediate)
2087       new_taskdata->td_flags.task_serial = 1;
2088     __kmp_invoke_task(gtid, new_task, current_task);
2089   } else if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2090              __kmp_wpolicy_passive) {
2091     kmp_info_t *this_thr = __kmp_threads[gtid];
2092     kmp_team_t *team = this_thr->th.th_team;
2093     kmp_int32 nthreads = this_thr->th.th_team_nproc;
2094     for (int i = 0; i < nthreads; ++i) {
2095       kmp_info_t *thread = team->t.t_threads[i];
2096       if (thread == this_thr)
2097         continue;
2098       if (thread->th.th_sleep_loc != NULL) {
2099         __kmp_null_resume_wrapper(thread);
2100         break; // awake one thread at a time
2101       }
2102     }
2103   }
2104   return TASK_CURRENT_NOT_QUEUED;
2105 }
2106 
2107 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
2108 // non-thread-switchable task from the parent thread only!
2109 //
2110 // loc_ref: location of original task pragma (ignored)
2111 // gtid: Global Thread ID of encountering thread
2112 // new_task: non-thread-switchable task thunk allocated by
2113 // __kmp_omp_task_alloc()
2114 // Returns:
2115 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2116 //    be resumed later.
2117 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2118 //    resumed later.
2119 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
2120                           kmp_task_t *new_task) {
2121   kmp_int32 res;
2122   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
2123 
2124 #if KMP_DEBUG || OMPT_SUPPORT
2125   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2126 #endif
2127   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
2128                 new_taskdata));
2129   __kmp_assert_valid_gtid(gtid);
2130 
2131 #if OMPT_SUPPORT
2132   kmp_taskdata_t *parent = NULL;
2133   if (UNLIKELY(ompt_enabled.enabled)) {
2134     if (!new_taskdata->td_flags.started) {
2135       OMPT_STORE_RETURN_ADDRESS(gtid);
2136       parent = new_taskdata->td_parent;
2137       if (!parent->ompt_task_info.frame.enter_frame.ptr) {
2138         parent->ompt_task_info.frame.enter_frame.ptr =
2139             OMPT_GET_FRAME_ADDRESS(0);
2140       }
2141       if (ompt_enabled.ompt_callback_task_create) {
2142         ompt_callbacks.ompt_callback(ompt_callback_task_create)(
2143             &(parent->ompt_task_info.task_data),
2144             &(parent->ompt_task_info.frame),
2145             &(new_taskdata->ompt_task_info.task_data),
2146             TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
2147             OMPT_LOAD_RETURN_ADDRESS(gtid));
2148       }
2149     } else {
2150       // We are scheduling the continuation of an UNTIED task.
2151       // Scheduling back to the parent task.
2152       __ompt_task_finish(new_task,
2153                          new_taskdata->ompt_task_info.scheduling_parent,
2154                          ompt_task_switch);
2155       new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
2156     }
2157   }
2158 #endif
2159 
2160   res = __kmp_omp_task(gtid, new_task, true);
2161 
2162   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
2163                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
2164                 gtid, loc_ref, new_taskdata));
2165 #if OMPT_SUPPORT
2166   if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
2167     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2168   }
2169 #endif
2170   return res;
2171 }
2172 
2173 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
2174 // a taskloop task with the correct OMPT return address
2175 //
2176 // loc_ref: location of original task pragma (ignored)
2177 // gtid: Global Thread ID of encountering thread
2178 // new_task: non-thread-switchable task thunk allocated by
2179 // __kmp_omp_task_alloc()
2180 // codeptr_ra: return address for OMPT callback
2181 // Returns:
2182 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2183 //    be resumed later.
2184 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2185 //    resumed later.
2186 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
2187                                   kmp_task_t *new_task, void *codeptr_ra) {
2188   kmp_int32 res;
2189   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
2190 
2191 #if KMP_DEBUG || OMPT_SUPPORT
2192   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2193 #endif
2194   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
2195                 new_taskdata));
2196 
2197 #if OMPT_SUPPORT
2198   kmp_taskdata_t *parent = NULL;
2199   if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
2200     parent = new_taskdata->td_parent;
2201     if (!parent->ompt_task_info.frame.enter_frame.ptr)
2202       parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2203     if (ompt_enabled.ompt_callback_task_create) {
2204       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
2205           &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
2206           &(new_taskdata->ompt_task_info.task_data),
2207           TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, codeptr_ra);
2208     }
2209   }
2210 #endif
2211 
2212   res = __kmp_omp_task(gtid, new_task, true);
2213 
2214   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
2215                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
2216                 gtid, loc_ref, new_taskdata));
2217 #if OMPT_SUPPORT
2218   if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
2219     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2220   }
2221 #endif
2222   return res;
2223 }
2224 
2225 template <bool ompt>
2226 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
2227                                               void *frame_address,
2228                                               void *return_address) {
2229   kmp_taskdata_t *taskdata = nullptr;
2230   kmp_info_t *thread;
2231   int thread_finished = FALSE;
2232   KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
2233 
2234   KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
2235   KMP_DEBUG_ASSERT(gtid >= 0);
2236 
2237   if (__kmp_tasking_mode != tskm_immediate_exec) {
2238     thread = __kmp_threads[gtid];
2239     taskdata = thread->th.th_current_task;
2240 
2241 #if OMPT_SUPPORT && OMPT_OPTIONAL
2242     ompt_data_t *my_task_data;
2243     ompt_data_t *my_parallel_data;
2244 
2245     if (ompt) {
2246       my_task_data = &(taskdata->ompt_task_info.task_data);
2247       my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
2248 
2249       taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
2250 
2251       if (ompt_enabled.ompt_callback_sync_region) {
2252         ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2253             ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2254             my_task_data, return_address);
2255       }
2256 
2257       if (ompt_enabled.ompt_callback_sync_region_wait) {
2258         ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2259             ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2260             my_task_data, return_address);
2261       }
2262     }
2263 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2264 
2265 // Debugger: The taskwait is active. Store location and thread encountered the
2266 // taskwait.
2267 #if USE_ITT_BUILD
2268 // Note: These values are used by ITT events as well.
2269 #endif /* USE_ITT_BUILD */
2270     taskdata->td_taskwait_counter += 1;
2271     taskdata->td_taskwait_ident = loc_ref;
2272     taskdata->td_taskwait_thread = gtid + 1;
2273 
2274 #if USE_ITT_BUILD
2275     void *itt_sync_obj = NULL;
2276 #if USE_ITT_NOTIFY
2277     KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2278 #endif /* USE_ITT_NOTIFY */
2279 #endif /* USE_ITT_BUILD */
2280 
2281     bool must_wait =
2282         !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
2283 
2284     must_wait = must_wait || (thread->th.th_task_team != NULL &&
2285                               thread->th.th_task_team->tt.tt_found_proxy_tasks);
2286     // If hidden helper thread is encountered, we must enable wait here.
2287     must_wait =
2288         must_wait ||
2289         (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
2290          thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
2291 
2292     if (must_wait) {
2293       kmp_flag_32<false, false> flag(
2294           RCAST(std::atomic<kmp_uint32> *,
2295                 &(taskdata->td_incomplete_child_tasks)),
2296           0U);
2297       while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
2298         flag.execute_tasks(thread, gtid, FALSE,
2299                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2300                            __kmp_task_stealing_constraint);
2301       }
2302     }
2303 #if USE_ITT_BUILD
2304     KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2305     KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
2306 #endif /* USE_ITT_BUILD */
2307 
2308     // Debugger:  The taskwait is completed. Location remains, but thread is
2309     // negated.
2310     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2311 
2312 #if OMPT_SUPPORT && OMPT_OPTIONAL
2313     if (ompt) {
2314       if (ompt_enabled.ompt_callback_sync_region_wait) {
2315         ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2316             ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2317             my_task_data, return_address);
2318       }
2319       if (ompt_enabled.ompt_callback_sync_region) {
2320         ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2321             ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2322             my_task_data, return_address);
2323       }
2324       taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
2325     }
2326 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2327   }
2328 
2329   KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
2330                 "returning TASK_CURRENT_NOT_QUEUED\n",
2331                 gtid, taskdata));
2332 
2333   return TASK_CURRENT_NOT_QUEUED;
2334 }
2335 
2336 #if OMPT_SUPPORT && OMPT_OPTIONAL
2337 OMPT_NOINLINE
2338 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
2339                                           void *frame_address,
2340                                           void *return_address) {
2341   return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
2342                                             return_address);
2343 }
2344 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2345 
2346 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
2347 // complete
2348 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
2349 #if OMPT_SUPPORT && OMPT_OPTIONAL
2350   if (UNLIKELY(ompt_enabled.enabled)) {
2351     OMPT_STORE_RETURN_ADDRESS(gtid);
2352     return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
2353                                     OMPT_LOAD_RETURN_ADDRESS(gtid));
2354   }
2355 #endif
2356   return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
2357 }
2358 
2359 // __kmpc_omp_taskyield: switch to a different task
2360 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
2361   kmp_taskdata_t *taskdata = NULL;
2362   kmp_info_t *thread;
2363   int thread_finished = FALSE;
2364 
2365   KMP_COUNT_BLOCK(OMP_TASKYIELD);
2366   KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2367 
2368   KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2369                 gtid, loc_ref, end_part));
2370   __kmp_assert_valid_gtid(gtid);
2371 
2372   if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
2373     thread = __kmp_threads[gtid];
2374     taskdata = thread->th.th_current_task;
2375 // Should we model this as a task wait or not?
2376 // Debugger: The taskwait is active. Store location and thread encountered the
2377 // taskwait.
2378 #if USE_ITT_BUILD
2379 // Note: These values are used by ITT events as well.
2380 #endif /* USE_ITT_BUILD */
2381     taskdata->td_taskwait_counter += 1;
2382     taskdata->td_taskwait_ident = loc_ref;
2383     taskdata->td_taskwait_thread = gtid + 1;
2384 
2385 #if USE_ITT_BUILD
2386     void *itt_sync_obj = NULL;
2387 #if USE_ITT_NOTIFY
2388     KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2389 #endif /* USE_ITT_NOTIFY */
2390 #endif /* USE_ITT_BUILD */
2391     if (!taskdata->td_flags.team_serial) {
2392       kmp_task_team_t *task_team = thread->th.th_task_team;
2393       if (task_team != NULL) {
2394         if (KMP_TASKING_ENABLED(task_team)) {
2395 #if OMPT_SUPPORT
2396           if (UNLIKELY(ompt_enabled.enabled))
2397             thread->th.ompt_thread_info.ompt_task_yielded = 1;
2398 #endif
2399           __kmp_execute_tasks_32(
2400               thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
2401               &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2402               __kmp_task_stealing_constraint);
2403 #if OMPT_SUPPORT
2404           if (UNLIKELY(ompt_enabled.enabled))
2405             thread->th.ompt_thread_info.ompt_task_yielded = 0;
2406 #endif
2407         }
2408       }
2409     }
2410 #if USE_ITT_BUILD
2411     KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2412 #endif /* USE_ITT_BUILD */
2413 
2414     // Debugger:  The taskwait is completed. Location remains, but thread is
2415     // negated.
2416     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2417   }
2418 
2419   KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2420                 "returning TASK_CURRENT_NOT_QUEUED\n",
2421                 gtid, taskdata));
2422 
2423   return TASK_CURRENT_NOT_QUEUED;
2424 }
2425 
2426 // Task Reduction implementation
2427 //
2428 // Note: initial implementation didn't take into account the possibility
2429 // to specify omp_orig for initializer of the UDR (user defined reduction).
2430 // Corrected implementation takes into account the omp_orig object.
2431 // Compiler is free to use old implementation if omp_orig is not specified.
2432 
2433 /*!
2434 @ingroup BASIC_TYPES
2435 @{
2436 */
2437 
2438 /*!
2439 Flags for special info per task reduction item.
2440 */
2441 typedef struct kmp_taskred_flags {
2442   /*! 1 - use lazy alloc/init (e.g. big objects, num tasks < num threads) */
2443   unsigned lazy_priv : 1;
2444   unsigned reserved31 : 31;
2445 } kmp_taskred_flags_t;
2446 
2447 /*!
2448 Internal struct for reduction data item related info set up by compiler.
2449 */
2450 typedef struct kmp_task_red_input {
2451   void *reduce_shar; /**< shared between tasks item to reduce into */
2452   size_t reduce_size; /**< size of data item in bytes */
2453   // three compiler-generated routines (init, fini are optional):
2454   void *reduce_init; /**< data initialization routine (single parameter) */
2455   void *reduce_fini; /**< data finalization routine */
2456   void *reduce_comb; /**< data combiner routine */
2457   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2458 } kmp_task_red_input_t;
2459 
2460 /*!
2461 Internal struct for reduction data item related info saved by the library.
2462 */
2463 typedef struct kmp_taskred_data {
2464   void *reduce_shar; /**< shared between tasks item to reduce into */
2465   size_t reduce_size; /**< size of data item */
2466   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2467   void *reduce_priv; /**< array of thread specific items */
2468   void *reduce_pend; /**< end of private data for faster comparison op */
2469   // three compiler-generated routines (init, fini are optional):
2470   void *reduce_comb; /**< data combiner routine */
2471   void *reduce_init; /**< data initialization routine (two parameters) */
2472   void *reduce_fini; /**< data finalization routine */
2473   void *reduce_orig; /**< original item (can be used in UDR initializer) */
2474 } kmp_taskred_data_t;
2475 
2476 /*!
2477 Internal struct for reduction data item related info set up by compiler.
2478 
2479 New interface: added reduce_orig field to provide omp_orig for UDR initializer.
2480 */
2481 typedef struct kmp_taskred_input {
2482   void *reduce_shar; /**< shared between tasks item to reduce into */
2483   void *reduce_orig; /**< original reduction item used for initialization */
2484   size_t reduce_size; /**< size of data item */
2485   // three compiler-generated routines (init, fini are optional):
2486   void *reduce_init; /**< data initialization routine (two parameters) */
2487   void *reduce_fini; /**< data finalization routine */
2488   void *reduce_comb; /**< data combiner routine */
2489   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2490 } kmp_taskred_input_t;
2491 /*!
2492 @}
2493 */
2494 
2495 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2496 template <>
2497 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2498                                              kmp_task_red_input_t &src) {
2499   item.reduce_orig = NULL;
2500 }
2501 template <>
2502 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2503                                             kmp_taskred_input_t &src) {
2504   if (src.reduce_orig != NULL) {
2505     item.reduce_orig = src.reduce_orig;
2506   } else {
2507     item.reduce_orig = src.reduce_shar;
2508   } // non-NULL reduce_orig means new interface used
2509 }
2510 
2511 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2512 template <>
2513 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2514                                            size_t offset) {
2515   ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2516 }
2517 template <>
2518 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2519                                           size_t offset) {
2520   ((void (*)(void *, void *))item.reduce_init)(
2521       (char *)(item.reduce_priv) + offset, item.reduce_orig);
2522 }
2523 
2524 template <typename T>
2525 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2526   __kmp_assert_valid_gtid(gtid);
2527   kmp_info_t *thread = __kmp_threads[gtid];
2528   kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2529   kmp_uint32 nth = thread->th.th_team_nproc;
2530   kmp_taskred_data_t *arr;
2531 
2532   // check input data just in case
2533   KMP_ASSERT(tg != NULL);
2534   KMP_ASSERT(data != NULL);
2535   KMP_ASSERT(num > 0);
2536   if (nth == 1 && !__kmp_enable_hidden_helper) {
2537     KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2538                   gtid, tg));
2539     return (void *)tg;
2540   }
2541   KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2542                 gtid, tg, num));
2543   arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2544       thread, num * sizeof(kmp_taskred_data_t));
2545   for (int i = 0; i < num; ++i) {
2546     size_t size = data[i].reduce_size - 1;
2547     // round the size up to cache line per thread-specific item
2548     size += CACHE_LINE - size % CACHE_LINE;
2549     KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2550     arr[i].reduce_shar = data[i].reduce_shar;
2551     arr[i].reduce_size = size;
2552     arr[i].flags = data[i].flags;
2553     arr[i].reduce_comb = data[i].reduce_comb;
2554     arr[i].reduce_init = data[i].reduce_init;
2555     arr[i].reduce_fini = data[i].reduce_fini;
2556     __kmp_assign_orig<T>(arr[i], data[i]);
2557     if (!arr[i].flags.lazy_priv) {
2558       // allocate cache-line aligned block and fill it with zeros
2559       arr[i].reduce_priv = __kmp_allocate(nth * size);
2560       arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2561       if (arr[i].reduce_init != NULL) {
2562         // initialize all thread-specific items
2563         for (size_t j = 0; j < nth; ++j) {
2564           __kmp_call_init<T>(arr[i], j * size);
2565         }
2566       }
2567     } else {
2568       // only allocate space for pointers now,
2569       // objects will be lazily allocated/initialized if/when requested
2570       // note that __kmp_allocate zeroes the allocated memory
2571       arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2572     }
2573   }
2574   tg->reduce_data = (void *)arr;
2575   tg->reduce_num_data = num;
2576   return (void *)tg;
2577 }
2578 
2579 /*!
2580 @ingroup TASKING
2581 @param gtid      Global thread ID
2582 @param num       Number of data items to reduce
2583 @param data      Array of data for reduction
2584 @return The taskgroup identifier
2585 
2586 Initialize task reduction for the taskgroup.
2587 
2588 Note: this entry supposes the optional compiler-generated initializer routine
2589 has single parameter - pointer to object to be initialized. That means
2590 the reduction either does not use omp_orig object, or the omp_orig is accessible
2591 without help of the runtime library.
2592 */
2593 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2594 #if OMPX_TASKGRAPH
2595   kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2596   if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2597     kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2598     this_tdg->rec_taskred_data =
2599         __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2600     this_tdg->rec_num_taskred = num;
2601     KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2602                sizeof(kmp_task_red_input_t) * num);
2603   }
2604 #endif
2605   return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2606 }
2607 
2608 /*!
2609 @ingroup TASKING
2610 @param gtid      Global thread ID
2611 @param num       Number of data items to reduce
2612 @param data      Array of data for reduction
2613 @return The taskgroup identifier
2614 
2615 Initialize task reduction for the taskgroup.
2616 
2617 Note: this entry supposes the optional compiler-generated initializer routine
2618 has two parameters, pointer to object to be initialized and pointer to omp_orig
2619 */
2620 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2621 #if OMPX_TASKGRAPH
2622   kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2623   if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2624     kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2625     this_tdg->rec_taskred_data =
2626         __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2627     this_tdg->rec_num_taskred = num;
2628     KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2629                sizeof(kmp_task_red_input_t) * num);
2630   }
2631 #endif
2632   return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2633 }
2634 
2635 // Copy task reduction data (except for shared pointers).
2636 template <typename T>
2637 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2638                                     kmp_taskgroup_t *tg, void *reduce_data) {
2639   kmp_taskred_data_t *arr;
2640   KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2641                 " from data %p\n",
2642                 thr, tg, reduce_data));
2643   arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2644       thr, num * sizeof(kmp_taskred_data_t));
2645   // threads will share private copies, thunk routines, sizes, flags, etc.:
2646   KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2647   for (int i = 0; i < num; ++i) {
2648     arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2649   }
2650   tg->reduce_data = (void *)arr;
2651   tg->reduce_num_data = num;
2652 }
2653 
2654 /*!
2655 @ingroup TASKING
2656 @param gtid    Global thread ID
2657 @param tskgrp  The taskgroup ID (optional)
2658 @param data    Shared location of the item
2659 @return The pointer to per-thread data
2660 
2661 Get thread-specific location of data item
2662 */
2663 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2664   __kmp_assert_valid_gtid(gtid);
2665   kmp_info_t *thread = __kmp_threads[gtid];
2666   kmp_int32 nth = thread->th.th_team_nproc;
2667   if (nth == 1)
2668     return data; // nothing to do
2669 
2670   kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2671   if (tg == NULL)
2672     tg = thread->th.th_current_task->td_taskgroup;
2673   KMP_ASSERT(tg != NULL);
2674   kmp_taskred_data_t *arr;
2675   kmp_int32 num;
2676   kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2677 
2678 #if OMPX_TASKGRAPH
2679   if ((thread->th.th_current_task->is_taskgraph) &&
2680       (!__kmp_tdg_is_recording(
2681           __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) {
2682     tg = thread->th.th_current_task->td_taskgroup;
2683     KMP_ASSERT(tg != NULL);
2684     KMP_ASSERT(tg->reduce_data != NULL);
2685     arr = (kmp_taskred_data_t *)(tg->reduce_data);
2686     num = tg->reduce_num_data;
2687   }
2688 #endif
2689 
2690   KMP_ASSERT(data != NULL);
2691   while (tg != NULL) {
2692     arr = (kmp_taskred_data_t *)(tg->reduce_data);
2693     num = tg->reduce_num_data;
2694     for (int i = 0; i < num; ++i) {
2695       if (!arr[i].flags.lazy_priv) {
2696         if (data == arr[i].reduce_shar ||
2697             (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2698           return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2699       } else {
2700         // check shared location first
2701         void **p_priv = (void **)(arr[i].reduce_priv);
2702         if (data == arr[i].reduce_shar)
2703           goto found;
2704         // check if we get some thread specific location as parameter
2705         for (int j = 0; j < nth; ++j)
2706           if (data == p_priv[j])
2707             goto found;
2708         continue; // not found, continue search
2709       found:
2710         if (p_priv[tid] == NULL) {
2711           // allocate thread specific object lazily
2712           p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2713           if (arr[i].reduce_init != NULL) {
2714             if (arr[i].reduce_orig != NULL) { // new interface
2715               ((void (*)(void *, void *))arr[i].reduce_init)(
2716                   p_priv[tid], arr[i].reduce_orig);
2717             } else { // old interface (single parameter)
2718               ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2719             }
2720           }
2721         }
2722         return p_priv[tid];
2723       }
2724     }
2725     KMP_ASSERT(tg->parent);
2726     tg = tg->parent;
2727   }
2728   KMP_ASSERT2(0, "Unknown task reduction item");
2729   return NULL; // ERROR, this line never executed
2730 }
2731 
2732 // Finalize task reduction.
2733 // Called from __kmpc_end_taskgroup()
2734 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2735   kmp_int32 nth = th->th.th_team_nproc;
2736   KMP_DEBUG_ASSERT(
2737       nth > 1 ||
2738       __kmp_enable_hidden_helper); // should not be called if nth == 1 unless we
2739                                    // are using hidden helper threads
2740   kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2741   kmp_int32 num = tg->reduce_num_data;
2742   for (int i = 0; i < num; ++i) {
2743     void *sh_data = arr[i].reduce_shar;
2744     void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2745     void (*f_comb)(void *, void *) =
2746         (void (*)(void *, void *))(arr[i].reduce_comb);
2747     if (!arr[i].flags.lazy_priv) {
2748       void *pr_data = arr[i].reduce_priv;
2749       size_t size = arr[i].reduce_size;
2750       for (int j = 0; j < nth; ++j) {
2751         void *priv_data = (char *)pr_data + j * size;
2752         f_comb(sh_data, priv_data); // combine results
2753         if (f_fini)
2754           f_fini(priv_data); // finalize if needed
2755       }
2756     } else {
2757       void **pr_data = (void **)(arr[i].reduce_priv);
2758       for (int j = 0; j < nth; ++j) {
2759         if (pr_data[j] != NULL) {
2760           f_comb(sh_data, pr_data[j]); // combine results
2761           if (f_fini)
2762             f_fini(pr_data[j]); // finalize if needed
2763           __kmp_free(pr_data[j]);
2764         }
2765       }
2766     }
2767     __kmp_free(arr[i].reduce_priv);
2768   }
2769   __kmp_thread_free(th, arr);
2770   tg->reduce_data = NULL;
2771   tg->reduce_num_data = 0;
2772 }
2773 
2774 // Cleanup task reduction data for parallel or worksharing,
2775 // do not touch task private data other threads still working with.
2776 // Called from __kmpc_end_taskgroup()
2777 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2778   __kmp_thread_free(th, tg->reduce_data);
2779   tg->reduce_data = NULL;
2780   tg->reduce_num_data = 0;
2781 }
2782 
2783 template <typename T>
2784 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2785                                          int num, T *data) {
2786   __kmp_assert_valid_gtid(gtid);
2787   kmp_info_t *thr = __kmp_threads[gtid];
2788   kmp_int32 nth = thr->th.th_team_nproc;
2789   __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2790   if (nth == 1) {
2791     KA_TRACE(10,
2792              ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2793               gtid, thr->th.th_current_task->td_taskgroup));
2794     return (void *)thr->th.th_current_task->td_taskgroup;
2795   }
2796   kmp_team_t *team = thr->th.th_team;
2797   void *reduce_data;
2798   kmp_taskgroup_t *tg;
2799   reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2800   if (reduce_data == NULL &&
2801       __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2802                                  (void *)1)) {
2803     // single thread enters this block to initialize common reduction data
2804     KMP_DEBUG_ASSERT(reduce_data == NULL);
2805     // first initialize own data, then make a copy other threads can use
2806     tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2807     reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2808     KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2809     // fini counters should be 0 at this point
2810     KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2811     KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2812     KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2813   } else {
2814     while (
2815         (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2816         (void *)1) { // wait for task reduction initialization
2817       KMP_CPU_PAUSE();
2818     }
2819     KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2820     tg = thr->th.th_current_task->td_taskgroup;
2821     __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2822   }
2823   return tg;
2824 }
2825 
2826 /*!
2827 @ingroup TASKING
2828 @param loc       Source location info
2829 @param gtid      Global thread ID
2830 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2831 @param num       Number of data items to reduce
2832 @param data      Array of data for reduction
2833 @return The taskgroup identifier
2834 
2835 Initialize task reduction for a parallel or worksharing.
2836 
2837 Note: this entry supposes the optional compiler-generated initializer routine
2838 has single parameter - pointer to object to be initialized. That means
2839 the reduction either does not use omp_orig object, or the omp_orig is accessible
2840 without help of the runtime library.
2841 */
2842 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2843                                           int num, void *data) {
2844   return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2845                                             (kmp_task_red_input_t *)data);
2846 }
2847 
2848 /*!
2849 @ingroup TASKING
2850 @param loc       Source location info
2851 @param gtid      Global thread ID
2852 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2853 @param num       Number of data items to reduce
2854 @param data      Array of data for reduction
2855 @return The taskgroup identifier
2856 
2857 Initialize task reduction for a parallel or worksharing.
2858 
2859 Note: this entry supposes the optional compiler-generated initializer routine
2860 has two parameters, pointer to object to be initialized and pointer to omp_orig
2861 */
2862 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2863                                    void *data) {
2864   return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2865                                             (kmp_taskred_input_t *)data);
2866 }
2867 
2868 /*!
2869 @ingroup TASKING
2870 @param loc       Source location info
2871 @param gtid      Global thread ID
2872 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2873 
2874 Finalize task reduction for a parallel or worksharing.
2875 */
2876 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2877   __kmpc_end_taskgroup(loc, gtid);
2878 }
2879 
2880 // __kmpc_taskgroup: Start a new taskgroup
2881 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2882   __kmp_assert_valid_gtid(gtid);
2883   kmp_info_t *thread = __kmp_threads[gtid];
2884   kmp_taskdata_t *taskdata = thread->th.th_current_task;
2885   kmp_taskgroup_t *tg_new =
2886       (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2887   KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2888   KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2889   KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2890   tg_new->parent = taskdata->td_taskgroup;
2891   tg_new->reduce_data = NULL;
2892   tg_new->reduce_num_data = 0;
2893   tg_new->gomp_data = NULL;
2894   taskdata->td_taskgroup = tg_new;
2895 
2896 #if OMPT_SUPPORT && OMPT_OPTIONAL
2897   if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2898     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2899     if (!codeptr)
2900       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2901     kmp_team_t *team = thread->th.th_team;
2902     ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2903     // FIXME: I think this is wrong for lwt!
2904     ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2905 
2906     ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2907         ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2908         &(my_task_data), codeptr);
2909   }
2910 #endif
2911 }
2912 
2913 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2914 //                       and its descendants are complete
2915 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2916   __kmp_assert_valid_gtid(gtid);
2917   kmp_info_t *thread = __kmp_threads[gtid];
2918   kmp_taskdata_t *taskdata = thread->th.th_current_task;
2919   kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2920   int thread_finished = FALSE;
2921 
2922 #if OMPT_SUPPORT && OMPT_OPTIONAL
2923   kmp_team_t *team;
2924   ompt_data_t my_task_data;
2925   ompt_data_t my_parallel_data;
2926   void *codeptr = nullptr;
2927   if (UNLIKELY(ompt_enabled.enabled)) {
2928     team = thread->th.th_team;
2929     my_task_data = taskdata->ompt_task_info.task_data;
2930     // FIXME: I think this is wrong for lwt!
2931     my_parallel_data = team->t.ompt_team_info.parallel_data;
2932     codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2933     if (!codeptr)
2934       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2935   }
2936 #endif
2937 
2938   KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2939   KMP_DEBUG_ASSERT(taskgroup != NULL);
2940   KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2941 
2942   if (__kmp_tasking_mode != tskm_immediate_exec) {
2943     // mark task as waiting not on a barrier
2944     taskdata->td_taskwait_counter += 1;
2945     taskdata->td_taskwait_ident = loc;
2946     taskdata->td_taskwait_thread = gtid + 1;
2947 #if USE_ITT_BUILD
2948     // For ITT the taskgroup wait is similar to taskwait until we need to
2949     // distinguish them
2950     void *itt_sync_obj = NULL;
2951 #if USE_ITT_NOTIFY
2952     KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2953 #endif /* USE_ITT_NOTIFY */
2954 #endif /* USE_ITT_BUILD */
2955 
2956 #if OMPT_SUPPORT && OMPT_OPTIONAL
2957     if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2958       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2959           ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2960           &(my_task_data), codeptr);
2961     }
2962 #endif
2963 
2964     if (!taskdata->td_flags.team_serial ||
2965         (thread->th.th_task_team != NULL &&
2966          (thread->th.th_task_team->tt.tt_found_proxy_tasks ||
2967           thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
2968       kmp_flag_32<false, false> flag(
2969           RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
2970       while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2971         flag.execute_tasks(thread, gtid, FALSE,
2972                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2973                            __kmp_task_stealing_constraint);
2974       }
2975     }
2976     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2977 
2978 #if OMPT_SUPPORT && OMPT_OPTIONAL
2979     if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2980       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2981           ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2982           &(my_task_data), codeptr);
2983     }
2984 #endif
2985 
2986 #if USE_ITT_BUILD
2987     KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2988     KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2989 #endif /* USE_ITT_BUILD */
2990   }
2991   KMP_DEBUG_ASSERT(taskgroup->count == 0);
2992 
2993   if (taskgroup->reduce_data != NULL &&
2994       !taskgroup->gomp_data) { // need to reduce?
2995     int cnt;
2996     void *reduce_data;
2997     kmp_team_t *t = thread->th.th_team;
2998     kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2999     // check if <priv> data of the first reduction variable shared for the team
3000     void *priv0 = arr[0].reduce_priv;
3001     if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
3002         ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
3003       // finishing task reduction on parallel
3004       cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
3005       if (cnt == thread->th.th_team_nproc - 1) {
3006         // we are the last thread passing __kmpc_reduction_modifier_fini()
3007         // finalize task reduction:
3008         __kmp_task_reduction_fini(thread, taskgroup);
3009         // cleanup fields in the team structure:
3010         // TODO: is relaxed store enough here (whole barrier should follow)?
3011         __kmp_thread_free(thread, reduce_data);
3012         KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
3013         KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
3014       } else {
3015         // we are not the last thread passing __kmpc_reduction_modifier_fini(),
3016         // so do not finalize reduction, just clean own copy of the data
3017         __kmp_task_reduction_clean(thread, taskgroup);
3018       }
3019     } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
3020                    NULL &&
3021                ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
3022       // finishing task reduction on worksharing
3023       cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
3024       if (cnt == thread->th.th_team_nproc - 1) {
3025         // we are the last thread passing __kmpc_reduction_modifier_fini()
3026         __kmp_task_reduction_fini(thread, taskgroup);
3027         // cleanup fields in team structure:
3028         // TODO: is relaxed store enough here (whole barrier should follow)?
3029         __kmp_thread_free(thread, reduce_data);
3030         KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
3031         KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
3032       } else {
3033         // we are not the last thread passing __kmpc_reduction_modifier_fini(),
3034         // so do not finalize reduction, just clean own copy of the data
3035         __kmp_task_reduction_clean(thread, taskgroup);
3036       }
3037     } else {
3038       // finishing task reduction on taskgroup
3039       __kmp_task_reduction_fini(thread, taskgroup);
3040     }
3041   }
3042   // Restore parent taskgroup for the current task
3043   taskdata->td_taskgroup = taskgroup->parent;
3044   __kmp_thread_free(thread, taskgroup);
3045 
3046   KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
3047                 gtid, taskdata));
3048 
3049 #if OMPT_SUPPORT && OMPT_OPTIONAL
3050   if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
3051     ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
3052         ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
3053         &(my_task_data), codeptr);
3054   }
3055 #endif
3056 }
3057 
3058 static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid,
3059                                            kmp_task_team_t *task_team,
3060                                            kmp_int32 is_constrained) {
3061   kmp_task_t *task = NULL;
3062   kmp_taskdata_t *taskdata;
3063   kmp_taskdata_t *current;
3064   kmp_thread_data_t *thread_data;
3065   int ntasks = task_team->tt.tt_num_task_pri;
3066   if (ntasks == 0) {
3067     KA_TRACE(
3068         20, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid));
3069     return NULL;
3070   }
3071   do {
3072     // decrement num_tasks to "reserve" one task to get for execution
3073     if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks,
3074                                    ntasks - 1))
3075       break;
3076     ntasks = task_team->tt.tt_num_task_pri;
3077   } while (ntasks > 0);
3078   if (ntasks == 0) {
3079     KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n",
3080                   __kmp_get_gtid()));
3081     return NULL;
3082   }
3083   // We got a "ticket" to get a "reserved" priority task
3084   int deque_ntasks;
3085   kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3086   do {
3087     KMP_ASSERT(list != NULL);
3088     thread_data = &list->td;
3089     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3090     deque_ntasks = thread_data->td.td_deque_ntasks;
3091     if (deque_ntasks == 0) {
3092       __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3093       KA_TRACE(20, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n",
3094                     __kmp_get_gtid(), thread_data));
3095       list = list->next;
3096     }
3097   } while (deque_ntasks == 0);
3098   KMP_DEBUG_ASSERT(deque_ntasks);
3099   int target = thread_data->td.td_deque_head;
3100   current = __kmp_threads[gtid]->th.th_current_task;
3101   taskdata = thread_data->td.td_deque[target];
3102   if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3103     // Bump head pointer and Wrap.
3104     thread_data->td.td_deque_head =
3105         (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3106   } else {
3107     if (!task_team->tt.tt_untied_task_encountered) {
3108       // The TSC does not allow to steal victim task
3109       __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3110       KA_TRACE(20, ("__kmp_get_priority_task(exit #3): T#%d could not get task "
3111                     "from %p: task_team=%p ntasks=%d head=%u tail=%u\n",
3112                     gtid, thread_data, task_team, deque_ntasks, target,
3113                     thread_data->td.td_deque_tail));
3114       task_team->tt.tt_num_task_pri++; // atomic inc, restore value
3115       return NULL;
3116     }
3117     int i;
3118     // walk through the deque trying to steal any task
3119     taskdata = NULL;
3120     for (i = 1; i < deque_ntasks; ++i) {
3121       target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3122       taskdata = thread_data->td.td_deque[target];
3123       if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3124         break; // found task to execute
3125       } else {
3126         taskdata = NULL;
3127       }
3128     }
3129     if (taskdata == NULL) {
3130       // No appropriate candidate found to execute
3131       __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3132       KA_TRACE(
3133           10, ("__kmp_get_priority_task(exit #4): T#%d could not get task from "
3134                "%p: task_team=%p ntasks=%d head=%u tail=%u\n",
3135                gtid, thread_data, task_team, deque_ntasks,
3136                thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3137       task_team->tt.tt_num_task_pri++; // atomic inc, restore value
3138       return NULL;
3139     }
3140     int prev = target;
3141     for (i = i + 1; i < deque_ntasks; ++i) {
3142       // shift remaining tasks in the deque left by 1
3143       target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3144       thread_data->td.td_deque[prev] = thread_data->td.td_deque[target];
3145       prev = target;
3146     }
3147     KMP_DEBUG_ASSERT(
3148         thread_data->td.td_deque_tail ==
3149         (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(thread_data->td)));
3150     thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped))
3151   }
3152   thread_data->td.td_deque_ntasks = deque_ntasks - 1;
3153   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3154   task = KMP_TASKDATA_TO_TASK(taskdata);
3155   return task;
3156 }
3157 
3158 // __kmp_remove_my_task: remove a task from my own deque
3159 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
3160                                         kmp_task_team_t *task_team,
3161                                         kmp_int32 is_constrained) {
3162   kmp_task_t *task;
3163   kmp_taskdata_t *taskdata;
3164   kmp_thread_data_t *thread_data;
3165   kmp_uint32 tail;
3166 
3167   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3168   KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
3169                    NULL); // Caller should check this condition
3170 
3171   thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
3172 
3173   KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
3174                 gtid, thread_data->td.td_deque_ntasks,
3175                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3176 
3177   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
3178     KA_TRACE(10,
3179              ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
3180               "ntasks=%d head=%u tail=%u\n",
3181               gtid, thread_data->td.td_deque_ntasks,
3182               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3183     return NULL;
3184   }
3185 
3186   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3187 
3188   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
3189     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3190     KA_TRACE(10,
3191              ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
3192               "ntasks=%d head=%u tail=%u\n",
3193               gtid, thread_data->td.td_deque_ntasks,
3194               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3195     return NULL;
3196   }
3197 
3198   tail = (thread_data->td.td_deque_tail - 1) &
3199          TASK_DEQUE_MASK(thread_data->td); // Wrap index.
3200   taskdata = thread_data->td.td_deque[tail];
3201 
3202   if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
3203                              thread->th.th_current_task)) {
3204     // The TSC does not allow to steal victim task
3205     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3206     KA_TRACE(10,
3207              ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
3208               "ntasks=%d head=%u tail=%u\n",
3209               gtid, thread_data->td.td_deque_ntasks,
3210               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3211     return NULL;
3212   }
3213 
3214   thread_data->td.td_deque_tail = tail;
3215   TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
3216 
3217   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3218 
3219   KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
3220                 "ntasks=%d head=%u tail=%u\n",
3221                 gtid, taskdata, thread_data->td.td_deque_ntasks,
3222                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3223 
3224   task = KMP_TASKDATA_TO_TASK(taskdata);
3225   return task;
3226 }
3227 
3228 // __kmp_steal_task: remove a task from another thread's deque
3229 // Assume that calling thread has already checked existence of
3230 // task_team thread_data before calling this routine.
3231 static kmp_task_t *__kmp_steal_task(kmp_int32 victim_tid, kmp_int32 gtid,
3232                                     kmp_task_team_t *task_team,
3233                                     std::atomic<kmp_int32> *unfinished_threads,
3234                                     int *thread_finished,
3235                                     kmp_int32 is_constrained) {
3236   kmp_task_t *task;
3237   kmp_taskdata_t *taskdata;
3238   kmp_taskdata_t *current;
3239   kmp_thread_data_t *victim_td, *threads_data;
3240   kmp_int32 target;
3241   kmp_info_t *victim_thr;
3242 
3243   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3244 
3245   threads_data = task_team->tt.tt_threads_data;
3246   KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
3247   KMP_DEBUG_ASSERT(victim_tid >= 0);
3248   KMP_DEBUG_ASSERT(victim_tid < task_team->tt.tt_nproc);
3249 
3250   victim_td = &threads_data[victim_tid];
3251   victim_thr = victim_td->td.td_thr;
3252   (void)victim_thr; // Use in TRACE messages which aren't always enabled.
3253 
3254   KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
3255                 "task_team=%p ntasks=%d head=%u tail=%u\n",
3256                 gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3257                 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3258                 victim_td->td.td_deque_tail));
3259 
3260   if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
3261     KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
3262                   "task_team=%p ntasks=%d head=%u tail=%u\n",
3263                   gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3264                   victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3265                   victim_td->td.td_deque_tail));
3266     return NULL;
3267   }
3268 
3269   __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
3270 
3271   int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
3272   // Check again after we acquire the lock
3273   if (ntasks == 0) {
3274     __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3275     KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
3276                   "task_team=%p ntasks=%d head=%u tail=%u\n",
3277                   gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3278                   victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3279     return NULL;
3280   }
3281 
3282   KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
3283   current = __kmp_threads[gtid]->th.th_current_task;
3284   taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
3285   if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3286     // Bump head pointer and Wrap.
3287     victim_td->td.td_deque_head =
3288         (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
3289   } else {
3290     if (!task_team->tt.tt_untied_task_encountered) {
3291       // The TSC does not allow to steal victim task
3292       __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3293       KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
3294                     "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3295                     gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3296                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3297       return NULL;
3298     }
3299     int i;
3300     // walk through victim's deque trying to steal any task
3301     target = victim_td->td.td_deque_head;
3302     taskdata = NULL;
3303     for (i = 1; i < ntasks; ++i) {
3304       target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3305       taskdata = victim_td->td.td_deque[target];
3306       if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3307         break; // found victim task
3308       } else {
3309         taskdata = NULL;
3310       }
3311     }
3312     if (taskdata == NULL) {
3313       // No appropriate candidate to steal found
3314       __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3315       KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
3316                     "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3317                     gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3318                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3319       return NULL;
3320     }
3321     int prev = target;
3322     for (i = i + 1; i < ntasks; ++i) {
3323       // shift remaining tasks in the deque left by 1
3324       target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3325       victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
3326       prev = target;
3327     }
3328     KMP_DEBUG_ASSERT(
3329         victim_td->td.td_deque_tail ==
3330         (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
3331     victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
3332   }
3333   if (*thread_finished) {
3334     // We need to un-mark this victim as a finished victim.  This must be done
3335     // before releasing the lock, or else other threads (starting with the
3336     // primary thread victim) might be prematurely released from the barrier!!!
3337 #if KMP_DEBUG
3338     kmp_int32 count =
3339 #endif
3340         KMP_ATOMIC_INC(unfinished_threads);
3341     KA_TRACE(
3342         20,
3343         ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
3344          gtid, count + 1, task_team));
3345     *thread_finished = FALSE;
3346   }
3347   TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
3348 
3349   __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3350 
3351   KMP_COUNT_BLOCK(TASK_stolen);
3352   KA_TRACE(10,
3353            ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
3354             "task_team=%p ntasks=%d head=%u tail=%u\n",
3355             gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
3356             ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3357 
3358   task = KMP_TASKDATA_TO_TASK(taskdata);
3359   return task;
3360 }
3361 
3362 // __kmp_execute_tasks_template: Choose and execute tasks until either the
3363 // condition is statisfied (return true) or there are none left (return false).
3364 //
3365 // final_spin is TRUE if this is the spin at the release barrier.
3366 // thread_finished indicates whether the thread is finished executing all
3367 // the tasks it has on its deque, and is at the release barrier.
3368 // spinner is the location on which to spin.
3369 // spinner == NULL means only execute a single task and return.
3370 // checker is the value to check to terminate the spin.
3371 template <class C>
3372 static inline int __kmp_execute_tasks_template(
3373     kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
3374     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3375     kmp_int32 is_constrained) {
3376   kmp_task_team_t *task_team = thread->th.th_task_team;
3377   kmp_thread_data_t *threads_data;
3378   kmp_task_t *task;
3379   kmp_info_t *other_thread;
3380   kmp_taskdata_t *current_task = thread->th.th_current_task;
3381   std::atomic<kmp_int32> *unfinished_threads;
3382   kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
3383                       tid = thread->th.th_info.ds.ds_tid;
3384 
3385   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3386   KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
3387 
3388   if (task_team == NULL || current_task == NULL)
3389     return FALSE;
3390 
3391   KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
3392                 "*thread_finished=%d\n",
3393                 gtid, final_spin, *thread_finished));
3394 
3395   thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3396   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3397 
3398   KMP_DEBUG_ASSERT(threads_data != NULL);
3399 
3400   nthreads = task_team->tt.tt_nproc;
3401   unfinished_threads = &(task_team->tt.tt_unfinished_threads);
3402   KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
3403 
3404   while (1) { // Outer loop keeps trying to find tasks in case of single thread
3405     // getting tasks from target constructs
3406     while (1) { // Inner loop to find a task and execute it
3407       task = NULL;
3408       if (task_team->tt.tt_num_task_pri) { // get priority task first
3409         task = __kmp_get_priority_task(gtid, task_team, is_constrained);
3410       }
3411       if (task == NULL && use_own_tasks) { // check own queue next
3412         task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
3413       }
3414       if ((task == NULL) && (nthreads > 1)) { // Steal a task finally
3415         int asleep = 1;
3416         use_own_tasks = 0;
3417         // Try to steal from the last place I stole from successfully.
3418         if (victim_tid == -2) { // haven't stolen anything yet
3419           victim_tid = threads_data[tid].td.td_deque_last_stolen;
3420           if (victim_tid !=
3421               -1) // if we have a last stolen from victim, get the thread
3422             other_thread = threads_data[victim_tid].td.td_thr;
3423         }
3424         if (victim_tid != -1) { // found last victim
3425           asleep = 0;
3426         } else if (!new_victim) { // no recent steals and we haven't already
3427           // used a new victim; select a random thread
3428           do { // Find a different thread to steal work from.
3429             // Pick a random thread. Initial plan was to cycle through all the
3430             // threads, and only return if we tried to steal from every thread,
3431             // and failed.  Arch says that's not such a great idea.
3432             victim_tid = __kmp_get_random(thread) % (nthreads - 1);
3433             if (victim_tid >= tid) {
3434               ++victim_tid; // Adjusts random distribution to exclude self
3435             }
3436             // Found a potential victim
3437             other_thread = threads_data[victim_tid].td.td_thr;
3438             // There is a slight chance that __kmp_enable_tasking() did not wake
3439             // up all threads waiting at the barrier.  If victim is sleeping,
3440             // then wake it up. Since we were going to pay the cache miss
3441             // penalty for referencing another thread's kmp_info_t struct
3442             // anyway,
3443             // the check shouldn't cost too much performance at this point. In
3444             // extra barrier mode, tasks do not sleep at the separate tasking
3445             // barrier, so this isn't a problem.
3446             asleep = 0;
3447             if ((__kmp_tasking_mode == tskm_task_teams) &&
3448                 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
3449                 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
3450                  NULL)) {
3451               asleep = 1;
3452               __kmp_null_resume_wrapper(other_thread);
3453               // A sleeping thread should not have any tasks on it's queue.
3454               // There is a slight possibility that it resumes, steals a task
3455               // from another thread, which spawns more tasks, all in the time
3456               // that it takes this thread to check => don't write an assertion
3457               // that the victim's queue is empty.  Try stealing from a
3458               // different thread.
3459             }
3460           } while (asleep);
3461         }
3462 
3463         if (!asleep) {
3464           // We have a victim to try to steal from
3465           task =
3466               __kmp_steal_task(victim_tid, gtid, task_team, unfinished_threads,
3467                                thread_finished, is_constrained);
3468         }
3469         if (task != NULL) { // set last stolen to victim
3470           if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
3471             threads_data[tid].td.td_deque_last_stolen = victim_tid;
3472             // The pre-refactored code did not try more than 1 successful new
3473             // vicitm, unless the last one generated more local tasks;
3474             // new_victim keeps track of this
3475             new_victim = 1;
3476           }
3477         } else { // No tasks found; unset last_stolen
3478           KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
3479           victim_tid = -2; // no successful victim found
3480         }
3481       }
3482 
3483       if (task == NULL)
3484         break; // break out of tasking loop
3485 
3486 // Found a task; execute it
3487 #if USE_ITT_BUILD && USE_ITT_NOTIFY
3488       if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
3489         if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
3490           // get the object reliably
3491           itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
3492         }
3493         __kmp_itt_task_starting(itt_sync_obj);
3494       }
3495 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
3496       __kmp_invoke_task(gtid, task, current_task);
3497 #if USE_ITT_BUILD
3498       if (itt_sync_obj != NULL)
3499         __kmp_itt_task_finished(itt_sync_obj);
3500 #endif /* USE_ITT_BUILD */
3501       // If this thread is only partway through the barrier and the condition is
3502       // met, then return now, so that the barrier gather/release pattern can
3503       // proceed. If this thread is in the last spin loop in the barrier,
3504       // waiting to be released, we know that the termination condition will not
3505       // be satisfied, so don't waste any cycles checking it.
3506       if (flag == NULL || (!final_spin && flag->done_check())) {
3507         KA_TRACE(
3508             15,
3509             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3510              gtid));
3511         return TRUE;
3512       }
3513       if (thread->th.th_task_team == NULL) {
3514         break;
3515       }
3516       KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3517       // If execution of a stolen task results in more tasks being placed on our
3518       // run queue, reset use_own_tasks
3519       if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3520         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3521                       "other tasks, restart\n",
3522                       gtid));
3523         use_own_tasks = 1;
3524         new_victim = 0;
3525       }
3526     }
3527 
3528     // The task source has been exhausted. If in final spin loop of barrier,
3529     // check if termination condition is satisfied. The work queue may be empty
3530     // but there might be proxy tasks still executing.
3531     if (final_spin &&
3532         KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
3533       // First, decrement the #unfinished threads, if that has not already been
3534       // done.  This decrement might be to the spin location, and result in the
3535       // termination condition being satisfied.
3536       if (!*thread_finished) {
3537 #if KMP_DEBUG
3538         kmp_int32 count = -1 +
3539 #endif
3540             KMP_ATOMIC_DEC(unfinished_threads);
3541         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3542                       "unfinished_threads to %d task_team=%p\n",
3543                       gtid, count, task_team));
3544         *thread_finished = TRUE;
3545       }
3546 
3547       // It is now unsafe to reference thread->th.th_team !!!
3548       // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3549       // thread to pass through the barrier, where it might reset each thread's
3550       // th.th_team field for the next parallel region. If we can steal more
3551       // work, we know that this has not happened yet.
3552       if (flag != NULL && flag->done_check()) {
3553         KA_TRACE(
3554             15,
3555             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3556              gtid));
3557         return TRUE;
3558       }
3559     }
3560 
3561     // If this thread's task team is NULL, primary thread has recognized that
3562     // there are no more tasks; bail out
3563     if (thread->th.th_task_team == NULL) {
3564       KA_TRACE(15,
3565                ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3566       return FALSE;
3567     }
3568 
3569     // Check the flag again to see if it has already done in case to be trapped
3570     // into infinite loop when a if0 task depends on a hidden helper task
3571     // outside any parallel region. Detached tasks are not impacted in this case
3572     // because the only thread executing this function has to execute the proxy
3573     // task so it is in another code path that has the same check.
3574     if (flag == NULL || (!final_spin && flag->done_check())) {
3575       KA_TRACE(15,
3576                ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3577                 gtid));
3578       return TRUE;
3579     }
3580 
3581     // We could be getting tasks from target constructs; if this is the only
3582     // thread, keep trying to execute tasks from own queue
3583     if (nthreads == 1 &&
3584         KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))
3585       use_own_tasks = 1;
3586     else {
3587       KA_TRACE(15,
3588                ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3589       return FALSE;
3590     }
3591   }
3592 }
3593 
3594 template <bool C, bool S>
3595 int __kmp_execute_tasks_32(
3596     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
3597     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3598     kmp_int32 is_constrained) {
3599   return __kmp_execute_tasks_template(
3600       thread, gtid, flag, final_spin,
3601       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3602 }
3603 
3604 template <bool C, bool S>
3605 int __kmp_execute_tasks_64(
3606     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
3607     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3608     kmp_int32 is_constrained) {
3609   return __kmp_execute_tasks_template(
3610       thread, gtid, flag, final_spin,
3611       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3612 }
3613 
3614 template <bool C, bool S>
3615 int __kmp_atomic_execute_tasks_64(
3616     kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,
3617     int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3618     kmp_int32 is_constrained) {
3619   return __kmp_execute_tasks_template(
3620       thread, gtid, flag, final_spin,
3621       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3622 }
3623 
3624 int __kmp_execute_tasks_oncore(
3625     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3626     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3627     kmp_int32 is_constrained) {
3628   return __kmp_execute_tasks_template(
3629       thread, gtid, flag, final_spin,
3630       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3631 }
3632 
3633 template int
3634 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
3635                                      kmp_flag_32<false, false> *, int,
3636                                      int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3637 
3638 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
3639                                                  kmp_flag_64<false, true> *,
3640                                                  int,
3641                                                  int *USE_ITT_BUILD_ARG(void *),
3642                                                  kmp_int32);
3643 
3644 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
3645                                                  kmp_flag_64<true, false> *,
3646                                                  int,
3647                                                  int *USE_ITT_BUILD_ARG(void *),
3648                                                  kmp_int32);
3649 
3650 template int __kmp_atomic_execute_tasks_64<false, true>(
3651     kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,
3652     int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3653 
3654 template int __kmp_atomic_execute_tasks_64<true, false>(
3655     kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,
3656     int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3657 
3658 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3659 // next barrier so they can assist in executing enqueued tasks.
3660 // First thread in allocates the task team atomically.
3661 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3662                                  kmp_info_t *this_thr) {
3663   kmp_thread_data_t *threads_data;
3664   int nthreads, i, is_init_thread;
3665 
3666   KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3667                 __kmp_gtid_from_thread(this_thr)));
3668 
3669   KMP_DEBUG_ASSERT(task_team != NULL);
3670   KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3671 
3672   nthreads = task_team->tt.tt_nproc;
3673   KMP_DEBUG_ASSERT(nthreads > 0);
3674   KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3675 
3676   // Allocate or increase the size of threads_data if necessary
3677   is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3678 
3679   if (!is_init_thread) {
3680     // Some other thread already set up the array.
3681     KA_TRACE(
3682         20,
3683         ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3684          __kmp_gtid_from_thread(this_thr)));
3685     return;
3686   }
3687   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3688   KMP_DEBUG_ASSERT(threads_data != NULL);
3689 
3690   if (__kmp_tasking_mode == tskm_task_teams &&
3691       (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3692     // Release any threads sleeping at the barrier, so that they can steal
3693     // tasks and execute them.  In extra barrier mode, tasks do not sleep
3694     // at the separate tasking barrier, so this isn't a problem.
3695     for (i = 0; i < nthreads; i++) {
3696       void *sleep_loc;
3697       kmp_info_t *thread = threads_data[i].td.td_thr;
3698 
3699       if (i == this_thr->th.th_info.ds.ds_tid) {
3700         continue;
3701       }
3702       // Since we haven't locked the thread's suspend mutex lock at this
3703       // point, there is a small window where a thread might be putting
3704       // itself to sleep, but hasn't set the th_sleep_loc field yet.
3705       // To work around this, __kmp_execute_tasks_template() periodically checks
3706       // see if other threads are sleeping (using the same random mechanism that
3707       // is used for task stealing) and awakens them if they are.
3708       if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3709           NULL) {
3710         KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3711                       __kmp_gtid_from_thread(this_thr),
3712                       __kmp_gtid_from_thread(thread)));
3713         __kmp_null_resume_wrapper(thread);
3714       } else {
3715         KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3716                       __kmp_gtid_from_thread(this_thr),
3717                       __kmp_gtid_from_thread(thread)));
3718       }
3719     }
3720   }
3721 
3722   KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3723                 __kmp_gtid_from_thread(this_thr)));
3724 }
3725 
3726 /* // TODO: Check the comment consistency
3727  * Utility routines for "task teams".  A task team (kmp_task_t) is kind of
3728  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3729  * After a child * thread checks into a barrier and calls __kmp_release() from
3730  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3731  * longer assume that the kmp_team_t structure is intact (at any moment, the
3732  * primary thread may exit the barrier code and free the team data structure,
3733  * and return the threads to the thread pool).
3734  *
3735  * This does not work with the tasking code, as the thread is still
3736  * expected to participate in the execution of any tasks that may have been
3737  * spawned my a member of the team, and the thread still needs access to all
3738  * to each thread in the team, so that it can steal work from it.
3739  *
3740  * Enter the existence of the kmp_task_team_t struct.  It employs a reference
3741  * counting mechanism, and is allocated by the primary thread before calling
3742  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3743  * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
3744  * of the kmp_task_team_t structs for consecutive barriers can overlap
3745  * (and will, unless the primary thread is the last thread to exit the barrier
3746  * release phase, which is not typical). The existence of such a struct is
3747  * useful outside the context of tasking.
3748  *
3749  * We currently use the existence of the threads array as an indicator that
3750  * tasks were spawned since the last barrier.  If the structure is to be
3751  * useful outside the context of tasking, then this will have to change, but
3752  * not setting the field minimizes the performance impact of tasking on
3753  * barriers, when no explicit tasks were spawned (pushed, actually).
3754  */
3755 
3756 static kmp_task_team_t *__kmp_free_task_teams =
3757     NULL; // Free list for task_team data structures
3758 // Lock for task team data structures
3759 kmp_bootstrap_lock_t __kmp_task_team_lock =
3760     KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3761 
3762 // __kmp_alloc_task_deque:
3763 // Allocates a task deque for a particular thread, and initialize the necessary
3764 // data structures relating to the deque.  This only happens once per thread
3765 // per task team since task teams are recycled. No lock is needed during
3766 // allocation since each thread allocates its own deque.
3767 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3768                                    kmp_thread_data_t *thread_data) {
3769   __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3770   KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3771 
3772   // Initialize last stolen task field to "none"
3773   thread_data->td.td_deque_last_stolen = -1;
3774 
3775   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3776   KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3777   KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3778 
3779   KE_TRACE(
3780       10,
3781       ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3782        __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3783   // Allocate space for task deque, and zero the deque
3784   // Cannot use __kmp_thread_calloc() because threads not around for
3785   // kmp_reap_task_team( ).
3786   thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3787       INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3788   thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3789 }
3790 
3791 // __kmp_free_task_deque:
3792 // Deallocates a task deque for a particular thread. Happens at library
3793 // deallocation so don't need to reset all thread data fields.
3794 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3795   if (thread_data->td.td_deque != NULL) {
3796     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3797     TCW_4(thread_data->td.td_deque_ntasks, 0);
3798     __kmp_free(thread_data->td.td_deque);
3799     thread_data->td.td_deque = NULL;
3800     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3801   }
3802 
3803 #ifdef BUILD_TIED_TASK_STACK
3804   // GEH: Figure out what to do here for td_susp_tied_tasks
3805   if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3806     __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3807   }
3808 #endif // BUILD_TIED_TASK_STACK
3809 }
3810 
3811 // __kmp_realloc_task_threads_data:
3812 // Allocates a threads_data array for a task team, either by allocating an
3813 // initial array or enlarging an existing array.  Only the first thread to get
3814 // the lock allocs or enlarges the array and re-initializes the array elements.
3815 // That thread returns "TRUE", the rest return "FALSE".
3816 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3817 // The current size is given by task_team -> tt.tt_max_threads.
3818 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3819                                            kmp_task_team_t *task_team) {
3820   kmp_thread_data_t **threads_data_p;
3821   kmp_int32 nthreads, maxthreads;
3822   int is_init_thread = FALSE;
3823 
3824   if (TCR_4(task_team->tt.tt_found_tasks)) {
3825     // Already reallocated and initialized.
3826     return FALSE;
3827   }
3828 
3829   threads_data_p = &task_team->tt.tt_threads_data;
3830   nthreads = task_team->tt.tt_nproc;
3831   maxthreads = task_team->tt.tt_max_threads;
3832 
3833   // All threads must lock when they encounter the first task of the implicit
3834   // task region to make sure threads_data fields are (re)initialized before
3835   // used.
3836   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3837 
3838   if (!TCR_4(task_team->tt.tt_found_tasks)) {
3839     // first thread to enable tasking
3840     kmp_team_t *team = thread->th.th_team;
3841     int i;
3842 
3843     is_init_thread = TRUE;
3844     if (maxthreads < nthreads) {
3845 
3846       if (*threads_data_p != NULL) {
3847         kmp_thread_data_t *old_data = *threads_data_p;
3848         kmp_thread_data_t *new_data = NULL;
3849 
3850         KE_TRACE(
3851             10,
3852             ("__kmp_realloc_task_threads_data: T#%d reallocating "
3853              "threads data for task_team %p, new_size = %d, old_size = %d\n",
3854              __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3855         // Reallocate threads_data to have more elements than current array
3856         // Cannot use __kmp_thread_realloc() because threads not around for
3857         // kmp_reap_task_team( ).  Note all new array entries are initialized
3858         // to zero by __kmp_allocate().
3859         new_data = (kmp_thread_data_t *)__kmp_allocate(
3860             nthreads * sizeof(kmp_thread_data_t));
3861         // copy old data to new data
3862         KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3863                      (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3864 
3865 #ifdef BUILD_TIED_TASK_STACK
3866         // GEH: Figure out if this is the right thing to do
3867         for (i = maxthreads; i < nthreads; i++) {
3868           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3869           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3870         }
3871 #endif // BUILD_TIED_TASK_STACK
3872        // Install the new data and free the old data
3873         (*threads_data_p) = new_data;
3874         __kmp_free(old_data);
3875       } else {
3876         KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3877                       "threads data for task_team %p, size = %d\n",
3878                       __kmp_gtid_from_thread(thread), task_team, nthreads));
3879         // Make the initial allocate for threads_data array, and zero entries
3880         // Cannot use __kmp_thread_calloc() because threads not around for
3881         // kmp_reap_task_team( ).
3882         *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3883             nthreads * sizeof(kmp_thread_data_t));
3884 #ifdef BUILD_TIED_TASK_STACK
3885         // GEH: Figure out if this is the right thing to do
3886         for (i = 0; i < nthreads; i++) {
3887           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3888           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3889         }
3890 #endif // BUILD_TIED_TASK_STACK
3891       }
3892       task_team->tt.tt_max_threads = nthreads;
3893     } else {
3894       // If array has (more than) enough elements, go ahead and use it
3895       KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3896     }
3897 
3898     // initialize threads_data pointers back to thread_info structures
3899     for (i = 0; i < nthreads; i++) {
3900       kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3901       thread_data->td.td_thr = team->t.t_threads[i];
3902 
3903       if (thread_data->td.td_deque_last_stolen >= nthreads) {
3904         // The last stolen field survives across teams / barrier, and the number
3905         // of threads may have changed.  It's possible (likely?) that a new
3906         // parallel region will exhibit the same behavior as previous region.
3907         thread_data->td.td_deque_last_stolen = -1;
3908       }
3909     }
3910 
3911     KMP_MB();
3912     TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3913   }
3914 
3915   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3916   return is_init_thread;
3917 }
3918 
3919 // __kmp_free_task_threads_data:
3920 // Deallocates a threads_data array for a task team, including any attached
3921 // tasking deques.  Only occurs at library shutdown.
3922 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3923   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3924   if (task_team->tt.tt_threads_data != NULL) {
3925     int i;
3926     for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3927       __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3928     }
3929     __kmp_free(task_team->tt.tt_threads_data);
3930     task_team->tt.tt_threads_data = NULL;
3931   }
3932   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3933 }
3934 
3935 // __kmp_free_task_pri_list:
3936 // Deallocates tasking deques used for priority tasks.
3937 // Only occurs at library shutdown.
3938 static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
3939   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3940   if (task_team->tt.tt_task_pri_list != NULL) {
3941     kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3942     while (list != NULL) {
3943       kmp_task_pri_t *next = list->next;
3944       __kmp_free_task_deque(&list->td);
3945       __kmp_free(list);
3946       list = next;
3947     }
3948     task_team->tt.tt_task_pri_list = NULL;
3949   }
3950   __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3951 }
3952 
3953 static inline void __kmp_task_team_init(kmp_task_team_t *task_team,
3954                                         kmp_team_t *team) {
3955   int team_nth = team->t.t_nproc;
3956   // Only need to init if task team is isn't active or team size changed
3957   if (!task_team->tt.tt_active || team_nth != task_team->tt.tt_nproc) {
3958     TCW_4(task_team->tt.tt_found_tasks, FALSE);
3959     TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3960     TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3961     TCW_4(task_team->tt.tt_nproc, team_nth);
3962     KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team_nth);
3963     TCW_4(task_team->tt.tt_active, TRUE);
3964   }
3965 }
3966 
3967 // __kmp_allocate_task_team:
3968 // Allocates a task team associated with a specific team, taking it from
3969 // the global task team free list if possible.  Also initializes data
3970 // structures.
3971 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3972                                                  kmp_team_t *team) {
3973   kmp_task_team_t *task_team = NULL;
3974 
3975   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3976                 (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3977 
3978   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3979     // Take a task team from the task team pool
3980     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3981     if (__kmp_free_task_teams != NULL) {
3982       task_team = __kmp_free_task_teams;
3983       TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3984       task_team->tt.tt_next = NULL;
3985     }
3986     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3987   }
3988 
3989   if (task_team == NULL) {
3990     KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3991                   "task team for team %p\n",
3992                   __kmp_gtid_from_thread(thread), team));
3993     // Allocate a new task team if one is not available. Cannot use
3994     // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3995     task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3996     __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3997     __kmp_init_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3998 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3999     // suppress race conditions detection on synchronization flags in debug mode
4000     // this helps to analyze library internals eliminating false positives
4001     __itt_suppress_mark_range(
4002         __itt_suppress_range, __itt_suppress_threading_errors,
4003         &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
4004     __itt_suppress_mark_range(__itt_suppress_range,
4005                               __itt_suppress_threading_errors,
4006                               CCAST(kmp_uint32 *, &task_team->tt.tt_active),
4007                               sizeof(task_team->tt.tt_active));
4008 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4009     // Note: __kmp_allocate zeroes returned memory, othewise we would need:
4010     // task_team->tt.tt_threads_data = NULL;
4011     // task_team->tt.tt_max_threads = 0;
4012     // task_team->tt.tt_next = NULL;
4013   }
4014 
4015   __kmp_task_team_init(task_team, team);
4016 
4017   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
4018                 "unfinished_threads init'd to %d\n",
4019                 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
4020                 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
4021   return task_team;
4022 }
4023 
4024 // __kmp_free_task_team:
4025 // Frees the task team associated with a specific thread, and adds it
4026 // to the global task team free list.
4027 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
4028   KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
4029                 thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
4030 
4031   // Put task team back on free list
4032   __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
4033 
4034   KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
4035   task_team->tt.tt_next = __kmp_free_task_teams;
4036   TCW_PTR(__kmp_free_task_teams, task_team);
4037 
4038   __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
4039 }
4040 
4041 // __kmp_reap_task_teams:
4042 // Free all the task teams on the task team free list.
4043 // Should only be done during library shutdown.
4044 // Cannot do anything that needs a thread structure or gtid since they are
4045 // already gone.
4046 void __kmp_reap_task_teams(void) {
4047   kmp_task_team_t *task_team;
4048 
4049   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
4050     // Free all task_teams on the free list
4051     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
4052     while ((task_team = __kmp_free_task_teams) != NULL) {
4053       __kmp_free_task_teams = task_team->tt.tt_next;
4054       task_team->tt.tt_next = NULL;
4055 
4056       // Free threads_data if necessary
4057       if (task_team->tt.tt_threads_data != NULL) {
4058         __kmp_free_task_threads_data(task_team);
4059       }
4060       if (task_team->tt.tt_task_pri_list != NULL) {
4061         __kmp_free_task_pri_list(task_team);
4062       }
4063       __kmp_free(task_team);
4064     }
4065     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
4066   }
4067 }
4068 
4069 // View the array of two task team pointers as a pair of pointers:
4070 //  1) a single task_team pointer
4071 //  2) next pointer for stack
4072 // Serial teams can create a stack of task teams for nested serial teams.
4073 void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
4074   KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4075   kmp_task_team_list_t *current =
4076       (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
4077   kmp_task_team_list_t *node =
4078       (kmp_task_team_list_t *)__kmp_allocate(sizeof(kmp_task_team_list_t));
4079   node->task_team = current->task_team;
4080   node->next = current->next;
4081   thread->th.th_task_team = current->task_team = NULL;
4082   current->next = node;
4083 }
4084 
4085 // Serial team pops a task team off the stack
4086 void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
4087   KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4088   kmp_task_team_list_t *current =
4089       (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
4090   if (current->task_team) {
4091     __kmp_free_task_team(thread, current->task_team);
4092   }
4093   kmp_task_team_list_t *next = current->next;
4094   if (next) {
4095     current->task_team = next->task_team;
4096     current->next = next->next;
4097     KMP_DEBUG_ASSERT(next != current);
4098     __kmp_free(next);
4099     thread->th.th_task_team = current->task_team;
4100   }
4101 }
4102 
4103 // __kmp_wait_to_unref_task_teams:
4104 // Some threads could still be in the fork barrier release code, possibly
4105 // trying to steal tasks.  Wait for each thread to unreference its task team.
4106 void __kmp_wait_to_unref_task_teams(void) {
4107   kmp_info_t *thread;
4108   kmp_uint32 spins;
4109   kmp_uint64 time;
4110   int done;
4111 
4112   KMP_INIT_YIELD(spins);
4113   KMP_INIT_BACKOFF(time);
4114 
4115   for (;;) {
4116     done = TRUE;
4117 
4118     // TODO: GEH - this may be is wrong because some sync would be necessary
4119     // in case threads are added to the pool during the traversal. Need to
4120     // verify that lock for thread pool is held when calling this routine.
4121     for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
4122          thread = thread->th.th_next_pool) {
4123 #if KMP_OS_WINDOWS
4124       DWORD exit_val;
4125 #endif
4126       if (TCR_PTR(thread->th.th_task_team) == NULL) {
4127         KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
4128                       __kmp_gtid_from_thread(thread)));
4129         continue;
4130       }
4131 #if KMP_OS_WINDOWS
4132       // TODO: GEH - add this check for Linux* OS / OS X* as well?
4133       if (!__kmp_is_thread_alive(thread, &exit_val)) {
4134         thread->th.th_task_team = NULL;
4135         continue;
4136       }
4137 #endif
4138 
4139       done = FALSE; // Because th_task_team pointer is not NULL for this thread
4140 
4141       KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
4142                     "unreference task_team\n",
4143                     __kmp_gtid_from_thread(thread)));
4144 
4145       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
4146         void *sleep_loc;
4147         // If the thread is sleeping, awaken it.
4148         if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
4149             NULL) {
4150           KA_TRACE(
4151               10,
4152               ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
4153                __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
4154           __kmp_null_resume_wrapper(thread);
4155         }
4156       }
4157     }
4158     if (done) {
4159       break;
4160     }
4161 
4162     // If oversubscribed or have waited a bit, yield.
4163     KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
4164   }
4165 }
4166 
4167 // __kmp_task_team_setup:  Create a task_team for the current team, but use
4168 // an already created, unused one if it already exists.
4169 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {
4170   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4171 
4172   // For the serial and root teams, setup the first task team pointer to point
4173   // to task team. The other pointer is a stack of task teams from previous
4174   // serial levels.
4175   if (team == this_thr->th.th_serial_team ||
4176       team == this_thr->th.th_root->r.r_root_team) {
4177     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4178     if (team->t.t_task_team[0] == NULL) {
4179       team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team);
4180       KA_TRACE(
4181           20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
4182                " for serial/root team %p\n",
4183                __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team));
4184 
4185     } else
4186       __kmp_task_team_init(team->t.t_task_team[0], team);
4187     return;
4188   }
4189 
4190   // If this task_team hasn't been created yet, allocate it. It will be used in
4191   // the region after the next.
4192   // If it exists, it is the current task team and shouldn't be touched yet as
4193   // it may still be in use.
4194   if (team->t.t_task_team[this_thr->th.th_task_state] == NULL) {
4195     team->t.t_task_team[this_thr->th.th_task_state] =
4196         __kmp_allocate_task_team(this_thr, team);
4197     KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
4198                   " for team %d at parity=%d\n",
4199                   __kmp_gtid_from_thread(this_thr),
4200                   team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
4201                   this_thr->th.th_task_state));
4202   }
4203 
4204   // After threads exit the release, they will call sync, and then point to this
4205   // other task_team; make sure it is allocated and properly initialized. As
4206   // threads spin in the barrier release phase, they will continue to use the
4207   // previous task_team struct(above), until they receive the signal to stop
4208   // checking for tasks (they can't safely reference the kmp_team_t struct,
4209   // which could be reallocated by the primary thread).
4210   int other_team = 1 - this_thr->th.th_task_state;
4211   KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
4212   if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
4213     team->t.t_task_team[other_team] = __kmp_allocate_task_team(this_thr, team);
4214     KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
4215                   "task_team %p for team %d at parity=%d\n",
4216                   __kmp_gtid_from_thread(this_thr),
4217                   team->t.t_task_team[other_team], team->t.t_id, other_team));
4218   } else { // Leave the old task team struct in place for the upcoming region;
4219     // adjust as needed
4220     kmp_task_team_t *task_team = team->t.t_task_team[other_team];
4221     __kmp_task_team_init(task_team, team);
4222     // if team size has changed, the first thread to enable tasking will
4223     // realloc threads_data if necessary
4224     KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
4225                   "%p for team %d at parity=%d\n",
4226                   __kmp_gtid_from_thread(this_thr),
4227                   team->t.t_task_team[other_team], team->t.t_id, other_team));
4228   }
4229 
4230   // For regular thread, task enabling should be called when the task is going
4231   // to be pushed to a dequeue. However, for the hidden helper thread, we need
4232   // it ahead of time so that some operations can be performed without race
4233   // condition.
4234   if (this_thr == __kmp_hidden_helper_main_thread) {
4235     for (int i = 0; i < 2; ++i) {
4236       kmp_task_team_t *task_team = team->t.t_task_team[i];
4237       if (KMP_TASKING_ENABLED(task_team)) {
4238         continue;
4239       }
4240       __kmp_enable_tasking(task_team, this_thr);
4241       for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
4242         kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
4243         if (thread_data->td.td_deque == NULL) {
4244           __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data);
4245         }
4246       }
4247     }
4248   }
4249 }
4250 
4251 // __kmp_task_team_sync: Propagation of task team data from team to threads
4252 // which happens just after the release phase of a team barrier.  This may be
4253 // called by any thread. This is not called for serial or root teams.
4254 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
4255   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4256   KMP_DEBUG_ASSERT(team != this_thr->th.th_serial_team);
4257   KMP_DEBUG_ASSERT(team != this_thr->th.th_root->r.r_root_team);
4258 
4259   // Toggle the th_task_state field, to switch which task_team this thread
4260   // refers to
4261   this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
4262 
4263   // It is now safe to propagate the task team pointer from the team struct to
4264   // the current thread.
4265   TCW_PTR(this_thr->th.th_task_team,
4266           team->t.t_task_team[this_thr->th.th_task_state]);
4267   KA_TRACE(20,
4268            ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
4269             "%p from Team #%d (parity=%d)\n",
4270             __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
4271             team->t.t_id, this_thr->th.th_task_state));
4272 }
4273 
4274 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
4275 // barrier gather phase. Only called by the primary thread.
4276 //
4277 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
4278 // by passing in 0 optionally as the last argument. When wait is zero, primary
4279 // thread does not wait for unfinished_threads to reach 0.
4280 void __kmp_task_team_wait(
4281     kmp_info_t *this_thr,
4282     kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
4283   kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
4284 
4285   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4286   KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
4287 
4288   if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
4289     if (wait) {
4290       KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
4291                     "(for unfinished_threads to reach 0) on task_team = %p\n",
4292                     __kmp_gtid_from_thread(this_thr), task_team));
4293       // Worker threads may have dropped through to release phase, but could
4294       // still be executing tasks. Wait here for tasks to complete. To avoid
4295       // memory contention, only primary thread checks termination condition.
4296       kmp_flag_32<false, false> flag(
4297           RCAST(std::atomic<kmp_uint32> *,
4298                 &task_team->tt.tt_unfinished_threads),
4299           0U);
4300       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
4301     }
4302     // Deactivate the old task team, so that the worker threads will stop
4303     // referencing it while spinning.
4304     KA_TRACE(
4305         20,
4306         ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
4307          "setting active to false, setting local and team's pointer to NULL\n",
4308          __kmp_gtid_from_thread(this_thr), task_team));
4309     TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
4310     TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
4311     KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
4312     TCW_SYNC_4(task_team->tt.tt_active, FALSE);
4313     KMP_MB();
4314 
4315     TCW_PTR(this_thr->th.th_task_team, NULL);
4316   }
4317 }
4318 
4319 // __kmp_tasking_barrier:
4320 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
4321 // Internal function to execute all tasks prior to a regular barrier or a join
4322 // barrier. It is a full barrier itself, which unfortunately turns regular
4323 // barriers into double barriers and join barriers into 1 1/2 barriers.
4324 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
4325   std::atomic<kmp_uint32> *spin = RCAST(
4326       std::atomic<kmp_uint32> *,
4327       &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
4328   int flag = FALSE;
4329   KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
4330 
4331 #if USE_ITT_BUILD
4332   KMP_FSYNC_SPIN_INIT(spin, NULL);
4333 #endif /* USE_ITT_BUILD */
4334   kmp_flag_32<false, false> spin_flag(spin, 0U);
4335   while (!spin_flag.execute_tasks(thread, gtid, TRUE,
4336                                   &flag USE_ITT_BUILD_ARG(NULL), 0)) {
4337 #if USE_ITT_BUILD
4338     // TODO: What about itt_sync_obj??
4339     KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
4340 #endif /* USE_ITT_BUILD */
4341 
4342     if (TCR_4(__kmp_global.g.g_done)) {
4343       if (__kmp_global.g.g_abort)
4344         __kmp_abort_thread();
4345       break;
4346     }
4347     KMP_YIELD(TRUE);
4348   }
4349 #if USE_ITT_BUILD
4350   KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
4351 #endif /* USE_ITT_BUILD */
4352 }
4353 
4354 // __kmp_give_task puts a task into a given thread queue if:
4355 //  - the queue for that thread was created
4356 //  - there's space in that queue
4357 // Because of this, __kmp_push_task needs to check if there's space after
4358 // getting the lock
4359 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
4360                             kmp_int32 pass) {
4361   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4362   kmp_task_team_t *task_team = taskdata->td_task_team;
4363 
4364   KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
4365                 taskdata, tid));
4366 
4367   // If task_team is NULL something went really bad...
4368   KMP_DEBUG_ASSERT(task_team != NULL);
4369 
4370   bool result = false;
4371   kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
4372 
4373   if (thread_data->td.td_deque == NULL) {
4374     // There's no queue in this thread, go find another one
4375     // We're guaranteed that at least one thread has a queue
4376     KA_TRACE(30,
4377              ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
4378               tid, taskdata));
4379     return result;
4380   }
4381 
4382   if (TCR_4(thread_data->td.td_deque_ntasks) >=
4383       TASK_DEQUE_SIZE(thread_data->td)) {
4384     KA_TRACE(
4385         30,
4386         ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
4387          taskdata, tid));
4388 
4389     // if this deque is bigger than the pass ratio give a chance to another
4390     // thread
4391     if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4392       return result;
4393 
4394     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4395     if (TCR_4(thread_data->td.td_deque_ntasks) >=
4396         TASK_DEQUE_SIZE(thread_data->td)) {
4397       // expand deque to push the task which is not allowed to execute
4398       __kmp_realloc_task_deque(thread, thread_data);
4399     }
4400 
4401   } else {
4402 
4403     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4404 
4405     if (TCR_4(thread_data->td.td_deque_ntasks) >=
4406         TASK_DEQUE_SIZE(thread_data->td)) {
4407       KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
4408                     "thread %d.\n",
4409                     taskdata, tid));
4410 
4411       // if this deque is bigger than the pass ratio give a chance to another
4412       // thread
4413       if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4414         goto release_and_exit;
4415 
4416       __kmp_realloc_task_deque(thread, thread_data);
4417     }
4418   }
4419 
4420   // lock is held here, and there is space in the deque
4421 
4422   thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
4423   // Wrap index.
4424   thread_data->td.td_deque_tail =
4425       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
4426   TCW_4(thread_data->td.td_deque_ntasks,
4427         TCR_4(thread_data->td.td_deque_ntasks) + 1);
4428 
4429   result = true;
4430   KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
4431                 taskdata, tid));
4432 
4433 release_and_exit:
4434   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
4435 
4436   return result;
4437 }
4438 
4439 #define PROXY_TASK_FLAG 0x40000000
4440 /* The finish of the proxy tasks is divided in two pieces:
4441     - the top half is the one that can be done from a thread outside the team
4442     - the bottom half must be run from a thread within the team
4443 
4444    In order to run the bottom half the task gets queued back into one of the
4445    threads of the team. Once the td_incomplete_child_task counter of the parent
4446    is decremented the threads can leave the barriers. So, the bottom half needs
4447    to be queued before the counter is decremented. The top half is therefore
4448    divided in two parts:
4449     - things that can be run before queuing the bottom half
4450     - things that must be run after queuing the bottom half
4451 
4452    This creates a second race as the bottom half can free the task before the
4453    second top half is executed. To avoid this we use the
4454    td_incomplete_child_task of the proxy task to synchronize the top and bottom
4455    half. */
4456 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4457   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
4458   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4459   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
4460   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
4461 
4462   taskdata->td_flags.complete = 1; // mark the task as completed
4463 #if OMPX_TASKGRAPH
4464   taskdata->td_flags.onced = 1;
4465 #endif
4466 
4467   if (taskdata->td_taskgroup)
4468     KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
4469 
4470   // Create an imaginary children for this task so the bottom half cannot
4471   // release the task before we have completed the second top half
4472   KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG);
4473 }
4474 
4475 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4476 #if KMP_DEBUG
4477   kmp_int32 children = 0;
4478   // Predecrement simulated by "- 1" calculation
4479   children = -1 +
4480 #endif
4481       KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
4482   KMP_DEBUG_ASSERT(children >= 0);
4483 
4484   // Remove the imaginary children
4485   KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG);
4486 }
4487 
4488 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
4489   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4490   kmp_info_t *thread = __kmp_threads[gtid];
4491 
4492   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4493   KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
4494                    1); // top half must run before bottom half
4495 
4496   // We need to wait to make sure the top half is finished
4497   // Spinning here should be ok as this should happen quickly
4498   while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
4499           PROXY_TASK_FLAG) > 0)
4500     ;
4501 
4502   __kmp_release_deps(gtid, taskdata);
4503   __kmp_free_task_and_ancestors(gtid, taskdata, thread);
4504 }
4505 
4506 /*!
4507 @ingroup TASKING
4508 @param gtid Global Thread ID of encountering thread
4509 @param ptask Task which execution is completed
4510 
4511 Execute the completion of a proxy task from a thread of that is part of the
4512 team. Run first and bottom halves directly.
4513 */
4514 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
4515   KMP_DEBUG_ASSERT(ptask != NULL);
4516   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4517   KA_TRACE(
4518       10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
4519            gtid, taskdata));
4520   __kmp_assert_valid_gtid(gtid);
4521   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4522 
4523   __kmp_first_top_half_finish_proxy(taskdata);
4524   __kmp_second_top_half_finish_proxy(taskdata);
4525   __kmp_bottom_half_finish_proxy(gtid, ptask);
4526 
4527   KA_TRACE(10,
4528            ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
4529             gtid, taskdata));
4530 }
4531 
4532 void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {
4533   KMP_DEBUG_ASSERT(ptask != NULL);
4534   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4535 
4536   // Enqueue task to complete bottom half completion from a thread within the
4537   // corresponding team
4538   kmp_team_t *team = taskdata->td_team;
4539   kmp_int32 nthreads = team->t.t_nproc;
4540   kmp_info_t *thread;
4541 
4542   // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
4543   // but we cannot use __kmp_get_random here
4544   kmp_int32 start_k = start % nthreads;
4545   kmp_int32 pass = 1;
4546   kmp_int32 k = start_k;
4547 
4548   do {
4549     // For now we're just linearly trying to find a thread
4550     thread = team->t.t_threads[k];
4551     k = (k + 1) % nthreads;
4552 
4553     // we did a full pass through all the threads
4554     if (k == start_k)
4555       pass = pass << 1;
4556 
4557   } while (!__kmp_give_task(thread, k, ptask, pass));
4558 
4559   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && __kmp_wpolicy_passive) {
4560     // awake at least one thread to execute given task
4561     for (int i = 0; i < nthreads; ++i) {
4562       thread = team->t.t_threads[i];
4563       if (thread->th.th_sleep_loc != NULL) {
4564         __kmp_null_resume_wrapper(thread);
4565         break;
4566       }
4567     }
4568   }
4569 }
4570 
4571 /*!
4572 @ingroup TASKING
4573 @param ptask Task which execution is completed
4574 
4575 Execute the completion of a proxy task from a thread that could not belong to
4576 the team.
4577 */
4578 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
4579   KMP_DEBUG_ASSERT(ptask != NULL);
4580   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4581 
4582   KA_TRACE(
4583       10,
4584       ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
4585        taskdata));
4586 
4587   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4588 
4589   __kmp_first_top_half_finish_proxy(taskdata);
4590 
4591   __kmpc_give_task(ptask);
4592 
4593   __kmp_second_top_half_finish_proxy(taskdata);
4594 
4595   KA_TRACE(
4596       10,
4597       ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
4598        taskdata));
4599 }
4600 
4601 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
4602                                                 kmp_task_t *task) {
4603   kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
4604   if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
4605     td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
4606     td->td_allow_completion_event.ed.task = task;
4607     __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
4608   }
4609   return &td->td_allow_completion_event;
4610 }
4611 
4612 void __kmp_fulfill_event(kmp_event_t *event) {
4613   if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
4614     kmp_task_t *ptask = event->ed.task;
4615     kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4616     bool detached = false;
4617     int gtid = __kmp_get_gtid();
4618 
4619     // The associated task might have completed or could be completing at this
4620     // point.
4621     // We need to take the lock to avoid races
4622     __kmp_acquire_tas_lock(&event->lock, gtid);
4623     if (taskdata->td_flags.proxy == TASK_PROXY) {
4624       detached = true;
4625     } else {
4626 #if OMPT_SUPPORT
4627       // The OMPT event must occur under mutual exclusion,
4628       // otherwise the tool might access ptask after free
4629       if (UNLIKELY(ompt_enabled.enabled))
4630         __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
4631 #endif
4632     }
4633     event->type = KMP_EVENT_UNINITIALIZED;
4634     __kmp_release_tas_lock(&event->lock, gtid);
4635 
4636     if (detached) {
4637 #if OMPT_SUPPORT
4638       // We free ptask afterwards and know the task is finished,
4639       // so locking is not necessary
4640       if (UNLIKELY(ompt_enabled.enabled))
4641         __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
4642 #endif
4643       // If the task detached complete the proxy task
4644       if (gtid >= 0) {
4645         kmp_team_t *team = taskdata->td_team;
4646         kmp_info_t *thread = __kmp_get_thread();
4647         if (thread->th.th_team == team) {
4648           __kmpc_proxy_task_completed(gtid, ptask);
4649           return;
4650         }
4651       }
4652 
4653       // fallback
4654       __kmpc_proxy_task_completed_ooo(ptask);
4655     }
4656   }
4657 }
4658 
4659 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4660 // for taskloop
4661 //
4662 // thread:   allocating thread
4663 // task_src: pointer to source task to be duplicated
4664 // taskloop_recur: used only when dealing with taskgraph,
4665 //      indicating whether we need to update task->td_task_id
4666 // returns:  a pointer to the allocated kmp_task_t structure (task).
4667 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src
4668 #if OMPX_TASKGRAPH
4669                                  , int taskloop_recur
4670 #endif
4671 ) {
4672   kmp_task_t *task;
4673   kmp_taskdata_t *taskdata;
4674   kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4675   kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
4676   size_t shareds_offset;
4677   size_t task_size;
4678 
4679   KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4680                 task_src));
4681   KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4682                    TASK_FULL); // it should not be proxy task
4683   KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
4684   task_size = taskdata_src->td_size_alloc;
4685 
4686   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4687   KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4688                 task_size));
4689 #if USE_FAST_MEMORY
4690   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4691 #else
4692   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4693 #endif /* USE_FAST_MEMORY */
4694   KMP_MEMCPY(taskdata, taskdata_src, task_size);
4695 
4696   task = KMP_TASKDATA_TO_TASK(taskdata);
4697 
4698   // Initialize new task (only specific fields not affected by memcpy)
4699 #if OMPX_TASKGRAPH
4700   if (!taskdata->is_taskgraph || taskloop_recur)
4701     taskdata->td_task_id = KMP_GEN_TASK_ID();
4702   else if (taskdata->is_taskgraph &&
4703            __kmp_tdg_is_recording(taskdata_src->tdg->tdg_status))
4704     taskdata->td_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
4705 #else
4706   taskdata->td_task_id = KMP_GEN_TASK_ID();
4707 #endif
4708   if (task->shareds != NULL) { // need setup shareds pointer
4709     shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4710     task->shareds = &((char *)taskdata)[shareds_offset];
4711     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4712                      0);
4713   }
4714   taskdata->td_alloc_thread = thread;
4715   taskdata->td_parent = parent_task;
4716   // task inherits the taskgroup from the parent task
4717   taskdata->td_taskgroup = parent_task->td_taskgroup;
4718   // tied task needs to initialize the td_last_tied at creation,
4719   // untied one does this when it is scheduled for execution
4720   if (taskdata->td_flags.tiedness == TASK_TIED)
4721     taskdata->td_last_tied = taskdata;
4722 
4723   // Only need to keep track of child task counts if team parallel and tasking
4724   // not serialized
4725   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4726     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
4727     if (parent_task->td_taskgroup)
4728       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4729     // Only need to keep track of allocated child tasks for explicit tasks since
4730     // implicit not deallocated
4731     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4732       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
4733   }
4734 
4735   KA_TRACE(20,
4736            ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4737             thread, taskdata, taskdata->td_parent));
4738 #if OMPT_SUPPORT
4739   if (UNLIKELY(ompt_enabled.enabled))
4740     __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4741 #endif
4742   return task;
4743 }
4744 
4745 // Routine optionally generated by the compiler for setting the lastprivate flag
4746 // and calling needed constructors for private/firstprivate objects
4747 // (used to form taskloop tasks from pattern task)
4748 // Parameters: dest task, src task, lastprivate flag.
4749 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4750 
4751 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4752 
4753 // class to encapsulate manipulating loop bounds in a taskloop task.
4754 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4755 // the loop bound variables.
4756 class kmp_taskloop_bounds_t {
4757   kmp_task_t *task;
4758   const kmp_taskdata_t *taskdata;
4759   size_t lower_offset;
4760   size_t upper_offset;
4761 
4762 public:
4763   kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4764       : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4765         lower_offset((char *)lb - (char *)task),
4766         upper_offset((char *)ub - (char *)task) {
4767     KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4768     KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4769   }
4770   kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4771       : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4772         lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4773   size_t get_lower_offset() const { return lower_offset; }
4774   size_t get_upper_offset() const { return upper_offset; }
4775   kmp_uint64 get_lb() const {
4776     kmp_int64 retval;
4777 #if defined(KMP_GOMP_COMPAT)
4778     // Intel task just returns the lower bound normally
4779     if (!taskdata->td_flags.native) {
4780       retval = *(kmp_int64 *)((char *)task + lower_offset);
4781     } else {
4782       // GOMP task has to take into account the sizeof(long)
4783       if (taskdata->td_size_loop_bounds == 4) {
4784         kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4785         retval = (kmp_int64)*lb;
4786       } else {
4787         kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4788         retval = (kmp_int64)*lb;
4789       }
4790     }
4791 #else
4792     (void)taskdata;
4793     retval = *(kmp_int64 *)((char *)task + lower_offset);
4794 #endif // defined(KMP_GOMP_COMPAT)
4795     return retval;
4796   }
4797   kmp_uint64 get_ub() const {
4798     kmp_int64 retval;
4799 #if defined(KMP_GOMP_COMPAT)
4800     // Intel task just returns the upper bound normally
4801     if (!taskdata->td_flags.native) {
4802       retval = *(kmp_int64 *)((char *)task + upper_offset);
4803     } else {
4804       // GOMP task has to take into account the sizeof(long)
4805       if (taskdata->td_size_loop_bounds == 4) {
4806         kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4807         retval = (kmp_int64)*ub;
4808       } else {
4809         kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4810         retval = (kmp_int64)*ub;
4811       }
4812     }
4813 #else
4814     retval = *(kmp_int64 *)((char *)task + upper_offset);
4815 #endif // defined(KMP_GOMP_COMPAT)
4816     return retval;
4817   }
4818   void set_lb(kmp_uint64 lb) {
4819 #if defined(KMP_GOMP_COMPAT)
4820     // Intel task just sets the lower bound normally
4821     if (!taskdata->td_flags.native) {
4822       *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4823     } else {
4824       // GOMP task has to take into account the sizeof(long)
4825       if (taskdata->td_size_loop_bounds == 4) {
4826         kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4827         *lower = (kmp_uint32)lb;
4828       } else {
4829         kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4830         *lower = (kmp_uint64)lb;
4831       }
4832     }
4833 #else
4834     *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4835 #endif // defined(KMP_GOMP_COMPAT)
4836   }
4837   void set_ub(kmp_uint64 ub) {
4838 #if defined(KMP_GOMP_COMPAT)
4839     // Intel task just sets the upper bound normally
4840     if (!taskdata->td_flags.native) {
4841       *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4842     } else {
4843       // GOMP task has to take into account the sizeof(long)
4844       if (taskdata->td_size_loop_bounds == 4) {
4845         kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4846         *upper = (kmp_uint32)ub;
4847       } else {
4848         kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4849         *upper = (kmp_uint64)ub;
4850       }
4851     }
4852 #else
4853     *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4854 #endif // defined(KMP_GOMP_COMPAT)
4855   }
4856 };
4857 
4858 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4859 //
4860 // loc        Source location information
4861 // gtid       Global thread ID
4862 // task       Pattern task, exposes the loop iteration range
4863 // lb         Pointer to loop lower bound in task structure
4864 // ub         Pointer to loop upper bound in task structure
4865 // st         Loop stride
4866 // ub_glob    Global upper bound (used for lastprivate check)
4867 // num_tasks  Number of tasks to execute
4868 // grainsize  Number of loop iterations per task
4869 // extras     Number of chunks with grainsize+1 iterations
4870 // last_chunk Reduction of grainsize for last task
4871 // tc         Iterations count
4872 // task_dup   Tasks duplication routine
4873 // codeptr_ra Return address for OMPT events
4874 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4875                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4876                            kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4877                            kmp_uint64 grainsize, kmp_uint64 extras,
4878                            kmp_int64 last_chunk, kmp_uint64 tc,
4879 #if OMPT_SUPPORT
4880                            void *codeptr_ra,
4881 #endif
4882                            void *task_dup) {
4883   KMP_COUNT_BLOCK(OMP_TASKLOOP);
4884   KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4885   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4886   // compiler provides global bounds here
4887   kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4888   kmp_uint64 lower = task_bounds.get_lb();
4889   kmp_uint64 upper = task_bounds.get_ub();
4890   kmp_uint64 i;
4891   kmp_info_t *thread = __kmp_threads[gtid];
4892   kmp_taskdata_t *current_task = thread->th.th_current_task;
4893   kmp_task_t *next_task;
4894   kmp_int32 lastpriv = 0;
4895 
4896   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4897                              (last_chunk < 0 ? last_chunk : extras));
4898   KMP_DEBUG_ASSERT(num_tasks > extras);
4899   KMP_DEBUG_ASSERT(num_tasks > 0);
4900   KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4901                 "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4902                 gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4903                 ub_glob, st, task_dup));
4904 
4905   // Launch num_tasks tasks, assign grainsize iterations each task
4906   for (i = 0; i < num_tasks; ++i) {
4907     kmp_uint64 chunk_minus_1;
4908     if (extras == 0) {
4909       chunk_minus_1 = grainsize - 1;
4910     } else {
4911       chunk_minus_1 = grainsize;
4912       --extras; // first extras iterations get bigger chunk (grainsize+1)
4913     }
4914     upper = lower + st * chunk_minus_1;
4915     if (upper > *ub) {
4916       upper = *ub;
4917     }
4918     if (i == num_tasks - 1) {
4919       // schedule the last task, set lastprivate flag if needed
4920       if (st == 1) { // most common case
4921         KMP_DEBUG_ASSERT(upper == *ub);
4922         if (upper == ub_glob)
4923           lastpriv = 1;
4924       } else if (st > 0) { // positive loop stride
4925         KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4926         if ((kmp_uint64)st > ub_glob - upper)
4927           lastpriv = 1;
4928       } else { // negative loop stride
4929         KMP_DEBUG_ASSERT(upper + st < *ub);
4930         if (upper - ub_glob < (kmp_uint64)(-st))
4931           lastpriv = 1;
4932       }
4933     }
4934 
4935 #if OMPX_TASKGRAPH
4936     next_task = __kmp_task_dup_alloc(thread, task, /* taskloop_recur */ 0);
4937 #else
4938     next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4939 #endif
4940 
4941     kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4942     kmp_taskloop_bounds_t next_task_bounds =
4943         kmp_taskloop_bounds_t(next_task, task_bounds);
4944 
4945     // adjust task-specific bounds
4946     next_task_bounds.set_lb(lower);
4947     if (next_taskdata->td_flags.native) {
4948       next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4949     } else {
4950       next_task_bounds.set_ub(upper);
4951     }
4952     if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4953                            // etc.
4954       ptask_dup(next_task, task, lastpriv);
4955     KA_TRACE(40,
4956              ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4957               "upper %lld stride %lld, (offsets %p %p)\n",
4958               gtid, i, next_task, lower, upper, st,
4959               next_task_bounds.get_lower_offset(),
4960               next_task_bounds.get_upper_offset()));
4961 #if OMPT_SUPPORT
4962     __kmp_omp_taskloop_task(NULL, gtid, next_task,
4963                             codeptr_ra); // schedule new task
4964 #if OMPT_OPTIONAL
4965     if (ompt_enabled.ompt_callback_dispatch) {
4966       OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk,
4967                               lower, upper, st);
4968     }
4969 #endif // OMPT_OPTIONAL
4970 #else
4971     __kmp_omp_task(gtid, next_task, true); // schedule new task
4972 #endif
4973     lower = upper + st; // adjust lower bound for the next iteration
4974   }
4975   // free the pattern task and exit
4976   __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4977   // do not execute the pattern task, just do internal bookkeeping
4978   __kmp_task_finish<false>(gtid, task, current_task);
4979 }
4980 
4981 // Structure to keep taskloop parameters for auxiliary task
4982 // kept in the shareds of the task structure.
4983 typedef struct __taskloop_params {
4984   kmp_task_t *task;
4985   kmp_uint64 *lb;
4986   kmp_uint64 *ub;
4987   void *task_dup;
4988   kmp_int64 st;
4989   kmp_uint64 ub_glob;
4990   kmp_uint64 num_tasks;
4991   kmp_uint64 grainsize;
4992   kmp_uint64 extras;
4993   kmp_int64 last_chunk;
4994   kmp_uint64 tc;
4995   kmp_uint64 num_t_min;
4996 #if OMPT_SUPPORT
4997   void *codeptr_ra;
4998 #endif
4999 } __taskloop_params_t;
5000 
5001 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
5002                           kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
5003                           kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
5004                           kmp_uint64,
5005 #if OMPT_SUPPORT
5006                           void *,
5007 #endif
5008                           void *);
5009 
5010 // Execute part of the taskloop submitted as a task.
5011 int __kmp_taskloop_task(int gtid, void *ptask) {
5012   __taskloop_params_t *p =
5013       (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
5014   kmp_task_t *task = p->task;
5015   kmp_uint64 *lb = p->lb;
5016   kmp_uint64 *ub = p->ub;
5017   void *task_dup = p->task_dup;
5018   //  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
5019   kmp_int64 st = p->st;
5020   kmp_uint64 ub_glob = p->ub_glob;
5021   kmp_uint64 num_tasks = p->num_tasks;
5022   kmp_uint64 grainsize = p->grainsize;
5023   kmp_uint64 extras = p->extras;
5024   kmp_int64 last_chunk = p->last_chunk;
5025   kmp_uint64 tc = p->tc;
5026   kmp_uint64 num_t_min = p->num_t_min;
5027 #if OMPT_SUPPORT
5028   void *codeptr_ra = p->codeptr_ra;
5029 #endif
5030 #if KMP_DEBUG
5031   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5032   KMP_DEBUG_ASSERT(task != NULL);
5033   KA_TRACE(20,
5034            ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
5035             " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
5036             gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
5037             st, task_dup));
5038 #endif
5039   KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
5040   if (num_tasks > num_t_min)
5041     __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
5042                          grainsize, extras, last_chunk, tc, num_t_min,
5043 #if OMPT_SUPPORT
5044                          codeptr_ra,
5045 #endif
5046                          task_dup);
5047   else
5048     __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
5049                           grainsize, extras, last_chunk, tc,
5050 #if OMPT_SUPPORT
5051                           codeptr_ra,
5052 #endif
5053                           task_dup);
5054 
5055   KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
5056   return 0;
5057 }
5058 
5059 // Schedule part of the taskloop as a task,
5060 // execute the rest of the taskloop.
5061 //
5062 // loc        Source location information
5063 // gtid       Global thread ID
5064 // task       Pattern task, exposes the loop iteration range
5065 // lb         Pointer to loop lower bound in task structure
5066 // ub         Pointer to loop upper bound in task structure
5067 // st         Loop stride
5068 // ub_glob    Global upper bound (used for lastprivate check)
5069 // num_tasks  Number of tasks to execute
5070 // grainsize  Number of loop iterations per task
5071 // extras     Number of chunks with grainsize+1 iterations
5072 // last_chunk Reduction of grainsize for last task
5073 // tc         Iterations count
5074 // num_t_min  Threshold to launch tasks recursively
5075 // task_dup   Tasks duplication routine
5076 // codeptr_ra Return address for OMPT events
5077 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
5078                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5079                           kmp_uint64 ub_glob, kmp_uint64 num_tasks,
5080                           kmp_uint64 grainsize, kmp_uint64 extras,
5081                           kmp_int64 last_chunk, kmp_uint64 tc,
5082                           kmp_uint64 num_t_min,
5083 #if OMPT_SUPPORT
5084                           void *codeptr_ra,
5085 #endif
5086                           void *task_dup) {
5087   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5088   KMP_DEBUG_ASSERT(task != NULL);
5089   KMP_DEBUG_ASSERT(num_tasks > num_t_min);
5090   KA_TRACE(20,
5091            ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
5092             " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
5093             gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
5094             st, task_dup));
5095   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
5096   kmp_uint64 lower = *lb;
5097   kmp_info_t *thread = __kmp_threads[gtid];
5098   //  kmp_taskdata_t *current_task = thread->th.th_current_task;
5099   kmp_task_t *next_task;
5100   size_t lower_offset =
5101       (char *)lb - (char *)task; // remember offset of lb in the task structure
5102   size_t upper_offset =
5103       (char *)ub - (char *)task; // remember offset of ub in the task structure
5104 
5105   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5106                              (last_chunk < 0 ? last_chunk : extras));
5107   KMP_DEBUG_ASSERT(num_tasks > extras);
5108   KMP_DEBUG_ASSERT(num_tasks > 0);
5109 
5110   // split the loop in two halves
5111   kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
5112   kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
5113   kmp_uint64 gr_size0 = grainsize;
5114   kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
5115   kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
5116   if (last_chunk < 0) {
5117     ext0 = ext1 = 0;
5118     last_chunk1 = last_chunk;
5119     tc0 = grainsize * n_tsk0;
5120     tc1 = tc - tc0;
5121   } else if (n_tsk0 <= extras) {
5122     gr_size0++; // integrate extras into grainsize
5123     ext0 = 0; // no extra iters in 1st half
5124     ext1 = extras - n_tsk0; // remaining extras
5125     tc0 = gr_size0 * n_tsk0;
5126     tc1 = tc - tc0;
5127   } else { // n_tsk0 > extras
5128     ext1 = 0; // no extra iters in 2nd half
5129     ext0 = extras;
5130     tc1 = grainsize * n_tsk1;
5131     tc0 = tc - tc1;
5132   }
5133   ub0 = lower + st * (tc0 - 1);
5134   lb1 = ub0 + st;
5135 
5136   // create pattern task for 2nd half of the loop
5137 #if OMPX_TASKGRAPH
5138   next_task = __kmp_task_dup_alloc(thread, task,
5139                                    /* taskloop_recur */ 1);
5140 #else
5141   next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
5142 #endif
5143   // adjust lower bound (upper bound is not changed) for the 2nd half
5144   *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
5145   if (ptask_dup != NULL) // construct firstprivates, etc.
5146     ptask_dup(next_task, task, 0);
5147   *ub = ub0; // adjust upper bound for the 1st half
5148 
5149   // create auxiliary task for 2nd half of the loop
5150   // make sure new task has same parent task as the pattern task
5151   kmp_taskdata_t *current_task = thread->th.th_current_task;
5152   thread->th.th_current_task = taskdata->td_parent;
5153   kmp_task_t *new_task =
5154       __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
5155                             sizeof(__taskloop_params_t), &__kmp_taskloop_task);
5156   // restore current task
5157   thread->th.th_current_task = current_task;
5158   __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
5159   p->task = next_task;
5160   p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
5161   p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
5162   p->task_dup = task_dup;
5163   p->st = st;
5164   p->ub_glob = ub_glob;
5165   p->num_tasks = n_tsk1;
5166   p->grainsize = grainsize;
5167   p->extras = ext1;
5168   p->last_chunk = last_chunk1;
5169   p->tc = tc1;
5170   p->num_t_min = num_t_min;
5171 #if OMPT_SUPPORT
5172   p->codeptr_ra = codeptr_ra;
5173 #endif
5174 
5175 #if OMPX_TASKGRAPH
5176   kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
5177   new_task_data->tdg = taskdata->tdg;
5178   new_task_data->is_taskgraph = 0;
5179 #endif
5180 
5181 #if OMPT_SUPPORT
5182   // schedule new task with correct return address for OMPT events
5183   __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
5184 #else
5185   __kmp_omp_task(gtid, new_task, true); // schedule new task
5186 #endif
5187 
5188   // execute the 1st half of current subrange
5189   if (n_tsk0 > num_t_min)
5190     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
5191                          ext0, last_chunk0, tc0, num_t_min,
5192 #if OMPT_SUPPORT
5193                          codeptr_ra,
5194 #endif
5195                          task_dup);
5196   else
5197     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
5198                           gr_size0, ext0, last_chunk0, tc0,
5199 #if OMPT_SUPPORT
5200                           codeptr_ra,
5201 #endif
5202                           task_dup);
5203 
5204   KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
5205 }
5206 
5207 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5208                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5209                            int nogroup, int sched, kmp_uint64 grainsize,
5210                            int modifier, void *task_dup) {
5211   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5212   KMP_DEBUG_ASSERT(task != NULL);
5213   if (nogroup == 0) {
5214 #if OMPT_SUPPORT && OMPT_OPTIONAL
5215     OMPT_STORE_RETURN_ADDRESS(gtid);
5216 #endif
5217     __kmpc_taskgroup(loc, gtid);
5218   }
5219 
5220 #if OMPX_TASKGRAPH
5221   KMP_ATOMIC_DEC(&__kmp_tdg_task_id);
5222 #endif
5223   // =========================================================================
5224   // calculate loop parameters
5225   kmp_taskloop_bounds_t task_bounds(task, lb, ub);
5226   kmp_uint64 tc;
5227   // compiler provides global bounds here
5228   kmp_uint64 lower = task_bounds.get_lb();
5229   kmp_uint64 upper = task_bounds.get_ub();
5230   kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
5231   kmp_uint64 num_tasks = 0, extras = 0;
5232   kmp_int64 last_chunk =
5233       0; // reduce grainsize of last task by last_chunk in strict mode
5234   kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
5235   kmp_info_t *thread = __kmp_threads[gtid];
5236   kmp_taskdata_t *current_task = thread->th.th_current_task;
5237 
5238   KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
5239                 "grain %llu(%d, %d), dup %p\n",
5240                 gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
5241                 task_dup));
5242 
5243   // compute trip count
5244   if (st == 1) { // most common case
5245     tc = upper - lower + 1;
5246   } else if (st < 0) {
5247     tc = (lower - upper) / (-st) + 1;
5248   } else { // st > 0
5249     tc = (upper - lower) / st + 1;
5250   }
5251   if (tc == 0) {
5252     KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
5253     // free the pattern task and exit
5254     __kmp_task_start(gtid, task, current_task);
5255     // do not execute anything for zero-trip loop
5256     __kmp_task_finish<false>(gtid, task, current_task);
5257     return;
5258   }
5259 
5260 #if OMPT_SUPPORT && OMPT_OPTIONAL
5261   ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
5262   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
5263   if (ompt_enabled.ompt_callback_work) {
5264     ompt_callbacks.ompt_callback(ompt_callback_work)(
5265         ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
5266         &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5267   }
5268 #endif
5269 
5270   if (num_tasks_min == 0)
5271     // TODO: can we choose better default heuristic?
5272     num_tasks_min =
5273         KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
5274 
5275   // compute num_tasks/grainsize based on the input provided
5276   switch (sched) {
5277   case 0: // no schedule clause specified, we can choose the default
5278     // let's try to schedule (team_size*10) tasks
5279     grainsize = thread->th.th_team_nproc * 10;
5280     KMP_FALLTHROUGH();
5281   case 2: // num_tasks provided
5282     if (grainsize > tc) {
5283       num_tasks = tc; // too big num_tasks requested, adjust values
5284       grainsize = 1;
5285       extras = 0;
5286     } else {
5287       num_tasks = grainsize;
5288       grainsize = tc / num_tasks;
5289       extras = tc % num_tasks;
5290     }
5291     break;
5292   case 1: // grainsize provided
5293     if (grainsize > tc) {
5294       num_tasks = 1;
5295       grainsize = tc; // too big grainsize requested, adjust values
5296       extras = 0;
5297     } else {
5298       if (modifier) {
5299         num_tasks = (tc + grainsize - 1) / grainsize;
5300         last_chunk = tc - (num_tasks * grainsize);
5301         extras = 0;
5302       } else {
5303         num_tasks = tc / grainsize;
5304         // adjust grainsize for balanced distribution of iterations
5305         grainsize = tc / num_tasks;
5306         extras = tc % num_tasks;
5307       }
5308     }
5309     break;
5310   default:
5311     KMP_ASSERT2(0, "unknown scheduling of taskloop");
5312   }
5313 
5314   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5315                              (last_chunk < 0 ? last_chunk : extras));
5316   KMP_DEBUG_ASSERT(num_tasks > extras);
5317   KMP_DEBUG_ASSERT(num_tasks > 0);
5318   // =========================================================================
5319 
5320   // check if clause value first
5321   // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
5322   if (if_val == 0) { // if(0) specified, mark task as serial
5323     taskdata->td_flags.task_serial = 1;
5324     taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
5325     // always start serial tasks linearly
5326     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5327                           grainsize, extras, last_chunk, tc,
5328 #if OMPT_SUPPORT
5329                           OMPT_GET_RETURN_ADDRESS(0),
5330 #endif
5331                           task_dup);
5332     // !taskdata->td_flags.native => currently force linear spawning of tasks
5333     // for GOMP_taskloop
5334   } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
5335     KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
5336                   "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5337                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5338                   last_chunk));
5339     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5340                          grainsize, extras, last_chunk, tc, num_tasks_min,
5341 #if OMPT_SUPPORT
5342                          OMPT_GET_RETURN_ADDRESS(0),
5343 #endif
5344                          task_dup);
5345   } else {
5346     KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
5347                   "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5348                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5349                   last_chunk));
5350     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5351                           grainsize, extras, last_chunk, tc,
5352 #if OMPT_SUPPORT
5353                           OMPT_GET_RETURN_ADDRESS(0),
5354 #endif
5355                           task_dup);
5356   }
5357 
5358 #if OMPT_SUPPORT && OMPT_OPTIONAL
5359   if (ompt_enabled.ompt_callback_work) {
5360     ompt_callbacks.ompt_callback(ompt_callback_work)(
5361         ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
5362         &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5363   }
5364 #endif
5365 
5366   if (nogroup == 0) {
5367 #if OMPT_SUPPORT && OMPT_OPTIONAL
5368     OMPT_STORE_RETURN_ADDRESS(gtid);
5369 #endif
5370     __kmpc_end_taskgroup(loc, gtid);
5371   }
5372   KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
5373 }
5374 
5375 /*!
5376 @ingroup TASKING
5377 @param loc       Source location information
5378 @param gtid      Global thread ID
5379 @param task      Task structure
5380 @param if_val    Value of the if clause
5381 @param lb        Pointer to loop lower bound in task structure
5382 @param ub        Pointer to loop upper bound in task structure
5383 @param st        Loop stride
5384 @param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
5385 @param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
5386 @param grainsize Schedule value if specified
5387 @param task_dup  Tasks duplication routine
5388 
5389 Execute the taskloop construct.
5390 */
5391 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5392                      kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
5393                      int sched, kmp_uint64 grainsize, void *task_dup) {
5394   __kmp_assert_valid_gtid(gtid);
5395   KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
5396   __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5397                  0, task_dup);
5398   KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
5399 }
5400 
5401 /*!
5402 @ingroup TASKING
5403 @param loc       Source location information
5404 @param gtid      Global thread ID
5405 @param task      Task structure
5406 @param if_val    Value of the if clause
5407 @param lb        Pointer to loop lower bound in task structure
5408 @param ub        Pointer to loop upper bound in task structure
5409 @param st        Loop stride
5410 @param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
5411 @param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
5412 @param grainsize Schedule value if specified
5413 @param modifier  Modifier 'strict' for sched, 1 if present, 0 otherwise
5414 @param task_dup  Tasks duplication routine
5415 
5416 Execute the taskloop construct.
5417 */
5418 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5419                        kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5420                        int nogroup, int sched, kmp_uint64 grainsize,
5421                        int modifier, void *task_dup) {
5422   __kmp_assert_valid_gtid(gtid);
5423   KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
5424   __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5425                  modifier, task_dup);
5426   KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
5427 }
5428 
5429 /*!
5430 @ingroup TASKING
5431 @param gtid Global Thread ID of current thread
5432 @return Returns a pointer to the thread's current task async handle. If no task
5433 is present or gtid is invalid, returns NULL.
5434 
5435 Acqurires a pointer to the target async handle from the current task.
5436 */
5437 void **__kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid) {
5438   if (gtid == KMP_GTID_DNE)
5439     return NULL;
5440 
5441   kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5442   kmp_taskdata_t *taskdata = thread->th.th_current_task;
5443 
5444   if (!taskdata)
5445     return NULL;
5446 
5447   return &taskdata->td_target_data.async_handle;
5448 }
5449 
5450 /*!
5451 @ingroup TASKING
5452 @param gtid Global Thread ID of current thread
5453 @return Returns TRUE if the current task being executed of the given thread has
5454 a task team allocated to it. Otherwise, returns FALSE.
5455 
5456 Checks if the current thread has a task team.
5457 */
5458 bool __kmpc_omp_has_task_team(kmp_int32 gtid) {
5459   if (gtid == KMP_GTID_DNE)
5460     return FALSE;
5461 
5462   kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5463   kmp_taskdata_t *taskdata = thread->th.th_current_task;
5464 
5465   if (!taskdata)
5466     return FALSE;
5467 
5468   return taskdata->td_task_team != NULL;
5469 }
5470 
5471 #if OMPX_TASKGRAPH
5472 // __kmp_find_tdg: identify a TDG through its ID
5473 // gtid:   Global Thread ID
5474 // tdg_id: ID of the TDG
5475 // returns: If a TDG corresponding to this ID is found and not
5476 // its initial state, return the pointer to it, otherwise nullptr
5477 static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
5478   kmp_tdg_info_t *res = nullptr;
5479   if (__kmp_max_tdgs == 0)
5480     return res;
5481 
5482   if (__kmp_global_tdgs == NULL)
5483     __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(
5484         sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs);
5485 
5486   if ((__kmp_global_tdgs[tdg_id]) &&
5487       (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))
5488     res = __kmp_global_tdgs[tdg_id];
5489   return res;
5490 }
5491 
5492 // __kmp_print_tdg_dot: prints the TDG to a dot file
5493 // tdg:    ID of the TDG
5494 void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg) {
5495   kmp_int32 tdg_id = tdg->tdg_id;
5496   KA_TRACE(10, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id));
5497 
5498   char file_name[20];
5499   sprintf(file_name, "tdg_%d.dot", tdg_id);
5500   kmp_safe_raii_file_t tdg_file(file_name, "w");
5501 
5502   kmp_int32 num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5503   fprintf(tdg_file,
5504           "digraph TDG {\n"
5505           "   compound=true\n"
5506           "   subgraph cluster {\n"
5507           "      label=TDG_%d\n",
5508           tdg_id);
5509   for (kmp_int32 i = 0; i < num_tasks; i++) {
5510     fprintf(tdg_file, "      %d[style=bold]\n", i);
5511   }
5512   fprintf(tdg_file, "   }\n");
5513   for (kmp_int32 i = 0; i < num_tasks; i++) {
5514     kmp_int32 nsuccessors = tdg->record_map[i].nsuccessors;
5515     kmp_int32 *successors = tdg->record_map[i].successors;
5516     if (nsuccessors > 0) {
5517       for (kmp_int32 j = 0; j < nsuccessors; j++)
5518         fprintf(tdg_file, "   %d -> %d \n", i, successors[j]);
5519     }
5520   }
5521   fprintf(tdg_file, "}");
5522   KA_TRACE(10, ("__kmp_print_tdg_dot(exit): T#%d tdg_id=%d \n", gtid, tdg_id));
5523 }
5524 
5525 // __kmp_start_record: launch the execution of a previous
5526 // recorded TDG
5527 // gtid:   Global Thread ID
5528 // tdg:    ID of the TDG
5529 void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5530   KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);
5531   KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,
5532                 tdg->tdg_id, tdg->num_roots));
5533   kmp_node_info_t *this_record_map = tdg->record_map;
5534   kmp_int32 *this_root_tasks = tdg->root_tasks;
5535   kmp_int32 this_num_roots = tdg->num_roots;
5536   kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5537 
5538   kmp_info_t *thread = __kmp_threads[gtid];
5539   kmp_taskdata_t *parent_task = thread->th.th_current_task;
5540 
5541   if (tdg->rec_taskred_data) {
5542     __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);
5543   }
5544 
5545   for (kmp_int32 j = 0; j < this_num_tasks; j++) {
5546     kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);
5547 
5548     td->td_parent = parent_task;
5549     this_record_map[j].parent_task = parent_task;
5550 
5551     kmp_taskgroup_t *parent_taskgroup =
5552         this_record_map[j].parent_task->td_taskgroup;
5553 
5554     KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,
5555                       this_record_map[j].npredecessors);
5556     KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);
5557 
5558     if (parent_taskgroup) {
5559       KMP_ATOMIC_INC(&parent_taskgroup->count);
5560       // The taskgroup is different so we must update it
5561       td->td_taskgroup = parent_taskgroup;
5562     } else if (td->td_taskgroup != nullptr) {
5563       // If the parent doesnt have a taskgroup, remove it from the task
5564       td->td_taskgroup = nullptr;
5565     }
5566     if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)
5567       KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);
5568   }
5569 
5570   for (kmp_int32 j = 0; j < this_num_roots; ++j) {
5571     __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);
5572   }
5573   KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,
5574                 tdg->tdg_id, tdg->num_roots));
5575 }
5576 
5577 // __kmp_start_record: set up a TDG structure and turn the
5578 // recording flag to true
5579 // gtid:        Global Thread ID of the encountering thread
5580 // input_flags: Flags associated with the TDG
5581 // tdg_id:      ID of the TDG to record
5582 static inline void __kmp_start_record(kmp_int32 gtid,
5583                                       kmp_taskgraph_flags_t *flags,
5584                                       kmp_int32 tdg_id) {
5585   kmp_tdg_info_t *tdg =
5586       (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t));
5587   __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;
5588   // Initializing the TDG structure
5589   tdg->tdg_id = tdg_id;
5590   tdg->map_size = INIT_MAPSIZE;
5591   tdg->num_roots = -1;
5592   tdg->root_tasks = nullptr;
5593   tdg->tdg_status = KMP_TDG_RECORDING;
5594   tdg->rec_num_taskred = 0;
5595   tdg->rec_taskred_data = nullptr;
5596   KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0);
5597 
5598   // Initializing the list of nodes in this TDG
5599   kmp_node_info_t *this_record_map =
5600       (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t));
5601   for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) {
5602     kmp_int32 *successorsList =
5603         (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32));
5604     this_record_map[i].task = nullptr;
5605     this_record_map[i].successors = successorsList;
5606     this_record_map[i].nsuccessors = 0;
5607     this_record_map[i].npredecessors = 0;
5608     this_record_map[i].successors_size = __kmp_successors_size;
5609     KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0);
5610   }
5611 
5612   __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;
5613 }
5614 
5615 // __kmpc_start_record_task: Wrapper around __kmp_start_record to mark
5616 // the beginning of the record process of a task region
5617 // loc_ref:     Location of TDG, not used yet
5618 // gtid:        Global Thread ID of the encountering thread
5619 // input_flags: Flags associated with the TDG
5620 // tdg_id:      ID of the TDG to record, for now, incremental integer
5621 // returns:     1 if we record, otherwise, 0
5622 kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid,
5623                                    kmp_int32 input_flags, kmp_int32 tdg_id) {
5624 
5625   kmp_int32 res;
5626   kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags;
5627   KA_TRACE(10,
5628            ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",
5629             gtid, loc_ref, input_flags, tdg_id));
5630 
5631   if (__kmp_max_tdgs == 0) {
5632     KA_TRACE(
5633         10,
5634         ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "
5635          "__kmp_max_tdgs = 0\n",
5636          gtid, loc_ref, input_flags, tdg_id));
5637     return 1;
5638   }
5639 
5640   __kmpc_taskgroup(loc_ref, gtid);
5641   if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {
5642     // TODO: use re_record flag
5643     __kmp_exec_tdg(gtid, tdg);
5644     res = 0;
5645   } else {
5646     __kmp_curr_tdg_idx = tdg_id;
5647     KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);
5648     __kmp_start_record(gtid, flags, tdg_id);
5649     __kmp_num_tdg++;
5650     res = 1;
5651   }
5652   KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",
5653                 gtid, tdg_id, res ? "record" : "execute"));
5654   return res;
5655 }
5656 
5657 // __kmp_end_record: set up a TDG after recording it
5658 // gtid:   Global thread ID
5659 // tdg:    Pointer to the TDG
5660 void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5661   // Store roots
5662   kmp_node_info_t *this_record_map = tdg->record_map;
5663   kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5664   kmp_int32 *this_root_tasks =
5665       (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32));
5666   kmp_int32 this_map_size = tdg->map_size;
5667   kmp_int32 this_num_roots = 0;
5668   kmp_info_t *thread = __kmp_threads[gtid];
5669 
5670   for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5671     if (this_record_map[i].npredecessors == 0) {
5672       this_root_tasks[this_num_roots++] = i;
5673     }
5674   }
5675 
5676   // Update with roots info and mapsize
5677   tdg->map_size = this_map_size;
5678   tdg->num_roots = this_num_roots;
5679   tdg->root_tasks = this_root_tasks;
5680   KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);
5681   tdg->tdg_status = KMP_TDG_READY;
5682 
5683   if (thread->th.th_current_task->td_dephash) {
5684     __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);
5685     thread->th.th_current_task->td_dephash = NULL;
5686   }
5687 
5688   // Reset predecessor counter
5689   for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5690     KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,
5691                       this_record_map[i].npredecessors);
5692   }
5693   KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);
5694 
5695   if (__kmp_tdg_dot)
5696     __kmp_print_tdg_dot(tdg);
5697 }
5698 
5699 // __kmpc_end_record_task: wrapper around __kmp_end_record to mark
5700 // the end of recording phase
5701 //
5702 // loc_ref:      Source location information
5703 // gtid:         Global thread ID
5704 // input_flags:  Flags attached to the graph
5705 // tdg_id:       ID of the TDG just finished recording
5706 void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,
5707                             kmp_int32 input_flags, kmp_int32 tdg_id) {
5708   kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);
5709 
5710   KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"
5711                 " tdg=%d with flags=%d\n",
5712                 gtid, loc_ref, tdg_id, input_flags));
5713   if (__kmp_max_tdgs) {
5714     // TODO: use input_flags->nowait
5715     __kmpc_end_taskgroup(loc_ref, gtid);
5716     if (__kmp_tdg_is_recording(tdg->tdg_status))
5717       __kmp_end_record(gtid, tdg);
5718   }
5719   KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"
5720                 " tdg=%d, its status is now READY\n",
5721                 gtid, loc_ref, tdg_id));
5722 }
5723 #endif
5724