xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_tasking.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 #if ENABLE_LIBOMPTARGET
25 static void (*tgt_target_nowait_query)(void **);
26 
__kmp_init_target_task()27 void __kmp_init_target_task() {
28   *(void **)(&tgt_target_nowait_query) = KMP_DLSYM("__tgt_target_nowait_query");
29 }
30 #endif
31 
32 /* forward declaration */
33 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
34                                  kmp_info_t *this_thr);
35 static void __kmp_alloc_task_deque(kmp_info_t *thread,
36                                    kmp_thread_data_t *thread_data);
37 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
38                                            kmp_task_team_t *task_team);
39 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
40 #if OMPX_TASKGRAPH
41 static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
42 int __kmp_taskloop_task(int gtid, void *ptask);
43 #endif
44 
45 #ifdef BUILD_TIED_TASK_STACK
46 
47 //  __kmp_trace_task_stack: print the tied tasks from the task stack in order
48 //  from top do bottom
49 //
50 //  gtid: global thread identifier for thread containing stack
51 //  thread_data: thread data for task team thread containing stack
52 //  threshold: value above which the trace statement triggers
53 //  location: string identifying call site of this function (for trace)
__kmp_trace_task_stack(kmp_int32 gtid,kmp_thread_data_t * thread_data,int threshold,char * location)54 static void __kmp_trace_task_stack(kmp_int32 gtid,
55                                    kmp_thread_data_t *thread_data,
56                                    int threshold, char *location) {
57   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
58   kmp_taskdata_t **stack_top = task_stack->ts_top;
59   kmp_int32 entries = task_stack->ts_entries;
60   kmp_taskdata_t *tied_task;
61 
62   KA_TRACE(
63       threshold,
64       ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
65        "first_block = %p, stack_top = %p \n",
66        location, gtid, entries, task_stack->ts_first_block, stack_top));
67 
68   KMP_DEBUG_ASSERT(stack_top != NULL);
69   KMP_DEBUG_ASSERT(entries > 0);
70 
71   while (entries != 0) {
72     KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
73     // fix up ts_top if we need to pop from previous block
74     if (entries & TASK_STACK_INDEX_MASK == 0) {
75       kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
76 
77       stack_block = stack_block->sb_prev;
78       stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
79     }
80 
81     // finish bookkeeping
82     stack_top--;
83     entries--;
84 
85     tied_task = *stack_top;
86 
87     KMP_DEBUG_ASSERT(tied_task != NULL);
88     KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
89 
90     KA_TRACE(threshold,
91              ("__kmp_trace_task_stack(%s):             gtid=%d, entry=%d, "
92               "stack_top=%p, tied_task=%p\n",
93               location, gtid, entries, stack_top, tied_task));
94   }
95   KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
96 
97   KA_TRACE(threshold,
98            ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
99             location, gtid));
100 }
101 
102 //  __kmp_init_task_stack: initialize the task stack for the first time
103 //  after a thread_data structure is created.
104 //  It should not be necessary to do this again (assuming the stack works).
105 //
106 //  gtid: global thread identifier of calling thread
107 //  thread_data: thread data for task team thread containing stack
__kmp_init_task_stack(kmp_int32 gtid,kmp_thread_data_t * thread_data)108 static void __kmp_init_task_stack(kmp_int32 gtid,
109                                   kmp_thread_data_t *thread_data) {
110   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
111   kmp_stack_block_t *first_block;
112 
113   // set up the first block of the stack
114   first_block = &task_stack->ts_first_block;
115   task_stack->ts_top = (kmp_taskdata_t **)first_block;
116   memset((void *)first_block, '\0',
117          TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
118 
119   // initialize the stack to be empty
120   task_stack->ts_entries = TASK_STACK_EMPTY;
121   first_block->sb_next = NULL;
122   first_block->sb_prev = NULL;
123 }
124 
125 //  __kmp_free_task_stack: free the task stack when thread_data is destroyed.
126 //
127 //  gtid: global thread identifier for calling thread
128 //  thread_data: thread info for thread containing stack
__kmp_free_task_stack(kmp_int32 gtid,kmp_thread_data_t * thread_data)129 static void __kmp_free_task_stack(kmp_int32 gtid,
130                                   kmp_thread_data_t *thread_data) {
131   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
132   kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
133 
134   KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
135   // free from the second block of the stack
136   while (stack_block != NULL) {
137     kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
138 
139     stack_block->sb_next = NULL;
140     stack_block->sb_prev = NULL;
141     if (stack_block != &task_stack->ts_first_block) {
142       __kmp_thread_free(thread,
143                         stack_block); // free the block, if not the first
144     }
145     stack_block = next_block;
146   }
147   // initialize the stack to be empty
148   task_stack->ts_entries = 0;
149   task_stack->ts_top = NULL;
150 }
151 
152 //  __kmp_push_task_stack: Push the tied task onto the task stack.
153 //     Grow the stack if necessary by allocating another block.
154 //
155 //  gtid: global thread identifier for calling thread
156 //  thread: thread info for thread containing stack
157 //  tied_task: the task to push on the stack
__kmp_push_task_stack(kmp_int32 gtid,kmp_info_t * thread,kmp_taskdata_t * tied_task)158 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
159                                   kmp_taskdata_t *tied_task) {
160   // GEH - need to consider what to do if tt_threads_data not allocated yet
161   kmp_thread_data_t *thread_data =
162       &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
163   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
164 
165   if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
166     return; // Don't push anything on stack if team or team tasks are serialized
167   }
168 
169   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
170   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
171 
172   KA_TRACE(20,
173            ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
174             gtid, thread, tied_task));
175   // Store entry
176   *(task_stack->ts_top) = tied_task;
177 
178   // Do bookkeeping for next push
179   task_stack->ts_top++;
180   task_stack->ts_entries++;
181 
182   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
183     // Find beginning of this task block
184     kmp_stack_block_t *stack_block =
185         (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
186 
187     // Check if we already have a block
188     if (stack_block->sb_next !=
189         NULL) { // reset ts_top to beginning of next block
190       task_stack->ts_top = &stack_block->sb_next->sb_block[0];
191     } else { // Alloc new block and link it up
192       kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
193           thread, sizeof(kmp_stack_block_t));
194 
195       task_stack->ts_top = &new_block->sb_block[0];
196       stack_block->sb_next = new_block;
197       new_block->sb_prev = stack_block;
198       new_block->sb_next = NULL;
199 
200       KA_TRACE(
201           30,
202           ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
203            gtid, tied_task, new_block));
204     }
205   }
206   KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
207                 tied_task));
208 }
209 
210 //  __kmp_pop_task_stack: Pop the tied task from the task stack.  Don't return
211 //  the task, just check to make sure it matches the ending task passed in.
212 //
213 //  gtid: global thread identifier for the calling thread
214 //  thread: thread info structure containing stack
215 //  tied_task: the task popped off the stack
216 //  ending_task: the task that is ending (should match popped task)
__kmp_pop_task_stack(kmp_int32 gtid,kmp_info_t * thread,kmp_taskdata_t * ending_task)217 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
218                                  kmp_taskdata_t *ending_task) {
219   // GEH - need to consider what to do if tt_threads_data not allocated yet
220   kmp_thread_data_t *thread_data =
221       &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
222   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
223   kmp_taskdata_t *tied_task;
224 
225   if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
226     // Don't pop anything from stack if team or team tasks are serialized
227     return;
228   }
229 
230   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
231   KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
232 
233   KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
234                 thread));
235 
236   // fix up ts_top if we need to pop from previous block
237   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
238     kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
239 
240     stack_block = stack_block->sb_prev;
241     task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
242   }
243 
244   // finish bookkeeping
245   task_stack->ts_top--;
246   task_stack->ts_entries--;
247 
248   tied_task = *(task_stack->ts_top);
249 
250   KMP_DEBUG_ASSERT(tied_task != NULL);
251   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
252   KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
253 
254   KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
255                 tied_task));
256   return;
257 }
258 #endif /* BUILD_TIED_TASK_STACK */
259 
260 // returns 1 if new task is allowed to execute, 0 otherwise
261 // checks Task Scheduling constraint (if requested) and
262 // mutexinoutset dependencies if any
__kmp_task_is_allowed(int gtid,const kmp_int32 is_constrained,const kmp_taskdata_t * tasknew,const kmp_taskdata_t * taskcurr)263 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
264                                   const kmp_taskdata_t *tasknew,
265                                   const kmp_taskdata_t *taskcurr) {
266   if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
267     // Check if the candidate obeys the Task Scheduling Constraints (TSC)
268     // only descendant of all deferred tied tasks can be scheduled, checking
269     // the last one is enough, as it in turn is the descendant of all others
270     kmp_taskdata_t *current = taskcurr->td_last_tied;
271     KMP_DEBUG_ASSERT(current != NULL);
272     // check if the task is not suspended on barrier
273     if (current->td_flags.tasktype == TASK_EXPLICIT ||
274         current->td_taskwait_thread > 0) { // <= 0 on barrier
275       kmp_int32 level = current->td_level;
276       kmp_taskdata_t *parent = tasknew->td_parent;
277       while (parent != current && parent->td_level > level) {
278         // check generation up to the level of the current task
279         parent = parent->td_parent;
280         KMP_DEBUG_ASSERT(parent != NULL);
281       }
282       if (parent != current)
283         return false;
284     }
285   }
286   // Check mutexinoutset dependencies, acquire locks
287   kmp_depnode_t *node = tasknew->td_depnode;
288 #if OMPX_TASKGRAPH
289   if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
290 #else
291   if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
292 #endif
293     for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
294       KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
295       if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
296         continue;
297       // could not get the lock, release previous locks
298       for (int j = i - 1; j >= 0; --j)
299         __kmp_release_lock(node->dn.mtx_locks[j], gtid);
300       return false;
301     }
302     // negative num_locks means all locks acquired successfully
303     node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
304   }
305   return true;
306 }
307 
308 // __kmp_realloc_task_deque:
309 // Re-allocates a task deque for a particular thread, copies the content from
310 // the old deque and adjusts the necessary data structures relating to the
311 // deque. This operation must be done with the deque_lock being held
312 static void __kmp_realloc_task_deque(kmp_info_t *thread,
313                                      kmp_thread_data_t *thread_data) {
314   kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
315   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
316   kmp_int32 new_size = 2 * size;
317 
318   KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
319                 "%d] for thread_data %p\n",
320                 __kmp_gtid_from_thread(thread), size, new_size, thread_data));
321 
322   kmp_taskdata_t **new_deque =
323       (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
324 
325   int i, j;
326   for (i = thread_data->td.td_deque_head, j = 0; j < size;
327        i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
328     new_deque[j] = thread_data->td.td_deque[i];
329 
330   __kmp_free(thread_data->td.td_deque);
331 
332   thread_data->td.td_deque_head = 0;
333   thread_data->td.td_deque_tail = size;
334   thread_data->td.td_deque = new_deque;
335   thread_data->td.td_deque_size = new_size;
336 }
337 
338 static kmp_task_pri_t *__kmp_alloc_task_pri_list() {
339   kmp_task_pri_t *l = (kmp_task_pri_t *)__kmp_allocate(sizeof(kmp_task_pri_t));
340   kmp_thread_data_t *thread_data = &l->td;
341   __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
342   thread_data->td.td_deque_last_stolen = -1;
343   KE_TRACE(20, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] "
344                 "for thread_data %p\n",
345                 __kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data));
346   thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
347       INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
348   thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
349   return l;
350 }
351 
352 // The function finds the deque of priority tasks with given priority, or
353 // allocates a new deque and put it into sorted (high -> low) list of deques.
354 // Deques of non-default priority tasks are shared between all threads in team,
355 // as opposed to per-thread deques of tasks with default priority.
356 // The function is called under the lock task_team->tt.tt_task_pri_lock.
357 static kmp_thread_data_t *
358 __kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri) {
359   kmp_thread_data_t *thread_data;
360   kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
361   if (lst->priority == pri) {
362     // Found queue of tasks with given priority.
363     thread_data = &lst->td;
364   } else if (lst->priority < pri) {
365     // All current priority queues contain tasks with lower priority.
366     // Allocate new one for given priority tasks.
367     kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
368     thread_data = &list->td;
369     list->priority = pri;
370     list->next = lst;
371     task_team->tt.tt_task_pri_list = list;
372   } else { // task_team->tt.tt_task_pri_list->priority > pri
373     kmp_task_pri_t *next_queue = lst->next;
374     while (next_queue && next_queue->priority > pri) {
375       lst = next_queue;
376       next_queue = lst->next;
377     }
378     // lst->priority > pri && (next == NULL || pri >= next->priority)
379     if (next_queue == NULL) {
380       // No queue with pri priority, need to allocate new one.
381       kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
382       thread_data = &list->td;
383       list->priority = pri;
384       list->next = NULL;
385       lst->next = list;
386     } else if (next_queue->priority == pri) {
387       // Found queue of tasks with given priority.
388       thread_data = &next_queue->td;
389     } else { // lst->priority > pri > next->priority
390       // insert newly allocated between existed queues
391       kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
392       thread_data = &list->td;
393       list->priority = pri;
394       list->next = next_queue;
395       lst->next = list;
396     }
397   }
398   return thread_data;
399 }
400 
401 //  __kmp_push_priority_task: Add a task to the team's priority task deque
402 static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread,
403                                           kmp_taskdata_t *taskdata,
404                                           kmp_task_team_t *task_team,
405                                           kmp_int32 pri) {
406   kmp_thread_data_t *thread_data = NULL;
407   KA_TRACE(20,
408            ("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n",
409             gtid, taskdata, pri));
410 
411   // Find task queue specific to priority value
412   kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
413   if (UNLIKELY(lst == NULL)) {
414     __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
415     if (task_team->tt.tt_task_pri_list == NULL) {
416       // List of queues is still empty, allocate one.
417       kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
418       thread_data = &list->td;
419       list->priority = pri;
420       list->next = NULL;
421       task_team->tt.tt_task_pri_list = list;
422     } else {
423       // Other thread initialized a queue. Check if it fits and get thread_data.
424       thread_data = __kmp_get_priority_deque_data(task_team, pri);
425     }
426     __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
427   } else {
428     if (lst->priority == pri) {
429       // Found queue of tasks with given priority.
430       thread_data = &lst->td;
431     } else {
432       __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
433       thread_data = __kmp_get_priority_deque_data(task_team, pri);
434       __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
435     }
436   }
437   KMP_DEBUG_ASSERT(thread_data);
438 
439   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
440   // Check if deque is full
441   if (TCR_4(thread_data->td.td_deque_ntasks) >=
442       TASK_DEQUE_SIZE(thread_data->td)) {
443     if (__kmp_enable_task_throttling &&
444         __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
445                               thread->th.th_current_task)) {
446       __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
447       KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning "
448                     "TASK_NOT_PUSHED for task %p\n",
449                     gtid, taskdata));
450       return TASK_NOT_PUSHED;
451     } else {
452       // expand deque to push the task which is not allowed to execute
453       __kmp_realloc_task_deque(thread, thread_data);
454     }
455   }
456   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
457                    TASK_DEQUE_SIZE(thread_data->td));
458   // Push taskdata.
459   thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
460   // Wrap index.
461   thread_data->td.td_deque_tail =
462       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
463   TCW_4(thread_data->td.td_deque_ntasks,
464         TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
465   KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
466   KMP_FSYNC_RELEASING(taskdata); // releasing child
467   KA_TRACE(20, ("__kmp_push_priority_task: T#%d returning "
468                 "TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n",
469                 gtid, taskdata, thread_data->td.td_deque_ntasks,
470                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
471   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
472   task_team->tt.tt_num_task_pri++; // atomic inc
473   return TASK_SUCCESSFULLY_PUSHED;
474 }
475 
476 //  __kmp_push_task: Add a task to the thread's deque
477 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
478   kmp_info_t *thread = __kmp_threads[gtid];
479   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
480 
481   // If we encounter a hidden helper task, and the current thread is not a
482   // hidden helper thread, we have to give the task to any hidden helper thread
483   // starting from its shadow one.
484   if (UNLIKELY(taskdata->td_flags.hidden_helper &&
485                !KMP_HIDDEN_HELPER_THREAD(gtid))) {
486     kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
487     __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid));
488     // Signal the hidden helper threads.
489     __kmp_hidden_helper_worker_thread_signal();
490     return TASK_SUCCESSFULLY_PUSHED;
491   }
492 
493   kmp_task_team_t *task_team = thread->th.th_task_team;
494   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
495   kmp_thread_data_t *thread_data;
496 
497   KA_TRACE(20,
498            ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
499 
500   if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
501     // untied task needs to increment counter so that the task structure is not
502     // freed prematurely
503     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
504     KMP_DEBUG_USE_VAR(counter);
505     KA_TRACE(
506         20,
507         ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
508          gtid, counter, taskdata));
509   }
510 
511   // The first check avoids building task_team thread data if serialized
512   if (UNLIKELY(taskdata->td_flags.task_serial)) {
513     KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
514                   "TASK_NOT_PUSHED for task %p\n",
515                   gtid, taskdata));
516     return TASK_NOT_PUSHED;
517   }
518 
519   // Now that serialized tasks have returned, we can assume that we are not in
520   // immediate exec mode
521   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
522   if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
523     __kmp_enable_tasking(task_team, thread);
524   }
525   KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
526   KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
527 
528   if (taskdata->td_flags.priority_specified && task->data2.priority > 0 &&
529       __kmp_max_task_priority > 0) {
530     int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority);
531     return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri);
532   }
533 
534   // Find tasking deque specific to encountering thread
535   thread_data = &task_team->tt.tt_threads_data[tid];
536 
537   // No lock needed since only owner can allocate. If the task is hidden_helper,
538   // we don't need it either because we have initialized the dequeue for hidden
539   // helper thread data.
540   if (UNLIKELY(thread_data->td.td_deque == NULL)) {
541     __kmp_alloc_task_deque(thread, thread_data);
542   }
543 
544   int locked = 0;
545   // Check if deque is full
546   if (TCR_4(thread_data->td.td_deque_ntasks) >=
547       TASK_DEQUE_SIZE(thread_data->td)) {
548     if (__kmp_enable_task_throttling &&
549         __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
550                               thread->th.th_current_task)) {
551       KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
552                     "TASK_NOT_PUSHED for task %p\n",
553                     gtid, taskdata));
554       return TASK_NOT_PUSHED;
555     } else {
556       __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
557       locked = 1;
558       if (TCR_4(thread_data->td.td_deque_ntasks) >=
559           TASK_DEQUE_SIZE(thread_data->td)) {
560         // expand deque to push the task which is not allowed to execute
561         __kmp_realloc_task_deque(thread, thread_data);
562       }
563     }
564   }
565   // Lock the deque for the task push operation
566   if (!locked) {
567     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
568     // Need to recheck as we can get a proxy task from thread outside of OpenMP
569     if (TCR_4(thread_data->td.td_deque_ntasks) >=
570         TASK_DEQUE_SIZE(thread_data->td)) {
571       if (__kmp_enable_task_throttling &&
572           __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
573                                 thread->th.th_current_task)) {
574         __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
575         KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
576                       "returning TASK_NOT_PUSHED for task %p\n",
577                       gtid, taskdata));
578         return TASK_NOT_PUSHED;
579       } else {
580         // expand deque to push the task which is not allowed to execute
581         __kmp_realloc_task_deque(thread, thread_data);
582       }
583     }
584   }
585   // Must have room since no thread can add tasks but calling thread
586   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
587                    TASK_DEQUE_SIZE(thread_data->td));
588 
589   thread_data->td.td_deque[thread_data->td.td_deque_tail] =
590       taskdata; // Push taskdata
591   // Wrap index.
592   thread_data->td.td_deque_tail =
593       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
594   TCW_4(thread_data->td.td_deque_ntasks,
595         TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
596   KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
597   KMP_FSYNC_RELEASING(taskdata); // releasing child
598   KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
599                 "task=%p ntasks=%d head=%u tail=%u\n",
600                 gtid, taskdata, thread_data->td.td_deque_ntasks,
601                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
602 
603   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
604 
605   return TASK_SUCCESSFULLY_PUSHED;
606 }
607 
608 // __kmp_pop_current_task_from_thread: set up current task from called thread
609 // when team ends
610 //
611 // this_thr: thread structure to set current_task in.
612 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
613   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
614                 "this_thread=%p, curtask=%p, "
615                 "curtask_parent=%p\n",
616                 0, this_thr, this_thr->th.th_current_task,
617                 this_thr->th.th_current_task->td_parent));
618 
619   this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
620 
621   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
622                 "this_thread=%p, curtask=%p, "
623                 "curtask_parent=%p\n",
624                 0, this_thr, this_thr->th.th_current_task,
625                 this_thr->th.th_current_task->td_parent));
626 }
627 
628 // __kmp_push_current_task_to_thread: set up current task in called thread for a
629 // new team
630 //
631 // this_thr: thread structure to set up
632 // team: team for implicit task data
633 // tid: thread within team to set up
634 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
635                                        int tid) {
636   // current task of the thread is a parent of the new just created implicit
637   // tasks of new team
638   KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
639                 "curtask=%p "
640                 "parent_task=%p\n",
641                 tid, this_thr, this_thr->th.th_current_task,
642                 team->t.t_implicit_task_taskdata[tid].td_parent));
643 
644   KMP_DEBUG_ASSERT(this_thr != NULL);
645 
646   if (tid == 0) {
647     if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
648       team->t.t_implicit_task_taskdata[0].td_parent =
649           this_thr->th.th_current_task;
650       this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
651     }
652   } else {
653     team->t.t_implicit_task_taskdata[tid].td_parent =
654         team->t.t_implicit_task_taskdata[0].td_parent;
655     this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
656   }
657 
658   KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
659                 "curtask=%p "
660                 "parent_task=%p\n",
661                 tid, this_thr, this_thr->th.th_current_task,
662                 team->t.t_implicit_task_taskdata[tid].td_parent));
663 }
664 
665 // __kmp_task_start: bookkeeping for a task starting execution
666 //
667 // GTID: global thread id of calling thread
668 // task: task starting execution
669 // current_task: task suspending
670 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
671                              kmp_taskdata_t *current_task) {
672   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
673   kmp_info_t *thread = __kmp_threads[gtid];
674 
675   KA_TRACE(10,
676            ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
677             gtid, taskdata, current_task));
678 
679   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
680 
681   // mark currently executing task as suspended
682   // TODO: GEH - make sure root team implicit task is initialized properly.
683   // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
684   current_task->td_flags.executing = 0;
685 
686 // Add task to stack if tied
687 #ifdef BUILD_TIED_TASK_STACK
688   if (taskdata->td_flags.tiedness == TASK_TIED) {
689     __kmp_push_task_stack(gtid, thread, taskdata);
690   }
691 #endif /* BUILD_TIED_TASK_STACK */
692 
693   // mark starting task as executing and as current task
694   thread->th.th_current_task = taskdata;
695 
696   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
697                    taskdata->td_flags.tiedness == TASK_UNTIED);
698   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
699                    taskdata->td_flags.tiedness == TASK_UNTIED);
700   taskdata->td_flags.started = 1;
701   taskdata->td_flags.executing = 1;
702   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
703   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
704 
705   // GEH TODO: shouldn't we pass some sort of location identifier here?
706   // APT: yes, we will pass location here.
707   // need to store current thread state (in a thread or taskdata structure)
708   // before setting work_state, otherwise wrong state is set after end of task
709 
710   KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
711 
712   return;
713 }
714 
715 #if OMPT_SUPPORT
716 //------------------------------------------------------------------------------
717 
718 // __ompt_task_start:
719 //   Build and trigger task-begin event
720 static inline void __ompt_task_start(kmp_task_t *task,
721                                      kmp_taskdata_t *current_task,
722                                      kmp_int32 gtid) {
723   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
724   ompt_task_status_t status = ompt_task_switch;
725   if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
726     status = ompt_task_yield;
727     __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
728   }
729   /* let OMPT know that we're about to run this task */
730   if (ompt_enabled.ompt_callback_task_schedule) {
731     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
732         &(current_task->ompt_task_info.task_data), status,
733         &(taskdata->ompt_task_info.task_data));
734   }
735   taskdata->ompt_task_info.scheduling_parent = current_task;
736 }
737 
738 // __ompt_task_finish:
739 //   Build and trigger final task-schedule event
740 static inline void __ompt_task_finish(kmp_task_t *task,
741                                       kmp_taskdata_t *resumed_task,
742                                       ompt_task_status_t status) {
743   if (ompt_enabled.ompt_callback_task_schedule) {
744     kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
745     if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
746         taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
747       status = ompt_task_cancel;
748     }
749 
750     /* let OMPT know that we're returning to the callee task */
751     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
752         &(taskdata->ompt_task_info.task_data), status,
753         (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
754   }
755 }
756 #endif
757 
758 template <bool ompt>
759 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
760                                                kmp_task_t *task,
761                                                void *frame_address,
762                                                void *return_address) {
763   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
764   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
765 
766   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
767                 "current_task=%p\n",
768                 gtid, loc_ref, taskdata, current_task));
769 
770   if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
771     // untied task needs to increment counter so that the task structure is not
772     // freed prematurely
773     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
774     KMP_DEBUG_USE_VAR(counter);
775     KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
776                   "incremented for task %p\n",
777                   gtid, counter, taskdata));
778   }
779 
780   taskdata->td_flags.task_serial =
781       1; // Execute this task immediately, not deferred.
782   __kmp_task_start(gtid, task, current_task);
783 
784 #if OMPT_SUPPORT
785   if (ompt) {
786     if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
787       current_task->ompt_task_info.frame.enter_frame.ptr =
788           taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
789       current_task->ompt_task_info.frame.enter_frame_flags =
790           taskdata->ompt_task_info.frame.exit_frame_flags =
791               OMPT_FRAME_FLAGS_APP;
792     }
793     if (ompt_enabled.ompt_callback_task_create) {
794       ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
795       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
796           &(parent_info->task_data), &(parent_info->frame),
797           &(taskdata->ompt_task_info.task_data),
798           TASK_TYPE_DETAILS_FORMAT(taskdata), 0, return_address);
799     }
800     __ompt_task_start(task, current_task, gtid);
801   }
802 #endif // OMPT_SUPPORT
803 
804   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
805                 loc_ref, taskdata));
806 }
807 
808 #if OMPT_SUPPORT
809 OMPT_NOINLINE
810 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
811                                            kmp_task_t *task,
812                                            void *frame_address,
813                                            void *return_address) {
814   __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
815                                            return_address);
816 }
817 #endif // OMPT_SUPPORT
818 
819 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
820 // execution
821 //
822 // loc_ref: source location information; points to beginning of task block.
823 // gtid: global thread number.
824 // task: task thunk for the started task.
825 #ifdef __s390x__
826 // This is required for OMPT_GET_FRAME_ADDRESS(1) to compile on s390x.
827 // In order for it to work correctly, the caller also needs to be compiled with
828 // backchain. If a caller is compiled without backchain,
829 // OMPT_GET_FRAME_ADDRESS(1) will produce an incorrect value, but will not
830 // crash.
831 __attribute__((target("backchain")))
832 #endif
833 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
834                                kmp_task_t *task) {
835 #if OMPT_SUPPORT
836   if (UNLIKELY(ompt_enabled.enabled)) {
837     OMPT_STORE_RETURN_ADDRESS(gtid);
838     __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
839                                    OMPT_GET_FRAME_ADDRESS(1),
840                                    OMPT_LOAD_RETURN_ADDRESS(gtid));
841     return;
842   }
843 #endif
844   __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
845 }
846 
847 #ifdef TASK_UNUSED
848 // __kmpc_omp_task_begin: report that a given task has started execution
849 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
850 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
851   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
852 
853   KA_TRACE(
854       10,
855       ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
856        gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
857 
858   __kmp_task_start(gtid, task, current_task);
859 
860   KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
861                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
862   return;
863 }
864 #endif // TASK_UNUSED
865 
866 // __kmp_free_task: free the current task space and the space for shareds
867 //
868 // gtid: Global thread ID of calling thread
869 // taskdata: task to free
870 // thread: thread data structure of caller
871 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
872                             kmp_info_t *thread) {
873   KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
874                 taskdata));
875 
876   // Check to make sure all flags and counters have the correct values
877   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
878   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
879   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
880   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
881   KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
882                    taskdata->td_flags.task_serial == 1);
883   KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
884   kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
885   // Clear data to not be re-used later by mistake.
886   task->data1.destructors = NULL;
887   task->data2.priority = 0;
888 
889   taskdata->td_flags.freed = 1;
890 #if OMPX_TASKGRAPH
891   // do not free tasks in taskgraph
892   if (!taskdata->is_taskgraph) {
893 #endif
894 // deallocate the taskdata and shared variable blocks associated with this task
895 #if USE_FAST_MEMORY
896   __kmp_fast_free(thread, taskdata);
897 #else /* ! USE_FAST_MEMORY */
898   __kmp_thread_free(thread, taskdata);
899 #endif
900 #if OMPX_TASKGRAPH
901   } else {
902     taskdata->td_flags.complete = 0;
903     taskdata->td_flags.started = 0;
904     taskdata->td_flags.freed = 0;
905     taskdata->td_flags.executing = 0;
906     taskdata->td_flags.task_serial =
907         (taskdata->td_parent->td_flags.final ||
908           taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser);
909 
910     // taskdata->td_allow_completion_event.pending_events_count = 1;
911     KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
912     KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
913     // start at one because counts current task and children
914     KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
915   }
916 #endif
917 
918   KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
919 }
920 
921 // __kmp_free_task_and_ancestors: free the current task and ancestors without
922 // children
923 //
924 // gtid: Global thread ID of calling thread
925 // taskdata: task to free
926 // thread: thread data structure of caller
927 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
928                                           kmp_taskdata_t *taskdata,
929                                           kmp_info_t *thread) {
930   // Proxy tasks must always be allowed to free their parents
931   // because they can be run in background even in serial mode.
932   kmp_int32 team_serial =
933       (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
934       !taskdata->td_flags.proxy;
935   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
936 
937   kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
938   KMP_DEBUG_ASSERT(children >= 0);
939 
940   // Now, go up the ancestor tree to see if any ancestors can now be freed.
941   while (children == 0) {
942     kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
943 
944     KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
945                   "and freeing itself\n",
946                   gtid, taskdata));
947 
948     // --- Deallocate my ancestor task ---
949     __kmp_free_task(gtid, taskdata, thread);
950 
951     taskdata = parent_taskdata;
952 
953     if (team_serial)
954       return;
955     // Stop checking ancestors at implicit task instead of walking up ancestor
956     // tree to avoid premature deallocation of ancestors.
957     if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
958       if (taskdata->td_dephash) { // do we need to cleanup dephash?
959         int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
960         kmp_tasking_flags_t flags_old = taskdata->td_flags;
961         if (children == 0 && flags_old.complete == 1) {
962           kmp_tasking_flags_t flags_new = flags_old;
963           flags_new.complete = 0;
964           if (KMP_COMPARE_AND_STORE_ACQ32(
965                   RCAST(kmp_int32 *, &taskdata->td_flags),
966                   *RCAST(kmp_int32 *, &flags_old),
967                   *RCAST(kmp_int32 *, &flags_new))) {
968             KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
969                            "dephash of implicit task %p\n",
970                            gtid, taskdata));
971             // cleanup dephash of finished implicit task
972             __kmp_dephash_free_entries(thread, taskdata->td_dephash);
973           }
974         }
975       }
976       return;
977     }
978     // Predecrement simulated by "- 1" calculation
979     children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
980     KMP_DEBUG_ASSERT(children >= 0);
981   }
982 
983   KA_TRACE(
984       20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
985            "not freeing it yet\n",
986            gtid, taskdata, children));
987 }
988 
989 // Only need to keep track of child task counts if any of the following:
990 // 1. team parallel and tasking not serialized;
991 // 2. it is a proxy or detachable or hidden helper task
992 // 3. the children counter of its parent task is greater than 0.
993 // The reason for the 3rd one is for serialized team that found detached task,
994 // hidden helper task, T. In this case, the execution of T is still deferred,
995 // and it is also possible that a regular task depends on T. In this case, if we
996 // don't track the children, task synchronization will be broken.
997 static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
998   kmp_tasking_flags_t flags = taskdata->td_flags;
999   bool ret = !(flags.team_serial || flags.tasking_ser);
1000   ret = ret || flags.proxy == TASK_PROXY ||
1001         flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
1002   ret = ret ||
1003         KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
1004 #if OMPX_TASKGRAPH
1005   if (taskdata->td_taskgroup && taskdata->is_taskgraph)
1006     ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > 0;
1007 #endif
1008   return ret;
1009 }
1010 
1011 // __kmp_task_finish: bookkeeping to do when a task finishes execution
1012 //
1013 // gtid: global thread ID for calling thread
1014 // task: task to be finished
1015 // resumed_task: task to be resumed.  (may be NULL if task is serialized)
1016 //
1017 // template<ompt>: effectively ompt_enabled.enabled!=0
1018 // the version with ompt=false is inlined, allowing to optimize away all ompt
1019 // code in this case
1020 template <bool ompt>
1021 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
1022                               kmp_taskdata_t *resumed_task) {
1023   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1024   kmp_info_t *thread = __kmp_threads[gtid];
1025   kmp_task_team_t *task_team =
1026       thread->th.th_task_team; // might be NULL for serial teams...
1027 #if OMPX_TASKGRAPH
1028   // to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop
1029   bool is_taskgraph;
1030 #endif
1031 #if KMP_DEBUG
1032   kmp_int32 children = 0;
1033 #endif
1034   KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
1035                 "task %p\n",
1036                 gtid, taskdata, resumed_task));
1037 
1038   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
1039 
1040 #if OMPX_TASKGRAPH
1041   is_taskgraph = taskdata->is_taskgraph;
1042 #endif
1043 
1044 // Pop task from stack if tied
1045 #ifdef BUILD_TIED_TASK_STACK
1046   if (taskdata->td_flags.tiedness == TASK_TIED) {
1047     __kmp_pop_task_stack(gtid, thread, taskdata);
1048   }
1049 #endif /* BUILD_TIED_TASK_STACK */
1050 
1051   if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
1052     // untied task needs to check the counter so that the task structure is not
1053     // freed prematurely
1054     kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
1055     KA_TRACE(
1056         20,
1057         ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
1058          gtid, counter, taskdata));
1059     if (counter > 0) {
1060       // untied task is not done, to be continued possibly by other thread, do
1061       // not free it now
1062       if (resumed_task == NULL) {
1063         KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
1064         resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1065         // task is the parent
1066       }
1067       thread->th.th_current_task = resumed_task; // restore current_task
1068       resumed_task->td_flags.executing = 1; // resume previous task
1069       KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
1070                     "resuming task %p\n",
1071                     gtid, taskdata, resumed_task));
1072       return;
1073     }
1074   }
1075 
1076   // bookkeeping for resuming task:
1077   // GEH - note tasking_ser => task_serial
1078   KMP_DEBUG_ASSERT(
1079       (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
1080       taskdata->td_flags.task_serial);
1081   if (taskdata->td_flags.task_serial) {
1082     if (resumed_task == NULL) {
1083       resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1084       // task is the parent
1085     }
1086   } else {
1087     KMP_DEBUG_ASSERT(resumed_task !=
1088                      NULL); // verify that resumed task is passed as argument
1089   }
1090 
1091   /* If the tasks' destructor thunk flag has been set, we need to invoke the
1092      destructor thunk that has been generated by the compiler. The code is
1093      placed here, since at this point other tasks might have been released
1094      hence overlapping the destructor invocations with some other work in the
1095      released tasks.  The OpenMP spec is not specific on when the destructors
1096      are invoked, so we should be free to choose. */
1097   if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
1098     kmp_routine_entry_t destr_thunk = task->data1.destructors;
1099     KMP_ASSERT(destr_thunk);
1100     destr_thunk(gtid, task);
1101   }
1102 
1103   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
1104   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
1105   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
1106 
1107   bool completed = true;
1108   if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
1109     if (taskdata->td_allow_completion_event.type ==
1110         KMP_EVENT_ALLOW_COMPLETION) {
1111       // event hasn't been fulfilled yet. Try to detach task.
1112       __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
1113       if (taskdata->td_allow_completion_event.type ==
1114           KMP_EVENT_ALLOW_COMPLETION) {
1115         // task finished execution
1116         KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
1117         taskdata->td_flags.executing = 0; // suspend the finishing task
1118 
1119 #if OMPT_SUPPORT
1120         // For a detached task, which is not completed, we switch back
1121         // the omp_fulfill_event signals completion
1122         // locking is necessary to avoid a race with ompt_task_late_fulfill
1123         if (ompt)
1124           __ompt_task_finish(task, resumed_task, ompt_task_detach);
1125 #endif
1126 
1127         // no access to taskdata after this point!
1128         // __kmp_fulfill_event might free taskdata at any time from now
1129 
1130         taskdata->td_flags.proxy = TASK_PROXY; // proxify!
1131         completed = false;
1132       }
1133       __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
1134     }
1135   }
1136 
1137   // Tasks with valid target async handles must be re-enqueued.
1138   if (taskdata->td_target_data.async_handle != NULL) {
1139     // Note: no need to translate gtid to its shadow. If the current thread is a
1140     // hidden helper one, then the gtid is already correct. Otherwise, hidden
1141     // helper threads are disabled, and gtid refers to a OpenMP thread.
1142 #if OMPT_SUPPORT
1143     if (ompt) {
1144       __ompt_task_finish(task, resumed_task, ompt_task_switch);
1145     }
1146 #endif
1147     __kmpc_give_task(task, __kmp_tid_from_gtid(gtid));
1148     if (KMP_HIDDEN_HELPER_THREAD(gtid))
1149       __kmp_hidden_helper_worker_thread_signal();
1150     completed = false;
1151   }
1152 
1153   if (completed) {
1154     taskdata->td_flags.complete = 1; // mark the task as completed
1155 #if OMPX_TASKGRAPH
1156     taskdata->td_flags.onced = 1; // mark the task as ran once already
1157 #endif
1158 
1159 #if OMPT_SUPPORT
1160     // This is not a detached task, we are done here
1161     if (ompt)
1162       __ompt_task_finish(task, resumed_task, ompt_task_complete);
1163 #endif
1164     // TODO: What would be the balance between the conditions in the function
1165     // and an atomic operation?
1166     if (__kmp_track_children_task(taskdata)) {
1167       __kmp_release_deps(gtid, taskdata);
1168       // Predecrement simulated by "- 1" calculation
1169 #if KMP_DEBUG
1170       children = -1 +
1171 #endif
1172           KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
1173       KMP_DEBUG_ASSERT(children >= 0);
1174 #if OMPX_TASKGRAPH
1175       if (taskdata->td_taskgroup && !taskdata->is_taskgraph)
1176 #else
1177       if (taskdata->td_taskgroup)
1178 #endif
1179         KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
1180     } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
1181                              task_team->tt.tt_hidden_helper_task_encountered)) {
1182       // if we found proxy or hidden helper tasks there could exist a dependency
1183       // chain with the proxy task as origin
1184       __kmp_release_deps(gtid, taskdata);
1185     }
1186     // td_flags.executing must be marked as 0 after __kmp_release_deps has been
1187     // called. Othertwise, if a task is executed immediately from the
1188     // release_deps code, the flag will be reset to 1 again by this same
1189     // function
1190     KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
1191     taskdata->td_flags.executing = 0; // suspend the finishing task
1192 
1193     // Decrement the counter of hidden helper tasks to be executed.
1194     if (taskdata->td_flags.hidden_helper) {
1195       // Hidden helper tasks can only be executed by hidden helper threads.
1196       KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
1197       KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
1198     }
1199   }
1200 
1201   KA_TRACE(
1202       20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
1203            gtid, taskdata, children));
1204 
1205   // Free this task and then ancestor tasks if they have no children.
1206   // Restore th_current_task first as suggested by John:
1207   // johnmc: if an asynchronous inquiry peers into the runtime system
1208   // it doesn't see the freed task as the current task.
1209   thread->th.th_current_task = resumed_task;
1210   if (completed)
1211     __kmp_free_task_and_ancestors(gtid, taskdata, thread);
1212 
1213   // TODO: GEH - make sure root team implicit task is initialized properly.
1214   // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
1215   resumed_task->td_flags.executing = 1; // resume previous task
1216 
1217 #if OMPX_TASKGRAPH
1218   if (is_taskgraph && __kmp_track_children_task(taskdata) &&
1219       taskdata->td_taskgroup) {
1220     // TDG: we only release taskgroup barrier here because
1221     // free_task_and_ancestors will call
1222     // __kmp_free_task, which resets all task parameters such as
1223     // taskdata->started, etc. If we release the barrier earlier, these
1224     // parameters could be read before being reset. This is not an issue for
1225     // non-TDG implementation because we never reuse a task(data) structure
1226     KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
1227   }
1228 #endif
1229 
1230   KA_TRACE(
1231       10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
1232            gtid, taskdata, resumed_task));
1233 
1234   return;
1235 }
1236 
1237 template <bool ompt>
1238 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
1239                                                   kmp_int32 gtid,
1240                                                   kmp_task_t *task) {
1241   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
1242                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1243   KMP_DEBUG_ASSERT(gtid >= 0);
1244   // this routine will provide task to resume
1245   __kmp_task_finish<ompt>(gtid, task, NULL);
1246 
1247   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
1248                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1249 
1250 #if OMPT_SUPPORT
1251   if (ompt) {
1252     ompt_frame_t *ompt_frame;
1253     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1254     ompt_frame->enter_frame = ompt_data_none;
1255     ompt_frame->enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;
1256   }
1257 #endif
1258 
1259   return;
1260 }
1261 
1262 #if OMPT_SUPPORT
1263 OMPT_NOINLINE
1264 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1265                                        kmp_task_t *task) {
1266   __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1267 }
1268 #endif // OMPT_SUPPORT
1269 
1270 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1271 //
1272 // loc_ref: source location information; points to end of task block.
1273 // gtid: global thread number.
1274 // task: task thunk for the completed task.
1275 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1276                                   kmp_task_t *task) {
1277 #if OMPT_SUPPORT
1278   if (UNLIKELY(ompt_enabled.enabled)) {
1279     __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1280     return;
1281   }
1282 #endif
1283   __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1284 }
1285 
1286 #ifdef TASK_UNUSED
1287 // __kmpc_omp_task_complete: report that a task has completed execution
1288 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1289 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1290                               kmp_task_t *task) {
1291   KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1292                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1293 
1294   __kmp_task_finish<false>(gtid, task,
1295                            NULL); // Not sure how to find task to resume
1296 
1297   KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1298                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1299   return;
1300 }
1301 #endif // TASK_UNUSED
1302 
1303 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1304 // task for a given thread
1305 //
1306 // loc_ref:  reference to source location of parallel region
1307 // this_thr:  thread data structure corresponding to implicit task
1308 // team: team for this_thr
1309 // tid: thread id of given thread within team
1310 // set_curr_task: TRUE if need to push current task to thread
1311 // NOTE: Routine does not set up the implicit task ICVS.  This is assumed to
1312 // have already been done elsewhere.
1313 // TODO: Get better loc_ref.  Value passed in may be NULL
1314 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1315                               kmp_team_t *team, int tid, int set_curr_task) {
1316   kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1317 
1318   KF_TRACE(
1319       10,
1320       ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1321        tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1322 
1323   task->td_task_id = KMP_GEN_TASK_ID();
1324   task->td_team = team;
1325   //    task->td_parent   = NULL;  // fix for CQ230101 (broken parent task info
1326   //    in debugger)
1327   task->td_ident = loc_ref;
1328   task->td_taskwait_ident = NULL;
1329   task->td_taskwait_counter = 0;
1330   task->td_taskwait_thread = 0;
1331 
1332   task->td_flags.tiedness = TASK_TIED;
1333   task->td_flags.tasktype = TASK_IMPLICIT;
1334   task->td_flags.proxy = TASK_FULL;
1335 
1336   // All implicit tasks are executed immediately, not deferred
1337   task->td_flags.task_serial = 1;
1338   task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1339   task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1340 
1341   task->td_flags.started = 1;
1342   task->td_flags.executing = 1;
1343   task->td_flags.complete = 0;
1344   task->td_flags.freed = 0;
1345 #if OMPX_TASKGRAPH
1346   task->td_flags.onced = 0;
1347 #endif
1348 
1349   task->td_depnode = NULL;
1350   task->td_last_tied = task;
1351   task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1352 
1353   if (set_curr_task) { // only do this init first time thread is created
1354     KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1355     // Not used: don't need to deallocate implicit task
1356     KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1357     task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1358     task->td_dephash = NULL;
1359     __kmp_push_current_task_to_thread(this_thr, team, tid);
1360   } else {
1361     KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1362     KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1363   }
1364 
1365 #if OMPT_SUPPORT
1366   if (UNLIKELY(ompt_enabled.enabled))
1367     __ompt_task_init(task, tid);
1368 #endif
1369 
1370   KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1371                 team, task));
1372 }
1373 
1374 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1375 // at the end of parallel regions. Some resources are kept for reuse in the next
1376 // parallel region.
1377 //
1378 // thread:  thread data structure corresponding to implicit task
1379 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1380   kmp_taskdata_t *task = thread->th.th_current_task;
1381   if (task->td_dephash) {
1382     int children;
1383     task->td_flags.complete = 1;
1384 #if OMPX_TASKGRAPH
1385     task->td_flags.onced = 1;
1386 #endif
1387     children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1388     kmp_tasking_flags_t flags_old = task->td_flags;
1389     if (children == 0 && flags_old.complete == 1) {
1390       kmp_tasking_flags_t flags_new = flags_old;
1391       flags_new.complete = 0;
1392       if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1393                                       *RCAST(kmp_int32 *, &flags_old),
1394                                       *RCAST(kmp_int32 *, &flags_new))) {
1395         KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1396                        "dephash of implicit task %p\n",
1397                        thread->th.th_info.ds.ds_gtid, task));
1398         __kmp_dephash_free_entries(thread, task->td_dephash);
1399       }
1400     }
1401   }
1402 }
1403 
1404 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1405 // when these are destroyed regions
1406 //
1407 // thread:  thread data structure corresponding to implicit task
1408 void __kmp_free_implicit_task(kmp_info_t *thread) {
1409   kmp_taskdata_t *task = thread->th.th_current_task;
1410   if (task && task->td_dephash) {
1411     __kmp_dephash_free(thread, task->td_dephash);
1412     task->td_dephash = NULL;
1413   }
1414 }
1415 
1416 // Round up a size to a power of two specified by val: Used to insert padding
1417 // between structures co-allocated using a single malloc() call
1418 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1419   if (size & (val - 1)) {
1420     size &= ~(val - 1);
1421     if (size <= KMP_SIZE_T_MAX - val) {
1422       size += val; // Round up if there is no overflow.
1423     }
1424   }
1425   return size;
1426 } // __kmp_round_up_to_va
1427 
1428 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1429 //
1430 // loc_ref: source location information
1431 // gtid: global thread number.
1432 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1433 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1434 // sizeof_kmp_task_t:  Size in bytes of kmp_task_t data structure including
1435 // private vars accessed in task.
1436 // sizeof_shareds:  Size in bytes of array of pointers to shared vars accessed
1437 // in task.
1438 // task_entry: Pointer to task code entry point generated by compiler.
1439 // returns: a pointer to the allocated kmp_task_t structure (task).
1440 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1441                              kmp_tasking_flags_t *flags,
1442                              size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1443                              kmp_routine_entry_t task_entry) {
1444   kmp_task_t *task;
1445   kmp_taskdata_t *taskdata;
1446   kmp_info_t *thread = __kmp_threads[gtid];
1447   kmp_team_t *team = thread->th.th_team;
1448   kmp_taskdata_t *parent_task = thread->th.th_current_task;
1449   size_t shareds_offset;
1450 
1451   if (UNLIKELY(!TCR_4(__kmp_init_middle)))
1452     __kmp_middle_initialize();
1453 
1454   if (flags->hidden_helper) {
1455     if (__kmp_enable_hidden_helper) {
1456       if (!TCR_4(__kmp_init_hidden_helper))
1457         __kmp_hidden_helper_initialize();
1458     } else {
1459       // If the hidden helper task is not enabled, reset the flag to FALSE.
1460       flags->hidden_helper = FALSE;
1461     }
1462   }
1463 
1464   KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1465                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1466                 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1467                 sizeof_shareds, task_entry));
1468 
1469   KMP_DEBUG_ASSERT(parent_task);
1470   if (parent_task->td_flags.final) {
1471     if (flags->merged_if0) {
1472     }
1473     flags->final = 1;
1474   }
1475 
1476   if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1477     // Untied task encountered causes the TSC algorithm to check entire deque of
1478     // the victim thread. If no untied task encountered, then checking the head
1479     // of the deque should be enough.
1480     KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1481   }
1482 
1483   // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1484   // the tasking setup
1485   // when that happens is too late.
1486   if (UNLIKELY(flags->proxy == TASK_PROXY ||
1487                flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
1488     if (flags->proxy == TASK_PROXY) {
1489       flags->tiedness = TASK_UNTIED;
1490       flags->merged_if0 = 1;
1491     }
1492     /* are we running in a sequential parallel or tskm_immediate_exec... we need
1493        tasking support enabled */
1494     if ((thread->th.th_task_team) == NULL) {
1495       /* This should only happen if the team is serialized
1496           setup a task team and propagate it to the thread */
1497       KMP_DEBUG_ASSERT(team->t.t_serialized);
1498       KA_TRACE(30,
1499                ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1500                 gtid));
1501       __kmp_task_team_setup(thread, team);
1502       thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1503     }
1504     kmp_task_team_t *task_team = thread->th.th_task_team;
1505 
1506     /* tasking must be enabled now as the task might not be pushed */
1507     if (!KMP_TASKING_ENABLED(task_team)) {
1508       KA_TRACE(
1509           30,
1510           ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1511       __kmp_enable_tasking(task_team, thread);
1512       kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1513       kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1514       // No lock needed since only owner can allocate
1515       if (thread_data->td.td_deque == NULL) {
1516         __kmp_alloc_task_deque(thread, thread_data);
1517       }
1518     }
1519 
1520     if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
1521         task_team->tt.tt_found_proxy_tasks == FALSE)
1522       TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1523     if (flags->hidden_helper &&
1524         task_team->tt.tt_hidden_helper_task_encountered == FALSE)
1525       TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
1526   }
1527 
1528   // Calculate shared structure offset including padding after kmp_task_t struct
1529   // to align pointers in shared struct
1530   shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1531   shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(kmp_uint64));
1532 
1533   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1534   KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1535                 shareds_offset));
1536   KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1537                 sizeof_shareds));
1538 
1539   // Avoid double allocation here by combining shareds with taskdata
1540 #if USE_FAST_MEMORY
1541   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1542                                                                sizeof_shareds);
1543 #else /* ! USE_FAST_MEMORY */
1544   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1545                                                                sizeof_shareds);
1546 #endif /* USE_FAST_MEMORY */
1547 
1548   task = KMP_TASKDATA_TO_TASK(taskdata);
1549 
1550 // Make sure task & taskdata are aligned appropriately
1551 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || KMP_ARCH_S390X || !KMP_HAVE_QUAD
1552   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1553   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1554 #else
1555   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1556   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1557 #endif
1558   if (sizeof_shareds > 0) {
1559     // Avoid double allocation here by combining shareds with taskdata
1560     task->shareds = &((char *)taskdata)[shareds_offset];
1561     // Make sure shareds struct is aligned to pointer size
1562     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1563                      0);
1564   } else {
1565     task->shareds = NULL;
1566   }
1567   task->routine = task_entry;
1568   task->part_id = 0; // AC: Always start with 0 part id
1569 
1570   taskdata->td_task_id = KMP_GEN_TASK_ID();
1571   taskdata->td_team = thread->th.th_team;
1572   taskdata->td_alloc_thread = thread;
1573   taskdata->td_parent = parent_task;
1574   taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1575   KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1576   taskdata->td_ident = loc_ref;
1577   taskdata->td_taskwait_ident = NULL;
1578   taskdata->td_taskwait_counter = 0;
1579   taskdata->td_taskwait_thread = 0;
1580   KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1581   // avoid copying icvs for proxy tasks
1582   if (flags->proxy == TASK_FULL)
1583     copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1584 
1585   taskdata->td_flags = *flags;
1586   taskdata->td_task_team = thread->th.th_task_team;
1587   taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1588   taskdata->td_flags.tasktype = TASK_EXPLICIT;
1589   // If it is hidden helper task, we need to set the team and task team
1590   // correspondingly.
1591   if (flags->hidden_helper) {
1592     kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1593     taskdata->td_team = shadow_thread->th.th_team;
1594     taskdata->td_task_team = shadow_thread->th.th_task_team;
1595   }
1596 
1597   // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1598   taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1599 
1600   // GEH - TODO: fix this to copy parent task's value of team_serial flag
1601   taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1602 
1603   // GEH - Note we serialize the task if the team is serialized to make sure
1604   // implicit parallel region tasks are not left until program termination to
1605   // execute. Also, it helps locality to execute immediately.
1606 
1607   taskdata->td_flags.task_serial =
1608       (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1609        taskdata->td_flags.tasking_ser || flags->merged_if0);
1610 
1611   taskdata->td_flags.started = 0;
1612   taskdata->td_flags.executing = 0;
1613   taskdata->td_flags.complete = 0;
1614   taskdata->td_flags.freed = 0;
1615 #if OMPX_TASKGRAPH
1616   taskdata->td_flags.onced = 0;
1617   taskdata->is_taskgraph = 0;
1618   taskdata->tdg = nullptr;
1619 #endif
1620   KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1621   // start at one because counts current task and children
1622   KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1623   taskdata->td_taskgroup =
1624       parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1625   taskdata->td_dephash = NULL;
1626   taskdata->td_depnode = NULL;
1627   taskdata->td_target_data.async_handle = NULL;
1628   if (flags->tiedness == TASK_UNTIED)
1629     taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1630   else
1631     taskdata->td_last_tied = taskdata;
1632   taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1633 #if OMPT_SUPPORT
1634   if (UNLIKELY(ompt_enabled.enabled))
1635     __ompt_task_init(taskdata, gtid);
1636 #endif
1637   // TODO: What would be the balance between the conditions in the function and
1638   // an atomic operation?
1639   if (__kmp_track_children_task(taskdata)) {
1640     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1641     if (parent_task->td_taskgroup)
1642       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1643     // Only need to keep track of allocated child tasks for explicit tasks since
1644     // implicit not deallocated
1645     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1646       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1647     }
1648     if (flags->hidden_helper) {
1649       taskdata->td_flags.task_serial = FALSE;
1650       // Increment the number of hidden helper tasks to be executed
1651       KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
1652     }
1653   }
1654 
1655 #if OMPX_TASKGRAPH
1656   kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
1657   if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) &&
1658       (task_entry != (kmp_routine_entry_t)__kmp_taskloop_task)) {
1659     taskdata->is_taskgraph = 1;
1660     taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
1661     taskdata->td_task_id = KMP_GEN_TASK_ID();
1662     taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
1663   }
1664 #endif
1665   KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1666                 gtid, taskdata, taskdata->td_parent));
1667 
1668   return task;
1669 }
1670 
1671 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1672                                   kmp_int32 flags, size_t sizeof_kmp_task_t,
1673                                   size_t sizeof_shareds,
1674                                   kmp_routine_entry_t task_entry) {
1675   kmp_task_t *retval;
1676   kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1677   __kmp_assert_valid_gtid(gtid);
1678   input_flags->native = FALSE;
1679   // __kmp_task_alloc() sets up all other runtime flags
1680   KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1681                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1682                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1683                 input_flags->proxy ? "proxy" : "",
1684                 input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1685                 sizeof_shareds, task_entry));
1686 
1687   retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1688                             sizeof_shareds, task_entry);
1689 
1690   KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1691 
1692   return retval;
1693 }
1694 
1695 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1696                                          kmp_int32 flags,
1697                                          size_t sizeof_kmp_task_t,
1698                                          size_t sizeof_shareds,
1699                                          kmp_routine_entry_t task_entry,
1700                                          kmp_int64 device_id) {
1701   auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1702   // target task is untied defined in the specification
1703   input_flags.tiedness = TASK_UNTIED;
1704   input_flags.target = 1;
1705 
1706   if (__kmp_enable_hidden_helper)
1707     input_flags.hidden_helper = TRUE;
1708 
1709   return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1710                                sizeof_shareds, task_entry);
1711 }
1712 
1713 /*!
1714 @ingroup TASKING
1715 @param loc_ref location of the original task directive
1716 @param gtid Global Thread ID of encountering thread
1717 @param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new
1718 task''
1719 @param naffins Number of affinity items
1720 @param affin_list List of affinity items
1721 @return Returns non-zero if registering affinity information was not successful.
1722  Returns 0 if registration was successful
1723 This entry registers the affinity information attached to a task with the task
1724 thunk structure kmp_taskdata_t.
1725 */
1726 kmp_int32
1727 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
1728                                   kmp_task_t *new_task, kmp_int32 naffins,
1729                                   kmp_task_affinity_info_t *affin_list) {
1730   return 0;
1731 }
1732 
1733 //  __kmp_invoke_task: invoke the specified task
1734 //
1735 // gtid: global thread ID of caller
1736 // task: the task to invoke
1737 // current_task: the task to resume after task invocation
1738 #ifdef __s390x__
1739 __attribute__((target("backchain")))
1740 #endif
1741 static void
1742 __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1743                   kmp_taskdata_t *current_task) {
1744   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1745   kmp_info_t *thread;
1746   int discard = 0 /* false */;
1747   KA_TRACE(
1748       30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1749            gtid, taskdata, current_task));
1750   KMP_DEBUG_ASSERT(task);
1751   if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1752                taskdata->td_flags.complete == 1)) {
1753     // This is a proxy task that was already completed but it needs to run
1754     // its bottom-half finish
1755     KA_TRACE(
1756         30,
1757         ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1758          gtid, taskdata));
1759 
1760     __kmp_bottom_half_finish_proxy(gtid, task);
1761 
1762     KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1763                   "proxy task %p, resuming task %p\n",
1764                   gtid, taskdata, current_task));
1765 
1766     return;
1767   }
1768 
1769 #if OMPT_SUPPORT
1770   // For untied tasks, the first task executed only calls __kmpc_omp_task and
1771   // does not execute code.
1772   ompt_thread_info_t oldInfo;
1773   if (UNLIKELY(ompt_enabled.enabled)) {
1774     // Store the threads states and restore them after the task
1775     thread = __kmp_threads[gtid];
1776     oldInfo = thread->th.ompt_thread_info;
1777     thread->th.ompt_thread_info.wait_id = 0;
1778     thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1779                                             ? ompt_state_work_serial
1780                                             : ompt_state_work_parallel;
1781     taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1782   }
1783 #endif
1784 
1785   // Proxy tasks are not handled by the runtime
1786   if (taskdata->td_flags.proxy != TASK_PROXY) {
1787     __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1788   }
1789 
1790   // TODO: cancel tasks if the parallel region has also been cancelled
1791   // TODO: check if this sequence can be hoisted above __kmp_task_start
1792   // if cancellation has been enabled for this run ...
1793   if (UNLIKELY(__kmp_omp_cancellation)) {
1794     thread = __kmp_threads[gtid];
1795     kmp_team_t *this_team = thread->th.th_team;
1796     kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1797     if ((taskgroup && taskgroup->cancel_request) ||
1798         (this_team->t.t_cancel_request == cancel_parallel)) {
1799 #if OMPT_SUPPORT && OMPT_OPTIONAL
1800       ompt_data_t *task_data;
1801       if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1802         __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1803         ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1804             task_data,
1805             ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1806                                                       : ompt_cancel_parallel) |
1807                 ompt_cancel_discarded_task,
1808             NULL);
1809       }
1810 #endif
1811       KMP_COUNT_BLOCK(TASK_cancelled);
1812       // this task belongs to a task group and we need to cancel it
1813       discard = 1 /* true */;
1814     }
1815   }
1816 
1817   // Invoke the task routine and pass in relevant data.
1818   // Thunks generated by gcc take a different argument list.
1819   if (!discard) {
1820     if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1821       taskdata->td_last_tied = current_task->td_last_tied;
1822       KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1823     }
1824 #if KMP_STATS_ENABLED
1825     KMP_COUNT_BLOCK(TASK_executed);
1826     switch (KMP_GET_THREAD_STATE()) {
1827     case FORK_JOIN_BARRIER:
1828       KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1829       break;
1830     case PLAIN_BARRIER:
1831       KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1832       break;
1833     case TASKYIELD:
1834       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1835       break;
1836     case TASKWAIT:
1837       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1838       break;
1839     case TASKGROUP:
1840       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1841       break;
1842     default:
1843       KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1844       break;
1845     }
1846 #endif // KMP_STATS_ENABLED
1847 
1848 // OMPT task begin
1849 #if OMPT_SUPPORT
1850     if (UNLIKELY(ompt_enabled.enabled))
1851       __ompt_task_start(task, current_task, gtid);
1852 #endif
1853 #if OMPT_SUPPORT && OMPT_OPTIONAL
1854     if (UNLIKELY(ompt_enabled.ompt_callback_dispatch &&
1855                  taskdata->ompt_task_info.dispatch_chunk.iterations > 0)) {
1856       ompt_data_t instance = ompt_data_none;
1857       instance.ptr = &(taskdata->ompt_task_info.dispatch_chunk);
1858       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1859       ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
1860           &(team_info->parallel_data), &(taskdata->ompt_task_info.task_data),
1861           ompt_dispatch_taskloop_chunk, instance);
1862       taskdata->ompt_task_info.dispatch_chunk = {0, 0};
1863     }
1864 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1865 
1866 #if OMPD_SUPPORT
1867     if (ompd_state & OMPD_ENABLE_BP)
1868       ompd_bp_task_begin();
1869 #endif
1870 
1871 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1872     kmp_uint64 cur_time;
1873     kmp_int32 kmp_itt_count_task =
1874         __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1875         current_task->td_flags.tasktype == TASK_IMPLICIT;
1876     if (kmp_itt_count_task) {
1877       thread = __kmp_threads[gtid];
1878       // Time outer level explicit task on barrier for adjusting imbalance time
1879       if (thread->th.th_bar_arrive_time)
1880         cur_time = __itt_get_timestamp();
1881       else
1882         kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1883     }
1884     KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1885 #endif
1886 
1887 #if ENABLE_LIBOMPTARGET
1888     if (taskdata->td_target_data.async_handle != NULL) {
1889       // If we have a valid target async handle, that means that we have already
1890       // executed the task routine once. We must query for the handle completion
1891       // instead of re-executing the routine.
1892       KMP_ASSERT(tgt_target_nowait_query);
1893       tgt_target_nowait_query(&taskdata->td_target_data.async_handle);
1894     } else
1895 #endif
1896     if (task->routine != NULL) {
1897 #ifdef KMP_GOMP_COMPAT
1898       if (taskdata->td_flags.native) {
1899         ((void (*)(void *))(*(task->routine)))(task->shareds);
1900       } else
1901 #endif /* KMP_GOMP_COMPAT */
1902       {
1903         (*(task->routine))(gtid, task);
1904       }
1905     }
1906     KMP_POP_PARTITIONED_TIMER();
1907 
1908 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1909     if (kmp_itt_count_task) {
1910       // Barrier imbalance - adjust arrive time with the task duration
1911       thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1912     }
1913     KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1914     KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1915 #endif
1916   }
1917 
1918 #if OMPD_SUPPORT
1919   if (ompd_state & OMPD_ENABLE_BP)
1920     ompd_bp_task_end();
1921 #endif
1922 
1923   // Proxy tasks are not handled by the runtime
1924   if (taskdata->td_flags.proxy != TASK_PROXY) {
1925 #if OMPT_SUPPORT
1926     if (UNLIKELY(ompt_enabled.enabled)) {
1927       thread->th.ompt_thread_info = oldInfo;
1928       if (taskdata->td_flags.tiedness == TASK_TIED) {
1929         taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1930       }
1931       __kmp_task_finish<true>(gtid, task, current_task);
1932     } else
1933 #endif
1934       __kmp_task_finish<false>(gtid, task, current_task);
1935   }
1936 #if OMPT_SUPPORT
1937   else if (UNLIKELY(ompt_enabled.enabled && taskdata->td_flags.target)) {
1938     __ompt_task_finish(task, current_task, ompt_task_switch);
1939   }
1940 #endif
1941 
1942   KA_TRACE(
1943       30,
1944       ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1945        gtid, taskdata, current_task));
1946   return;
1947 }
1948 
1949 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1950 //
1951 // loc_ref: location of original task pragma (ignored)
1952 // gtid: Global Thread ID of encountering thread
1953 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1954 // Returns:
1955 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1956 //    be resumed later.
1957 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1958 //    resumed later.
1959 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1960                                 kmp_task_t *new_task) {
1961   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1962 
1963   KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1964                 loc_ref, new_taskdata));
1965 
1966 #if OMPT_SUPPORT
1967   kmp_taskdata_t *parent;
1968   if (UNLIKELY(ompt_enabled.enabled)) {
1969     parent = new_taskdata->td_parent;
1970     if (ompt_enabled.ompt_callback_task_create) {
1971       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1972           &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1973           &(new_taskdata->ompt_task_info.task_data),
1974           TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1975           OMPT_GET_RETURN_ADDRESS(0));
1976     }
1977   }
1978 #endif
1979 
1980   /* Should we execute the new task or queue it? For now, let's just always try
1981      to queue it.  If the queue fills up, then we'll execute it.  */
1982 
1983   if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1984   { // Execute this task immediately
1985     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1986     new_taskdata->td_flags.task_serial = 1;
1987     __kmp_invoke_task(gtid, new_task, current_task);
1988   }
1989 
1990   KA_TRACE(
1991       10,
1992       ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1993        "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1994        gtid, loc_ref, new_taskdata));
1995 
1996 #if OMPT_SUPPORT
1997   if (UNLIKELY(ompt_enabled.enabled)) {
1998     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1999     parent->ompt_task_info.frame.enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;
2000   }
2001 #endif
2002   return TASK_CURRENT_NOT_QUEUED;
2003 }
2004 
2005 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
2006 //
2007 // gtid: Global Thread ID of encountering thread
2008 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
2009 // serialize_immediate: if TRUE then if the task is executed immediately its
2010 // execution will be serialized
2011 // Returns:
2012 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2013 //    be resumed later.
2014 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2015 //    resumed later.
2016 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
2017                          bool serialize_immediate) {
2018   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2019 
2020 #if OMPX_TASKGRAPH
2021   if (new_taskdata->is_taskgraph &&
2022       __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
2023     kmp_tdg_info_t *tdg = new_taskdata->tdg;
2024     // extend the record_map if needed
2025     if (new_taskdata->td_tdg_task_id >= new_taskdata->tdg->map_size) {
2026       __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
2027       // map_size could have been updated by another thread if recursive
2028       // taskloop
2029       if (new_taskdata->td_tdg_task_id >= tdg->map_size) {
2030         kmp_uint old_size = tdg->map_size;
2031         kmp_uint new_size = old_size * 2;
2032         kmp_node_info_t *old_record = tdg->record_map;
2033         kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
2034             new_size * sizeof(kmp_node_info_t));
2035 
2036         KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t));
2037         tdg->record_map = new_record;
2038 
2039         __kmp_free(old_record);
2040 
2041         for (kmp_int i = old_size; i < new_size; i++) {
2042           kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
2043               __kmp_successors_size * sizeof(kmp_int32));
2044           new_record[i].task = nullptr;
2045           new_record[i].successors = successorsList;
2046           new_record[i].nsuccessors = 0;
2047           new_record[i].npredecessors = 0;
2048           new_record[i].successors_size = __kmp_successors_size;
2049           KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
2050         }
2051         // update the size at the end, so that we avoid other
2052         // threads use old_record while map_size is already updated
2053         tdg->map_size = new_size;
2054       }
2055       __kmp_release_bootstrap_lock(&tdg->graph_lock);
2056     }
2057     // record a task
2058     if (tdg->record_map[new_taskdata->td_tdg_task_id].task == nullptr) {
2059       tdg->record_map[new_taskdata->td_tdg_task_id].task = new_task;
2060       tdg->record_map[new_taskdata->td_tdg_task_id].parent_task =
2061           new_taskdata->td_parent;
2062       KMP_ATOMIC_INC(&tdg->num_tasks);
2063     }
2064   }
2065 #endif
2066 
2067   /* Should we execute the new task or queue it? For now, let's just always try
2068      to queue it.  If the queue fills up, then we'll execute it.  */
2069   if (new_taskdata->td_flags.proxy == TASK_PROXY ||
2070       __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
2071   { // Execute this task immediately
2072     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
2073     if (serialize_immediate)
2074       new_taskdata->td_flags.task_serial = 1;
2075     __kmp_invoke_task(gtid, new_task, current_task);
2076   } else if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2077              __kmp_wpolicy_passive) {
2078     kmp_info_t *this_thr = __kmp_threads[gtid];
2079     kmp_team_t *team = this_thr->th.th_team;
2080     kmp_int32 nthreads = this_thr->th.th_team_nproc;
2081     for (int i = 0; i < nthreads; ++i) {
2082       kmp_info_t *thread = team->t.t_threads[i];
2083       if (thread == this_thr)
2084         continue;
2085       if (thread->th.th_sleep_loc != NULL) {
2086         __kmp_null_resume_wrapper(thread);
2087         break; // awake one thread at a time
2088       }
2089     }
2090   }
2091   return TASK_CURRENT_NOT_QUEUED;
2092 }
2093 
2094 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
2095 // non-thread-switchable task from the parent thread only!
2096 //
2097 // loc_ref: location of original task pragma (ignored)
2098 // gtid: Global Thread ID of encountering thread
2099 // new_task: non-thread-switchable task thunk allocated by
2100 // __kmp_omp_task_alloc()
2101 // Returns:
2102 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2103 //    be resumed later.
2104 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2105 //    resumed later.
2106 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
2107                           kmp_task_t *new_task) {
2108   kmp_int32 res;
2109   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
2110 
2111 #if KMP_DEBUG || OMPT_SUPPORT
2112   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2113 #endif
2114   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
2115                 new_taskdata));
2116   __kmp_assert_valid_gtid(gtid);
2117 
2118 #if OMPT_SUPPORT
2119   kmp_taskdata_t *parent = NULL;
2120   if (UNLIKELY(ompt_enabled.enabled)) {
2121     if (!new_taskdata->td_flags.started) {
2122       OMPT_STORE_RETURN_ADDRESS(gtid);
2123       parent = new_taskdata->td_parent;
2124       if (!parent->ompt_task_info.frame.enter_frame.ptr) {
2125         parent->ompt_task_info.frame.enter_frame.ptr =
2126             OMPT_GET_FRAME_ADDRESS(0);
2127       }
2128       if (ompt_enabled.ompt_callback_task_create) {
2129         ompt_callbacks.ompt_callback(ompt_callback_task_create)(
2130             &(parent->ompt_task_info.task_data),
2131             &(parent->ompt_task_info.frame),
2132             &(new_taskdata->ompt_task_info.task_data),
2133             TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
2134             OMPT_LOAD_RETURN_ADDRESS(gtid));
2135       }
2136     } else {
2137       // We are scheduling the continuation of an UNTIED task.
2138       // Scheduling back to the parent task.
2139       __ompt_task_finish(new_task,
2140                          new_taskdata->ompt_task_info.scheduling_parent,
2141                          ompt_task_switch);
2142       new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
2143     }
2144   }
2145 #endif
2146 
2147   res = __kmp_omp_task(gtid, new_task, true);
2148 
2149   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
2150                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
2151                 gtid, loc_ref, new_taskdata));
2152 #if OMPT_SUPPORT
2153   if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
2154     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2155   }
2156 #endif
2157   return res;
2158 }
2159 
2160 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
2161 // a taskloop task with the correct OMPT return address
2162 //
2163 // loc_ref: location of original task pragma (ignored)
2164 // gtid: Global Thread ID of encountering thread
2165 // new_task: non-thread-switchable task thunk allocated by
2166 // __kmp_omp_task_alloc()
2167 // codeptr_ra: return address for OMPT callback
2168 // Returns:
2169 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2170 //    be resumed later.
2171 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2172 //    resumed later.
2173 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
2174                                   kmp_task_t *new_task, void *codeptr_ra) {
2175   kmp_int32 res;
2176   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
2177 
2178 #if KMP_DEBUG || OMPT_SUPPORT
2179   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2180 #endif
2181   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
2182                 new_taskdata));
2183 
2184 #if OMPT_SUPPORT
2185   kmp_taskdata_t *parent = NULL;
2186   if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
2187     parent = new_taskdata->td_parent;
2188     if (!parent->ompt_task_info.frame.enter_frame.ptr)
2189       parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2190     if (ompt_enabled.ompt_callback_task_create) {
2191       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
2192           &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
2193           &(new_taskdata->ompt_task_info.task_data),
2194           TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, codeptr_ra);
2195     }
2196   }
2197 #endif
2198 
2199   res = __kmp_omp_task(gtid, new_task, true);
2200 
2201   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
2202                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
2203                 gtid, loc_ref, new_taskdata));
2204 #if OMPT_SUPPORT
2205   if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
2206     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2207   }
2208 #endif
2209   return res;
2210 }
2211 
2212 template <bool ompt>
2213 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
2214                                               void *frame_address,
2215                                               void *return_address) {
2216   kmp_taskdata_t *taskdata = nullptr;
2217   kmp_info_t *thread;
2218   int thread_finished = FALSE;
2219   KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
2220 
2221   KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
2222   KMP_DEBUG_ASSERT(gtid >= 0);
2223 
2224   if (__kmp_tasking_mode != tskm_immediate_exec) {
2225     thread = __kmp_threads[gtid];
2226     taskdata = thread->th.th_current_task;
2227 
2228 #if OMPT_SUPPORT && OMPT_OPTIONAL
2229     ompt_data_t *my_task_data;
2230     ompt_data_t *my_parallel_data;
2231 
2232     if (ompt) {
2233       my_task_data = &(taskdata->ompt_task_info.task_data);
2234       my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
2235 
2236       taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
2237 
2238       if (ompt_enabled.ompt_callback_sync_region) {
2239         ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2240             ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2241             my_task_data, return_address);
2242       }
2243 
2244       if (ompt_enabled.ompt_callback_sync_region_wait) {
2245         ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2246             ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2247             my_task_data, return_address);
2248       }
2249     }
2250 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2251 
2252 // Debugger: The taskwait is active. Store location and thread encountered the
2253 // taskwait.
2254 #if USE_ITT_BUILD
2255 // Note: These values are used by ITT events as well.
2256 #endif /* USE_ITT_BUILD */
2257     taskdata->td_taskwait_counter += 1;
2258     taskdata->td_taskwait_ident = loc_ref;
2259     taskdata->td_taskwait_thread = gtid + 1;
2260 
2261 #if USE_ITT_BUILD
2262     void *itt_sync_obj = NULL;
2263 #if USE_ITT_NOTIFY
2264     KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2265 #endif /* USE_ITT_NOTIFY */
2266 #endif /* USE_ITT_BUILD */
2267 
2268     bool must_wait =
2269         !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
2270 
2271     must_wait = must_wait || (thread->th.th_task_team != NULL &&
2272                               thread->th.th_task_team->tt.tt_found_proxy_tasks);
2273     // If hidden helper thread is encountered, we must enable wait here.
2274     must_wait =
2275         must_wait ||
2276         (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
2277          thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
2278 
2279     if (must_wait) {
2280       kmp_flag_32<false, false> flag(
2281           RCAST(std::atomic<kmp_uint32> *,
2282                 &(taskdata->td_incomplete_child_tasks)),
2283           0U);
2284       while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
2285         flag.execute_tasks(thread, gtid, FALSE,
2286                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2287                            __kmp_task_stealing_constraint);
2288       }
2289     }
2290 #if USE_ITT_BUILD
2291     KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2292     KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
2293 #endif /* USE_ITT_BUILD */
2294 
2295     // Debugger:  The taskwait is completed. Location remains, but thread is
2296     // negated.
2297     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2298 
2299 #if OMPT_SUPPORT && OMPT_OPTIONAL
2300     if (ompt) {
2301       if (ompt_enabled.ompt_callback_sync_region_wait) {
2302         ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2303             ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2304             my_task_data, return_address);
2305       }
2306       if (ompt_enabled.ompt_callback_sync_region) {
2307         ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2308             ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2309             my_task_data, return_address);
2310       }
2311       taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
2312     }
2313 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2314   }
2315 
2316   KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
2317                 "returning TASK_CURRENT_NOT_QUEUED\n",
2318                 gtid, taskdata));
2319 
2320   return TASK_CURRENT_NOT_QUEUED;
2321 }
2322 
2323 #if OMPT_SUPPORT && OMPT_OPTIONAL
2324 OMPT_NOINLINE
2325 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
2326                                           void *frame_address,
2327                                           void *return_address) {
2328   return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
2329                                             return_address);
2330 }
2331 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2332 
2333 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
2334 // complete
2335 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
2336 #if OMPT_SUPPORT && OMPT_OPTIONAL
2337   if (UNLIKELY(ompt_enabled.enabled)) {
2338     OMPT_STORE_RETURN_ADDRESS(gtid);
2339     return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
2340                                     OMPT_LOAD_RETURN_ADDRESS(gtid));
2341   }
2342 #endif
2343   return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
2344 }
2345 
2346 // __kmpc_omp_taskyield: switch to a different task
2347 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
2348   kmp_taskdata_t *taskdata = NULL;
2349   kmp_info_t *thread;
2350   int thread_finished = FALSE;
2351 
2352   KMP_COUNT_BLOCK(OMP_TASKYIELD);
2353   KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2354 
2355   KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2356                 gtid, loc_ref, end_part));
2357   __kmp_assert_valid_gtid(gtid);
2358 
2359   if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
2360     thread = __kmp_threads[gtid];
2361     taskdata = thread->th.th_current_task;
2362 // Should we model this as a task wait or not?
2363 // Debugger: The taskwait is active. Store location and thread encountered the
2364 // taskwait.
2365 #if USE_ITT_BUILD
2366 // Note: These values are used by ITT events as well.
2367 #endif /* USE_ITT_BUILD */
2368     taskdata->td_taskwait_counter += 1;
2369     taskdata->td_taskwait_ident = loc_ref;
2370     taskdata->td_taskwait_thread = gtid + 1;
2371 
2372 #if USE_ITT_BUILD
2373     void *itt_sync_obj = NULL;
2374 #if USE_ITT_NOTIFY
2375     KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2376 #endif /* USE_ITT_NOTIFY */
2377 #endif /* USE_ITT_BUILD */
2378     if (!taskdata->td_flags.team_serial) {
2379       kmp_task_team_t *task_team = thread->th.th_task_team;
2380       if (task_team != NULL) {
2381         if (KMP_TASKING_ENABLED(task_team)) {
2382 #if OMPT_SUPPORT
2383           if (UNLIKELY(ompt_enabled.enabled))
2384             thread->th.ompt_thread_info.ompt_task_yielded = 1;
2385 #endif
2386           __kmp_execute_tasks_32(
2387               thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
2388               &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2389               __kmp_task_stealing_constraint);
2390 #if OMPT_SUPPORT
2391           if (UNLIKELY(ompt_enabled.enabled))
2392             thread->th.ompt_thread_info.ompt_task_yielded = 0;
2393 #endif
2394         }
2395       }
2396     }
2397 #if USE_ITT_BUILD
2398     KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2399 #endif /* USE_ITT_BUILD */
2400 
2401     // Debugger:  The taskwait is completed. Location remains, but thread is
2402     // negated.
2403     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2404   }
2405 
2406   KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2407                 "returning TASK_CURRENT_NOT_QUEUED\n",
2408                 gtid, taskdata));
2409 
2410   return TASK_CURRENT_NOT_QUEUED;
2411 }
2412 
2413 // Task Reduction implementation
2414 //
2415 // Note: initial implementation didn't take into account the possibility
2416 // to specify omp_orig for initializer of the UDR (user defined reduction).
2417 // Corrected implementation takes into account the omp_orig object.
2418 // Compiler is free to use old implementation if omp_orig is not specified.
2419 
2420 /*!
2421 @ingroup BASIC_TYPES
2422 @{
2423 */
2424 
2425 /*!
2426 Flags for special info per task reduction item.
2427 */
2428 typedef struct kmp_taskred_flags {
2429   /*! 1 - use lazy alloc/init (e.g. big objects, num tasks < num threads) */
2430   unsigned lazy_priv : 1;
2431   unsigned reserved31 : 31;
2432 } kmp_taskred_flags_t;
2433 
2434 /*!
2435 Internal struct for reduction data item related info set up by compiler.
2436 */
2437 typedef struct kmp_task_red_input {
2438   void *reduce_shar; /**< shared between tasks item to reduce into */
2439   size_t reduce_size; /**< size of data item in bytes */
2440   // three compiler-generated routines (init, fini are optional):
2441   void *reduce_init; /**< data initialization routine (single parameter) */
2442   void *reduce_fini; /**< data finalization routine */
2443   void *reduce_comb; /**< data combiner routine */
2444   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2445 } kmp_task_red_input_t;
2446 
2447 /*!
2448 Internal struct for reduction data item related info saved by the library.
2449 */
2450 typedef struct kmp_taskred_data {
2451   void *reduce_shar; /**< shared between tasks item to reduce into */
2452   size_t reduce_size; /**< size of data item */
2453   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2454   void *reduce_priv; /**< array of thread specific items */
2455   void *reduce_pend; /**< end of private data for faster comparison op */
2456   // three compiler-generated routines (init, fini are optional):
2457   void *reduce_comb; /**< data combiner routine */
2458   void *reduce_init; /**< data initialization routine (two parameters) */
2459   void *reduce_fini; /**< data finalization routine */
2460   void *reduce_orig; /**< original item (can be used in UDR initializer) */
2461 } kmp_taskred_data_t;
2462 
2463 /*!
2464 Internal struct for reduction data item related info set up by compiler.
2465 
2466 New interface: added reduce_orig field to provide omp_orig for UDR initializer.
2467 */
2468 typedef struct kmp_taskred_input {
2469   void *reduce_shar; /**< shared between tasks item to reduce into */
2470   void *reduce_orig; /**< original reduction item used for initialization */
2471   size_t reduce_size; /**< size of data item */
2472   // three compiler-generated routines (init, fini are optional):
2473   void *reduce_init; /**< data initialization routine (two parameters) */
2474   void *reduce_fini; /**< data finalization routine */
2475   void *reduce_comb; /**< data combiner routine */
2476   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2477 } kmp_taskred_input_t;
2478 /*!
2479 @}
2480 */
2481 
2482 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2483 template <>
2484 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2485                                              kmp_task_red_input_t &src) {
2486   item.reduce_orig = NULL;
2487 }
2488 template <>
2489 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2490                                             kmp_taskred_input_t &src) {
2491   if (src.reduce_orig != NULL) {
2492     item.reduce_orig = src.reduce_orig;
2493   } else {
2494     item.reduce_orig = src.reduce_shar;
2495   } // non-NULL reduce_orig means new interface used
2496 }
2497 
2498 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2499 template <>
2500 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2501                                            size_t offset) {
2502   ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2503 }
2504 template <>
2505 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2506                                           size_t offset) {
2507   ((void (*)(void *, void *))item.reduce_init)(
2508       (char *)(item.reduce_priv) + offset, item.reduce_orig);
2509 }
2510 
2511 template <typename T>
2512 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2513   __kmp_assert_valid_gtid(gtid);
2514   kmp_info_t *thread = __kmp_threads[gtid];
2515   kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2516   kmp_uint32 nth = thread->th.th_team_nproc;
2517   kmp_taskred_data_t *arr;
2518 
2519   // check input data just in case
2520   KMP_ASSERT(tg != NULL);
2521   KMP_ASSERT(data != NULL);
2522   KMP_ASSERT(num > 0);
2523   if (nth == 1 && !__kmp_enable_hidden_helper) {
2524     KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2525                   gtid, tg));
2526     return (void *)tg;
2527   }
2528   KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2529                 gtid, tg, num));
2530   arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2531       thread, num * sizeof(kmp_taskred_data_t));
2532   for (int i = 0; i < num; ++i) {
2533     size_t size = data[i].reduce_size - 1;
2534     // round the size up to cache line per thread-specific item
2535     size += CACHE_LINE - size % CACHE_LINE;
2536     KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2537     arr[i].reduce_shar = data[i].reduce_shar;
2538     arr[i].reduce_size = size;
2539     arr[i].flags = data[i].flags;
2540     arr[i].reduce_comb = data[i].reduce_comb;
2541     arr[i].reduce_init = data[i].reduce_init;
2542     arr[i].reduce_fini = data[i].reduce_fini;
2543     __kmp_assign_orig<T>(arr[i], data[i]);
2544     if (!arr[i].flags.lazy_priv) {
2545       // allocate cache-line aligned block and fill it with zeros
2546       arr[i].reduce_priv = __kmp_allocate(nth * size);
2547       arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2548       if (arr[i].reduce_init != NULL) {
2549         // initialize all thread-specific items
2550         for (size_t j = 0; j < nth; ++j) {
2551           __kmp_call_init<T>(arr[i], j * size);
2552         }
2553       }
2554     } else {
2555       // only allocate space for pointers now,
2556       // objects will be lazily allocated/initialized if/when requested
2557       // note that __kmp_allocate zeroes the allocated memory
2558       arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2559     }
2560   }
2561   tg->reduce_data = (void *)arr;
2562   tg->reduce_num_data = num;
2563   return (void *)tg;
2564 }
2565 
2566 /*!
2567 @ingroup TASKING
2568 @param gtid      Global thread ID
2569 @param num       Number of data items to reduce
2570 @param data      Array of data for reduction
2571 @return The taskgroup identifier
2572 
2573 Initialize task reduction for the taskgroup.
2574 
2575 Note: this entry supposes the optional compiler-generated initializer routine
2576 has single parameter - pointer to object to be initialized. That means
2577 the reduction either does not use omp_orig object, or the omp_orig is accessible
2578 without help of the runtime library.
2579 */
2580 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2581 #if OMPX_TASKGRAPH
2582   kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2583   if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2584     kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2585     this_tdg->rec_taskred_data =
2586         __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2587     this_tdg->rec_num_taskred = num;
2588     KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2589                sizeof(kmp_task_red_input_t) * num);
2590   }
2591 #endif
2592   return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2593 }
2594 
2595 /*!
2596 @ingroup TASKING
2597 @param gtid      Global thread ID
2598 @param num       Number of data items to reduce
2599 @param data      Array of data for reduction
2600 @return The taskgroup identifier
2601 
2602 Initialize task reduction for the taskgroup.
2603 
2604 Note: this entry supposes the optional compiler-generated initializer routine
2605 has two parameters, pointer to object to be initialized and pointer to omp_orig
2606 */
2607 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2608 #if OMPX_TASKGRAPH
2609   kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2610   if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2611     kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2612     this_tdg->rec_taskred_data =
2613         __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2614     this_tdg->rec_num_taskred = num;
2615     KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2616                sizeof(kmp_task_red_input_t) * num);
2617   }
2618 #endif
2619   return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2620 }
2621 
2622 // Copy task reduction data (except for shared pointers).
2623 template <typename T>
2624 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2625                                     kmp_taskgroup_t *tg, void *reduce_data) {
2626   kmp_taskred_data_t *arr;
2627   KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2628                 " from data %p\n",
2629                 thr, tg, reduce_data));
2630   arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2631       thr, num * sizeof(kmp_taskred_data_t));
2632   // threads will share private copies, thunk routines, sizes, flags, etc.:
2633   KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2634   for (int i = 0; i < num; ++i) {
2635     arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2636   }
2637   tg->reduce_data = (void *)arr;
2638   tg->reduce_num_data = num;
2639 }
2640 
2641 /*!
2642 @ingroup TASKING
2643 @param gtid    Global thread ID
2644 @param tskgrp  The taskgroup ID (optional)
2645 @param data    Shared location of the item
2646 @return The pointer to per-thread data
2647 
2648 Get thread-specific location of data item
2649 */
2650 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2651   __kmp_assert_valid_gtid(gtid);
2652   kmp_info_t *thread = __kmp_threads[gtid];
2653   kmp_int32 nth = thread->th.th_team_nproc;
2654   if (nth == 1)
2655     return data; // nothing to do
2656 
2657   kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2658   if (tg == NULL)
2659     tg = thread->th.th_current_task->td_taskgroup;
2660   KMP_ASSERT(tg != NULL);
2661   kmp_taskred_data_t *arr;
2662   kmp_int32 num;
2663   kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2664 
2665 #if OMPX_TASKGRAPH
2666   if ((thread->th.th_current_task->is_taskgraph) &&
2667       (!__kmp_tdg_is_recording(
2668           __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) {
2669     tg = thread->th.th_current_task->td_taskgroup;
2670     KMP_ASSERT(tg != NULL);
2671     KMP_ASSERT(tg->reduce_data != NULL);
2672     arr = (kmp_taskred_data_t *)(tg->reduce_data);
2673     num = tg->reduce_num_data;
2674   }
2675 #endif
2676 
2677   KMP_ASSERT(data != NULL);
2678   while (tg != NULL) {
2679     arr = (kmp_taskred_data_t *)(tg->reduce_data);
2680     num = tg->reduce_num_data;
2681     for (int i = 0; i < num; ++i) {
2682       if (!arr[i].flags.lazy_priv) {
2683         if (data == arr[i].reduce_shar ||
2684             (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2685           return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2686       } else {
2687         // check shared location first
2688         void **p_priv = (void **)(arr[i].reduce_priv);
2689         if (data == arr[i].reduce_shar)
2690           goto found;
2691         // check if we get some thread specific location as parameter
2692         for (int j = 0; j < nth; ++j)
2693           if (data == p_priv[j])
2694             goto found;
2695         continue; // not found, continue search
2696       found:
2697         if (p_priv[tid] == NULL) {
2698           // allocate thread specific object lazily
2699           p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2700           if (arr[i].reduce_init != NULL) {
2701             if (arr[i].reduce_orig != NULL) { // new interface
2702               ((void (*)(void *, void *))arr[i].reduce_init)(
2703                   p_priv[tid], arr[i].reduce_orig);
2704             } else { // old interface (single parameter)
2705               ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2706             }
2707           }
2708         }
2709         return p_priv[tid];
2710       }
2711     }
2712     KMP_ASSERT(tg->parent);
2713     tg = tg->parent;
2714   }
2715   KMP_ASSERT2(0, "Unknown task reduction item");
2716   return NULL; // ERROR, this line never executed
2717 }
2718 
2719 // Finalize task reduction.
2720 // Called from __kmpc_end_taskgroup()
2721 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2722   kmp_int32 nth = th->th.th_team_nproc;
2723   KMP_DEBUG_ASSERT(
2724       nth > 1 ||
2725       __kmp_enable_hidden_helper); // should not be called if nth == 1 unless we
2726                                    // are using hidden helper threads
2727   kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2728   kmp_int32 num = tg->reduce_num_data;
2729   for (int i = 0; i < num; ++i) {
2730     void *sh_data = arr[i].reduce_shar;
2731     void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2732     void (*f_comb)(void *, void *) =
2733         (void (*)(void *, void *))(arr[i].reduce_comb);
2734     if (!arr[i].flags.lazy_priv) {
2735       void *pr_data = arr[i].reduce_priv;
2736       size_t size = arr[i].reduce_size;
2737       for (int j = 0; j < nth; ++j) {
2738         void *priv_data = (char *)pr_data + j * size;
2739         f_comb(sh_data, priv_data); // combine results
2740         if (f_fini)
2741           f_fini(priv_data); // finalize if needed
2742       }
2743     } else {
2744       void **pr_data = (void **)(arr[i].reduce_priv);
2745       for (int j = 0; j < nth; ++j) {
2746         if (pr_data[j] != NULL) {
2747           f_comb(sh_data, pr_data[j]); // combine results
2748           if (f_fini)
2749             f_fini(pr_data[j]); // finalize if needed
2750           __kmp_free(pr_data[j]);
2751         }
2752       }
2753     }
2754     __kmp_free(arr[i].reduce_priv);
2755   }
2756   __kmp_thread_free(th, arr);
2757   tg->reduce_data = NULL;
2758   tg->reduce_num_data = 0;
2759 }
2760 
2761 // Cleanup task reduction data for parallel or worksharing,
2762 // do not touch task private data other threads still working with.
2763 // Called from __kmpc_end_taskgroup()
2764 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2765   __kmp_thread_free(th, tg->reduce_data);
2766   tg->reduce_data = NULL;
2767   tg->reduce_num_data = 0;
2768 }
2769 
2770 template <typename T>
2771 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2772                                          int num, T *data) {
2773   __kmp_assert_valid_gtid(gtid);
2774   kmp_info_t *thr = __kmp_threads[gtid];
2775   kmp_int32 nth = thr->th.th_team_nproc;
2776   __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2777   if (nth == 1) {
2778     KA_TRACE(10,
2779              ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2780               gtid, thr->th.th_current_task->td_taskgroup));
2781     return (void *)thr->th.th_current_task->td_taskgroup;
2782   }
2783   kmp_team_t *team = thr->th.th_team;
2784   void *reduce_data;
2785   kmp_taskgroup_t *tg;
2786   reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2787   if (reduce_data == NULL &&
2788       __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2789                                  (void *)1)) {
2790     // single thread enters this block to initialize common reduction data
2791     KMP_DEBUG_ASSERT(reduce_data == NULL);
2792     // first initialize own data, then make a copy other threads can use
2793     tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2794     reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2795     KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2796     // fini counters should be 0 at this point
2797     KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2798     KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2799     KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2800   } else {
2801     while (
2802         (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2803         (void *)1) { // wait for task reduction initialization
2804       KMP_CPU_PAUSE();
2805     }
2806     KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2807     tg = thr->th.th_current_task->td_taskgroup;
2808     __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2809   }
2810   return tg;
2811 }
2812 
2813 /*!
2814 @ingroup TASKING
2815 @param loc       Source location info
2816 @param gtid      Global thread ID
2817 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2818 @param num       Number of data items to reduce
2819 @param data      Array of data for reduction
2820 @return The taskgroup identifier
2821 
2822 Initialize task reduction for a parallel or worksharing.
2823 
2824 Note: this entry supposes the optional compiler-generated initializer routine
2825 has single parameter - pointer to object to be initialized. That means
2826 the reduction either does not use omp_orig object, or the omp_orig is accessible
2827 without help of the runtime library.
2828 */
2829 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2830                                           int num, void *data) {
2831   return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2832                                             (kmp_task_red_input_t *)data);
2833 }
2834 
2835 /*!
2836 @ingroup TASKING
2837 @param loc       Source location info
2838 @param gtid      Global thread ID
2839 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2840 @param num       Number of data items to reduce
2841 @param data      Array of data for reduction
2842 @return The taskgroup identifier
2843 
2844 Initialize task reduction for a parallel or worksharing.
2845 
2846 Note: this entry supposes the optional compiler-generated initializer routine
2847 has two parameters, pointer to object to be initialized and pointer to omp_orig
2848 */
2849 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2850                                    void *data) {
2851   return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2852                                             (kmp_taskred_input_t *)data);
2853 }
2854 
2855 /*!
2856 @ingroup TASKING
2857 @param loc       Source location info
2858 @param gtid      Global thread ID
2859 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2860 
2861 Finalize task reduction for a parallel or worksharing.
2862 */
2863 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2864   __kmpc_end_taskgroup(loc, gtid);
2865 }
2866 
2867 // __kmpc_taskgroup: Start a new taskgroup
2868 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2869   __kmp_assert_valid_gtid(gtid);
2870   kmp_info_t *thread = __kmp_threads[gtid];
2871   kmp_taskdata_t *taskdata = thread->th.th_current_task;
2872   kmp_taskgroup_t *tg_new =
2873       (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2874   KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2875   KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2876   KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2877   tg_new->parent = taskdata->td_taskgroup;
2878   tg_new->reduce_data = NULL;
2879   tg_new->reduce_num_data = 0;
2880   tg_new->gomp_data = NULL;
2881   taskdata->td_taskgroup = tg_new;
2882 
2883 #if OMPT_SUPPORT && OMPT_OPTIONAL
2884   if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2885     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2886     if (!codeptr)
2887       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2888     kmp_team_t *team = thread->th.th_team;
2889     ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2890     // FIXME: I think this is wrong for lwt!
2891     ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2892 
2893     ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2894         ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2895         &(my_task_data), codeptr);
2896   }
2897 #endif
2898 }
2899 
2900 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2901 //                       and its descendants are complete
2902 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2903   __kmp_assert_valid_gtid(gtid);
2904   kmp_info_t *thread = __kmp_threads[gtid];
2905   kmp_taskdata_t *taskdata = thread->th.th_current_task;
2906   kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2907   int thread_finished = FALSE;
2908 
2909 #if OMPT_SUPPORT && OMPT_OPTIONAL
2910   kmp_team_t *team;
2911   ompt_data_t my_task_data;
2912   ompt_data_t my_parallel_data;
2913   void *codeptr = nullptr;
2914   if (UNLIKELY(ompt_enabled.enabled)) {
2915     team = thread->th.th_team;
2916     my_task_data = taskdata->ompt_task_info.task_data;
2917     // FIXME: I think this is wrong for lwt!
2918     my_parallel_data = team->t.ompt_team_info.parallel_data;
2919     codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2920     if (!codeptr)
2921       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2922   }
2923 #endif
2924 
2925   KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2926   KMP_DEBUG_ASSERT(taskgroup != NULL);
2927   KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2928 
2929   if (__kmp_tasking_mode != tskm_immediate_exec) {
2930     // mark task as waiting not on a barrier
2931     taskdata->td_taskwait_counter += 1;
2932     taskdata->td_taskwait_ident = loc;
2933     taskdata->td_taskwait_thread = gtid + 1;
2934 #if USE_ITT_BUILD
2935     // For ITT the taskgroup wait is similar to taskwait until we need to
2936     // distinguish them
2937     void *itt_sync_obj = NULL;
2938 #if USE_ITT_NOTIFY
2939     KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2940 #endif /* USE_ITT_NOTIFY */
2941 #endif /* USE_ITT_BUILD */
2942 
2943 #if OMPT_SUPPORT && OMPT_OPTIONAL
2944     if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2945       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2946           ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2947           &(my_task_data), codeptr);
2948     }
2949 #endif
2950 
2951     if (!taskdata->td_flags.team_serial ||
2952         (thread->th.th_task_team != NULL &&
2953          (thread->th.th_task_team->tt.tt_found_proxy_tasks ||
2954           thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
2955       kmp_flag_32<false, false> flag(
2956           RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
2957       while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2958         flag.execute_tasks(thread, gtid, FALSE,
2959                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2960                            __kmp_task_stealing_constraint);
2961       }
2962     }
2963     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2964 
2965 #if OMPT_SUPPORT && OMPT_OPTIONAL
2966     if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2967       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2968           ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2969           &(my_task_data), codeptr);
2970     }
2971 #endif
2972 
2973 #if USE_ITT_BUILD
2974     KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2975     KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2976 #endif /* USE_ITT_BUILD */
2977   }
2978   KMP_DEBUG_ASSERT(taskgroup->count == 0);
2979 
2980   if (taskgroup->reduce_data != NULL &&
2981       !taskgroup->gomp_data) { // need to reduce?
2982     int cnt;
2983     void *reduce_data;
2984     kmp_team_t *t = thread->th.th_team;
2985     kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2986     // check if <priv> data of the first reduction variable shared for the team
2987     void *priv0 = arr[0].reduce_priv;
2988     if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2989         ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2990       // finishing task reduction on parallel
2991       cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2992       if (cnt == thread->th.th_team_nproc - 1) {
2993         // we are the last thread passing __kmpc_reduction_modifier_fini()
2994         // finalize task reduction:
2995         __kmp_task_reduction_fini(thread, taskgroup);
2996         // cleanup fields in the team structure:
2997         // TODO: is relaxed store enough here (whole barrier should follow)?
2998         __kmp_thread_free(thread, reduce_data);
2999         KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
3000         KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
3001       } else {
3002         // we are not the last thread passing __kmpc_reduction_modifier_fini(),
3003         // so do not finalize reduction, just clean own copy of the data
3004         __kmp_task_reduction_clean(thread, taskgroup);
3005       }
3006     } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
3007                    NULL &&
3008                ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
3009       // finishing task reduction on worksharing
3010       cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
3011       if (cnt == thread->th.th_team_nproc - 1) {
3012         // we are the last thread passing __kmpc_reduction_modifier_fini()
3013         __kmp_task_reduction_fini(thread, taskgroup);
3014         // cleanup fields in team structure:
3015         // TODO: is relaxed store enough here (whole barrier should follow)?
3016         __kmp_thread_free(thread, reduce_data);
3017         KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
3018         KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
3019       } else {
3020         // we are not the last thread passing __kmpc_reduction_modifier_fini(),
3021         // so do not finalize reduction, just clean own copy of the data
3022         __kmp_task_reduction_clean(thread, taskgroup);
3023       }
3024     } else {
3025       // finishing task reduction on taskgroup
3026       __kmp_task_reduction_fini(thread, taskgroup);
3027     }
3028   }
3029   // Restore parent taskgroup for the current task
3030   taskdata->td_taskgroup = taskgroup->parent;
3031   __kmp_thread_free(thread, taskgroup);
3032 
3033   KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
3034                 gtid, taskdata));
3035 
3036 #if OMPT_SUPPORT && OMPT_OPTIONAL
3037   if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
3038     ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
3039         ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
3040         &(my_task_data), codeptr);
3041   }
3042 #endif
3043 }
3044 
3045 static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid,
3046                                            kmp_task_team_t *task_team,
3047                                            kmp_int32 is_constrained) {
3048   kmp_task_t *task = NULL;
3049   kmp_taskdata_t *taskdata;
3050   kmp_taskdata_t *current;
3051   kmp_thread_data_t *thread_data;
3052   int ntasks = task_team->tt.tt_num_task_pri;
3053   if (ntasks == 0) {
3054     KA_TRACE(
3055         20, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid));
3056     return NULL;
3057   }
3058   do {
3059     // decrement num_tasks to "reserve" one task to get for execution
3060     if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks,
3061                                    ntasks - 1))
3062       break;
3063     ntasks = task_team->tt.tt_num_task_pri;
3064   } while (ntasks > 0);
3065   if (ntasks == 0) {
3066     KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n",
3067                   __kmp_get_gtid()));
3068     return NULL;
3069   }
3070   // We got a "ticket" to get a "reserved" priority task
3071   int deque_ntasks;
3072   kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3073   do {
3074     KMP_ASSERT(list != NULL);
3075     thread_data = &list->td;
3076     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3077     deque_ntasks = thread_data->td.td_deque_ntasks;
3078     if (deque_ntasks == 0) {
3079       __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3080       KA_TRACE(20, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n",
3081                     __kmp_get_gtid(), thread_data));
3082       list = list->next;
3083     }
3084   } while (deque_ntasks == 0);
3085   KMP_DEBUG_ASSERT(deque_ntasks);
3086   int target = thread_data->td.td_deque_head;
3087   current = __kmp_threads[gtid]->th.th_current_task;
3088   taskdata = thread_data->td.td_deque[target];
3089   if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3090     // Bump head pointer and Wrap.
3091     thread_data->td.td_deque_head =
3092         (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3093   } else {
3094     if (!task_team->tt.tt_untied_task_encountered) {
3095       // The TSC does not allow to steal victim task
3096       __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3097       KA_TRACE(20, ("__kmp_get_priority_task(exit #3): T#%d could not get task "
3098                     "from %p: task_team=%p ntasks=%d head=%u tail=%u\n",
3099                     gtid, thread_data, task_team, deque_ntasks, target,
3100                     thread_data->td.td_deque_tail));
3101       task_team->tt.tt_num_task_pri++; // atomic inc, restore value
3102       return NULL;
3103     }
3104     int i;
3105     // walk through the deque trying to steal any task
3106     taskdata = NULL;
3107     for (i = 1; i < deque_ntasks; ++i) {
3108       target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3109       taskdata = thread_data->td.td_deque[target];
3110       if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3111         break; // found task to execute
3112       } else {
3113         taskdata = NULL;
3114       }
3115     }
3116     if (taskdata == NULL) {
3117       // No appropriate candidate found to execute
3118       __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3119       KA_TRACE(
3120           10, ("__kmp_get_priority_task(exit #4): T#%d could not get task from "
3121                "%p: task_team=%p ntasks=%d head=%u tail=%u\n",
3122                gtid, thread_data, task_team, deque_ntasks,
3123                thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3124       task_team->tt.tt_num_task_pri++; // atomic inc, restore value
3125       return NULL;
3126     }
3127     int prev = target;
3128     for (i = i + 1; i < deque_ntasks; ++i) {
3129       // shift remaining tasks in the deque left by 1
3130       target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3131       thread_data->td.td_deque[prev] = thread_data->td.td_deque[target];
3132       prev = target;
3133     }
3134     KMP_DEBUG_ASSERT(
3135         thread_data->td.td_deque_tail ==
3136         (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(thread_data->td)));
3137     thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped))
3138   }
3139   thread_data->td.td_deque_ntasks = deque_ntasks - 1;
3140   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3141   task = KMP_TASKDATA_TO_TASK(taskdata);
3142   return task;
3143 }
3144 
3145 // __kmp_remove_my_task: remove a task from my own deque
3146 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
3147                                         kmp_task_team_t *task_team,
3148                                         kmp_int32 is_constrained) {
3149   kmp_task_t *task;
3150   kmp_taskdata_t *taskdata;
3151   kmp_thread_data_t *thread_data;
3152   kmp_uint32 tail;
3153 
3154   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3155   KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
3156                    NULL); // Caller should check this condition
3157 
3158   thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
3159 
3160   KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
3161                 gtid, thread_data->td.td_deque_ntasks,
3162                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3163 
3164   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
3165     KA_TRACE(10,
3166              ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
3167               "ntasks=%d head=%u tail=%u\n",
3168               gtid, thread_data->td.td_deque_ntasks,
3169               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3170     return NULL;
3171   }
3172 
3173   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3174 
3175   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
3176     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3177     KA_TRACE(10,
3178              ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
3179               "ntasks=%d head=%u tail=%u\n",
3180               gtid, thread_data->td.td_deque_ntasks,
3181               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3182     return NULL;
3183   }
3184 
3185   tail = (thread_data->td.td_deque_tail - 1) &
3186          TASK_DEQUE_MASK(thread_data->td); // Wrap index.
3187   taskdata = thread_data->td.td_deque[tail];
3188 
3189   if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
3190                              thread->th.th_current_task)) {
3191     // The TSC does not allow to steal victim task
3192     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3193     KA_TRACE(10,
3194              ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
3195               "ntasks=%d head=%u tail=%u\n",
3196               gtid, thread_data->td.td_deque_ntasks,
3197               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3198     return NULL;
3199   }
3200 
3201   thread_data->td.td_deque_tail = tail;
3202   TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
3203 
3204   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3205 
3206   KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
3207                 "ntasks=%d head=%u tail=%u\n",
3208                 gtid, taskdata, thread_data->td.td_deque_ntasks,
3209                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3210 
3211   task = KMP_TASKDATA_TO_TASK(taskdata);
3212   return task;
3213 }
3214 
3215 // __kmp_steal_task: remove a task from another thread's deque
3216 // Assume that calling thread has already checked existence of
3217 // task_team thread_data before calling this routine.
3218 static kmp_task_t *__kmp_steal_task(kmp_int32 victim_tid, kmp_int32 gtid,
3219                                     kmp_task_team_t *task_team,
3220                                     std::atomic<kmp_int32> *unfinished_threads,
3221                                     int *thread_finished,
3222                                     kmp_int32 is_constrained) {
3223   kmp_task_t *task;
3224   kmp_taskdata_t *taskdata;
3225   kmp_taskdata_t *current;
3226   kmp_thread_data_t *victim_td, *threads_data;
3227   kmp_int32 target;
3228   kmp_info_t *victim_thr;
3229 
3230   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3231 
3232   threads_data = task_team->tt.tt_threads_data;
3233   KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
3234   KMP_DEBUG_ASSERT(victim_tid >= 0);
3235   KMP_DEBUG_ASSERT(victim_tid < task_team->tt.tt_max_threads);
3236 
3237   victim_td = &threads_data[victim_tid];
3238   victim_thr = victim_td->td.td_thr;
3239   (void)victim_thr; // Use in TRACE messages which aren't always enabled.
3240 
3241   KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
3242                 "task_team=%p ntasks=%d head=%u tail=%u\n",
3243                 gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3244                 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3245                 victim_td->td.td_deque_tail));
3246 
3247   if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
3248     KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
3249                   "task_team=%p ntasks=%d head=%u tail=%u\n",
3250                   gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3251                   victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3252                   victim_td->td.td_deque_tail));
3253     return NULL;
3254   }
3255 
3256   __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
3257 
3258   int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
3259   // Check again after we acquire the lock
3260   if (ntasks == 0) {
3261     __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3262     KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
3263                   "task_team=%p ntasks=%d head=%u tail=%u\n",
3264                   gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3265                   victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3266     return NULL;
3267   }
3268 
3269   KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
3270   current = __kmp_threads[gtid]->th.th_current_task;
3271   taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
3272   if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3273     // Bump head pointer and Wrap.
3274     victim_td->td.td_deque_head =
3275         (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
3276   } else {
3277     if (!task_team->tt.tt_untied_task_encountered) {
3278       // The TSC does not allow to steal victim task
3279       __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3280       KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
3281                     "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3282                     gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3283                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3284       return NULL;
3285     }
3286     int i;
3287     // walk through victim's deque trying to steal any task
3288     target = victim_td->td.td_deque_head;
3289     taskdata = NULL;
3290     for (i = 1; i < ntasks; ++i) {
3291       target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3292       taskdata = victim_td->td.td_deque[target];
3293       if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3294         break; // found victim task
3295       } else {
3296         taskdata = NULL;
3297       }
3298     }
3299     if (taskdata == NULL) {
3300       // No appropriate candidate to steal found
3301       __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3302       KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
3303                     "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3304                     gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3305                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3306       return NULL;
3307     }
3308     int prev = target;
3309     for (i = i + 1; i < ntasks; ++i) {
3310       // shift remaining tasks in the deque left by 1
3311       target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3312       victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
3313       prev = target;
3314     }
3315     KMP_DEBUG_ASSERT(
3316         victim_td->td.td_deque_tail ==
3317         (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
3318     victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
3319   }
3320   if (*thread_finished) {
3321     // We need to un-mark this victim as a finished victim.  This must be done
3322     // before releasing the lock, or else other threads (starting with the
3323     // primary thread victim) might be prematurely released from the barrier!!!
3324 #if KMP_DEBUG
3325     kmp_int32 count =
3326 #endif
3327         KMP_ATOMIC_INC(unfinished_threads);
3328     KA_TRACE(
3329         20,
3330         ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
3331          gtid, count + 1, task_team));
3332     *thread_finished = FALSE;
3333   }
3334   TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
3335 
3336   __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3337 
3338   KMP_COUNT_BLOCK(TASK_stolen);
3339   KA_TRACE(10,
3340            ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
3341             "task_team=%p ntasks=%d head=%u tail=%u\n",
3342             gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
3343             ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3344 
3345   task = KMP_TASKDATA_TO_TASK(taskdata);
3346   return task;
3347 }
3348 
3349 // __kmp_execute_tasks_template: Choose and execute tasks until either the
3350 // condition is statisfied (return true) or there are none left (return false).
3351 //
3352 // final_spin is TRUE if this is the spin at the release barrier.
3353 // thread_finished indicates whether the thread is finished executing all
3354 // the tasks it has on its deque, and is at the release barrier.
3355 // spinner is the location on which to spin.
3356 // spinner == NULL means only execute a single task and return.
3357 // checker is the value to check to terminate the spin.
3358 template <class C>
3359 static inline int __kmp_execute_tasks_template(
3360     kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
3361     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3362     kmp_int32 is_constrained) {
3363   kmp_task_team_t *task_team = thread->th.th_task_team;
3364   kmp_thread_data_t *threads_data;
3365   kmp_task_t *task;
3366   kmp_info_t *other_thread;
3367   kmp_taskdata_t *current_task = thread->th.th_current_task;
3368   std::atomic<kmp_int32> *unfinished_threads;
3369   kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
3370                       tid = thread->th.th_info.ds.ds_tid;
3371 
3372   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3373   KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
3374 
3375   if (task_team == NULL || current_task == NULL)
3376     return FALSE;
3377 
3378   KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
3379                 "*thread_finished=%d\n",
3380                 gtid, final_spin, *thread_finished));
3381 
3382   thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3383   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3384 
3385   KMP_DEBUG_ASSERT(threads_data != NULL);
3386 
3387   nthreads = task_team->tt.tt_nproc;
3388   unfinished_threads = &(task_team->tt.tt_unfinished_threads);
3389   KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
3390 
3391   while (1) { // Outer loop keeps trying to find tasks in case of single thread
3392     // getting tasks from target constructs
3393     while (1) { // Inner loop to find a task and execute it
3394       task = NULL;
3395       if (task_team->tt.tt_num_task_pri) { // get priority task first
3396         task = __kmp_get_priority_task(gtid, task_team, is_constrained);
3397       }
3398       if (task == NULL && use_own_tasks) { // check own queue next
3399         task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
3400       }
3401       if ((task == NULL) && (nthreads > 1)) { // Steal a task finally
3402         int asleep = 1;
3403         use_own_tasks = 0;
3404         // Try to steal from the last place I stole from successfully.
3405         if (victim_tid == -2) { // haven't stolen anything yet
3406           victim_tid = threads_data[tid].td.td_deque_last_stolen;
3407           if (victim_tid !=
3408               -1) // if we have a last stolen from victim, get the thread
3409             other_thread = threads_data[victim_tid].td.td_thr;
3410         }
3411         if (victim_tid != -1) { // found last victim
3412           asleep = 0;
3413         } else if (!new_victim) { // no recent steals and we haven't already
3414           // used a new victim; select a random thread
3415           do { // Find a different thread to steal work from.
3416             // Pick a random thread. Initial plan was to cycle through all the
3417             // threads, and only return if we tried to steal from every thread,
3418             // and failed.  Arch says that's not such a great idea.
3419             victim_tid = __kmp_get_random(thread) % (nthreads - 1);
3420             if (victim_tid >= tid) {
3421               ++victim_tid; // Adjusts random distribution to exclude self
3422             }
3423             // Found a potential victim
3424             other_thread = threads_data[victim_tid].td.td_thr;
3425             // There is a slight chance that __kmp_enable_tasking() did not wake
3426             // up all threads waiting at the barrier.  If victim is sleeping,
3427             // then wake it up. Since we were going to pay the cache miss
3428             // penalty for referencing another thread's kmp_info_t struct
3429             // anyway,
3430             // the check shouldn't cost too much performance at this point. In
3431             // extra barrier mode, tasks do not sleep at the separate tasking
3432             // barrier, so this isn't a problem.
3433             asleep = 0;
3434             if ((__kmp_tasking_mode == tskm_task_teams) &&
3435                 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
3436                 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
3437                  NULL)) {
3438               asleep = 1;
3439               __kmp_null_resume_wrapper(other_thread);
3440               // A sleeping thread should not have any tasks on it's queue.
3441               // There is a slight possibility that it resumes, steals a task
3442               // from another thread, which spawns more tasks, all in the time
3443               // that it takes this thread to check => don't write an assertion
3444               // that the victim's queue is empty.  Try stealing from a
3445               // different thread.
3446             }
3447           } while (asleep);
3448         }
3449 
3450         if (!asleep) {
3451           // We have a victim to try to steal from
3452           task =
3453               __kmp_steal_task(victim_tid, gtid, task_team, unfinished_threads,
3454                                thread_finished, is_constrained);
3455         }
3456         if (task != NULL) { // set last stolen to victim
3457           if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
3458             threads_data[tid].td.td_deque_last_stolen = victim_tid;
3459             // The pre-refactored code did not try more than 1 successful new
3460             // vicitm, unless the last one generated more local tasks;
3461             // new_victim keeps track of this
3462             new_victim = 1;
3463           }
3464         } else { // No tasks found; unset last_stolen
3465           KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
3466           victim_tid = -2; // no successful victim found
3467         }
3468       }
3469 
3470       if (task == NULL)
3471         break; // break out of tasking loop
3472 
3473 // Found a task; execute it
3474 #if USE_ITT_BUILD && USE_ITT_NOTIFY
3475       if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
3476         if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
3477           // get the object reliably
3478           itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
3479         }
3480         __kmp_itt_task_starting(itt_sync_obj);
3481       }
3482 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
3483       __kmp_invoke_task(gtid, task, current_task);
3484 #if USE_ITT_BUILD
3485       if (itt_sync_obj != NULL)
3486         __kmp_itt_task_finished(itt_sync_obj);
3487 #endif /* USE_ITT_BUILD */
3488       // If this thread is only partway through the barrier and the condition is
3489       // met, then return now, so that the barrier gather/release pattern can
3490       // proceed. If this thread is in the last spin loop in the barrier,
3491       // waiting to be released, we know that the termination condition will not
3492       // be satisfied, so don't waste any cycles checking it.
3493       if (flag == NULL || (!final_spin && flag->done_check())) {
3494         KA_TRACE(
3495             15,
3496             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3497              gtid));
3498         return TRUE;
3499       }
3500       if (thread->th.th_task_team == NULL) {
3501         break;
3502       }
3503       KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3504       // If execution of a stolen task results in more tasks being placed on our
3505       // run queue, reset use_own_tasks
3506       if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3507         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3508                       "other tasks, restart\n",
3509                       gtid));
3510         use_own_tasks = 1;
3511         new_victim = 0;
3512       }
3513     }
3514 
3515     // The task source has been exhausted. If in final spin loop of barrier,
3516     // check if termination condition is satisfied. The work queue may be empty
3517     // but there might be proxy tasks still executing.
3518     if (final_spin &&
3519         KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
3520       // First, decrement the #unfinished threads, if that has not already been
3521       // done.  This decrement might be to the spin location, and result in the
3522       // termination condition being satisfied.
3523       if (!*thread_finished) {
3524 #if KMP_DEBUG
3525         kmp_int32 count = -1 +
3526 #endif
3527             KMP_ATOMIC_DEC(unfinished_threads);
3528         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3529                       "unfinished_threads to %d task_team=%p\n",
3530                       gtid, count, task_team));
3531         *thread_finished = TRUE;
3532       }
3533 
3534       // It is now unsafe to reference thread->th.th_team !!!
3535       // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3536       // thread to pass through the barrier, where it might reset each thread's
3537       // th.th_team field for the next parallel region. If we can steal more
3538       // work, we know that this has not happened yet.
3539       if (flag != NULL && flag->done_check()) {
3540         KA_TRACE(
3541             15,
3542             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3543              gtid));
3544         return TRUE;
3545       }
3546     }
3547 
3548     // If this thread's task team is NULL, primary thread has recognized that
3549     // there are no more tasks; bail out
3550     if (thread->th.th_task_team == NULL) {
3551       KA_TRACE(15,
3552                ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3553       return FALSE;
3554     }
3555 
3556     // Check the flag again to see if it has already done in case to be trapped
3557     // into infinite loop when a if0 task depends on a hidden helper task
3558     // outside any parallel region. Detached tasks are not impacted in this case
3559     // because the only thread executing this function has to execute the proxy
3560     // task so it is in another code path that has the same check.
3561     if (flag == NULL || (!final_spin && flag->done_check())) {
3562       KA_TRACE(15,
3563                ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3564                 gtid));
3565       return TRUE;
3566     }
3567 
3568     // We could be getting tasks from target constructs; if this is the only
3569     // thread, keep trying to execute tasks from own queue
3570     if (nthreads == 1 &&
3571         KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))
3572       use_own_tasks = 1;
3573     else {
3574       KA_TRACE(15,
3575                ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3576       return FALSE;
3577     }
3578   }
3579 }
3580 
3581 template <bool C, bool S>
3582 int __kmp_execute_tasks_32(
3583     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
3584     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3585     kmp_int32 is_constrained) {
3586   return __kmp_execute_tasks_template(
3587       thread, gtid, flag, final_spin,
3588       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3589 }
3590 
3591 template <bool C, bool S>
3592 int __kmp_execute_tasks_64(
3593     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
3594     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3595     kmp_int32 is_constrained) {
3596   return __kmp_execute_tasks_template(
3597       thread, gtid, flag, final_spin,
3598       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3599 }
3600 
3601 template <bool C, bool S>
3602 int __kmp_atomic_execute_tasks_64(
3603     kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,
3604     int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3605     kmp_int32 is_constrained) {
3606   return __kmp_execute_tasks_template(
3607       thread, gtid, flag, final_spin,
3608       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3609 }
3610 
3611 int __kmp_execute_tasks_oncore(
3612     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3613     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3614     kmp_int32 is_constrained) {
3615   return __kmp_execute_tasks_template(
3616       thread, gtid, flag, final_spin,
3617       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3618 }
3619 
3620 template int
3621 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
3622                                      kmp_flag_32<false, false> *, int,
3623                                      int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3624 
3625 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
3626                                                  kmp_flag_64<false, true> *,
3627                                                  int,
3628                                                  int *USE_ITT_BUILD_ARG(void *),
3629                                                  kmp_int32);
3630 
3631 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
3632                                                  kmp_flag_64<true, false> *,
3633                                                  int,
3634                                                  int *USE_ITT_BUILD_ARG(void *),
3635                                                  kmp_int32);
3636 
3637 template int __kmp_atomic_execute_tasks_64<false, true>(
3638     kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,
3639     int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3640 
3641 template int __kmp_atomic_execute_tasks_64<true, false>(
3642     kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,
3643     int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3644 
3645 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3646 // next barrier so they can assist in executing enqueued tasks.
3647 // First thread in allocates the task team atomically.
3648 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3649                                  kmp_info_t *this_thr) {
3650   kmp_thread_data_t *threads_data;
3651   int nthreads, i, is_init_thread;
3652 
3653   KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3654                 __kmp_gtid_from_thread(this_thr)));
3655 
3656   KMP_DEBUG_ASSERT(task_team != NULL);
3657   KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3658 
3659   nthreads = task_team->tt.tt_nproc;
3660   KMP_DEBUG_ASSERT(nthreads > 0);
3661   KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3662 
3663   // Allocate or increase the size of threads_data if necessary
3664   is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3665 
3666   if (!is_init_thread) {
3667     // Some other thread already set up the array.
3668     KA_TRACE(
3669         20,
3670         ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3671          __kmp_gtid_from_thread(this_thr)));
3672     return;
3673   }
3674   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3675   KMP_DEBUG_ASSERT(threads_data != NULL);
3676 
3677   if (__kmp_tasking_mode == tskm_task_teams &&
3678       (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3679     // Release any threads sleeping at the barrier, so that they can steal
3680     // tasks and execute them.  In extra barrier mode, tasks do not sleep
3681     // at the separate tasking barrier, so this isn't a problem.
3682     for (i = 0; i < nthreads; i++) {
3683       void *sleep_loc;
3684       kmp_info_t *thread = threads_data[i].td.td_thr;
3685 
3686       if (i == this_thr->th.th_info.ds.ds_tid) {
3687         continue;
3688       }
3689       // Since we haven't locked the thread's suspend mutex lock at this
3690       // point, there is a small window where a thread might be putting
3691       // itself to sleep, but hasn't set the th_sleep_loc field yet.
3692       // To work around this, __kmp_execute_tasks_template() periodically checks
3693       // see if other threads are sleeping (using the same random mechanism that
3694       // is used for task stealing) and awakens them if they are.
3695       if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3696           NULL) {
3697         KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3698                       __kmp_gtid_from_thread(this_thr),
3699                       __kmp_gtid_from_thread(thread)));
3700         __kmp_null_resume_wrapper(thread);
3701       } else {
3702         KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3703                       __kmp_gtid_from_thread(this_thr),
3704                       __kmp_gtid_from_thread(thread)));
3705       }
3706     }
3707   }
3708 
3709   KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3710                 __kmp_gtid_from_thread(this_thr)));
3711 }
3712 
3713 /* // TODO: Check the comment consistency
3714  * Utility routines for "task teams".  A task team (kmp_task_t) is kind of
3715  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3716  * After a child * thread checks into a barrier and calls __kmp_release() from
3717  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3718  * longer assume that the kmp_team_t structure is intact (at any moment, the
3719  * primary thread may exit the barrier code and free the team data structure,
3720  * and return the threads to the thread pool).
3721  *
3722  * This does not work with the tasking code, as the thread is still
3723  * expected to participate in the execution of any tasks that may have been
3724  * spawned my a member of the team, and the thread still needs access to all
3725  * to each thread in the team, so that it can steal work from it.
3726  *
3727  * Enter the existence of the kmp_task_team_t struct.  It employs a reference
3728  * counting mechanism, and is allocated by the primary thread before calling
3729  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3730  * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
3731  * of the kmp_task_team_t structs for consecutive barriers can overlap
3732  * (and will, unless the primary thread is the last thread to exit the barrier
3733  * release phase, which is not typical). The existence of such a struct is
3734  * useful outside the context of tasking.
3735  *
3736  * We currently use the existence of the threads array as an indicator that
3737  * tasks were spawned since the last barrier.  If the structure is to be
3738  * useful outside the context of tasking, then this will have to change, but
3739  * not setting the field minimizes the performance impact of tasking on
3740  * barriers, when no explicit tasks were spawned (pushed, actually).
3741  */
3742 
3743 static kmp_task_team_t *__kmp_free_task_teams =
3744     NULL; // Free list for task_team data structures
3745 // Lock for task team data structures
3746 kmp_bootstrap_lock_t __kmp_task_team_lock =
3747     KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3748 
3749 // __kmp_alloc_task_deque:
3750 // Allocates a task deque for a particular thread, and initialize the necessary
3751 // data structures relating to the deque.  This only happens once per thread
3752 // per task team since task teams are recycled. No lock is needed during
3753 // allocation since each thread allocates its own deque.
3754 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3755                                    kmp_thread_data_t *thread_data) {
3756   __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3757   KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3758 
3759   // Initialize last stolen task field to "none"
3760   thread_data->td.td_deque_last_stolen = -1;
3761 
3762   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3763   KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3764   KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3765 
3766   KE_TRACE(
3767       10,
3768       ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3769        __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3770   // Allocate space for task deque, and zero the deque
3771   // Cannot use __kmp_thread_calloc() because threads not around for
3772   // kmp_reap_task_team( ).
3773   thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3774       INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3775   thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3776 }
3777 
3778 // __kmp_free_task_deque:
3779 // Deallocates a task deque for a particular thread. Happens at library
3780 // deallocation so don't need to reset all thread data fields.
3781 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3782   if (thread_data->td.td_deque != NULL) {
3783     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3784     TCW_4(thread_data->td.td_deque_ntasks, 0);
3785     __kmp_free(thread_data->td.td_deque);
3786     thread_data->td.td_deque = NULL;
3787     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3788   }
3789 
3790 #ifdef BUILD_TIED_TASK_STACK
3791   // GEH: Figure out what to do here for td_susp_tied_tasks
3792   if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3793     __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3794   }
3795 #endif // BUILD_TIED_TASK_STACK
3796 }
3797 
3798 // __kmp_realloc_task_threads_data:
3799 // Allocates a threads_data array for a task team, either by allocating an
3800 // initial array or enlarging an existing array.  Only the first thread to get
3801 // the lock allocs or enlarges the array and re-initializes the array elements.
3802 // That thread returns "TRUE", the rest return "FALSE".
3803 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3804 // The current size is given by task_team -> tt.tt_max_threads.
3805 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3806                                            kmp_task_team_t *task_team) {
3807   kmp_thread_data_t **threads_data_p;
3808   kmp_int32 nthreads, maxthreads;
3809   int is_init_thread = FALSE;
3810 
3811   if (TCR_4(task_team->tt.tt_found_tasks)) {
3812     // Already reallocated and initialized.
3813     return FALSE;
3814   }
3815 
3816   threads_data_p = &task_team->tt.tt_threads_data;
3817   nthreads = task_team->tt.tt_nproc;
3818   maxthreads = task_team->tt.tt_max_threads;
3819 
3820   // All threads must lock when they encounter the first task of the implicit
3821   // task region to make sure threads_data fields are (re)initialized before
3822   // used.
3823   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3824 
3825   if (!TCR_4(task_team->tt.tt_found_tasks)) {
3826     // first thread to enable tasking
3827     kmp_team_t *team = thread->th.th_team;
3828     int i;
3829 
3830     is_init_thread = TRUE;
3831     if (maxthreads < nthreads) {
3832 
3833       if (*threads_data_p != NULL) {
3834         kmp_thread_data_t *old_data = *threads_data_p;
3835         kmp_thread_data_t *new_data = NULL;
3836 
3837         KE_TRACE(
3838             10,
3839             ("__kmp_realloc_task_threads_data: T#%d reallocating "
3840              "threads data for task_team %p, new_size = %d, old_size = %d\n",
3841              __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3842         // Reallocate threads_data to have more elements than current array
3843         // Cannot use __kmp_thread_realloc() because threads not around for
3844         // kmp_reap_task_team( ).  Note all new array entries are initialized
3845         // to zero by __kmp_allocate().
3846         new_data = (kmp_thread_data_t *)__kmp_allocate(
3847             nthreads * sizeof(kmp_thread_data_t));
3848         // copy old data to new data
3849         KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3850                      (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3851 
3852 #ifdef BUILD_TIED_TASK_STACK
3853         // GEH: Figure out if this is the right thing to do
3854         for (i = maxthreads; i < nthreads; i++) {
3855           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3856           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3857         }
3858 #endif // BUILD_TIED_TASK_STACK
3859        // Install the new data and free the old data
3860         (*threads_data_p) = new_data;
3861         __kmp_free(old_data);
3862       } else {
3863         KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3864                       "threads data for task_team %p, size = %d\n",
3865                       __kmp_gtid_from_thread(thread), task_team, nthreads));
3866         // Make the initial allocate for threads_data array, and zero entries
3867         // Cannot use __kmp_thread_calloc() because threads not around for
3868         // kmp_reap_task_team( ).
3869         *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3870             nthreads * sizeof(kmp_thread_data_t));
3871 #ifdef BUILD_TIED_TASK_STACK
3872         // GEH: Figure out if this is the right thing to do
3873         for (i = 0; i < nthreads; i++) {
3874           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3875           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3876         }
3877 #endif // BUILD_TIED_TASK_STACK
3878       }
3879       task_team->tt.tt_max_threads = nthreads;
3880     } else {
3881       // If array has (more than) enough elements, go ahead and use it
3882       KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3883     }
3884 
3885     // initialize threads_data pointers back to thread_info structures
3886     for (i = 0; i < nthreads; i++) {
3887       kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3888       thread_data->td.td_thr = team->t.t_threads[i];
3889 
3890       if (thread_data->td.td_deque_last_stolen >= nthreads) {
3891         // The last stolen field survives across teams / barrier, and the number
3892         // of threads may have changed.  It's possible (likely?) that a new
3893         // parallel region will exhibit the same behavior as previous region.
3894         thread_data->td.td_deque_last_stolen = -1;
3895       }
3896     }
3897 
3898     KMP_MB();
3899     TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3900   }
3901 
3902   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3903   return is_init_thread;
3904 }
3905 
3906 // __kmp_free_task_threads_data:
3907 // Deallocates a threads_data array for a task team, including any attached
3908 // tasking deques.  Only occurs at library shutdown.
3909 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3910   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3911   if (task_team->tt.tt_threads_data != NULL) {
3912     int i;
3913     for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3914       __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3915     }
3916     __kmp_free(task_team->tt.tt_threads_data);
3917     task_team->tt.tt_threads_data = NULL;
3918   }
3919   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3920 }
3921 
3922 // __kmp_free_task_pri_list:
3923 // Deallocates tasking deques used for priority tasks.
3924 // Only occurs at library shutdown.
3925 static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
3926   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3927   if (task_team->tt.tt_task_pri_list != NULL) {
3928     kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3929     while (list != NULL) {
3930       kmp_task_pri_t *next = list->next;
3931       __kmp_free_task_deque(&list->td);
3932       __kmp_free(list);
3933       list = next;
3934     }
3935     task_team->tt.tt_task_pri_list = NULL;
3936   }
3937   __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3938 }
3939 
3940 static inline void __kmp_task_team_init(kmp_task_team_t *task_team,
3941                                         kmp_team_t *team) {
3942   int team_nth = team->t.t_nproc;
3943   // Only need to init if task team is isn't active or team size changed
3944   if (!task_team->tt.tt_active || team_nth != task_team->tt.tt_nproc) {
3945     TCW_4(task_team->tt.tt_found_tasks, FALSE);
3946     TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3947     TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3948     TCW_4(task_team->tt.tt_nproc, team_nth);
3949     KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team_nth);
3950     TCW_4(task_team->tt.tt_active, TRUE);
3951   }
3952 }
3953 
3954 // __kmp_allocate_task_team:
3955 // Allocates a task team associated with a specific team, taking it from
3956 // the global task team free list if possible.  Also initializes data
3957 // structures.
3958 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3959                                                  kmp_team_t *team) {
3960   kmp_task_team_t *task_team = NULL;
3961 
3962   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3963                 (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3964 
3965   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3966     // Take a task team from the task team pool
3967     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3968     if (__kmp_free_task_teams != NULL) {
3969       task_team = __kmp_free_task_teams;
3970       TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3971       task_team->tt.tt_next = NULL;
3972     }
3973     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3974   }
3975 
3976   if (task_team == NULL) {
3977     KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3978                   "task team for team %p\n",
3979                   __kmp_gtid_from_thread(thread), team));
3980     // Allocate a new task team if one is not available. Cannot use
3981     // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3982     task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3983     __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3984     __kmp_init_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3985 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3986     // suppress race conditions detection on synchronization flags in debug mode
3987     // this helps to analyze library internals eliminating false positives
3988     __itt_suppress_mark_range(
3989         __itt_suppress_range, __itt_suppress_threading_errors,
3990         &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3991     __itt_suppress_mark_range(__itt_suppress_range,
3992                               __itt_suppress_threading_errors,
3993                               CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3994                               sizeof(task_team->tt.tt_active));
3995 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3996     // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3997     // task_team->tt.tt_threads_data = NULL;
3998     // task_team->tt.tt_max_threads = 0;
3999     // task_team->tt.tt_next = NULL;
4000   }
4001 
4002   __kmp_task_team_init(task_team, team);
4003 
4004   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
4005                 "unfinished_threads init'd to %d\n",
4006                 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
4007                 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
4008   return task_team;
4009 }
4010 
4011 // __kmp_free_task_team:
4012 // Frees the task team associated with a specific thread, and adds it
4013 // to the global task team free list.
4014 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
4015   KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
4016                 thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
4017 
4018   // Put task team back on free list
4019   __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
4020 
4021   KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
4022   task_team->tt.tt_next = __kmp_free_task_teams;
4023   TCW_PTR(__kmp_free_task_teams, task_team);
4024 
4025   __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
4026 }
4027 
4028 // __kmp_reap_task_teams:
4029 // Free all the task teams on the task team free list.
4030 // Should only be done during library shutdown.
4031 // Cannot do anything that needs a thread structure or gtid since they are
4032 // already gone.
4033 void __kmp_reap_task_teams(void) {
4034   kmp_task_team_t *task_team;
4035 
4036   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
4037     // Free all task_teams on the free list
4038     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
4039     while ((task_team = __kmp_free_task_teams) != NULL) {
4040       __kmp_free_task_teams = task_team->tt.tt_next;
4041       task_team->tt.tt_next = NULL;
4042 
4043       // Free threads_data if necessary
4044       if (task_team->tt.tt_threads_data != NULL) {
4045         __kmp_free_task_threads_data(task_team);
4046       }
4047       if (task_team->tt.tt_task_pri_list != NULL) {
4048         __kmp_free_task_pri_list(task_team);
4049       }
4050       __kmp_free(task_team);
4051     }
4052     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
4053   }
4054 }
4055 
4056 // View the array of two task team pointers as a pair of pointers:
4057 //  1) a single task_team pointer
4058 //  2) next pointer for stack
4059 // Serial teams can create a stack of task teams for nested serial teams.
4060 void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
4061   KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4062   kmp_task_team_list_t *current =
4063       (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
4064   kmp_task_team_list_t *node =
4065       (kmp_task_team_list_t *)__kmp_allocate(sizeof(kmp_task_team_list_t));
4066   node->task_team = current->task_team;
4067   node->next = current->next;
4068   thread->th.th_task_team = current->task_team = NULL;
4069   current->next = node;
4070 }
4071 
4072 // Serial team pops a task team off the stack
4073 void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
4074   KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4075   kmp_task_team_list_t *current =
4076       (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
4077   if (current->task_team) {
4078     __kmp_free_task_team(thread, current->task_team);
4079   }
4080   kmp_task_team_list_t *next = current->next;
4081   if (next) {
4082     current->task_team = next->task_team;
4083     current->next = next->next;
4084     KMP_DEBUG_ASSERT(next != current);
4085     __kmp_free(next);
4086     thread->th.th_task_team = current->task_team;
4087   }
4088 }
4089 
4090 // __kmp_wait_to_unref_task_teams:
4091 // Some threads could still be in the fork barrier release code, possibly
4092 // trying to steal tasks.  Wait for each thread to unreference its task team.
4093 void __kmp_wait_to_unref_task_teams(void) {
4094   kmp_info_t *thread;
4095   kmp_uint32 spins;
4096   kmp_uint64 time;
4097   int done;
4098 
4099   KMP_INIT_YIELD(spins);
4100   KMP_INIT_BACKOFF(time);
4101 
4102   for (;;) {
4103     done = TRUE;
4104 
4105     // TODO: GEH - this may be is wrong because some sync would be necessary
4106     // in case threads are added to the pool during the traversal. Need to
4107     // verify that lock for thread pool is held when calling this routine.
4108     for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
4109          thread = thread->th.th_next_pool) {
4110 #if KMP_OS_WINDOWS
4111       DWORD exit_val;
4112 #endif
4113       if (TCR_PTR(thread->th.th_task_team) == NULL) {
4114         KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
4115                       __kmp_gtid_from_thread(thread)));
4116         continue;
4117       }
4118 #if KMP_OS_WINDOWS
4119       // TODO: GEH - add this check for Linux* OS / OS X* as well?
4120       if (!__kmp_is_thread_alive(thread, &exit_val)) {
4121         thread->th.th_task_team = NULL;
4122         continue;
4123       }
4124 #endif
4125 
4126       done = FALSE; // Because th_task_team pointer is not NULL for this thread
4127 
4128       KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
4129                     "unreference task_team\n",
4130                     __kmp_gtid_from_thread(thread)));
4131 
4132       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
4133         void *sleep_loc;
4134         // If the thread is sleeping, awaken it.
4135         if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
4136             NULL) {
4137           KA_TRACE(
4138               10,
4139               ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
4140                __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
4141           __kmp_null_resume_wrapper(thread);
4142         }
4143       }
4144     }
4145     if (done) {
4146       break;
4147     }
4148 
4149     // If oversubscribed or have waited a bit, yield.
4150     KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
4151   }
4152 }
4153 
4154 // __kmp_task_team_setup:  Create a task_team for the current team, but use
4155 // an already created, unused one if it already exists.
4156 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {
4157   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4158 
4159   // For the serial and root teams, setup the first task team pointer to point
4160   // to task team. The other pointer is a stack of task teams from previous
4161   // serial levels.
4162   if (team == this_thr->th.th_serial_team ||
4163       team == this_thr->th.th_root->r.r_root_team) {
4164     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4165     if (team->t.t_task_team[0] == NULL) {
4166       team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team);
4167       KA_TRACE(
4168           20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
4169                " for serial/root team %p\n",
4170                __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team));
4171 
4172     } else
4173       __kmp_task_team_init(team->t.t_task_team[0], team);
4174     return;
4175   }
4176 
4177   // If this task_team hasn't been created yet, allocate it. It will be used in
4178   // the region after the next.
4179   // If it exists, it is the current task team and shouldn't be touched yet as
4180   // it may still be in use.
4181   if (team->t.t_task_team[this_thr->th.th_task_state] == NULL) {
4182     team->t.t_task_team[this_thr->th.th_task_state] =
4183         __kmp_allocate_task_team(this_thr, team);
4184     KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
4185                   " for team %d at parity=%d\n",
4186                   __kmp_gtid_from_thread(this_thr),
4187                   team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
4188                   this_thr->th.th_task_state));
4189   }
4190 
4191   // After threads exit the release, they will call sync, and then point to this
4192   // other task_team; make sure it is allocated and properly initialized. As
4193   // threads spin in the barrier release phase, they will continue to use the
4194   // previous task_team struct(above), until they receive the signal to stop
4195   // checking for tasks (they can't safely reference the kmp_team_t struct,
4196   // which could be reallocated by the primary thread).
4197   int other_team = 1 - this_thr->th.th_task_state;
4198   KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
4199   if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
4200     team->t.t_task_team[other_team] = __kmp_allocate_task_team(this_thr, team);
4201     KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
4202                   "task_team %p for team %d at parity=%d\n",
4203                   __kmp_gtid_from_thread(this_thr),
4204                   team->t.t_task_team[other_team], team->t.t_id, other_team));
4205   } else { // Leave the old task team struct in place for the upcoming region;
4206     // adjust as needed
4207     kmp_task_team_t *task_team = team->t.t_task_team[other_team];
4208     __kmp_task_team_init(task_team, team);
4209     // if team size has changed, the first thread to enable tasking will
4210     // realloc threads_data if necessary
4211     KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
4212                   "%p for team %d at parity=%d\n",
4213                   __kmp_gtid_from_thread(this_thr),
4214                   team->t.t_task_team[other_team], team->t.t_id, other_team));
4215   }
4216 
4217   // For regular thread, task enabling should be called when the task is going
4218   // to be pushed to a dequeue. However, for the hidden helper thread, we need
4219   // it ahead of time so that some operations can be performed without race
4220   // condition.
4221   if (this_thr == __kmp_hidden_helper_main_thread) {
4222     for (int i = 0; i < 2; ++i) {
4223       kmp_task_team_t *task_team = team->t.t_task_team[i];
4224       if (KMP_TASKING_ENABLED(task_team)) {
4225         continue;
4226       }
4227       __kmp_enable_tasking(task_team, this_thr);
4228       for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
4229         kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
4230         if (thread_data->td.td_deque == NULL) {
4231           __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data);
4232         }
4233       }
4234     }
4235   }
4236 }
4237 
4238 // __kmp_task_team_sync: Propagation of task team data from team to threads
4239 // which happens just after the release phase of a team barrier.  This may be
4240 // called by any thread. This is not called for serial or root teams.
4241 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
4242   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4243   KMP_DEBUG_ASSERT(team != this_thr->th.th_serial_team);
4244   KMP_DEBUG_ASSERT(team != this_thr->th.th_root->r.r_root_team);
4245 
4246   // Toggle the th_task_state field, to switch which task_team this thread
4247   // refers to
4248   this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
4249 
4250   // It is now safe to propagate the task team pointer from the team struct to
4251   // the current thread.
4252   TCW_PTR(this_thr->th.th_task_team,
4253           team->t.t_task_team[this_thr->th.th_task_state]);
4254   KA_TRACE(20,
4255            ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
4256             "%p from Team #%d (parity=%d)\n",
4257             __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
4258             team->t.t_id, this_thr->th.th_task_state));
4259 }
4260 
4261 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
4262 // barrier gather phase. Only called by the primary thread.
4263 //
4264 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
4265 // by passing in 0 optionally as the last argument. When wait is zero, primary
4266 // thread does not wait for unfinished_threads to reach 0.
4267 void __kmp_task_team_wait(
4268     kmp_info_t *this_thr,
4269     kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
4270   kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
4271 
4272   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4273   KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
4274 
4275   if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
4276     if (wait) {
4277       KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
4278                     "(for unfinished_threads to reach 0) on task_team = %p\n",
4279                     __kmp_gtid_from_thread(this_thr), task_team));
4280       // Worker threads may have dropped through to release phase, but could
4281       // still be executing tasks. Wait here for tasks to complete. To avoid
4282       // memory contention, only primary thread checks termination condition.
4283       kmp_flag_32<false, false> flag(
4284           RCAST(std::atomic<kmp_uint32> *,
4285                 &task_team->tt.tt_unfinished_threads),
4286           0U);
4287       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
4288     }
4289     // Deactivate the old task team, so that the worker threads will stop
4290     // referencing it while spinning.
4291     KA_TRACE(
4292         20,
4293         ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
4294          "setting active to false, setting local and team's pointer to NULL\n",
4295          __kmp_gtid_from_thread(this_thr), task_team));
4296     TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
4297     TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
4298     KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
4299     TCW_SYNC_4(task_team->tt.tt_active, FALSE);
4300     KMP_MB();
4301 
4302     TCW_PTR(this_thr->th.th_task_team, NULL);
4303   }
4304 }
4305 
4306 // __kmp_tasking_barrier:
4307 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
4308 // Internal function to execute all tasks prior to a regular barrier or a join
4309 // barrier. It is a full barrier itself, which unfortunately turns regular
4310 // barriers into double barriers and join barriers into 1 1/2 barriers.
4311 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
4312   std::atomic<kmp_uint32> *spin = RCAST(
4313       std::atomic<kmp_uint32> *,
4314       &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
4315   int flag = FALSE;
4316   KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
4317 
4318 #if USE_ITT_BUILD
4319   KMP_FSYNC_SPIN_INIT(spin, NULL);
4320 #endif /* USE_ITT_BUILD */
4321   kmp_flag_32<false, false> spin_flag(spin, 0U);
4322   while (!spin_flag.execute_tasks(thread, gtid, TRUE,
4323                                   &flag USE_ITT_BUILD_ARG(NULL), 0)) {
4324 #if USE_ITT_BUILD
4325     // TODO: What about itt_sync_obj??
4326     KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
4327 #endif /* USE_ITT_BUILD */
4328 
4329     if (TCR_4(__kmp_global.g.g_done)) {
4330       if (__kmp_global.g.g_abort)
4331         __kmp_abort_thread();
4332       break;
4333     }
4334     KMP_YIELD(TRUE);
4335   }
4336 #if USE_ITT_BUILD
4337   KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
4338 #endif /* USE_ITT_BUILD */
4339 }
4340 
4341 // __kmp_give_task puts a task into a given thread queue if:
4342 //  - the queue for that thread was created
4343 //  - there's space in that queue
4344 // Because of this, __kmp_push_task needs to check if there's space after
4345 // getting the lock
4346 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
4347                             kmp_int32 pass) {
4348   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4349   kmp_task_team_t *task_team = taskdata->td_task_team;
4350 
4351   KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
4352                 taskdata, tid));
4353 
4354   // If task_team is NULL something went really bad...
4355   KMP_DEBUG_ASSERT(task_team != NULL);
4356 
4357   bool result = false;
4358   kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
4359 
4360   if (thread_data->td.td_deque == NULL) {
4361     // There's no queue in this thread, go find another one
4362     // We're guaranteed that at least one thread has a queue
4363     KA_TRACE(30,
4364              ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
4365               tid, taskdata));
4366     return result;
4367   }
4368 
4369   if (TCR_4(thread_data->td.td_deque_ntasks) >=
4370       TASK_DEQUE_SIZE(thread_data->td)) {
4371     KA_TRACE(
4372         30,
4373         ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
4374          taskdata, tid));
4375 
4376     // if this deque is bigger than the pass ratio give a chance to another
4377     // thread
4378     if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4379       return result;
4380 
4381     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4382     if (TCR_4(thread_data->td.td_deque_ntasks) >=
4383         TASK_DEQUE_SIZE(thread_data->td)) {
4384       // expand deque to push the task which is not allowed to execute
4385       __kmp_realloc_task_deque(thread, thread_data);
4386     }
4387 
4388   } else {
4389 
4390     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4391 
4392     if (TCR_4(thread_data->td.td_deque_ntasks) >=
4393         TASK_DEQUE_SIZE(thread_data->td)) {
4394       KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
4395                     "thread %d.\n",
4396                     taskdata, tid));
4397 
4398       // if this deque is bigger than the pass ratio give a chance to another
4399       // thread
4400       if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4401         goto release_and_exit;
4402 
4403       __kmp_realloc_task_deque(thread, thread_data);
4404     }
4405   }
4406 
4407   // lock is held here, and there is space in the deque
4408 
4409   thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
4410   // Wrap index.
4411   thread_data->td.td_deque_tail =
4412       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
4413   TCW_4(thread_data->td.td_deque_ntasks,
4414         TCR_4(thread_data->td.td_deque_ntasks) + 1);
4415 
4416   result = true;
4417   KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
4418                 taskdata, tid));
4419 
4420 release_and_exit:
4421   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
4422 
4423   return result;
4424 }
4425 
4426 #define PROXY_TASK_FLAG 0x40000000
4427 /* The finish of the proxy tasks is divided in two pieces:
4428     - the top half is the one that can be done from a thread outside the team
4429     - the bottom half must be run from a thread within the team
4430 
4431    In order to run the bottom half the task gets queued back into one of the
4432    threads of the team. Once the td_incomplete_child_task counter of the parent
4433    is decremented the threads can leave the barriers. So, the bottom half needs
4434    to be queued before the counter is decremented. The top half is therefore
4435    divided in two parts:
4436     - things that can be run before queuing the bottom half
4437     - things that must be run after queuing the bottom half
4438 
4439    This creates a second race as the bottom half can free the task before the
4440    second top half is executed. To avoid this we use the
4441    td_incomplete_child_task of the proxy task to synchronize the top and bottom
4442    half. */
4443 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4444   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
4445   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4446   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
4447   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
4448 
4449   taskdata->td_flags.complete = 1; // mark the task as completed
4450 #if OMPX_TASKGRAPH
4451   taskdata->td_flags.onced = 1;
4452 #endif
4453 
4454   if (taskdata->td_taskgroup)
4455     KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
4456 
4457   // Create an imaginary children for this task so the bottom half cannot
4458   // release the task before we have completed the second top half
4459   KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG);
4460 }
4461 
4462 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4463 #if KMP_DEBUG
4464   kmp_int32 children = 0;
4465   // Predecrement simulated by "- 1" calculation
4466   children = -1 +
4467 #endif
4468       KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
4469   KMP_DEBUG_ASSERT(children >= 0);
4470 
4471   // Remove the imaginary children
4472   KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG);
4473 }
4474 
4475 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
4476   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4477   kmp_info_t *thread = __kmp_threads[gtid];
4478 
4479   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4480   KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
4481                    1); // top half must run before bottom half
4482 
4483   // We need to wait to make sure the top half is finished
4484   // Spinning here should be ok as this should happen quickly
4485   while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
4486           PROXY_TASK_FLAG) > 0)
4487     ;
4488 
4489   __kmp_release_deps(gtid, taskdata);
4490   __kmp_free_task_and_ancestors(gtid, taskdata, thread);
4491 }
4492 
4493 /*!
4494 @ingroup TASKING
4495 @param gtid Global Thread ID of encountering thread
4496 @param ptask Task which execution is completed
4497 
4498 Execute the completion of a proxy task from a thread of that is part of the
4499 team. Run first and bottom halves directly.
4500 */
4501 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
4502   KMP_DEBUG_ASSERT(ptask != NULL);
4503   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4504   KA_TRACE(
4505       10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
4506            gtid, taskdata));
4507   __kmp_assert_valid_gtid(gtid);
4508   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4509 
4510   __kmp_first_top_half_finish_proxy(taskdata);
4511   __kmp_second_top_half_finish_proxy(taskdata);
4512   __kmp_bottom_half_finish_proxy(gtid, ptask);
4513 
4514   KA_TRACE(10,
4515            ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
4516             gtid, taskdata));
4517 }
4518 
4519 void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {
4520   KMP_DEBUG_ASSERT(ptask != NULL);
4521   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4522 
4523   // Enqueue task to complete bottom half completion from a thread within the
4524   // corresponding team
4525   kmp_team_t *team = taskdata->td_team;
4526   kmp_int32 nthreads = team->t.t_nproc;
4527   kmp_info_t *thread;
4528 
4529   // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
4530   // but we cannot use __kmp_get_random here
4531   kmp_int32 start_k = start % nthreads;
4532   kmp_int32 pass = 1;
4533   kmp_int32 k = start_k;
4534 
4535   do {
4536     // For now we're just linearly trying to find a thread
4537     thread = team->t.t_threads[k];
4538     k = (k + 1) % nthreads;
4539 
4540     // we did a full pass through all the threads
4541     if (k == start_k)
4542       pass = pass << 1;
4543 
4544   } while (!__kmp_give_task(thread, k, ptask, pass));
4545 
4546   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && __kmp_wpolicy_passive) {
4547     // awake at least one thread to execute given task
4548     for (int i = 0; i < nthreads; ++i) {
4549       thread = team->t.t_threads[i];
4550       if (thread->th.th_sleep_loc != NULL) {
4551         __kmp_null_resume_wrapper(thread);
4552         break;
4553       }
4554     }
4555   }
4556 }
4557 
4558 /*!
4559 @ingroup TASKING
4560 @param ptask Task which execution is completed
4561 
4562 Execute the completion of a proxy task from a thread that could not belong to
4563 the team.
4564 */
4565 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
4566   KMP_DEBUG_ASSERT(ptask != NULL);
4567   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4568 
4569   KA_TRACE(
4570       10,
4571       ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
4572        taskdata));
4573 
4574   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4575 
4576   __kmp_first_top_half_finish_proxy(taskdata);
4577 
4578   __kmpc_give_task(ptask);
4579 
4580   __kmp_second_top_half_finish_proxy(taskdata);
4581 
4582   KA_TRACE(
4583       10,
4584       ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
4585        taskdata));
4586 }
4587 
4588 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
4589                                                 kmp_task_t *task) {
4590   kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
4591   if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
4592     td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
4593     td->td_allow_completion_event.ed.task = task;
4594     __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
4595   }
4596   return &td->td_allow_completion_event;
4597 }
4598 
4599 void __kmp_fulfill_event(kmp_event_t *event) {
4600   if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
4601     kmp_task_t *ptask = event->ed.task;
4602     kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4603     bool detached = false;
4604     int gtid = __kmp_get_gtid();
4605 
4606     // The associated task might have completed or could be completing at this
4607     // point.
4608     // We need to take the lock to avoid races
4609     __kmp_acquire_tas_lock(&event->lock, gtid);
4610     if (taskdata->td_flags.proxy == TASK_PROXY) {
4611       detached = true;
4612     } else {
4613 #if OMPT_SUPPORT
4614       // The OMPT event must occur under mutual exclusion,
4615       // otherwise the tool might access ptask after free
4616       if (UNLIKELY(ompt_enabled.enabled))
4617         __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
4618 #endif
4619     }
4620     event->type = KMP_EVENT_UNINITIALIZED;
4621     __kmp_release_tas_lock(&event->lock, gtid);
4622 
4623     if (detached) {
4624 #if OMPT_SUPPORT
4625       // We free ptask afterwards and know the task is finished,
4626       // so locking is not necessary
4627       if (UNLIKELY(ompt_enabled.enabled))
4628         __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
4629 #endif
4630       // If the task detached complete the proxy task
4631       if (gtid >= 0) {
4632         kmp_team_t *team = taskdata->td_team;
4633         kmp_info_t *thread = __kmp_get_thread();
4634         if (thread->th.th_team == team) {
4635           __kmpc_proxy_task_completed(gtid, ptask);
4636           return;
4637         }
4638       }
4639 
4640       // fallback
4641       __kmpc_proxy_task_completed_ooo(ptask);
4642     }
4643   }
4644 }
4645 
4646 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4647 // for taskloop
4648 //
4649 // thread:   allocating thread
4650 // task_src: pointer to source task to be duplicated
4651 // taskloop_recur: used only when dealing with taskgraph,
4652 //      indicating whether we need to update task->td_task_id
4653 // returns:  a pointer to the allocated kmp_task_t structure (task).
4654 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src
4655 #if OMPX_TASKGRAPH
4656                                  , int taskloop_recur
4657 #endif
4658 ) {
4659   kmp_task_t *task;
4660   kmp_taskdata_t *taskdata;
4661   kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4662   kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
4663   size_t shareds_offset;
4664   size_t task_size;
4665 
4666   KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4667                 task_src));
4668   KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4669                    TASK_FULL); // it should not be proxy task
4670   KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
4671   task_size = taskdata_src->td_size_alloc;
4672 
4673   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4674   KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4675                 task_size));
4676 #if USE_FAST_MEMORY
4677   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4678 #else
4679   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4680 #endif /* USE_FAST_MEMORY */
4681   KMP_MEMCPY(taskdata, taskdata_src, task_size);
4682 
4683   task = KMP_TASKDATA_TO_TASK(taskdata);
4684 
4685   // Initialize new task (only specific fields not affected by memcpy)
4686 #if OMPX_TASKGRAPH
4687   if (taskdata->is_taskgraph && !taskloop_recur &&
4688       __kmp_tdg_is_recording(taskdata_src->tdg->tdg_status))
4689     taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
4690 #endif
4691   taskdata->td_task_id = KMP_GEN_TASK_ID();
4692   if (task->shareds != NULL) { // need setup shareds pointer
4693     shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4694     task->shareds = &((char *)taskdata)[shareds_offset];
4695     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4696                      0);
4697   }
4698   taskdata->td_alloc_thread = thread;
4699   taskdata->td_parent = parent_task;
4700   // task inherits the taskgroup from the parent task
4701   taskdata->td_taskgroup = parent_task->td_taskgroup;
4702   // tied task needs to initialize the td_last_tied at creation,
4703   // untied one does this when it is scheduled for execution
4704   if (taskdata->td_flags.tiedness == TASK_TIED)
4705     taskdata->td_last_tied = taskdata;
4706 
4707   // Only need to keep track of child task counts if team parallel and tasking
4708   // not serialized
4709   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4710     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
4711     if (parent_task->td_taskgroup)
4712       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4713     // Only need to keep track of allocated child tasks for explicit tasks since
4714     // implicit not deallocated
4715     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4716       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
4717   }
4718 
4719   KA_TRACE(20,
4720            ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4721             thread, taskdata, taskdata->td_parent));
4722 #if OMPT_SUPPORT
4723   if (UNLIKELY(ompt_enabled.enabled))
4724     __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4725 #endif
4726   return task;
4727 }
4728 
4729 // Routine optionally generated by the compiler for setting the lastprivate flag
4730 // and calling needed constructors for private/firstprivate objects
4731 // (used to form taskloop tasks from pattern task)
4732 // Parameters: dest task, src task, lastprivate flag.
4733 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4734 
4735 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4736 
4737 // class to encapsulate manipulating loop bounds in a taskloop task.
4738 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4739 // the loop bound variables.
4740 class kmp_taskloop_bounds_t {
4741   kmp_task_t *task;
4742   const kmp_taskdata_t *taskdata;
4743   size_t lower_offset;
4744   size_t upper_offset;
4745 
4746 public:
4747   kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4748       : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4749         lower_offset((char *)lb - (char *)task),
4750         upper_offset((char *)ub - (char *)task) {
4751     KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4752     KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4753   }
4754   kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4755       : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4756         lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4757   size_t get_lower_offset() const { return lower_offset; }
4758   size_t get_upper_offset() const { return upper_offset; }
4759   kmp_uint64 get_lb() const {
4760     kmp_int64 retval;
4761 #if defined(KMP_GOMP_COMPAT)
4762     // Intel task just returns the lower bound normally
4763     if (!taskdata->td_flags.native) {
4764       retval = *(kmp_int64 *)((char *)task + lower_offset);
4765     } else {
4766       // GOMP task has to take into account the sizeof(long)
4767       if (taskdata->td_size_loop_bounds == 4) {
4768         kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4769         retval = (kmp_int64)*lb;
4770       } else {
4771         kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4772         retval = (kmp_int64)*lb;
4773       }
4774     }
4775 #else
4776     (void)taskdata;
4777     retval = *(kmp_int64 *)((char *)task + lower_offset);
4778 #endif // defined(KMP_GOMP_COMPAT)
4779     return retval;
4780   }
4781   kmp_uint64 get_ub() const {
4782     kmp_int64 retval;
4783 #if defined(KMP_GOMP_COMPAT)
4784     // Intel task just returns the upper bound normally
4785     if (!taskdata->td_flags.native) {
4786       retval = *(kmp_int64 *)((char *)task + upper_offset);
4787     } else {
4788       // GOMP task has to take into account the sizeof(long)
4789       if (taskdata->td_size_loop_bounds == 4) {
4790         kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4791         retval = (kmp_int64)*ub;
4792       } else {
4793         kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4794         retval = (kmp_int64)*ub;
4795       }
4796     }
4797 #else
4798     retval = *(kmp_int64 *)((char *)task + upper_offset);
4799 #endif // defined(KMP_GOMP_COMPAT)
4800     return retval;
4801   }
4802   void set_lb(kmp_uint64 lb) {
4803 #if defined(KMP_GOMP_COMPAT)
4804     // Intel task just sets the lower bound normally
4805     if (!taskdata->td_flags.native) {
4806       *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4807     } else {
4808       // GOMP task has to take into account the sizeof(long)
4809       if (taskdata->td_size_loop_bounds == 4) {
4810         kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4811         *lower = (kmp_uint32)lb;
4812       } else {
4813         kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4814         *lower = (kmp_uint64)lb;
4815       }
4816     }
4817 #else
4818     *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4819 #endif // defined(KMP_GOMP_COMPAT)
4820   }
4821   void set_ub(kmp_uint64 ub) {
4822 #if defined(KMP_GOMP_COMPAT)
4823     // Intel task just sets the upper bound normally
4824     if (!taskdata->td_flags.native) {
4825       *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4826     } else {
4827       // GOMP task has to take into account the sizeof(long)
4828       if (taskdata->td_size_loop_bounds == 4) {
4829         kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4830         *upper = (kmp_uint32)ub;
4831       } else {
4832         kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4833         *upper = (kmp_uint64)ub;
4834       }
4835     }
4836 #else
4837     *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4838 #endif // defined(KMP_GOMP_COMPAT)
4839   }
4840 };
4841 
4842 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4843 //
4844 // loc        Source location information
4845 // gtid       Global thread ID
4846 // task       Pattern task, exposes the loop iteration range
4847 // lb         Pointer to loop lower bound in task structure
4848 // ub         Pointer to loop upper bound in task structure
4849 // st         Loop stride
4850 // ub_glob    Global upper bound (used for lastprivate check)
4851 // num_tasks  Number of tasks to execute
4852 // grainsize  Number of loop iterations per task
4853 // extras     Number of chunks with grainsize+1 iterations
4854 // last_chunk Reduction of grainsize for last task
4855 // tc         Iterations count
4856 // task_dup   Tasks duplication routine
4857 // codeptr_ra Return address for OMPT events
4858 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4859                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4860                            kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4861                            kmp_uint64 grainsize, kmp_uint64 extras,
4862                            kmp_int64 last_chunk, kmp_uint64 tc,
4863 #if OMPT_SUPPORT
4864                            void *codeptr_ra,
4865 #endif
4866                            void *task_dup) {
4867   KMP_COUNT_BLOCK(OMP_TASKLOOP);
4868   KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4869   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4870   // compiler provides global bounds here
4871   kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4872   kmp_uint64 lower = task_bounds.get_lb();
4873   kmp_uint64 upper = task_bounds.get_ub();
4874   kmp_uint64 i;
4875   kmp_info_t *thread = __kmp_threads[gtid];
4876   kmp_taskdata_t *current_task = thread->th.th_current_task;
4877   kmp_task_t *next_task;
4878   kmp_int32 lastpriv = 0;
4879 
4880   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4881                              (last_chunk < 0 ? last_chunk : extras));
4882   KMP_DEBUG_ASSERT(num_tasks > extras);
4883   KMP_DEBUG_ASSERT(num_tasks > 0);
4884   KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4885                 "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4886                 gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4887                 ub_glob, st, task_dup));
4888 
4889   // Launch num_tasks tasks, assign grainsize iterations each task
4890   for (i = 0; i < num_tasks; ++i) {
4891     kmp_uint64 chunk_minus_1;
4892     if (extras == 0) {
4893       chunk_minus_1 = grainsize - 1;
4894     } else {
4895       chunk_minus_1 = grainsize;
4896       --extras; // first extras iterations get bigger chunk (grainsize+1)
4897     }
4898     upper = lower + st * chunk_minus_1;
4899     if (upper > *ub) {
4900       upper = *ub;
4901     }
4902     if (i == num_tasks - 1) {
4903       // schedule the last task, set lastprivate flag if needed
4904       if (st == 1) { // most common case
4905         KMP_DEBUG_ASSERT(upper == *ub);
4906         if (upper == ub_glob)
4907           lastpriv = 1;
4908       } else if (st > 0) { // positive loop stride
4909         KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4910         if ((kmp_uint64)st > ub_glob - upper)
4911           lastpriv = 1;
4912       } else { // negative loop stride
4913         KMP_DEBUG_ASSERT(upper + st < *ub);
4914         if (upper - ub_glob < (kmp_uint64)(-st))
4915           lastpriv = 1;
4916       }
4917     }
4918 
4919 #if OMPX_TASKGRAPH
4920     next_task = __kmp_task_dup_alloc(thread, task, /* taskloop_recur */ 0);
4921 #else
4922     next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4923 #endif
4924 
4925     kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4926     kmp_taskloop_bounds_t next_task_bounds =
4927         kmp_taskloop_bounds_t(next_task, task_bounds);
4928 
4929     // adjust task-specific bounds
4930     next_task_bounds.set_lb(lower);
4931     if (next_taskdata->td_flags.native) {
4932       next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4933     } else {
4934       next_task_bounds.set_ub(upper);
4935     }
4936     if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4937                            // etc.
4938       ptask_dup(next_task, task, lastpriv);
4939     KA_TRACE(40,
4940              ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4941               "upper %lld stride %lld, (offsets %p %p)\n",
4942               gtid, i, next_task, lower, upper, st,
4943               next_task_bounds.get_lower_offset(),
4944               next_task_bounds.get_upper_offset()));
4945 #if OMPT_SUPPORT
4946     __kmp_omp_taskloop_task(NULL, gtid, next_task,
4947                             codeptr_ra); // schedule new task
4948 #if OMPT_OPTIONAL
4949     if (ompt_enabled.ompt_callback_dispatch) {
4950       OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk,
4951                               lower, upper, st);
4952     }
4953 #endif // OMPT_OPTIONAL
4954 #else
4955     __kmp_omp_task(gtid, next_task, true); // schedule new task
4956 #endif
4957     lower = upper + st; // adjust lower bound for the next iteration
4958   }
4959   // free the pattern task and exit
4960   __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4961   // do not execute the pattern task, just do internal bookkeeping
4962   __kmp_task_finish<false>(gtid, task, current_task);
4963 }
4964 
4965 // Structure to keep taskloop parameters for auxiliary task
4966 // kept in the shareds of the task structure.
4967 typedef struct __taskloop_params {
4968   kmp_task_t *task;
4969   kmp_uint64 *lb;
4970   kmp_uint64 *ub;
4971   void *task_dup;
4972   kmp_int64 st;
4973   kmp_uint64 ub_glob;
4974   kmp_uint64 num_tasks;
4975   kmp_uint64 grainsize;
4976   kmp_uint64 extras;
4977   kmp_int64 last_chunk;
4978   kmp_uint64 tc;
4979   kmp_uint64 num_t_min;
4980 #if OMPT_SUPPORT
4981   void *codeptr_ra;
4982 #endif
4983 } __taskloop_params_t;
4984 
4985 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4986                           kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4987                           kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
4988                           kmp_uint64,
4989 #if OMPT_SUPPORT
4990                           void *,
4991 #endif
4992                           void *);
4993 
4994 // Execute part of the taskloop submitted as a task.
4995 int __kmp_taskloop_task(int gtid, void *ptask) {
4996   __taskloop_params_t *p =
4997       (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4998   kmp_task_t *task = p->task;
4999   kmp_uint64 *lb = p->lb;
5000   kmp_uint64 *ub = p->ub;
5001   void *task_dup = p->task_dup;
5002   //  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
5003   kmp_int64 st = p->st;
5004   kmp_uint64 ub_glob = p->ub_glob;
5005   kmp_uint64 num_tasks = p->num_tasks;
5006   kmp_uint64 grainsize = p->grainsize;
5007   kmp_uint64 extras = p->extras;
5008   kmp_int64 last_chunk = p->last_chunk;
5009   kmp_uint64 tc = p->tc;
5010   kmp_uint64 num_t_min = p->num_t_min;
5011 #if OMPT_SUPPORT
5012   void *codeptr_ra = p->codeptr_ra;
5013 #endif
5014 #if KMP_DEBUG
5015   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5016   KMP_DEBUG_ASSERT(task != NULL);
5017   KA_TRACE(20,
5018            ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
5019             " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
5020             gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
5021             st, task_dup));
5022 #endif
5023   KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
5024   if (num_tasks > num_t_min)
5025     __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
5026                          grainsize, extras, last_chunk, tc, num_t_min,
5027 #if OMPT_SUPPORT
5028                          codeptr_ra,
5029 #endif
5030                          task_dup);
5031   else
5032     __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
5033                           grainsize, extras, last_chunk, tc,
5034 #if OMPT_SUPPORT
5035                           codeptr_ra,
5036 #endif
5037                           task_dup);
5038 
5039   KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
5040   return 0;
5041 }
5042 
5043 // Schedule part of the taskloop as a task,
5044 // execute the rest of the taskloop.
5045 //
5046 // loc        Source location information
5047 // gtid       Global thread ID
5048 // task       Pattern task, exposes the loop iteration range
5049 // lb         Pointer to loop lower bound in task structure
5050 // ub         Pointer to loop upper bound in task structure
5051 // st         Loop stride
5052 // ub_glob    Global upper bound (used for lastprivate check)
5053 // num_tasks  Number of tasks to execute
5054 // grainsize  Number of loop iterations per task
5055 // extras     Number of chunks with grainsize+1 iterations
5056 // last_chunk Reduction of grainsize for last task
5057 // tc         Iterations count
5058 // num_t_min  Threshold to launch tasks recursively
5059 // task_dup   Tasks duplication routine
5060 // codeptr_ra Return address for OMPT events
5061 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
5062                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5063                           kmp_uint64 ub_glob, kmp_uint64 num_tasks,
5064                           kmp_uint64 grainsize, kmp_uint64 extras,
5065                           kmp_int64 last_chunk, kmp_uint64 tc,
5066                           kmp_uint64 num_t_min,
5067 #if OMPT_SUPPORT
5068                           void *codeptr_ra,
5069 #endif
5070                           void *task_dup) {
5071   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5072   KMP_DEBUG_ASSERT(task != NULL);
5073   KMP_DEBUG_ASSERT(num_tasks > num_t_min);
5074   KA_TRACE(20,
5075            ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
5076             " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
5077             gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
5078             st, task_dup));
5079   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
5080   kmp_uint64 lower = *lb;
5081   kmp_info_t *thread = __kmp_threads[gtid];
5082   //  kmp_taskdata_t *current_task = thread->th.th_current_task;
5083   kmp_task_t *next_task;
5084   size_t lower_offset =
5085       (char *)lb - (char *)task; // remember offset of lb in the task structure
5086   size_t upper_offset =
5087       (char *)ub - (char *)task; // remember offset of ub in the task structure
5088 
5089   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5090                              (last_chunk < 0 ? last_chunk : extras));
5091   KMP_DEBUG_ASSERT(num_tasks > extras);
5092   KMP_DEBUG_ASSERT(num_tasks > 0);
5093 
5094   // split the loop in two halves
5095   kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
5096   kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
5097   kmp_uint64 gr_size0 = grainsize;
5098   kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
5099   kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
5100   if (last_chunk < 0) {
5101     ext0 = ext1 = 0;
5102     last_chunk1 = last_chunk;
5103     tc0 = grainsize * n_tsk0;
5104     tc1 = tc - tc0;
5105   } else if (n_tsk0 <= extras) {
5106     gr_size0++; // integrate extras into grainsize
5107     ext0 = 0; // no extra iters in 1st half
5108     ext1 = extras - n_tsk0; // remaining extras
5109     tc0 = gr_size0 * n_tsk0;
5110     tc1 = tc - tc0;
5111   } else { // n_tsk0 > extras
5112     ext1 = 0; // no extra iters in 2nd half
5113     ext0 = extras;
5114     tc1 = grainsize * n_tsk1;
5115     tc0 = tc - tc1;
5116   }
5117   ub0 = lower + st * (tc0 - 1);
5118   lb1 = ub0 + st;
5119 
5120   // create pattern task for 2nd half of the loop
5121 #if OMPX_TASKGRAPH
5122   next_task = __kmp_task_dup_alloc(thread, task,
5123                                    /* taskloop_recur */ 1);
5124 #else
5125   next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
5126 #endif
5127   // adjust lower bound (upper bound is not changed) for the 2nd half
5128   *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
5129   if (ptask_dup != NULL) // construct firstprivates, etc.
5130     ptask_dup(next_task, task, 0);
5131   *ub = ub0; // adjust upper bound for the 1st half
5132 
5133   // create auxiliary task for 2nd half of the loop
5134   // make sure new task has same parent task as the pattern task
5135   kmp_taskdata_t *current_task = thread->th.th_current_task;
5136   thread->th.th_current_task = taskdata->td_parent;
5137   kmp_task_t *new_task =
5138       __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
5139                             sizeof(__taskloop_params_t), &__kmp_taskloop_task);
5140   // restore current task
5141   thread->th.th_current_task = current_task;
5142   __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
5143   p->task = next_task;
5144   p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
5145   p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
5146   p->task_dup = task_dup;
5147   p->st = st;
5148   p->ub_glob = ub_glob;
5149   p->num_tasks = n_tsk1;
5150   p->grainsize = grainsize;
5151   p->extras = ext1;
5152   p->last_chunk = last_chunk1;
5153   p->tc = tc1;
5154   p->num_t_min = num_t_min;
5155 #if OMPT_SUPPORT
5156   p->codeptr_ra = codeptr_ra;
5157 #endif
5158 
5159 #if OMPX_TASKGRAPH
5160   kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
5161   new_task_data->tdg = taskdata->tdg;
5162   new_task_data->is_taskgraph = 0;
5163 #endif
5164 
5165 #if OMPT_SUPPORT
5166   // schedule new task with correct return address for OMPT events
5167   __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
5168 #else
5169   __kmp_omp_task(gtid, new_task, true); // schedule new task
5170 #endif
5171 
5172   // execute the 1st half of current subrange
5173   if (n_tsk0 > num_t_min)
5174     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
5175                          ext0, last_chunk0, tc0, num_t_min,
5176 #if OMPT_SUPPORT
5177                          codeptr_ra,
5178 #endif
5179                          task_dup);
5180   else
5181     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
5182                           gr_size0, ext0, last_chunk0, tc0,
5183 #if OMPT_SUPPORT
5184                           codeptr_ra,
5185 #endif
5186                           task_dup);
5187 
5188   KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
5189 }
5190 
5191 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5192                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5193                            int nogroup, int sched, kmp_uint64 grainsize,
5194                            int modifier, void *task_dup) {
5195   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5196   KMP_DEBUG_ASSERT(task != NULL);
5197   if (nogroup == 0) {
5198 #if OMPT_SUPPORT && OMPT_OPTIONAL
5199     OMPT_STORE_RETURN_ADDRESS(gtid);
5200 #endif
5201     __kmpc_taskgroup(loc, gtid);
5202   }
5203 
5204 #if OMPX_TASKGRAPH
5205   KMP_ATOMIC_DEC(&__kmp_tdg_task_id);
5206 #endif
5207   // =========================================================================
5208   // calculate loop parameters
5209   kmp_taskloop_bounds_t task_bounds(task, lb, ub);
5210   kmp_uint64 tc;
5211   // compiler provides global bounds here
5212   kmp_uint64 lower = task_bounds.get_lb();
5213   kmp_uint64 upper = task_bounds.get_ub();
5214   kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
5215   kmp_uint64 num_tasks = 0, extras = 0;
5216   kmp_int64 last_chunk =
5217       0; // reduce grainsize of last task by last_chunk in strict mode
5218   kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
5219   kmp_info_t *thread = __kmp_threads[gtid];
5220   kmp_taskdata_t *current_task = thread->th.th_current_task;
5221 
5222   KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
5223                 "grain %llu(%d, %d), dup %p\n",
5224                 gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
5225                 task_dup));
5226 
5227   // compute trip count
5228   if (st == 1) { // most common case
5229     tc = upper - lower + 1;
5230   } else if (st < 0) {
5231     tc = (lower - upper) / (-st) + 1;
5232   } else { // st > 0
5233     tc = (upper - lower) / st + 1;
5234   }
5235   if (tc == 0) {
5236     KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
5237     // free the pattern task and exit
5238     __kmp_task_start(gtid, task, current_task);
5239     // do not execute anything for zero-trip loop
5240     __kmp_task_finish<false>(gtid, task, current_task);
5241     return;
5242   }
5243 
5244 #if OMPT_SUPPORT && OMPT_OPTIONAL
5245   ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
5246   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
5247   if (ompt_enabled.ompt_callback_work) {
5248     ompt_callbacks.ompt_callback(ompt_callback_work)(
5249         ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
5250         &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5251   }
5252 #endif
5253 
5254   if (num_tasks_min == 0)
5255     // TODO: can we choose better default heuristic?
5256     num_tasks_min =
5257         KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
5258 
5259   // compute num_tasks/grainsize based on the input provided
5260   switch (sched) {
5261   case 0: // no schedule clause specified, we can choose the default
5262     // let's try to schedule (team_size*10) tasks
5263     grainsize = thread->th.th_team_nproc * static_cast<kmp_uint64>(10);
5264     KMP_FALLTHROUGH();
5265   case 2: // num_tasks provided
5266     if (grainsize > tc) {
5267       num_tasks = tc; // too big num_tasks requested, adjust values
5268       grainsize = 1;
5269       extras = 0;
5270     } else {
5271       num_tasks = grainsize;
5272       grainsize = tc / num_tasks;
5273       extras = tc % num_tasks;
5274     }
5275     break;
5276   case 1: // grainsize provided
5277     if (grainsize > tc) {
5278       num_tasks = 1;
5279       grainsize = tc; // too big grainsize requested, adjust values
5280       extras = 0;
5281     } else {
5282       if (modifier) {
5283         num_tasks = (tc + grainsize - 1) / grainsize;
5284         last_chunk = tc - (num_tasks * grainsize);
5285         extras = 0;
5286       } else {
5287         num_tasks = tc / grainsize;
5288         // adjust grainsize for balanced distribution of iterations
5289         grainsize = tc / num_tasks;
5290         extras = tc % num_tasks;
5291       }
5292     }
5293     break;
5294   default:
5295     KMP_ASSERT2(0, "unknown scheduling of taskloop");
5296   }
5297 
5298   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5299                              (last_chunk < 0 ? last_chunk : extras));
5300   KMP_DEBUG_ASSERT(num_tasks > extras);
5301   KMP_DEBUG_ASSERT(num_tasks > 0);
5302   // =========================================================================
5303 
5304   // check if clause value first
5305   // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
5306   if (if_val == 0) { // if(0) specified, mark task as serial
5307     taskdata->td_flags.task_serial = 1;
5308     taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
5309     // always start serial tasks linearly
5310     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5311                           grainsize, extras, last_chunk, tc,
5312 #if OMPT_SUPPORT
5313                           OMPT_GET_RETURN_ADDRESS(0),
5314 #endif
5315                           task_dup);
5316     // !taskdata->td_flags.native => currently force linear spawning of tasks
5317     // for GOMP_taskloop
5318   } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
5319     KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
5320                   "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5321                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5322                   last_chunk));
5323     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5324                          grainsize, extras, last_chunk, tc, num_tasks_min,
5325 #if OMPT_SUPPORT
5326                          OMPT_GET_RETURN_ADDRESS(0),
5327 #endif
5328                          task_dup);
5329   } else {
5330     KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
5331                   "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5332                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5333                   last_chunk));
5334     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5335                           grainsize, extras, last_chunk, tc,
5336 #if OMPT_SUPPORT
5337                           OMPT_GET_RETURN_ADDRESS(0),
5338 #endif
5339                           task_dup);
5340   }
5341 
5342 #if OMPT_SUPPORT && OMPT_OPTIONAL
5343   if (ompt_enabled.ompt_callback_work) {
5344     ompt_callbacks.ompt_callback(ompt_callback_work)(
5345         ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
5346         &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5347   }
5348 #endif
5349 
5350   if (nogroup == 0) {
5351 #if OMPT_SUPPORT && OMPT_OPTIONAL
5352     OMPT_STORE_RETURN_ADDRESS(gtid);
5353 #endif
5354     __kmpc_end_taskgroup(loc, gtid);
5355   }
5356   KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
5357 }
5358 
5359 /*!
5360 @ingroup TASKING
5361 @param loc       Source location information
5362 @param gtid      Global thread ID
5363 @param task      Task structure
5364 @param if_val    Value of the if clause
5365 @param lb        Pointer to loop lower bound in task structure
5366 @param ub        Pointer to loop upper bound in task structure
5367 @param st        Loop stride
5368 @param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
5369 @param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
5370 @param grainsize Schedule value if specified
5371 @param task_dup  Tasks duplication routine
5372 
5373 Execute the taskloop construct.
5374 */
5375 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5376                      kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
5377                      int sched, kmp_uint64 grainsize, void *task_dup) {
5378   __kmp_assert_valid_gtid(gtid);
5379   KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
5380   __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5381                  0, task_dup);
5382   KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
5383 }
5384 
5385 /*!
5386 @ingroup TASKING
5387 @param loc       Source location information
5388 @param gtid      Global thread ID
5389 @param task      Task structure
5390 @param if_val    Value of the if clause
5391 @param lb        Pointer to loop lower bound in task structure
5392 @param ub        Pointer to loop upper bound in task structure
5393 @param st        Loop stride
5394 @param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
5395 @param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
5396 @param grainsize Schedule value if specified
5397 @param modifier  Modifier 'strict' for sched, 1 if present, 0 otherwise
5398 @param task_dup  Tasks duplication routine
5399 
5400 Execute the taskloop construct.
5401 */
5402 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5403                        kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5404                        int nogroup, int sched, kmp_uint64 grainsize,
5405                        int modifier, void *task_dup) {
5406   __kmp_assert_valid_gtid(gtid);
5407   KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
5408   __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5409                  modifier, task_dup);
5410   KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
5411 }
5412 
5413 /*!
5414 @ingroup TASKING
5415 @param gtid Global Thread ID of current thread
5416 @return Returns a pointer to the thread's current task async handle. If no task
5417 is present or gtid is invalid, returns NULL.
5418 
5419 Acqurires a pointer to the target async handle from the current task.
5420 */
5421 void **__kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid) {
5422   if (gtid == KMP_GTID_DNE)
5423     return NULL;
5424 
5425   kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5426   kmp_taskdata_t *taskdata = thread->th.th_current_task;
5427 
5428   if (!taskdata)
5429     return NULL;
5430 
5431   return &taskdata->td_target_data.async_handle;
5432 }
5433 
5434 /*!
5435 @ingroup TASKING
5436 @param gtid Global Thread ID of current thread
5437 @return Returns TRUE if the current task being executed of the given thread has
5438 a task team allocated to it. Otherwise, returns FALSE.
5439 
5440 Checks if the current thread has a task team.
5441 */
5442 bool __kmpc_omp_has_task_team(kmp_int32 gtid) {
5443   if (gtid == KMP_GTID_DNE)
5444     return FALSE;
5445 
5446   kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5447   kmp_taskdata_t *taskdata = thread->th.th_current_task;
5448 
5449   if (!taskdata)
5450     return FALSE;
5451 
5452   return taskdata->td_task_team != NULL;
5453 }
5454 
5455 #if OMPX_TASKGRAPH
5456 // __kmp_find_tdg: identify a TDG through its ID
5457 // tdg_id: ID of the TDG
5458 // returns: If a TDG corresponding to this ID is found and not
5459 // its initial state, return the pointer to it, otherwise nullptr
5460 static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
5461   kmp_tdg_info_t *res = nullptr;
5462   if (__kmp_max_tdgs == 0)
5463     return res;
5464 
5465   if (__kmp_global_tdgs == NULL)
5466     __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(
5467         sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs);
5468 
5469   if ((__kmp_global_tdgs[tdg_id]) &&
5470       (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))
5471     res = __kmp_global_tdgs[tdg_id];
5472   return res;
5473 }
5474 
5475 // __kmp_print_tdg_dot: prints the TDG to a dot file
5476 // tdg:    ID of the TDG
5477 // gtid:   Global Thread ID
5478 void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg, kmp_int32 gtid) {
5479   kmp_int32 tdg_id = tdg->tdg_id;
5480   KA_TRACE(10, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id));
5481 
5482   char file_name[20];
5483   sprintf(file_name, "tdg_%d.dot", tdg_id);
5484   kmp_safe_raii_file_t tdg_file(file_name, "w");
5485 
5486   kmp_int32 num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5487   fprintf(tdg_file,
5488           "digraph TDG {\n"
5489           "   compound=true\n"
5490           "   subgraph cluster {\n"
5491           "      label=TDG_%d\n",
5492           tdg_id);
5493   for (kmp_int32 i = 0; i < num_tasks; i++) {
5494     fprintf(tdg_file, "      %d[style=bold]\n", i);
5495   }
5496   fprintf(tdg_file, "   }\n");
5497   for (kmp_int32 i = 0; i < num_tasks; i++) {
5498     kmp_int32 nsuccessors = tdg->record_map[i].nsuccessors;
5499     kmp_int32 *successors = tdg->record_map[i].successors;
5500     if (nsuccessors > 0) {
5501       for (kmp_int32 j = 0; j < nsuccessors; j++)
5502         fprintf(tdg_file, "   %d -> %d \n", i, successors[j]);
5503     }
5504   }
5505   fprintf(tdg_file, "}");
5506   KA_TRACE(10, ("__kmp_print_tdg_dot(exit): T#%d tdg_id=%d \n", gtid, tdg_id));
5507 }
5508 
5509 // __kmp_exec_tdg: launch the execution of a previous
5510 // recorded TDG
5511 // gtid:   Global Thread ID
5512 // tdg:    ID of the TDG
5513 void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5514   KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);
5515   KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,
5516                 tdg->tdg_id, tdg->num_roots));
5517   kmp_node_info_t *this_record_map = tdg->record_map;
5518   kmp_int32 *this_root_tasks = tdg->root_tasks;
5519   kmp_int32 this_num_roots = tdg->num_roots;
5520   kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5521 
5522   kmp_info_t *thread = __kmp_threads[gtid];
5523   kmp_taskdata_t *parent_task = thread->th.th_current_task;
5524 
5525   if (tdg->rec_taskred_data) {
5526     __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);
5527   }
5528 
5529   for (kmp_int32 j = 0; j < this_num_tasks; j++) {
5530     kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);
5531 
5532     td->td_parent = parent_task;
5533     this_record_map[j].parent_task = parent_task;
5534 
5535     kmp_taskgroup_t *parent_taskgroup =
5536         this_record_map[j].parent_task->td_taskgroup;
5537 
5538     KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,
5539                       this_record_map[j].npredecessors);
5540     KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);
5541 
5542     if (parent_taskgroup) {
5543       KMP_ATOMIC_INC(&parent_taskgroup->count);
5544       // The taskgroup is different so we must update it
5545       td->td_taskgroup = parent_taskgroup;
5546     } else if (td->td_taskgroup != nullptr) {
5547       // If the parent doesnt have a taskgroup, remove it from the task
5548       td->td_taskgroup = nullptr;
5549     }
5550     if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)
5551       KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);
5552   }
5553 
5554   for (kmp_int32 j = 0; j < this_num_roots; ++j) {
5555     __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);
5556   }
5557   KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,
5558                 tdg->tdg_id, tdg->num_roots));
5559 }
5560 
5561 // __kmp_start_record: set up a TDG structure and turn the
5562 // recording flag to true
5563 // gtid:        Global Thread ID of the encountering thread
5564 // input_flags: Flags associated with the TDG
5565 // tdg_id:      ID of the TDG to record
5566 static inline void __kmp_start_record(kmp_int32 gtid,
5567                                       kmp_taskgraph_flags_t *flags,
5568                                       kmp_int32 tdg_id) {
5569   kmp_tdg_info_t *tdg =
5570       (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t));
5571   __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;
5572   // Initializing the TDG structure
5573   tdg->tdg_id = tdg_id;
5574   tdg->map_size = INIT_MAPSIZE;
5575   tdg->num_roots = -1;
5576   tdg->root_tasks = nullptr;
5577   tdg->tdg_status = KMP_TDG_RECORDING;
5578   tdg->rec_num_taskred = 0;
5579   tdg->rec_taskred_data = nullptr;
5580   KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0);
5581 
5582   // Initializing the list of nodes in this TDG
5583   kmp_node_info_t *this_record_map =
5584       (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t));
5585   for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) {
5586     kmp_int32 *successorsList =
5587         (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32));
5588     this_record_map[i].task = nullptr;
5589     this_record_map[i].successors = successorsList;
5590     this_record_map[i].nsuccessors = 0;
5591     this_record_map[i].npredecessors = 0;
5592     this_record_map[i].successors_size = __kmp_successors_size;
5593     KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0);
5594   }
5595 
5596   __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;
5597 }
5598 
5599 // __kmpc_start_record_task: Wrapper around __kmp_start_record to mark
5600 // the beginning of the record process of a task region
5601 // loc_ref:     Location of TDG, not used yet
5602 // gtid:        Global Thread ID of the encountering thread
5603 // input_flags: Flags associated with the TDG
5604 // tdg_id:      ID of the TDG to record, for now, incremental integer
5605 // returns:     1 if we record, otherwise, 0
5606 kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid,
5607                                    kmp_int32 input_flags, kmp_int32 tdg_id) {
5608 
5609   kmp_int32 res;
5610   kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags;
5611   KA_TRACE(10,
5612            ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",
5613             gtid, loc_ref, input_flags, tdg_id));
5614 
5615   if (__kmp_max_tdgs == 0) {
5616     KA_TRACE(
5617         10,
5618         ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "
5619          "__kmp_max_tdgs = 0\n",
5620          gtid, loc_ref, input_flags, tdg_id));
5621     return 1;
5622   }
5623 
5624   __kmpc_taskgroup(loc_ref, gtid);
5625   if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {
5626     // TODO: use re_record flag
5627     __kmp_exec_tdg(gtid, tdg);
5628     res = 0;
5629   } else {
5630     __kmp_curr_tdg_idx = tdg_id;
5631     KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);
5632     __kmp_start_record(gtid, flags, tdg_id);
5633     __kmp_num_tdg++;
5634     res = 1;
5635   }
5636   KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",
5637                 gtid, tdg_id, res ? "record" : "execute"));
5638   return res;
5639 }
5640 
5641 // __kmp_end_record: set up a TDG after recording it
5642 // gtid:   Global thread ID
5643 // tdg:    Pointer to the TDG
5644 void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5645   // Store roots
5646   kmp_node_info_t *this_record_map = tdg->record_map;
5647   kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5648   kmp_int32 *this_root_tasks =
5649       (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32));
5650   kmp_int32 this_map_size = tdg->map_size;
5651   kmp_int32 this_num_roots = 0;
5652   kmp_info_t *thread = __kmp_threads[gtid];
5653 
5654   for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5655     if (this_record_map[i].npredecessors == 0) {
5656       this_root_tasks[this_num_roots++] = i;
5657     }
5658   }
5659 
5660   // Update with roots info and mapsize
5661   tdg->map_size = this_map_size;
5662   tdg->num_roots = this_num_roots;
5663   tdg->root_tasks = this_root_tasks;
5664   KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);
5665   tdg->tdg_status = KMP_TDG_READY;
5666 
5667   if (thread->th.th_current_task->td_dephash) {
5668     __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);
5669     thread->th.th_current_task->td_dephash = NULL;
5670   }
5671 
5672   // Reset predecessor counter
5673   for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5674     KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,
5675                       this_record_map[i].npredecessors);
5676   }
5677   KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);
5678 
5679   if (__kmp_tdg_dot)
5680     __kmp_print_tdg_dot(tdg, gtid);
5681 }
5682 
5683 // __kmpc_end_record_task: wrapper around __kmp_end_record to mark
5684 // the end of recording phase
5685 //
5686 // loc_ref:      Source location information
5687 // gtid:         Global thread ID
5688 // input_flags:  Flags attached to the graph
5689 // tdg_id:       ID of the TDG just finished recording
5690 void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,
5691                             kmp_int32 input_flags, kmp_int32 tdg_id) {
5692   kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);
5693 
5694   KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"
5695                 " tdg=%d with flags=%d\n",
5696                 gtid, loc_ref, tdg_id, input_flags));
5697   if (__kmp_max_tdgs) {
5698     // TODO: use input_flags->nowait
5699     __kmpc_end_taskgroup(loc_ref, gtid);
5700     if (__kmp_tdg_is_recording(tdg->tdg_status))
5701       __kmp_end_record(gtid, tdg);
5702   }
5703   KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"
5704                 " tdg=%d, its status is now READY\n",
5705                 gtid, loc_ref, tdg_id));
5706 }
5707 #endif
5708