xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_taskdeps.cpp (revision 7ab1a32cd43cbae61ad4dd435d6a482bbf61cb52)
1 /*
2  * kmp_taskdeps.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 //#define KMP_SUPPORT_GRAPH_OUTPUT 1
14 
15 #include "kmp.h"
16 #include "kmp_io.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 #if OMPT_SUPPORT
20 #include "ompt-specific.h"
21 #endif
22 
23 // TODO: Improve memory allocation? keep a list of pre-allocated structures?
24 // allocate in blocks? re-use list finished list entries?
25 // TODO: don't use atomic ref counters for stack-allocated nodes.
26 // TODO: find an alternate to atomic refs for heap-allocated nodes?
27 // TODO: Finish graph output support
28 // TODO: kmp_lock_t seems a tad to big (and heavy weight) for this. Check other
29 // runtime locks
30 // TODO: Any ITT support needed?
31 
32 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
33 static std::atomic<kmp_int32> kmp_node_id_seed = 0;
34 #endif
35 
36 static void __kmp_init_node(kmp_depnode_t *node) {
37   node->dn.successors = NULL;
38   node->dn.task = NULL; // will point to the right task
39   // once dependences have been processed
40   for (int i = 0; i < MAX_MTX_DEPS; ++i)
41     node->dn.mtx_locks[i] = NULL;
42   node->dn.mtx_num_locks = 0;
43   __kmp_init_lock(&node->dn.lock);
44   KMP_ATOMIC_ST_RLX(&node->dn.nrefs, 1); // init creates the first reference
45 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
46   node->dn.id = KMP_ATOMIC_INC(&kmp_node_id_seed);
47 #endif
48 #if USE_ITT_BUILD && USE_ITT_NOTIFY
49   __itt_sync_create(node, "OMP task dep node", NULL, 0);
50 #endif
51 }
52 
53 static inline kmp_depnode_t *__kmp_node_ref(kmp_depnode_t *node) {
54   KMP_ATOMIC_INC(&node->dn.nrefs);
55   return node;
56 }
57 
58 enum { KMP_DEPHASH_OTHER_SIZE = 97, KMP_DEPHASH_MASTER_SIZE = 997 };
59 
60 size_t sizes[] = {997, 2003, 4001, 8191, 16001, 32003, 64007, 131071, 270029};
61 const size_t MAX_GEN = 8;
62 
63 static inline size_t __kmp_dephash_hash(kmp_intptr_t addr, size_t hsize) {
64   // TODO alternate to try: set = (((Addr64)(addrUsefulBits * 9.618)) %
65   // m_num_sets );
66   return ((addr >> 6) ^ (addr >> 2)) % hsize;
67 }
68 
69 static kmp_dephash_t *__kmp_dephash_extend(kmp_info_t *thread,
70                                            kmp_dephash_t *current_dephash) {
71   kmp_dephash_t *h;
72 
73   size_t gen = current_dephash->generation + 1;
74   if (gen >= MAX_GEN)
75     return current_dephash;
76   size_t new_size = sizes[gen];
77 
78   size_t size_to_allocate =
79       new_size * sizeof(kmp_dephash_entry_t *) + sizeof(kmp_dephash_t);
80 
81 #if USE_FAST_MEMORY
82   h = (kmp_dephash_t *)__kmp_fast_allocate(thread, size_to_allocate);
83 #else
84   h = (kmp_dephash_t *)__kmp_thread_malloc(thread, size_to_allocate);
85 #endif
86 
87   h->size = new_size;
88   h->nelements = current_dephash->nelements;
89   h->buckets = (kmp_dephash_entry **)(h + 1);
90   h->generation = gen;
91   h->nconflicts = 0;
92   h->last_all = current_dephash->last_all;
93 
94   // make sure buckets are properly initialized
95   for (size_t i = 0; i < new_size; i++) {
96     h->buckets[i] = NULL;
97   }
98 
99   // insert existing elements in the new table
100   for (size_t i = 0; i < current_dephash->size; i++) {
101     kmp_dephash_entry_t *next, *entry;
102     for (entry = current_dephash->buckets[i]; entry; entry = next) {
103       next = entry->next_in_bucket;
104       // Compute the new hash using the new size, and insert the entry in
105       // the new bucket.
106       size_t new_bucket = __kmp_dephash_hash(entry->addr, h->size);
107       entry->next_in_bucket = h->buckets[new_bucket];
108       if (entry->next_in_bucket) {
109         h->nconflicts++;
110       }
111       h->buckets[new_bucket] = entry;
112     }
113   }
114 
115   // Free old hash table
116 #if USE_FAST_MEMORY
117   __kmp_fast_free(thread, current_dephash);
118 #else
119   __kmp_thread_free(thread, current_dephash);
120 #endif
121 
122   return h;
123 }
124 
125 static kmp_dephash_t *__kmp_dephash_create(kmp_info_t *thread,
126                                            kmp_taskdata_t *current_task) {
127   kmp_dephash_t *h;
128 
129   size_t h_size;
130 
131   if (current_task->td_flags.tasktype == TASK_IMPLICIT)
132     h_size = KMP_DEPHASH_MASTER_SIZE;
133   else
134     h_size = KMP_DEPHASH_OTHER_SIZE;
135 
136   size_t size = h_size * sizeof(kmp_dephash_entry_t *) + sizeof(kmp_dephash_t);
137 
138 #if USE_FAST_MEMORY
139   h = (kmp_dephash_t *)__kmp_fast_allocate(thread, size);
140 #else
141   h = (kmp_dephash_t *)__kmp_thread_malloc(thread, size);
142 #endif
143   h->size = h_size;
144 
145   h->generation = 0;
146   h->nelements = 0;
147   h->nconflicts = 0;
148   h->buckets = (kmp_dephash_entry **)(h + 1);
149   h->last_all = NULL;
150 
151   for (size_t i = 0; i < h_size; i++)
152     h->buckets[i] = 0;
153 
154   return h;
155 }
156 
157 static kmp_dephash_entry *__kmp_dephash_find(kmp_info_t *thread,
158                                              kmp_dephash_t **hash,
159                                              kmp_intptr_t addr) {
160   kmp_dephash_t *h = *hash;
161   if (h->nelements != 0 && h->nconflicts / h->size >= 1) {
162     *hash = __kmp_dephash_extend(thread, h);
163     h = *hash;
164   }
165   size_t bucket = __kmp_dephash_hash(addr, h->size);
166 
167   kmp_dephash_entry_t *entry;
168   for (entry = h->buckets[bucket]; entry; entry = entry->next_in_bucket)
169     if (entry->addr == addr)
170       break;
171 
172   if (entry == NULL) {
173 // create entry. This is only done by one thread so no locking required
174 #if USE_FAST_MEMORY
175     entry = (kmp_dephash_entry_t *)__kmp_fast_allocate(
176         thread, sizeof(kmp_dephash_entry_t));
177 #else
178     entry = (kmp_dephash_entry_t *)__kmp_thread_malloc(
179         thread, sizeof(kmp_dephash_entry_t));
180 #endif
181     entry->addr = addr;
182     if (!h->last_all) // no predecessor task with omp_all_memory dependence
183       entry->last_out = NULL;
184     else // else link the omp_all_memory depnode to the new entry
185       entry->last_out = __kmp_node_ref(h->last_all);
186     entry->last_set = NULL;
187     entry->prev_set = NULL;
188     entry->last_flag = 0;
189     entry->mtx_lock = NULL;
190     entry->next_in_bucket = h->buckets[bucket];
191     h->buckets[bucket] = entry;
192     h->nelements++;
193     if (entry->next_in_bucket)
194       h->nconflicts++;
195   }
196   return entry;
197 }
198 
199 static kmp_depnode_list_t *__kmp_add_node(kmp_info_t *thread,
200                                           kmp_depnode_list_t *list,
201                                           kmp_depnode_t *node) {
202   kmp_depnode_list_t *new_head;
203 
204 #if USE_FAST_MEMORY
205   new_head = (kmp_depnode_list_t *)__kmp_fast_allocate(
206       thread, sizeof(kmp_depnode_list_t));
207 #else
208   new_head = (kmp_depnode_list_t *)__kmp_thread_malloc(
209       thread, sizeof(kmp_depnode_list_t));
210 #endif
211 
212   new_head->node = __kmp_node_ref(node);
213   new_head->next = list;
214 
215   return new_head;
216 }
217 
218 static inline void __kmp_track_dependence(kmp_int32 gtid, kmp_depnode_t *source,
219                                           kmp_depnode_t *sink,
220                                           kmp_task_t *sink_task) {
221 #if OMPX_TASKGRAPH
222   kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
223   kmp_taskdata_t *task_sink = KMP_TASK_TO_TASKDATA(sink_task);
224   if (source->dn.task && sink_task) {
225     // Not supporting dependency between two tasks that one is within the TDG
226     // and the other is not
227     KMP_ASSERT(task_source->is_taskgraph == task_sink->is_taskgraph);
228   }
229   if (task_sink->is_taskgraph &&
230       __kmp_tdg_is_recording(task_sink->tdg->tdg_status)) {
231     kmp_node_info_t *source_info =
232         &task_sink->tdg->record_map[task_source->td_task_id];
233     bool exists = false;
234     for (int i = 0; i < source_info->nsuccessors; i++) {
235       if (source_info->successors[i] == task_sink->td_task_id) {
236         exists = true;
237         break;
238       }
239     }
240     if (!exists) {
241       if (source_info->nsuccessors >= source_info->successors_size) {
242         source_info->successors_size = 2 * source_info->successors_size;
243         kmp_int32 *old_succ_ids = source_info->successors;
244         kmp_int32 *new_succ_ids = (kmp_int32 *)__kmp_allocate(
245             source_info->successors_size * sizeof(kmp_int32));
246         source_info->successors = new_succ_ids;
247         __kmp_free(old_succ_ids);
248       }
249 
250       source_info->successors[source_info->nsuccessors] = task_sink->td_task_id;
251       source_info->nsuccessors++;
252 
253       kmp_node_info_t *sink_info =
254           &(task_sink->tdg->record_map[task_sink->td_task_id]);
255       sink_info->npredecessors++;
256     }
257   }
258 #endif
259 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
260   kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
261   // do not use sink->dn.task as that is only filled after the dependences
262   // are already processed!
263   kmp_taskdata_t *task_sink = KMP_TASK_TO_TASKDATA(sink_task);
264 
265   __kmp_printf("%d(%s) -> %d(%s)\n", source->dn.id,
266                task_source->td_ident->psource, sink->dn.id,
267                task_sink->td_ident->psource);
268 #endif
269 #if OMPT_SUPPORT && OMPT_OPTIONAL
270   /* OMPT tracks dependences between task (a=source, b=sink) in which
271      task a blocks the execution of b through the ompt_new_dependence_callback
272      */
273   if (ompt_enabled.ompt_callback_task_dependence) {
274     kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
275     ompt_data_t *sink_data;
276     if (sink_task)
277       sink_data = &(KMP_TASK_TO_TASKDATA(sink_task)->ompt_task_info.task_data);
278     else
279       sink_data = &__kmp_threads[gtid]->th.ompt_thread_info.task_data;
280 
281     ompt_callbacks.ompt_callback(ompt_callback_task_dependence)(
282         &(task_source->ompt_task_info.task_data), sink_data);
283   }
284 #endif /* OMPT_SUPPORT && OMPT_OPTIONAL */
285 }
286 
287 kmp_base_depnode_t *__kmpc_task_get_depnode(kmp_task_t *task) {
288   kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
289   return td->td_depnode ? &(td->td_depnode->dn) : NULL;
290 }
291 
292 kmp_depnode_list_t *__kmpc_task_get_successors(kmp_task_t *task) {
293   kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
294   return td->td_depnode->dn.successors;
295 }
296 
297 static inline kmp_int32
298 __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread,
299                              kmp_task_t *task, kmp_depnode_t *node,
300                              kmp_depnode_list_t *plist) {
301   if (!plist)
302     return 0;
303   kmp_int32 npredecessors = 0;
304   // link node as successor of list elements
305   for (kmp_depnode_list_t *p = plist; p; p = p->next) {
306     kmp_depnode_t *dep = p->node;
307 #if OMPX_TASKGRAPH
308     kmp_tdg_status tdg_status = KMP_TDG_NONE;
309     if (task) {
310       kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
311       if (td->is_taskgraph)
312         tdg_status = KMP_TASK_TO_TASKDATA(task)->tdg->tdg_status;
313       if (__kmp_tdg_is_recording(tdg_status))
314         __kmp_track_dependence(gtid, dep, node, task);
315     }
316 #endif
317     if (dep->dn.task) {
318       KMP_ACQUIRE_DEPNODE(gtid, dep);
319       if (dep->dn.task) {
320         if (!dep->dn.successors || dep->dn.successors->node != node) {
321 #if OMPX_TASKGRAPH
322           if (!(__kmp_tdg_is_recording(tdg_status)) && task)
323 #endif
324             __kmp_track_dependence(gtid, dep, node, task);
325           dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node);
326           KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
327                         "%p\n",
328                         gtid, KMP_TASK_TO_TASKDATA(dep->dn.task),
329                         KMP_TASK_TO_TASKDATA(task)));
330           npredecessors++;
331         }
332       }
333       KMP_RELEASE_DEPNODE(gtid, dep);
334     }
335   }
336   return npredecessors;
337 }
338 
339 // Add the edge 'sink' -> 'source' in the task dependency graph
340 static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
341                                                      kmp_info_t *thread,
342                                                      kmp_task_t *task,
343                                                      kmp_depnode_t *source,
344                                                      kmp_depnode_t *sink) {
345   if (!sink)
346     return 0;
347   kmp_int32 npredecessors = 0;
348 #if OMPX_TASKGRAPH
349   kmp_tdg_status tdg_status = KMP_TDG_NONE;
350   kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
351   if (task) {
352     if (td->is_taskgraph)
353       tdg_status = KMP_TASK_TO_TASKDATA(task)->tdg->tdg_status;
354     if (__kmp_tdg_is_recording(tdg_status) && sink->dn.task)
355       __kmp_track_dependence(gtid, sink, source, task);
356   }
357 #endif
358   if (sink->dn.task) {
359     // synchronously add source to sink' list of successors
360     KMP_ACQUIRE_DEPNODE(gtid, sink);
361     if (sink->dn.task) {
362       if (!sink->dn.successors || sink->dn.successors->node != source) {
363 #if OMPX_TASKGRAPH
364         if (!(__kmp_tdg_is_recording(tdg_status)) && task)
365 #endif
366           __kmp_track_dependence(gtid, sink, source, task);
367         sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source);
368         KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
369                     "%p\n",
370                     gtid, KMP_TASK_TO_TASKDATA(sink->dn.task),
371                     KMP_TASK_TO_TASKDATA(task)));
372 #if OMPX_TASKGRAPH
373         if (__kmp_tdg_is_recording(tdg_status)) {
374           kmp_taskdata_t *tdd = KMP_TASK_TO_TASKDATA(sink->dn.task);
375           if (tdd->is_taskgraph) {
376             if (tdd->td_flags.onced)
377               // decrement npredecessors if sink->dn.task belongs to a taskgraph
378               // and
379               //  1) the task is reset to its initial state (by kmp_free_task) or
380               //  2) the task is complete but not yet reset
381               npredecessors--;
382           }
383         }
384 #endif
385       npredecessors++;
386       }
387     }
388     KMP_RELEASE_DEPNODE(gtid, sink);
389   }
390   return npredecessors;
391 }
392 
393 static inline kmp_int32
394 __kmp_process_dep_all(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *h,
395                       bool dep_barrier, kmp_task_t *task) {
396   KA_TRACE(30, ("__kmp_process_dep_all: T#%d processing dep_all, "
397                 "dep_barrier = %d\n",
398                 gtid, dep_barrier));
399   kmp_info_t *thread = __kmp_threads[gtid];
400   kmp_int32 npredecessors = 0;
401 
402   // process previous omp_all_memory node if any
403   npredecessors +=
404       __kmp_depnode_link_successor(gtid, thread, task, node, h->last_all);
405   __kmp_node_deref(thread, h->last_all);
406   if (!dep_barrier) {
407     h->last_all = __kmp_node_ref(node);
408   } else {
409     // if this is a sync point in the serial sequence, then the previous
410     // outputs are guaranteed to be completed after the execution of this
411     // task so the previous output nodes can be cleared.
412     h->last_all = NULL;
413   }
414 
415   // process all regular dependences
416   for (size_t i = 0; i < h->size; i++) {
417     kmp_dephash_entry_t *info = h->buckets[i];
418     if (!info) // skip empty slots in dephash
419       continue;
420     for (; info; info = info->next_in_bucket) {
421       // for each entry the omp_all_memory works as OUT dependence
422       kmp_depnode_t *last_out = info->last_out;
423       kmp_depnode_list_t *last_set = info->last_set;
424       kmp_depnode_list_t *prev_set = info->prev_set;
425       if (last_set) {
426         npredecessors +=
427             __kmp_depnode_link_successor(gtid, thread, task, node, last_set);
428         __kmp_depnode_list_free(thread, last_set);
429         __kmp_depnode_list_free(thread, prev_set);
430         info->last_set = NULL;
431         info->prev_set = NULL;
432         info->last_flag = 0; // no sets in this dephash entry
433       } else {
434         npredecessors +=
435             __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
436       }
437       __kmp_node_deref(thread, last_out);
438       if (!dep_barrier) {
439         info->last_out = __kmp_node_ref(node);
440       } else {
441         info->last_out = NULL;
442       }
443     }
444   }
445   KA_TRACE(30, ("__kmp_process_dep_all: T#%d found %d predecessors\n", gtid,
446                 npredecessors));
447   return npredecessors;
448 }
449 
450 template <bool filter>
451 static inline kmp_int32
452 __kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t **hash,
453                    bool dep_barrier, kmp_int32 ndeps,
454                    kmp_depend_info_t *dep_list, kmp_task_t *task) {
455   KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d processing %d dependences : "
456                 "dep_barrier = %d\n",
457                 filter, gtid, ndeps, dep_barrier));
458 
459   kmp_info_t *thread = __kmp_threads[gtid];
460   kmp_int32 npredecessors = 0;
461   for (kmp_int32 i = 0; i < ndeps; i++) {
462     const kmp_depend_info_t *dep = &dep_list[i];
463 
464     if (filter && dep->base_addr == 0)
465       continue; // skip filtered entries
466 
467     kmp_dephash_entry_t *info =
468         __kmp_dephash_find(thread, hash, dep->base_addr);
469     kmp_depnode_t *last_out = info->last_out;
470     kmp_depnode_list_t *last_set = info->last_set;
471     kmp_depnode_list_t *prev_set = info->prev_set;
472 
473     if (dep->flags.out) { // out or inout --> clean lists if any
474       if (last_set) {
475         npredecessors +=
476             __kmp_depnode_link_successor(gtid, thread, task, node, last_set);
477         __kmp_depnode_list_free(thread, last_set);
478         __kmp_depnode_list_free(thread, prev_set);
479         info->last_set = NULL;
480         info->prev_set = NULL;
481         info->last_flag = 0; // no sets in this dephash entry
482       } else {
483         npredecessors +=
484             __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
485       }
486       __kmp_node_deref(thread, last_out);
487       if (!dep_barrier) {
488         info->last_out = __kmp_node_ref(node);
489       } else {
490         // if this is a sync point in the serial sequence, then the previous
491         // outputs are guaranteed to be completed after the execution of this
492         // task so the previous output nodes can be cleared.
493         info->last_out = NULL;
494       }
495     } else { // either IN or MTX or SET
496       if (info->last_flag == 0 || info->last_flag == dep->flag) {
497         // last_set either didn't exist or of same dep kind
498         // link node as successor of the last_out if any
499         npredecessors +=
500             __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
501         // link node as successor of all nodes in the prev_set if any
502         npredecessors +=
503             __kmp_depnode_link_successor(gtid, thread, task, node, prev_set);
504         if (dep_barrier) {
505           // clean last_out and prev_set if any; don't touch last_set
506           __kmp_node_deref(thread, last_out);
507           info->last_out = NULL;
508           __kmp_depnode_list_free(thread, prev_set);
509           info->prev_set = NULL;
510         }
511       } else { // last_set is of different dep kind, make it prev_set
512         // link node as successor of all nodes in the last_set
513         npredecessors +=
514             __kmp_depnode_link_successor(gtid, thread, task, node, last_set);
515         // clean last_out if any
516         __kmp_node_deref(thread, last_out);
517         info->last_out = NULL;
518         // clean prev_set if any
519         __kmp_depnode_list_free(thread, prev_set);
520         if (!dep_barrier) {
521           // move last_set to prev_set, new last_set will be allocated
522           info->prev_set = last_set;
523         } else {
524           info->prev_set = NULL;
525           info->last_flag = 0;
526         }
527         info->last_set = NULL;
528       }
529       // for dep_barrier last_flag value should remain:
530       // 0 if last_set is empty, unchanged otherwise
531       if (!dep_barrier) {
532         info->last_flag = dep->flag; // store dep kind of the last_set
533         info->last_set = __kmp_add_node(thread, info->last_set, node);
534       }
535       // check if we are processing MTX dependency
536       if (dep->flag == KMP_DEP_MTX) {
537         if (info->mtx_lock == NULL) {
538           info->mtx_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
539           __kmp_init_lock(info->mtx_lock);
540         }
541         KMP_DEBUG_ASSERT(node->dn.mtx_num_locks < MAX_MTX_DEPS);
542         kmp_int32 m;
543         // Save lock in node's array
544         for (m = 0; m < MAX_MTX_DEPS; ++m) {
545           // sort pointers in decreasing order to avoid potential livelock
546           if (node->dn.mtx_locks[m] < info->mtx_lock) {
547             KMP_DEBUG_ASSERT(!node->dn.mtx_locks[node->dn.mtx_num_locks]);
548             for (int n = node->dn.mtx_num_locks; n > m; --n) {
549               // shift right all lesser non-NULL pointers
550               KMP_DEBUG_ASSERT(node->dn.mtx_locks[n - 1] != NULL);
551               node->dn.mtx_locks[n] = node->dn.mtx_locks[n - 1];
552             }
553             node->dn.mtx_locks[m] = info->mtx_lock;
554             break;
555           }
556         }
557         KMP_DEBUG_ASSERT(m < MAX_MTX_DEPS); // must break from loop
558         node->dn.mtx_num_locks++;
559       }
560     }
561   }
562   KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d found %d predecessors\n", filter,
563                 gtid, npredecessors));
564   return npredecessors;
565 }
566 
567 #define NO_DEP_BARRIER (false)
568 #define DEP_BARRIER (true)
569 
570 // returns true if the task has any outstanding dependence
571 static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
572                              kmp_task_t *task, kmp_dephash_t **hash,
573                              bool dep_barrier, kmp_int32 ndeps,
574                              kmp_depend_info_t *dep_list,
575                              kmp_int32 ndeps_noalias,
576                              kmp_depend_info_t *noalias_dep_list) {
577   int i, n_mtxs = 0, dep_all = 0;
578 #if KMP_DEBUG
579   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
580 #endif
581   KA_TRACE(20, ("__kmp_check_deps: T#%d checking dependences for task %p : %d "
582                 "possibly aliased dependences, %d non-aliased dependences : "
583                 "dep_barrier=%d .\n",
584                 gtid, taskdata, ndeps, ndeps_noalias, dep_barrier));
585 
586   // Filter deps in dep_list
587   // TODO: Different algorithm for large dep_list ( > 10 ? )
588   for (i = 0; i < ndeps; i++) {
589     if (dep_list[i].base_addr != 0 &&
590         dep_list[i].base_addr != (kmp_intptr_t)KMP_SIZE_T_MAX) {
591       KMP_DEBUG_ASSERT(
592           dep_list[i].flag == KMP_DEP_IN || dep_list[i].flag == KMP_DEP_OUT ||
593           dep_list[i].flag == KMP_DEP_INOUT ||
594           dep_list[i].flag == KMP_DEP_MTX || dep_list[i].flag == KMP_DEP_SET);
595       for (int j = i + 1; j < ndeps; j++) {
596         if (dep_list[i].base_addr == dep_list[j].base_addr) {
597           if (dep_list[i].flag != dep_list[j].flag) {
598             // two different dependences on same address work identical to OUT
599             dep_list[i].flag = KMP_DEP_OUT;
600           }
601           dep_list[j].base_addr = 0; // Mark j element as void
602         }
603       }
604       if (dep_list[i].flag == KMP_DEP_MTX) {
605         // limit number of mtx deps to MAX_MTX_DEPS per node
606         if (n_mtxs < MAX_MTX_DEPS && task != NULL) {
607           ++n_mtxs;
608         } else {
609           dep_list[i].flag = KMP_DEP_OUT; // downgrade mutexinoutset to inout
610         }
611       }
612     } else if (dep_list[i].flag == KMP_DEP_ALL ||
613                dep_list[i].base_addr == (kmp_intptr_t)KMP_SIZE_T_MAX) {
614       // omp_all_memory dependence can be marked by compiler by either
615       // (addr=0 && flag=0x80) (flag KMP_DEP_ALL), or (addr=-1).
616       // omp_all_memory overrides all other dependences if any
617       dep_all = 1;
618       break;
619     }
620   }
621 
622   // doesn't need to be atomic as no other thread is going to be accessing this
623   // node just yet.
624   // npredecessors is set -1 to ensure that none of the releasing tasks queues
625   // this task before we have finished processing all the dependences
626   node->dn.npredecessors = -1;
627 
628   // used to pack all npredecessors additions into a single atomic operation at
629   // the end
630   int npredecessors;
631 
632   if (!dep_all) { // regular dependences
633     npredecessors = __kmp_process_deps<true>(gtid, node, hash, dep_barrier,
634                                              ndeps, dep_list, task);
635     npredecessors += __kmp_process_deps<false>(
636         gtid, node, hash, dep_barrier, ndeps_noalias, noalias_dep_list, task);
637   } else { // omp_all_memory dependence
638     npredecessors = __kmp_process_dep_all(gtid, node, *hash, dep_barrier, task);
639   }
640 
641   node->dn.task = task;
642   KMP_MB();
643 
644   // Account for our initial fake value
645   npredecessors++;
646 
647   // Update predecessors and obtain current value to check if there are still
648   // any outstanding dependences (some tasks may have finished while we
649   // processed the dependences)
650   npredecessors =
651       node->dn.npredecessors.fetch_add(npredecessors) + npredecessors;
652 
653   KA_TRACE(20, ("__kmp_check_deps: T#%d found %d predecessors for task %p \n",
654                 gtid, npredecessors, taskdata));
655 
656   // beyond this point the task could be queued (and executed) by a releasing
657   // task...
658   return npredecessors > 0 ? true : false;
659 }
660 
661 /*!
662 @ingroup TASKING
663 @param loc_ref location of the original task directive
664 @param gtid Global Thread ID of encountering thread
665 @param new_task task thunk allocated by __kmp_omp_task_alloc() for the ''new
666 task''
667 @param ndeps Number of depend items with possible aliasing
668 @param dep_list List of depend items with possible aliasing
669 @param ndeps_noalias Number of depend items with no aliasing
670 @param noalias_dep_list List of depend items with no aliasing
671 
672 @return Returns either TASK_CURRENT_NOT_QUEUED if the current task was not
673 suspended and queued, or TASK_CURRENT_QUEUED if it was suspended and queued
674 
675 Schedule a non-thread-switchable task with dependences for execution
676 */
677 kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
678                                     kmp_task_t *new_task, kmp_int32 ndeps,
679                                     kmp_depend_info_t *dep_list,
680                                     kmp_int32 ndeps_noalias,
681                                     kmp_depend_info_t *noalias_dep_list) {
682 
683   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
684   KA_TRACE(10, ("__kmpc_omp_task_with_deps(enter): T#%d loc=%p task=%p\n", gtid,
685                 loc_ref, new_taskdata));
686   __kmp_assert_valid_gtid(gtid);
687   kmp_info_t *thread = __kmp_threads[gtid];
688   kmp_taskdata_t *current_task = thread->th.th_current_task;
689 
690 #if OMPX_TASKGRAPH
691   // record TDG with deps
692   if (new_taskdata->is_taskgraph &&
693       __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
694     kmp_tdg_info_t *tdg = new_taskdata->tdg;
695     // extend record_map if needed
696     if (new_taskdata->td_task_id >= tdg->map_size) {
697       __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
698       if (new_taskdata->td_task_id >= tdg->map_size) {
699         kmp_uint old_size = tdg->map_size;
700         kmp_uint new_size = old_size * 2;
701         kmp_node_info_t *old_record = tdg->record_map;
702         kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
703             new_size * sizeof(kmp_node_info_t));
704         KMP_MEMCPY(new_record, tdg->record_map,
705                    old_size * sizeof(kmp_node_info_t));
706         tdg->record_map = new_record;
707 
708         __kmp_free(old_record);
709 
710         for (kmp_int i = old_size; i < new_size; i++) {
711           kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
712               __kmp_successors_size * sizeof(kmp_int32));
713           new_record[i].task = nullptr;
714           new_record[i].successors = successorsList;
715           new_record[i].nsuccessors = 0;
716           new_record[i].npredecessors = 0;
717           new_record[i].successors_size = __kmp_successors_size;
718           KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
719         }
720         // update the size at the end, so that we avoid other
721         // threads use old_record while map_size is already updated
722         tdg->map_size = new_size;
723       }
724       __kmp_release_bootstrap_lock(&tdg->graph_lock);
725     }
726     tdg->record_map[new_taskdata->td_task_id].task = new_task;
727     tdg->record_map[new_taskdata->td_task_id].parent_task =
728         new_taskdata->td_parent;
729     KMP_ATOMIC_INC(&tdg->num_tasks);
730   }
731 #endif
732 #if OMPT_SUPPORT
733   if (ompt_enabled.enabled) {
734     if (!current_task->ompt_task_info.frame.enter_frame.ptr)
735       current_task->ompt_task_info.frame.enter_frame.ptr =
736           OMPT_GET_FRAME_ADDRESS(0);
737     if (ompt_enabled.ompt_callback_task_create) {
738       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
739           &(current_task->ompt_task_info.task_data),
740           &(current_task->ompt_task_info.frame),
741           &(new_taskdata->ompt_task_info.task_data),
742           TASK_TYPE_DETAILS_FORMAT(new_taskdata), 1,
743           OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid));
744     }
745 
746     new_taskdata->ompt_task_info.frame.enter_frame.ptr =
747         OMPT_GET_FRAME_ADDRESS(0);
748   }
749 
750 #if OMPT_OPTIONAL
751   /* OMPT grab all dependences if requested by the tool */
752   if (ndeps + ndeps_noalias > 0 && ompt_enabled.ompt_callback_dependences) {
753     kmp_int32 i;
754 
755     int ompt_ndeps = ndeps + ndeps_noalias;
756     ompt_dependence_t *ompt_deps = (ompt_dependence_t *)KMP_OMPT_DEPS_ALLOC(
757         thread, (ndeps + ndeps_noalias) * sizeof(ompt_dependence_t));
758 
759     KMP_ASSERT(ompt_deps != NULL);
760 
761     for (i = 0; i < ndeps; i++) {
762       ompt_deps[i].variable.ptr = (void *)dep_list[i].base_addr;
763       if (dep_list[i].base_addr == KMP_SIZE_T_MAX)
764         ompt_deps[i].dependence_type = ompt_dependence_type_out_all_memory;
765       else if (dep_list[i].flags.in && dep_list[i].flags.out)
766         ompt_deps[i].dependence_type = ompt_dependence_type_inout;
767       else if (dep_list[i].flags.out)
768         ompt_deps[i].dependence_type = ompt_dependence_type_out;
769       else if (dep_list[i].flags.in)
770         ompt_deps[i].dependence_type = ompt_dependence_type_in;
771       else if (dep_list[i].flags.mtx)
772         ompt_deps[i].dependence_type = ompt_dependence_type_mutexinoutset;
773       else if (dep_list[i].flags.set)
774         ompt_deps[i].dependence_type = ompt_dependence_type_inoutset;
775       else if (dep_list[i].flags.all)
776         ompt_deps[i].dependence_type = ompt_dependence_type_out_all_memory;
777     }
778     for (i = 0; i < ndeps_noalias; i++) {
779       ompt_deps[ndeps + i].variable.ptr = (void *)noalias_dep_list[i].base_addr;
780       if (noalias_dep_list[i].base_addr == KMP_SIZE_T_MAX)
781         ompt_deps[ndeps + i].dependence_type =
782             ompt_dependence_type_out_all_memory;
783       else if (noalias_dep_list[i].flags.in && noalias_dep_list[i].flags.out)
784         ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inout;
785       else if (noalias_dep_list[i].flags.out)
786         ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_out;
787       else if (noalias_dep_list[i].flags.in)
788         ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_in;
789       else if (noalias_dep_list[i].flags.mtx)
790         ompt_deps[ndeps + i].dependence_type =
791             ompt_dependence_type_mutexinoutset;
792       else if (noalias_dep_list[i].flags.set)
793         ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inoutset;
794       else if (noalias_dep_list[i].flags.all)
795         ompt_deps[ndeps + i].dependence_type =
796             ompt_dependence_type_out_all_memory;
797     }
798     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
799         &(new_taskdata->ompt_task_info.task_data), ompt_deps, ompt_ndeps);
800     /* We can now free the allocated memory for the dependences */
801     /* For OMPD we might want to delay the free until end of this function */
802     KMP_OMPT_DEPS_FREE(thread, ompt_deps);
803   }
804 #endif /* OMPT_OPTIONAL */
805 #endif /* OMPT_SUPPORT */
806 
807   bool serial = current_task->td_flags.team_serial ||
808                 current_task->td_flags.tasking_ser ||
809                 current_task->td_flags.final;
810   kmp_task_team_t *task_team = thread->th.th_task_team;
811   serial = serial &&
812            !(task_team && (task_team->tt.tt_found_proxy_tasks ||
813                            task_team->tt.tt_hidden_helper_task_encountered));
814 
815   if (!serial && (ndeps > 0 || ndeps_noalias > 0)) {
816     /* if no dependences have been tracked yet, create the dependence hash */
817     if (current_task->td_dephash == NULL)
818       current_task->td_dephash = __kmp_dephash_create(thread, current_task);
819 
820 #if USE_FAST_MEMORY
821     kmp_depnode_t *node =
822         (kmp_depnode_t *)__kmp_fast_allocate(thread, sizeof(kmp_depnode_t));
823 #else
824     kmp_depnode_t *node =
825         (kmp_depnode_t *)__kmp_thread_malloc(thread, sizeof(kmp_depnode_t));
826 #endif
827 
828     __kmp_init_node(node);
829     new_taskdata->td_depnode = node;
830 
831     if (__kmp_check_deps(gtid, node, new_task, &current_task->td_dephash,
832                          NO_DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
833                          noalias_dep_list)) {
834       KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had blocking "
835                     "dependences: "
836                     "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
837                     gtid, loc_ref, new_taskdata));
838 #if OMPT_SUPPORT
839       if (ompt_enabled.enabled) {
840         current_task->ompt_task_info.frame.enter_frame = ompt_data_none;
841       }
842 #endif
843       return TASK_CURRENT_NOT_QUEUED;
844     }
845   } else {
846     KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d ignored dependences "
847                   "for task (serialized) loc=%p task=%p\n",
848                   gtid, loc_ref, new_taskdata));
849   }
850 
851   KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had no blocking "
852                 "dependences : "
853                 "loc=%p task=%p, transferring to __kmp_omp_task\n",
854                 gtid, loc_ref, new_taskdata));
855 
856   kmp_int32 ret = __kmp_omp_task(gtid, new_task, true);
857 #if OMPT_SUPPORT
858   if (ompt_enabled.enabled) {
859     current_task->ompt_task_info.frame.enter_frame = ompt_data_none;
860   }
861 #endif
862   return ret;
863 }
864 
865 #if OMPT_SUPPORT
866 void __ompt_taskwait_dep_finish(kmp_taskdata_t *current_task,
867                                 ompt_data_t *taskwait_task_data) {
868   if (ompt_enabled.ompt_callback_task_schedule) {
869     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
870         taskwait_task_data, ompt_taskwait_complete, NULL);
871   }
872   current_task->ompt_task_info.frame.enter_frame.ptr = NULL;
873   *taskwait_task_data = ompt_data_none;
874 }
875 #endif /* OMPT_SUPPORT */
876 
877 /*!
878 @ingroup TASKING
879 @param loc_ref location of the original task directive
880 @param gtid Global Thread ID of encountering thread
881 @param ndeps Number of depend items with possible aliasing
882 @param dep_list List of depend items with possible aliasing
883 @param ndeps_noalias Number of depend items with no aliasing
884 @param noalias_dep_list List of depend items with no aliasing
885 
886 Blocks the current task until all specifies dependences have been fulfilled.
887 */
888 void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
889                           kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
890                           kmp_depend_info_t *noalias_dep_list) {
891   __kmpc_omp_taskwait_deps_51(loc_ref, gtid, ndeps, dep_list, ndeps_noalias,
892                               noalias_dep_list, false);
893 }
894 
895 /* __kmpc_omp_taskwait_deps_51 : Function for OpenMP 5.1 nowait clause.
896                                  Placeholder for taskwait with nowait clause.
897                                  Earlier code of __kmpc_omp_wait_deps() is now
898                                  in this function.
899 */
900 void __kmpc_omp_taskwait_deps_51(ident_t *loc_ref, kmp_int32 gtid,
901                                  kmp_int32 ndeps, kmp_depend_info_t *dep_list,
902                                  kmp_int32 ndeps_noalias,
903                                  kmp_depend_info_t *noalias_dep_list,
904                                  kmp_int32 has_no_wait) {
905   KA_TRACE(10, ("__kmpc_omp_taskwait_deps(enter): T#%d loc=%p nowait#%d\n",
906                 gtid, loc_ref, has_no_wait));
907   if (ndeps == 0 && ndeps_noalias == 0) {
908     KA_TRACE(10, ("__kmpc_omp_taskwait_deps(exit): T#%d has no dependences to "
909                   "wait upon : loc=%p\n",
910                   gtid, loc_ref));
911     return;
912   }
913   __kmp_assert_valid_gtid(gtid);
914   kmp_info_t *thread = __kmp_threads[gtid];
915   kmp_taskdata_t *current_task = thread->th.th_current_task;
916 
917 #if OMPT_SUPPORT
918   // this function represents a taskwait construct with depend clause
919   // We signal 4 events:
920   //  - creation of the taskwait task
921   //  - dependences of the taskwait task
922   //  - schedule and finish of the taskwait task
923   ompt_data_t *taskwait_task_data = &thread->th.ompt_thread_info.task_data;
924   KMP_ASSERT(taskwait_task_data->ptr == NULL);
925   if (ompt_enabled.enabled) {
926     if (!current_task->ompt_task_info.frame.enter_frame.ptr)
927       current_task->ompt_task_info.frame.enter_frame.ptr =
928           OMPT_GET_FRAME_ADDRESS(0);
929     if (ompt_enabled.ompt_callback_task_create) {
930       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
931           &(current_task->ompt_task_info.task_data),
932           &(current_task->ompt_task_info.frame), taskwait_task_data,
933           ompt_task_taskwait | ompt_task_undeferred | ompt_task_mergeable, 1,
934           OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid));
935     }
936   }
937 
938 #if OMPT_OPTIONAL
939   /* OMPT grab all dependences if requested by the tool */
940   if (ndeps + ndeps_noalias > 0 && ompt_enabled.ompt_callback_dependences) {
941     kmp_int32 i;
942 
943     int ompt_ndeps = ndeps + ndeps_noalias;
944     ompt_dependence_t *ompt_deps = (ompt_dependence_t *)KMP_OMPT_DEPS_ALLOC(
945         thread, (ndeps + ndeps_noalias) * sizeof(ompt_dependence_t));
946 
947     KMP_ASSERT(ompt_deps != NULL);
948 
949     for (i = 0; i < ndeps; i++) {
950       ompt_deps[i].variable.ptr = (void *)dep_list[i].base_addr;
951       if (dep_list[i].flags.in && dep_list[i].flags.out)
952         ompt_deps[i].dependence_type = ompt_dependence_type_inout;
953       else if (dep_list[i].flags.out)
954         ompt_deps[i].dependence_type = ompt_dependence_type_out;
955       else if (dep_list[i].flags.in)
956         ompt_deps[i].dependence_type = ompt_dependence_type_in;
957       else if (dep_list[i].flags.mtx)
958         ompt_deps[ndeps + i].dependence_type =
959             ompt_dependence_type_mutexinoutset;
960       else if (dep_list[i].flags.set)
961         ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inoutset;
962     }
963     for (i = 0; i < ndeps_noalias; i++) {
964       ompt_deps[ndeps + i].variable.ptr = (void *)noalias_dep_list[i].base_addr;
965       if (noalias_dep_list[i].flags.in && noalias_dep_list[i].flags.out)
966         ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inout;
967       else if (noalias_dep_list[i].flags.out)
968         ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_out;
969       else if (noalias_dep_list[i].flags.in)
970         ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_in;
971       else if (noalias_dep_list[i].flags.mtx)
972         ompt_deps[ndeps + i].dependence_type =
973             ompt_dependence_type_mutexinoutset;
974       else if (noalias_dep_list[i].flags.set)
975         ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inoutset;
976     }
977     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
978         taskwait_task_data, ompt_deps, ompt_ndeps);
979     /* We can now free the allocated memory for the dependences */
980     /* For OMPD we might want to delay the free until end of this function */
981     KMP_OMPT_DEPS_FREE(thread, ompt_deps);
982     ompt_deps = NULL;
983   }
984 #endif /* OMPT_OPTIONAL */
985 #endif /* OMPT_SUPPORT */
986 
987   // We can return immediately as:
988   // - dependences are not computed in serial teams (except with proxy tasks)
989   // - if the dephash is not yet created it means we have nothing to wait for
990   bool ignore = current_task->td_flags.team_serial ||
991                 current_task->td_flags.tasking_ser ||
992                 current_task->td_flags.final;
993   ignore =
994       ignore && thread->th.th_task_team != NULL &&
995       thread->th.th_task_team->tt.tt_found_proxy_tasks == FALSE &&
996       thread->th.th_task_team->tt.tt_hidden_helper_task_encountered == FALSE;
997   ignore = ignore || current_task->td_dephash == NULL;
998 
999   if (ignore) {
1000     KA_TRACE(10, ("__kmpc_omp_taskwait_deps(exit): T#%d has no blocking "
1001                   "dependences : loc=%p\n",
1002                   gtid, loc_ref));
1003 #if OMPT_SUPPORT
1004     __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
1005 #endif /* OMPT_SUPPORT */
1006     return;
1007   }
1008 
1009   kmp_depnode_t node = {0};
1010   __kmp_init_node(&node);
1011 
1012   if (!__kmp_check_deps(gtid, &node, NULL, &current_task->td_dephash,
1013                         DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
1014                         noalias_dep_list)) {
1015     KA_TRACE(10, ("__kmpc_omp_taskwait_deps(exit): T#%d has no blocking "
1016                   "dependences : loc=%p\n",
1017                   gtid, loc_ref));
1018 #if OMPT_SUPPORT
1019     __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
1020 #endif /* OMPT_SUPPORT */
1021     return;
1022   }
1023 
1024   int thread_finished = FALSE;
1025   kmp_flag_32<false, false> flag(
1026       (std::atomic<kmp_uint32> *)&node.dn.npredecessors, 0U);
1027   while (node.dn.npredecessors > 0) {
1028     flag.execute_tasks(thread, gtid, FALSE,
1029                        &thread_finished USE_ITT_BUILD_ARG(NULL),
1030                        __kmp_task_stealing_constraint);
1031   }
1032 
1033   // Wait until the last __kmp_release_deps is finished before we free the
1034   // current stack frame holding the "node" variable; once its nrefs count
1035   // reaches 1, we're sure nobody else can try to reference it again.
1036   while (node.dn.nrefs > 1)
1037     KMP_YIELD(TRUE);
1038 
1039 #if OMPT_SUPPORT
1040   __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
1041 #endif /* OMPT_SUPPORT */
1042   KA_TRACE(10, ("__kmpc_omp_taskwait_deps(exit): T#%d finished waiting : loc=%p\
1043                 \n",
1044                 gtid, loc_ref));
1045 }
1046