xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_csupport.cpp (revision 3ceba58a7509418b47b8fca2d2b6bbf088714e26)
1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "kmp_utils.h"
22 #include "ompt-specific.h"
23 
24 #define MAX_MESSAGE 512
25 
26 // flags will be used in future, e.g. to implement openmp_strict library
27 // restrictions
28 
29 /*!
30  * @ingroup STARTUP_SHUTDOWN
31  * @param loc   in   source location information
32  * @param flags in   for future use (currently ignored)
33  *
34  * Initialize the runtime library. This call is optional; if it is not made then
35  * it will be implicitly called by attempts to use other library functions.
36  */
37 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
38   // By default __kmpc_begin() is no-op.
39   char *env;
40   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
41       __kmp_str_match_true(env)) {
42     __kmp_middle_initialize();
43     __kmp_assign_root_init_mask();
44     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
45   } else if (__kmp_ignore_mppbeg() == FALSE) {
46     // By default __kmp_ignore_mppbeg() returns TRUE.
47     __kmp_internal_begin();
48     KC_TRACE(10, ("__kmpc_begin: called\n"));
49   }
50 }
51 
52 /*!
53  * @ingroup STARTUP_SHUTDOWN
54  * @param loc source location information
55  *
56  * Shutdown the runtime library. This is also optional, and even if called will
57  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
58  * zero.
59  */
60 void __kmpc_end(ident_t *loc) {
61   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
62   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
63   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
64   // returns FALSE and __kmpc_end() will unregister this root (it can cause
65   // library shut down).
66   if (__kmp_ignore_mppend() == FALSE) {
67     KC_TRACE(10, ("__kmpc_end: called\n"));
68     KA_TRACE(30, ("__kmpc_end\n"));
69 
70     __kmp_internal_end_thread(-1);
71   }
72 #if KMP_OS_WINDOWS && OMPT_SUPPORT
73   // Normal exit process on Windows does not allow worker threads of the final
74   // parallel region to finish reporting their events, so shutting down the
75   // library here fixes the issue at least for the cases where __kmpc_end() is
76   // placed properly.
77   if (ompt_enabled.enabled)
78     __kmp_internal_end_library(__kmp_gtid_get_specific());
79 #endif
80 }
81 
82 /*!
83 @ingroup THREAD_STATES
84 @param loc Source location information.
85 @return The global thread index of the active thread.
86 
87 This function can be called in any context.
88 
89 If the runtime has ony been entered at the outermost level from a
90 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
91 that which would be returned by omp_get_thread_num() in the outermost
92 active parallel construct. (Or zero if there is no active parallel
93 construct, since the primary thread is necessarily thread zero).
94 
95 If multiple non-OpenMP threads all enter an OpenMP construct then this
96 will be a unique thread identifier among all the threads created by
97 the OpenMP runtime (but the value cannot be defined in terms of
98 OpenMP thread ids returned by omp_get_thread_num()).
99 */
100 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
101   kmp_int32 gtid = __kmp_entry_gtid();
102 
103   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
104 
105   return gtid;
106 }
107 
108 /*!
109 @ingroup THREAD_STATES
110 @param loc Source location information.
111 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
112 
113 This function can be called in any context.
114 It returns the total number of threads under the control of the OpenMP runtime.
115 That is not a number that can be determined by any OpenMP standard calls, since
116 the library may be called from more than one non-OpenMP thread, and this
117 reflects the total over all such calls. Similarly the runtime maintains
118 underlying threads even when they are not active (since the cost of creating
119 and destroying OS threads is high), this call counts all such threads even if
120 they are not waiting for work.
121 */
122 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
123   KC_TRACE(10,
124            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
125 
126   return TCR_4(__kmp_all_nth);
127 }
128 
129 /*!
130 @ingroup THREAD_STATES
131 @param loc Source location information.
132 @return The thread number of the calling thread in the innermost active parallel
133 construct.
134 */
135 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
136   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
137   return __kmp_tid_from_gtid(__kmp_entry_gtid());
138 }
139 
140 /*!
141 @ingroup THREAD_STATES
142 @param loc Source location information.
143 @return The number of threads in the innermost active parallel construct.
144 */
145 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
146   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
147 
148   return __kmp_entry_thread()->th.th_team->t.t_nproc;
149 }
150 
151 /*!
152  * @ingroup DEPRECATED
153  * @param loc location description
154  *
155  * This function need not be called. It always returns TRUE.
156  */
157 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
158 #ifndef KMP_DEBUG
159 
160   return TRUE;
161 
162 #else
163 
164   const char *semi2;
165   const char *semi3;
166   int line_no;
167 
168   if (__kmp_par_range == 0) {
169     return TRUE;
170   }
171   semi2 = loc->psource;
172   if (semi2 == NULL) {
173     return TRUE;
174   }
175   semi2 = strchr(semi2, ';');
176   if (semi2 == NULL) {
177     return TRUE;
178   }
179   semi2 = strchr(semi2 + 1, ';');
180   if (semi2 == NULL) {
181     return TRUE;
182   }
183   if (__kmp_par_range_filename[0]) {
184     const char *name = semi2 - 1;
185     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
186       name--;
187     }
188     if ((*name == '/') || (*name == ';')) {
189       name++;
190     }
191     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
192       return __kmp_par_range < 0;
193     }
194   }
195   semi3 = strchr(semi2 + 1, ';');
196   if (__kmp_par_range_routine[0]) {
197     if ((semi3 != NULL) && (semi3 > semi2) &&
198         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
199       return __kmp_par_range < 0;
200     }
201   }
202   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
203     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
204       return __kmp_par_range > 0;
205     }
206     return __kmp_par_range < 0;
207   }
208   return TRUE;
209 
210 #endif /* KMP_DEBUG */
211 }
212 
213 /*!
214 @ingroup THREAD_STATES
215 @param loc Source location information.
216 @return 1 if this thread is executing inside an active parallel region, zero if
217 not.
218 */
219 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
220   return __kmp_entry_thread()->th.th_root->r.r_active;
221 }
222 
223 /*!
224 @ingroup PARALLEL
225 @param loc source location information
226 @param global_tid global thread number
227 @param num_threads number of threads requested for this parallel construct
228 
229 Set the number of threads to be used by the next fork spawned by this thread.
230 This call is only required if the parallel construct has a `num_threads` clause.
231 */
232 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
233                              kmp_int32 num_threads) {
234   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
235                 global_tid, num_threads));
236   __kmp_assert_valid_gtid(global_tid);
237   __kmp_push_num_threads(loc, global_tid, num_threads);
238 }
239 
240 void __kmpc_push_num_threads_strict(ident_t *loc, kmp_int32 global_tid,
241                                     kmp_int32 num_threads, int severity,
242                                     const char *message) {
243   __kmp_push_num_threads(loc, global_tid, num_threads);
244   __kmp_set_strict_num_threads(loc, global_tid, severity, message);
245 }
246 
247 /*!
248 @ingroup PARALLEL
249 @param loc source location information
250 @param global_tid global thread number
251 @param list_length number of entries in the num_threads_list array
252 @param num_threads_list array of numbers of threads requested for this parallel
253 construct and subsequent nested parallel constructs
254 
255 Set the number of threads to be used by the next fork spawned by this thread,
256 and some nested forks as well.
257 This call is only required if the parallel construct has a `num_threads` clause
258 that has a list of integers as the argument.
259 */
260 void __kmpc_push_num_threads_list(ident_t *loc, kmp_int32 global_tid,
261                                   kmp_uint32 list_length,
262                                   kmp_int32 *num_threads_list) {
263   KA_TRACE(20, ("__kmpc_push_num_threads_list: enter T#%d num_threads_list=",
264                 global_tid));
265   KA_TRACE(20, ("%d", num_threads_list[0]));
266 #ifdef KMP_DEBUG
267   for (kmp_uint32 i = 1; i < list_length; ++i)
268     KA_TRACE(20, (", %d", num_threads_list[i]));
269 #endif
270   KA_TRACE(20, ("/n"));
271 
272   __kmp_assert_valid_gtid(global_tid);
273   __kmp_push_num_threads_list(loc, global_tid, list_length, num_threads_list);
274 }
275 
276 void __kmpc_push_num_threads_list_strict(ident_t *loc, kmp_int32 global_tid,
277                                          kmp_uint32 list_length,
278                                          kmp_int32 *num_threads_list,
279                                          int severity, const char *message) {
280   __kmp_push_num_threads_list(loc, global_tid, list_length, num_threads_list);
281   __kmp_set_strict_num_threads(loc, global_tid, severity, message);
282 }
283 
284 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
285   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
286   /* the num_threads are automatically popped */
287 }
288 
289 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
290                            kmp_int32 proc_bind) {
291   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
292                 proc_bind));
293   __kmp_assert_valid_gtid(global_tid);
294   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
295 }
296 
297 /*!
298 @ingroup PARALLEL
299 @param loc  source location information
300 @param argc  total number of arguments in the ellipsis
301 @param microtask  pointer to callback routine consisting of outlined parallel
302 construct
303 @param ...  pointers to shared variables that aren't global
304 
305 Do the actual fork and call the microtask in the relevant number of threads.
306 */
307 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
308   int gtid = __kmp_entry_gtid();
309 
310 #if (KMP_STATS_ENABLED)
311   // If we were in a serial region, then stop the serial timer, record
312   // the event, and start parallel region timer
313   stats_state_e previous_state = KMP_GET_THREAD_STATE();
314   if (previous_state == stats_state_e::SERIAL_REGION) {
315     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
316   } else {
317     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
318   }
319   int inParallel = __kmpc_in_parallel(loc);
320   if (inParallel) {
321     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
322   } else {
323     KMP_COUNT_BLOCK(OMP_PARALLEL);
324   }
325 #endif
326 
327   // maybe to save thr_state is enough here
328   {
329     va_list ap;
330     va_start(ap, microtask);
331 
332 #if OMPT_SUPPORT
333     ompt_frame_t *ompt_frame;
334     if (ompt_enabled.enabled) {
335       kmp_info_t *master_th = __kmp_threads[gtid];
336       ompt_frame = &master_th->th.th_current_task->ompt_task_info.frame;
337       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
338     }
339     OMPT_STORE_RETURN_ADDRESS(gtid);
340 #endif
341 
342 #if INCLUDE_SSC_MARKS
343     SSC_MARK_FORKING();
344 #endif
345     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
346                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
347                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
348                     kmp_va_addr_of(ap));
349 #if INCLUDE_SSC_MARKS
350     SSC_MARK_JOINING();
351 #endif
352     __kmp_join_call(loc, gtid
353 #if OMPT_SUPPORT
354                     ,
355                     fork_context_intel
356 #endif
357     );
358 
359     va_end(ap);
360 
361 #if OMPT_SUPPORT
362     if (ompt_enabled.enabled) {
363       ompt_frame->enter_frame = ompt_data_none;
364     }
365 #endif
366   }
367 
368 #if KMP_STATS_ENABLED
369   if (previous_state == stats_state_e::SERIAL_REGION) {
370     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
371     KMP_SET_THREAD_STATE(previous_state);
372   } else {
373     KMP_POP_PARTITIONED_TIMER();
374   }
375 #endif // KMP_STATS_ENABLED
376 }
377 
378 /*!
379 @ingroup PARALLEL
380 @param loc  source location information
381 @param microtask  pointer to callback routine consisting of outlined parallel
382 construct
383 @param cond  condition for running in parallel
384 @param args  struct of pointers to shared variables that aren't global
385 
386 Perform a fork only if the condition is true.
387 */
388 void __kmpc_fork_call_if(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
389                          kmp_int32 cond, void *args) {
390   int gtid = __kmp_entry_gtid();
391   if (cond) {
392     if (args)
393       __kmpc_fork_call(loc, argc, microtask, args);
394     else
395       __kmpc_fork_call(loc, argc, microtask);
396   } else {
397     __kmpc_serialized_parallel(loc, gtid);
398 
399 #if OMPT_SUPPORT
400     void *exit_frame_ptr;
401 #endif
402 
403     if (args)
404       __kmp_invoke_microtask(VOLATILE_CAST(microtask_t) microtask, gtid,
405                              /*npr=*/0,
406                              /*argc=*/1, &args
407 #if OMPT_SUPPORT
408                              ,
409                              &exit_frame_ptr
410 #endif
411       );
412     else
413       __kmp_invoke_microtask(VOLATILE_CAST(microtask_t) microtask, gtid,
414                              /*npr=*/0,
415                              /*argc=*/0,
416                              /*args=*/nullptr
417 #if OMPT_SUPPORT
418                              ,
419                              &exit_frame_ptr
420 #endif
421       );
422 
423     __kmpc_end_serialized_parallel(loc, gtid);
424   }
425 }
426 
427 /*!
428 @ingroup PARALLEL
429 @param loc source location information
430 @param global_tid global thread number
431 @param num_teams number of teams requested for the teams construct
432 @param num_threads number of threads per team requested for the teams construct
433 
434 Set the number of teams to be used by the teams construct.
435 This call is only required if the teams construct has a `num_teams` clause
436 or a `thread_limit` clause (or both).
437 */
438 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
439                            kmp_int32 num_teams, kmp_int32 num_threads) {
440   KA_TRACE(20,
441            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
442             global_tid, num_teams, num_threads));
443   __kmp_assert_valid_gtid(global_tid);
444   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
445 }
446 
447 /*!
448 @ingroup PARALLEL
449 @param loc source location information
450 @param global_tid global thread number
451 @param thread_limit limit on number of threads which can be created within the
452 current task
453 
454 Set the thread_limit for the current task
455 This call is there to support `thread_limit` clause on the `target` construct
456 */
457 void __kmpc_set_thread_limit(ident_t *loc, kmp_int32 global_tid,
458                              kmp_int32 thread_limit) {
459   __kmp_assert_valid_gtid(global_tid);
460   kmp_info_t *thread = __kmp_threads[global_tid];
461   if (thread_limit > 0)
462     thread->th.th_current_task->td_icvs.task_thread_limit = thread_limit;
463 }
464 
465 /*!
466 @ingroup PARALLEL
467 @param loc source location information
468 @param global_tid global thread number
469 @param num_teams_lb lower bound on number of teams requested for the teams
470 construct
471 @param num_teams_ub upper bound on number of teams requested for the teams
472 construct
473 @param num_threads number of threads per team requested for the teams construct
474 
475 Set the number of teams to be used by the teams construct. The number of initial
476 teams cretaed will be greater than or equal to the lower bound and less than or
477 equal to the upper bound.
478 This call is only required if the teams construct has a `num_teams` clause
479 or a `thread_limit` clause (or both).
480 */
481 void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
482                               kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
483                               kmp_int32 num_threads) {
484   KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
485                 " num_teams_ub=%d num_threads=%d\n",
486                 global_tid, num_teams_lb, num_teams_ub, num_threads));
487   __kmp_assert_valid_gtid(global_tid);
488   __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
489                           num_threads);
490 }
491 
492 /*!
493 @ingroup PARALLEL
494 @param loc  source location information
495 @param argc  total number of arguments in the ellipsis
496 @param microtask  pointer to callback routine consisting of outlined teams
497 construct
498 @param ...  pointers to shared variables that aren't global
499 
500 Do the actual fork and call the microtask in the relevant number of threads.
501 */
502 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
503                        ...) {
504   int gtid = __kmp_entry_gtid();
505   kmp_info_t *this_thr = __kmp_threads[gtid];
506   va_list ap;
507   va_start(ap, microtask);
508 
509 #if KMP_STATS_ENABLED
510   KMP_COUNT_BLOCK(OMP_TEAMS);
511   stats_state_e previous_state = KMP_GET_THREAD_STATE();
512   if (previous_state == stats_state_e::SERIAL_REGION) {
513     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
514   } else {
515     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
516   }
517 #endif
518 
519   // remember teams entry point and nesting level
520   this_thr->th.th_teams_microtask = microtask;
521   this_thr->th.th_teams_level =
522       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
523 
524 #if OMPT_SUPPORT
525   kmp_team_t *parent_team = this_thr->th.th_team;
526   int tid = __kmp_tid_from_gtid(gtid);
527   if (ompt_enabled.enabled) {
528     parent_team->t.t_implicit_task_taskdata[tid]
529         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
530   }
531   OMPT_STORE_RETURN_ADDRESS(gtid);
532 #endif
533 
534   // check if __kmpc_push_num_teams called, set default number of teams
535   // otherwise
536   if (this_thr->th.th_teams_size.nteams == 0) {
537     __kmp_push_num_teams(loc, gtid, 0, 0);
538   }
539   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
540   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
541   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
542 
543   __kmp_fork_call(
544       loc, gtid, fork_context_intel, argc,
545       VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
546       VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
547   __kmp_join_call(loc, gtid
548 #if OMPT_SUPPORT
549                   ,
550                   fork_context_intel
551 #endif
552   );
553 
554   // Pop current CG root off list
555   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
556   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
557   this_thr->th.th_cg_roots = tmp->up;
558   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
559                  " to node %p. cg_nthreads was %d\n",
560                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
561   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
562   int i = tmp->cg_nthreads--;
563   if (i == 1) { // check is we are the last thread in CG (not always the case)
564     __kmp_free(tmp);
565   }
566   // Restore current task's thread_limit from CG root
567   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
568   this_thr->th.th_current_task->td_icvs.thread_limit =
569       this_thr->th.th_cg_roots->cg_thread_limit;
570 
571   this_thr->th.th_teams_microtask = NULL;
572   this_thr->th.th_teams_level = 0;
573   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
574   va_end(ap);
575 #if KMP_STATS_ENABLED
576   if (previous_state == stats_state_e::SERIAL_REGION) {
577     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
578     KMP_SET_THREAD_STATE(previous_state);
579   } else {
580     KMP_POP_PARTITIONED_TIMER();
581   }
582 #endif // KMP_STATS_ENABLED
583 }
584 
585 // I don't think this function should ever have been exported.
586 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
587 // openmp code ever called it, but it's been exported from the RTL for so
588 // long that I'm afraid to remove the definition.
589 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
590 
591 /*!
592 @ingroup PARALLEL
593 @param loc  source location information
594 @param global_tid  global thread number
595 
596 Enter a serialized parallel construct. This interface is used to handle a
597 conditional parallel region, like this,
598 @code
599 #pragma omp parallel if (condition)
600 @endcode
601 when the condition is false.
602 */
603 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
604   // The implementation is now in kmp_runtime.cpp so that it can share static
605   // functions with kmp_fork_call since the tasks to be done are similar in
606   // each case.
607   __kmp_assert_valid_gtid(global_tid);
608 #if OMPT_SUPPORT
609   OMPT_STORE_RETURN_ADDRESS(global_tid);
610 #endif
611   __kmp_serialized_parallel(loc, global_tid);
612 }
613 
614 /*!
615 @ingroup PARALLEL
616 @param loc  source location information
617 @param global_tid  global thread number
618 
619 Leave a serialized parallel construct.
620 */
621 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
622   kmp_internal_control_t *top;
623   kmp_info_t *this_thr;
624   kmp_team_t *serial_team;
625 
626   KC_TRACE(10,
627            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
628 
629   /* skip all this code for autopar serialized loops since it results in
630      unacceptable overhead */
631   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
632     return;
633 
634   // Not autopar code
635   __kmp_assert_valid_gtid(global_tid);
636   if (!TCR_4(__kmp_init_parallel))
637     __kmp_parallel_initialize();
638 
639   __kmp_resume_if_soft_paused();
640 
641   this_thr = __kmp_threads[global_tid];
642   serial_team = this_thr->th.th_serial_team;
643 
644   kmp_task_team_t *task_team = this_thr->th.th_task_team;
645   // we need to wait for the proxy tasks before finishing the thread
646   if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
647                             task_team->tt.tt_hidden_helper_task_encountered))
648     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
649 
650   KMP_MB();
651   KMP_DEBUG_ASSERT(serial_team);
652   KMP_ASSERT(serial_team->t.t_serialized);
653   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
654   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
655   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
656   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
657 
658 #if OMPT_SUPPORT
659   if (ompt_enabled.enabled &&
660       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
661     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
662     if (ompt_enabled.ompt_callback_implicit_task) {
663       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
664           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
665           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
666     }
667 
668     // reset clear the task id only after unlinking the task
669     ompt_data_t *parent_task_data;
670     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
671 
672     if (ompt_enabled.ompt_callback_parallel_end) {
673       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
674           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
675           ompt_parallel_invoker_program | ompt_parallel_team,
676           OMPT_LOAD_RETURN_ADDRESS(global_tid));
677     }
678     __ompt_lw_taskteam_unlink(this_thr);
679     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
680   }
681 #endif
682 
683   /* If necessary, pop the internal control stack values and replace the team
684    * values */
685   top = serial_team->t.t_control_stack_top;
686   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
687     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
688     serial_team->t.t_control_stack_top = top->next;
689     __kmp_free(top);
690   }
691 
692   /* pop dispatch buffers stack */
693   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
694   {
695     dispatch_private_info_t *disp_buffer =
696         serial_team->t.t_dispatch->th_disp_buffer;
697     serial_team->t.t_dispatch->th_disp_buffer =
698         serial_team->t.t_dispatch->th_disp_buffer->next;
699     __kmp_free(disp_buffer);
700   }
701 
702   /* pop the task team stack */
703   if (serial_team->t.t_serialized > 1) {
704     __kmp_pop_task_team_node(this_thr, serial_team);
705   }
706 
707   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
708 
709   --serial_team->t.t_serialized;
710   if (serial_team->t.t_serialized == 0) {
711 
712     /* return to the parallel section */
713 
714 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
715     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
716       __kmp_clear_x87_fpu_status_word();
717       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
718       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
719     }
720 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
721 
722     __kmp_pop_current_task_from_thread(this_thr);
723 #if OMPD_SUPPORT
724     if (ompd_state & OMPD_ENABLE_BP)
725       ompd_bp_parallel_end();
726 #endif
727 
728     this_thr->th.th_team = serial_team->t.t_parent;
729     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
730 
731     /* restore values cached in the thread */
732     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
733     this_thr->th.th_team_master =
734         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
735     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
736 
737     /* TODO the below shouldn't need to be adjusted for serialized teams */
738     this_thr->th.th_dispatch =
739         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
740 
741     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
742     this_thr->th.th_current_task->td_flags.executing = 1;
743 
744     if (__kmp_tasking_mode != tskm_immediate_exec) {
745       // Restore task state from serial team structure
746       KMP_DEBUG_ASSERT(serial_team->t.t_primary_task_state == 0 ||
747                        serial_team->t.t_primary_task_state == 1);
748       this_thr->th.th_task_state =
749           (kmp_uint8)serial_team->t.t_primary_task_state;
750       // Copy the task team from the new child / old parent team to the thread.
751       this_thr->th.th_task_team =
752           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
753       KA_TRACE(20,
754                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
755                 "team %p\n",
756                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
757     }
758 #if KMP_AFFINITY_SUPPORTED
759     if (this_thr->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
760       __kmp_reset_root_init_mask(global_tid);
761     }
762 #endif
763   } else {
764     if (__kmp_tasking_mode != tskm_immediate_exec) {
765       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
766                     "depth of serial team %p to %d\n",
767                     global_tid, serial_team, serial_team->t.t_serialized));
768     }
769   }
770 
771   serial_team->t.t_level--;
772   if (__kmp_env_consistency_check)
773     __kmp_pop_parallel(global_tid, NULL);
774 #if OMPT_SUPPORT
775   if (ompt_enabled.enabled)
776     this_thr->th.ompt_thread_info.state =
777         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
778                                            : ompt_state_work_parallel);
779 #endif
780 }
781 
782 /*!
783 @ingroup SYNCHRONIZATION
784 @param loc  source location information.
785 
786 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
787 depending on the memory ordering convention obeyed by the compiler
788 even that may not be necessary).
789 */
790 void __kmpc_flush(ident_t *loc) {
791   KC_TRACE(10, ("__kmpc_flush: called\n"));
792 
793   /* need explicit __mf() here since use volatile instead in library */
794   KMP_MFENCE(); /* Flush all pending memory write invalidates.  */
795 
796 #if OMPT_SUPPORT && OMPT_OPTIONAL
797   if (ompt_enabled.ompt_callback_flush) {
798     ompt_callbacks.ompt_callback(ompt_callback_flush)(
799         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
800   }
801 #endif
802 }
803 
804 /* -------------------------------------------------------------------------- */
805 /*!
806 @ingroup SYNCHRONIZATION
807 @param loc source location information
808 @param global_tid thread id.
809 
810 Execute a barrier.
811 */
812 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
813   KMP_COUNT_BLOCK(OMP_BARRIER);
814   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
815   __kmp_assert_valid_gtid(global_tid);
816 
817   if (!TCR_4(__kmp_init_parallel))
818     __kmp_parallel_initialize();
819 
820   __kmp_resume_if_soft_paused();
821 
822   if (__kmp_env_consistency_check) {
823     if (loc == 0) {
824       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
825     }
826     __kmp_check_barrier(global_tid, ct_barrier, loc);
827   }
828 
829 #if OMPT_SUPPORT
830   ompt_frame_t *ompt_frame;
831   if (ompt_enabled.enabled) {
832     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
833     if (ompt_frame->enter_frame.ptr == NULL)
834       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
835   }
836   OMPT_STORE_RETURN_ADDRESS(global_tid);
837 #endif
838   __kmp_threads[global_tid]->th.th_ident = loc;
839   // TODO: explicit barrier_wait_id:
840   //   this function is called when 'barrier' directive is present or
841   //   implicit barrier at the end of a worksharing construct.
842   // 1) better to add a per-thread barrier counter to a thread data structure
843   // 2) set to 0 when a new team is created
844   // 4) no sync is required
845 
846   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
847 #if OMPT_SUPPORT && OMPT_OPTIONAL
848   if (ompt_enabled.enabled) {
849     ompt_frame->enter_frame = ompt_data_none;
850   }
851 #endif
852 }
853 
854 /* The BARRIER for a MASTER section is always explicit   */
855 /*!
856 @ingroup WORK_SHARING
857 @param loc  source location information.
858 @param global_tid  global thread number .
859 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
860 */
861 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
862   int status = 0;
863 
864   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
865   __kmp_assert_valid_gtid(global_tid);
866 
867   if (!TCR_4(__kmp_init_parallel))
868     __kmp_parallel_initialize();
869 
870   __kmp_resume_if_soft_paused();
871 
872   if (KMP_MASTER_GTID(global_tid)) {
873     KMP_COUNT_BLOCK(OMP_MASTER);
874     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
875     status = 1;
876   }
877 
878 #if OMPT_SUPPORT && OMPT_OPTIONAL
879   if (status) {
880     if (ompt_enabled.ompt_callback_masked) {
881       kmp_info_t *this_thr = __kmp_threads[global_tid];
882       kmp_team_t *team = this_thr->th.th_team;
883 
884       int tid = __kmp_tid_from_gtid(global_tid);
885       ompt_callbacks.ompt_callback(ompt_callback_masked)(
886           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
887           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
888           OMPT_GET_RETURN_ADDRESS(0));
889     }
890   }
891 #endif
892 
893   if (__kmp_env_consistency_check) {
894 #if KMP_USE_DYNAMIC_LOCK
895     if (status)
896       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
897     else
898       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
899 #else
900     if (status)
901       __kmp_push_sync(global_tid, ct_master, loc, NULL);
902     else
903       __kmp_check_sync(global_tid, ct_master, loc, NULL);
904 #endif
905   }
906 
907   return status;
908 }
909 
910 /*!
911 @ingroup WORK_SHARING
912 @param loc  source location information.
913 @param global_tid  global thread number .
914 
915 Mark the end of a <tt>master</tt> region. This should only be called by the
916 thread that executes the <tt>master</tt> region.
917 */
918 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
919   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
920   __kmp_assert_valid_gtid(global_tid);
921   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
922   KMP_POP_PARTITIONED_TIMER();
923 
924 #if OMPT_SUPPORT && OMPT_OPTIONAL
925   kmp_info_t *this_thr = __kmp_threads[global_tid];
926   kmp_team_t *team = this_thr->th.th_team;
927   if (ompt_enabled.ompt_callback_masked) {
928     int tid = __kmp_tid_from_gtid(global_tid);
929     ompt_callbacks.ompt_callback(ompt_callback_masked)(
930         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
931         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
932         OMPT_GET_RETURN_ADDRESS(0));
933   }
934 #endif
935 
936   if (__kmp_env_consistency_check) {
937     if (KMP_MASTER_GTID(global_tid))
938       __kmp_pop_sync(global_tid, ct_master, loc);
939   }
940 }
941 
942 /*!
943 @ingroup WORK_SHARING
944 @param loc  source location information.
945 @param global_tid  global thread number.
946 @param filter result of evaluating filter clause on thread global_tid, or zero
947 if no filter clause present
948 @return 1 if this thread should execute the <tt>masked</tt> block, 0 otherwise.
949 */
950 kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
951   int status = 0;
952   int tid;
953   KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
954   __kmp_assert_valid_gtid(global_tid);
955 
956   if (!TCR_4(__kmp_init_parallel))
957     __kmp_parallel_initialize();
958 
959   __kmp_resume_if_soft_paused();
960 
961   tid = __kmp_tid_from_gtid(global_tid);
962   if (tid == filter) {
963     KMP_COUNT_BLOCK(OMP_MASKED);
964     KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
965     status = 1;
966   }
967 
968 #if OMPT_SUPPORT && OMPT_OPTIONAL
969   if (status) {
970     if (ompt_enabled.ompt_callback_masked) {
971       kmp_info_t *this_thr = __kmp_threads[global_tid];
972       kmp_team_t *team = this_thr->th.th_team;
973       ompt_callbacks.ompt_callback(ompt_callback_masked)(
974           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
975           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
976           OMPT_GET_RETURN_ADDRESS(0));
977     }
978   }
979 #endif
980 
981   if (__kmp_env_consistency_check) {
982 #if KMP_USE_DYNAMIC_LOCK
983     if (status)
984       __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
985     else
986       __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
987 #else
988     if (status)
989       __kmp_push_sync(global_tid, ct_masked, loc, NULL);
990     else
991       __kmp_check_sync(global_tid, ct_masked, loc, NULL);
992 #endif
993   }
994 
995   return status;
996 }
997 
998 /*!
999 @ingroup WORK_SHARING
1000 @param loc  source location information.
1001 @param global_tid  global thread number .
1002 
1003 Mark the end of a <tt>masked</tt> region. This should only be called by the
1004 thread that executes the <tt>masked</tt> region.
1005 */
1006 void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
1007   KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
1008   __kmp_assert_valid_gtid(global_tid);
1009   KMP_POP_PARTITIONED_TIMER();
1010 
1011 #if OMPT_SUPPORT && OMPT_OPTIONAL
1012   kmp_info_t *this_thr = __kmp_threads[global_tid];
1013   kmp_team_t *team = this_thr->th.th_team;
1014   if (ompt_enabled.ompt_callback_masked) {
1015     int tid = __kmp_tid_from_gtid(global_tid);
1016     ompt_callbacks.ompt_callback(ompt_callback_masked)(
1017         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
1018         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1019         OMPT_GET_RETURN_ADDRESS(0));
1020   }
1021 #endif
1022 
1023   if (__kmp_env_consistency_check) {
1024     __kmp_pop_sync(global_tid, ct_masked, loc);
1025   }
1026 }
1027 
1028 /*!
1029 @ingroup WORK_SHARING
1030 @param loc  source location information.
1031 @param gtid  global thread number.
1032 
1033 Start execution of an <tt>ordered</tt> construct.
1034 */
1035 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
1036   int cid = 0;
1037   kmp_info_t *th;
1038   KMP_DEBUG_ASSERT(__kmp_init_serial);
1039 
1040   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
1041   __kmp_assert_valid_gtid(gtid);
1042 
1043   if (!TCR_4(__kmp_init_parallel))
1044     __kmp_parallel_initialize();
1045 
1046   __kmp_resume_if_soft_paused();
1047 
1048 #if USE_ITT_BUILD
1049   __kmp_itt_ordered_prep(gtid);
1050 // TODO: ordered_wait_id
1051 #endif /* USE_ITT_BUILD */
1052 
1053   th = __kmp_threads[gtid];
1054 
1055 #if OMPT_SUPPORT && OMPT_OPTIONAL
1056   kmp_team_t *team;
1057   ompt_wait_id_t lck;
1058   void *codeptr_ra;
1059   OMPT_STORE_RETURN_ADDRESS(gtid);
1060   if (ompt_enabled.enabled) {
1061     team = __kmp_team_from_gtid(gtid);
1062     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
1063     /* OMPT state update */
1064     th->th.ompt_thread_info.wait_id = lck;
1065     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
1066 
1067     /* OMPT event callback */
1068     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1069     if (ompt_enabled.ompt_callback_mutex_acquire) {
1070       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1071           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
1072           codeptr_ra);
1073     }
1074   }
1075 #endif
1076 
1077   if (th->th.th_dispatch->th_deo_fcn != 0)
1078     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
1079   else
1080     __kmp_parallel_deo(&gtid, &cid, loc);
1081 
1082 #if OMPT_SUPPORT && OMPT_OPTIONAL
1083   if (ompt_enabled.enabled) {
1084     /* OMPT state update */
1085     th->th.ompt_thread_info.state = ompt_state_work_parallel;
1086     th->th.ompt_thread_info.wait_id = 0;
1087 
1088     /* OMPT event callback */
1089     if (ompt_enabled.ompt_callback_mutex_acquired) {
1090       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1091           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1092     }
1093   }
1094 #endif
1095 
1096 #if USE_ITT_BUILD
1097   __kmp_itt_ordered_start(gtid);
1098 #endif /* USE_ITT_BUILD */
1099 }
1100 
1101 /*!
1102 @ingroup WORK_SHARING
1103 @param loc  source location information.
1104 @param gtid  global thread number.
1105 
1106 End execution of an <tt>ordered</tt> construct.
1107 */
1108 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
1109   int cid = 0;
1110   kmp_info_t *th;
1111 
1112   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
1113   __kmp_assert_valid_gtid(gtid);
1114 
1115 #if USE_ITT_BUILD
1116   __kmp_itt_ordered_end(gtid);
1117 // TODO: ordered_wait_id
1118 #endif /* USE_ITT_BUILD */
1119 
1120   th = __kmp_threads[gtid];
1121 
1122   if (th->th.th_dispatch->th_dxo_fcn != 0)
1123     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
1124   else
1125     __kmp_parallel_dxo(&gtid, &cid, loc);
1126 
1127 #if OMPT_SUPPORT && OMPT_OPTIONAL
1128   OMPT_STORE_RETURN_ADDRESS(gtid);
1129   if (ompt_enabled.ompt_callback_mutex_released) {
1130     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1131         ompt_mutex_ordered,
1132         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
1133             ->t.t_ordered.dt.t_value,
1134         OMPT_LOAD_RETURN_ADDRESS(gtid));
1135   }
1136 #endif
1137 }
1138 
1139 #if KMP_USE_DYNAMIC_LOCK
1140 
1141 static __forceinline void
1142 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
1143                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
1144   // Pointer to the allocated indirect lock is written to crit, while indexing
1145   // is ignored.
1146   void *idx;
1147   kmp_indirect_lock_t **lck;
1148   lck = (kmp_indirect_lock_t **)crit;
1149   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
1150   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
1151   KMP_SET_I_LOCK_LOCATION(ilk, loc);
1152   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
1153   KA_TRACE(20,
1154            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
1155 #if USE_ITT_BUILD
1156   __kmp_itt_critical_creating(ilk->lock, loc);
1157 #endif
1158   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
1159   if (status == 0) {
1160 #if USE_ITT_BUILD
1161     __kmp_itt_critical_destroyed(ilk->lock);
1162 #endif
1163     // We don't really need to destroy the unclaimed lock here since it will be
1164     // cleaned up at program exit.
1165     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
1166   }
1167   KMP_DEBUG_ASSERT(*lck != NULL);
1168 }
1169 
1170 // Fast-path acquire tas lock
1171 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
1172   {                                                                            \
1173     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1174     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1175     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1176     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
1177         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
1178       kmp_uint32 spins;                                                        \
1179       KMP_FSYNC_PREPARE(l);                                                    \
1180       KMP_INIT_YIELD(spins);                                                   \
1181       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1182       do {                                                                     \
1183         if (TCR_4(__kmp_nth) >                                                 \
1184             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1185           KMP_YIELD(TRUE);                                                     \
1186         } else {                                                               \
1187           KMP_YIELD_SPIN(spins);                                               \
1188         }                                                                      \
1189         __kmp_spin_backoff(&backoff);                                          \
1190       } while (                                                                \
1191           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1192           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1193     }                                                                          \
1194     KMP_FSYNC_ACQUIRED(l);                                                     \
1195   }
1196 
1197 // Fast-path test tas lock
1198 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1199   {                                                                            \
1200     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1201     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1202     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1203     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1204          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1205   }
1206 
1207 // Fast-path release tas lock
1208 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1209   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1210 
1211 #if KMP_USE_FUTEX
1212 
1213 #include <sys/syscall.h>
1214 #include <unistd.h>
1215 #ifndef FUTEX_WAIT
1216 #define FUTEX_WAIT 0
1217 #endif
1218 #ifndef FUTEX_WAKE
1219 #define FUTEX_WAKE 1
1220 #endif
1221 
1222 // Fast-path acquire futex lock
1223 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1224   {                                                                            \
1225     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1226     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1227     KMP_MB();                                                                  \
1228     KMP_FSYNC_PREPARE(ftx);                                                    \
1229     kmp_int32 poll_val;                                                        \
1230     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1231                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1232                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1233       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1234       if (!cond) {                                                             \
1235         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1236                                          poll_val |                            \
1237                                              KMP_LOCK_BUSY(1, futex))) {       \
1238           continue;                                                            \
1239         }                                                                      \
1240         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1241       }                                                                        \
1242       kmp_int32 rc;                                                            \
1243       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1244                         NULL, NULL, 0)) != 0) {                                \
1245         continue;                                                              \
1246       }                                                                        \
1247       gtid_code |= 1;                                                          \
1248     }                                                                          \
1249     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1250   }
1251 
1252 // Fast-path test futex lock
1253 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1254   {                                                                            \
1255     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1256     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1257                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1258       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1259       rc = TRUE;                                                               \
1260     } else {                                                                   \
1261       rc = FALSE;                                                              \
1262     }                                                                          \
1263   }
1264 
1265 // Fast-path release futex lock
1266 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1267   {                                                                            \
1268     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1269     KMP_MB();                                                                  \
1270     KMP_FSYNC_RELEASING(ftx);                                                  \
1271     kmp_int32 poll_val =                                                       \
1272         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1273     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1274       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1275               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1276     }                                                                          \
1277     KMP_MB();                                                                  \
1278     KMP_YIELD_OVERSUB();                                                       \
1279   }
1280 
1281 #endif // KMP_USE_FUTEX
1282 
1283 #else // KMP_USE_DYNAMIC_LOCK
1284 
1285 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1286                                                       ident_t const *loc,
1287                                                       kmp_int32 gtid) {
1288   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1289 
1290   // Because of the double-check, the following load doesn't need to be volatile
1291   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1292 
1293   if (lck == NULL) {
1294     void *idx;
1295 
1296     // Allocate & initialize the lock.
1297     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1298     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1299     __kmp_init_user_lock_with_checks(lck);
1300     __kmp_set_user_lock_location(lck, loc);
1301 #if USE_ITT_BUILD
1302     __kmp_itt_critical_creating(lck);
1303 // __kmp_itt_critical_creating() should be called *before* the first usage
1304 // of underlying lock. It is the only place where we can guarantee it. There
1305 // are chances the lock will destroyed with no usage, but it is not a
1306 // problem, because this is not real event seen by user but rather setting
1307 // name for object (lock). See more details in kmp_itt.h.
1308 #endif /* USE_ITT_BUILD */
1309 
1310     // Use a cmpxchg instruction to slam the start of the critical section with
1311     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1312     // and use the lock that the other thread allocated.
1313     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1314 
1315     if (status == 0) {
1316 // Deallocate the lock and reload the value.
1317 #if USE_ITT_BUILD
1318       __kmp_itt_critical_destroyed(lck);
1319 // Let ITT know the lock is destroyed and the same memory location may be reused
1320 // for another purpose.
1321 #endif /* USE_ITT_BUILD */
1322       __kmp_destroy_user_lock_with_checks(lck);
1323       __kmp_user_lock_free(&idx, gtid, lck);
1324       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1325       KMP_DEBUG_ASSERT(lck != NULL);
1326     }
1327   }
1328   return lck;
1329 }
1330 
1331 #endif // KMP_USE_DYNAMIC_LOCK
1332 
1333 /*!
1334 @ingroup WORK_SHARING
1335 @param loc  source location information.
1336 @param global_tid  global thread number.
1337 @param crit identity of the critical section. This could be a pointer to a lock
1338 associated with the critical section, or some other suitably unique value.
1339 
1340 Enter code protected by a `critical` construct.
1341 This function blocks until the executing thread can enter the critical section.
1342 */
1343 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1344                      kmp_critical_name *crit) {
1345 #if KMP_USE_DYNAMIC_LOCK
1346 #if OMPT_SUPPORT && OMPT_OPTIONAL
1347   OMPT_STORE_RETURN_ADDRESS(global_tid);
1348 #endif // OMPT_SUPPORT
1349   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1350 #else
1351   KMP_COUNT_BLOCK(OMP_CRITICAL);
1352 #if OMPT_SUPPORT && OMPT_OPTIONAL
1353   ompt_state_t prev_state = ompt_state_undefined;
1354   ompt_thread_info_t ti;
1355 #endif
1356   kmp_user_lock_p lck;
1357 
1358   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1359   __kmp_assert_valid_gtid(global_tid);
1360 
1361   // TODO: add THR_OVHD_STATE
1362 
1363   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1364   KMP_CHECK_USER_LOCK_INIT();
1365 
1366   if ((__kmp_user_lock_kind == lk_tas) &&
1367       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1368     lck = (kmp_user_lock_p)crit;
1369   }
1370 #if KMP_USE_FUTEX
1371   else if ((__kmp_user_lock_kind == lk_futex) &&
1372            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1373     lck = (kmp_user_lock_p)crit;
1374   }
1375 #endif
1376   else { // ticket, queuing or drdpa
1377     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1378   }
1379 
1380   if (__kmp_env_consistency_check)
1381     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1382 
1383     // since the critical directive binds to all threads, not just the current
1384     // team we have to check this even if we are in a serialized team.
1385     // also, even if we are the uber thread, we still have to conduct the lock,
1386     // as we have to contend with sibling threads.
1387 
1388 #if USE_ITT_BUILD
1389   __kmp_itt_critical_acquiring(lck);
1390 #endif /* USE_ITT_BUILD */
1391 #if OMPT_SUPPORT && OMPT_OPTIONAL
1392   OMPT_STORE_RETURN_ADDRESS(gtid);
1393   void *codeptr_ra = NULL;
1394   if (ompt_enabled.enabled) {
1395     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1396     /* OMPT state update */
1397     prev_state = ti.state;
1398     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1399     ti.state = ompt_state_wait_critical;
1400 
1401     /* OMPT event callback */
1402     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1403     if (ompt_enabled.ompt_callback_mutex_acquire) {
1404       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1405           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1406           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1407     }
1408   }
1409 #endif
1410   // Value of 'crit' should be good for using as a critical_id of the critical
1411   // section directive.
1412   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1413 
1414 #if USE_ITT_BUILD
1415   __kmp_itt_critical_acquired(lck);
1416 #endif /* USE_ITT_BUILD */
1417 #if OMPT_SUPPORT && OMPT_OPTIONAL
1418   if (ompt_enabled.enabled) {
1419     /* OMPT state update */
1420     ti.state = prev_state;
1421     ti.wait_id = 0;
1422 
1423     /* OMPT event callback */
1424     if (ompt_enabled.ompt_callback_mutex_acquired) {
1425       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1426           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1427     }
1428   }
1429 #endif
1430   KMP_POP_PARTITIONED_TIMER();
1431 
1432   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1433   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1434 #endif // KMP_USE_DYNAMIC_LOCK
1435 }
1436 
1437 #if KMP_USE_DYNAMIC_LOCK
1438 
1439 // Converts the given hint to an internal lock implementation
1440 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1441 #if KMP_USE_TSX
1442 #define KMP_TSX_LOCK(seq) lockseq_##seq
1443 #else
1444 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1445 #endif
1446 
1447 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1448 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.flags.rtm)
1449 #else
1450 #define KMP_CPUINFO_RTM 0
1451 #endif
1452 
1453   // Hints that do not require further logic
1454   if (hint & kmp_lock_hint_hle)
1455     return KMP_TSX_LOCK(hle);
1456   if (hint & kmp_lock_hint_rtm)
1457     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1458   if (hint & kmp_lock_hint_adaptive)
1459     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1460 
1461   // Rule out conflicting hints first by returning the default lock
1462   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1463     return __kmp_user_lock_seq;
1464   if ((hint & omp_lock_hint_speculative) &&
1465       (hint & omp_lock_hint_nonspeculative))
1466     return __kmp_user_lock_seq;
1467 
1468   // Do not even consider speculation when it appears to be contended
1469   if (hint & omp_lock_hint_contended)
1470     return lockseq_queuing;
1471 
1472   // Uncontended lock without speculation
1473   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1474     return lockseq_tas;
1475 
1476   // Use RTM lock for speculation
1477   if (hint & omp_lock_hint_speculative)
1478     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1479 
1480   return __kmp_user_lock_seq;
1481 }
1482 
1483 #if OMPT_SUPPORT && OMPT_OPTIONAL
1484 #if KMP_USE_DYNAMIC_LOCK
1485 static kmp_mutex_impl_t
1486 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1487   if (user_lock) {
1488     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1489     case 0:
1490       break;
1491 #if KMP_USE_FUTEX
1492     case locktag_futex:
1493       return kmp_mutex_impl_queuing;
1494 #endif
1495     case locktag_tas:
1496       return kmp_mutex_impl_spin;
1497 #if KMP_USE_TSX
1498     case locktag_hle:
1499     case locktag_rtm_spin:
1500       return kmp_mutex_impl_speculative;
1501 #endif
1502     default:
1503       return kmp_mutex_impl_none;
1504     }
1505     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1506   }
1507   KMP_ASSERT(ilock);
1508   switch (ilock->type) {
1509 #if KMP_USE_TSX
1510   case locktag_adaptive:
1511   case locktag_rtm_queuing:
1512     return kmp_mutex_impl_speculative;
1513 #endif
1514   case locktag_nested_tas:
1515     return kmp_mutex_impl_spin;
1516 #if KMP_USE_FUTEX
1517   case locktag_nested_futex:
1518 #endif
1519   case locktag_ticket:
1520   case locktag_queuing:
1521   case locktag_drdpa:
1522   case locktag_nested_ticket:
1523   case locktag_nested_queuing:
1524   case locktag_nested_drdpa:
1525     return kmp_mutex_impl_queuing;
1526   default:
1527     return kmp_mutex_impl_none;
1528   }
1529 }
1530 #else
1531 // For locks without dynamic binding
1532 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1533   switch (__kmp_user_lock_kind) {
1534   case lk_tas:
1535     return kmp_mutex_impl_spin;
1536 #if KMP_USE_FUTEX
1537   case lk_futex:
1538 #endif
1539   case lk_ticket:
1540   case lk_queuing:
1541   case lk_drdpa:
1542     return kmp_mutex_impl_queuing;
1543 #if KMP_USE_TSX
1544   case lk_hle:
1545   case lk_rtm_queuing:
1546   case lk_rtm_spin:
1547   case lk_adaptive:
1548     return kmp_mutex_impl_speculative;
1549 #endif
1550   default:
1551     return kmp_mutex_impl_none;
1552   }
1553 }
1554 #endif // KMP_USE_DYNAMIC_LOCK
1555 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1556 
1557 /*!
1558 @ingroup WORK_SHARING
1559 @param loc  source location information.
1560 @param global_tid  global thread number.
1561 @param crit identity of the critical section. This could be a pointer to a lock
1562 associated with the critical section, or some other suitably unique value.
1563 @param hint the lock hint.
1564 
1565 Enter code protected by a `critical` construct with a hint. The hint value is
1566 used to suggest a lock implementation. This function blocks until the executing
1567 thread can enter the critical section unless the hint suggests use of
1568 speculative execution and the hardware supports it.
1569 */
1570 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1571                                kmp_critical_name *crit, uint32_t hint) {
1572   KMP_COUNT_BLOCK(OMP_CRITICAL);
1573   kmp_user_lock_p lck;
1574 #if OMPT_SUPPORT && OMPT_OPTIONAL
1575   ompt_state_t prev_state = ompt_state_undefined;
1576   ompt_thread_info_t ti;
1577   // This is the case, if called from __kmpc_critical:
1578   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1579   if (!codeptr)
1580     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1581 #endif
1582 
1583   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1584   __kmp_assert_valid_gtid(global_tid);
1585 
1586   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1587   // Check if it is initialized.
1588   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1589   kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
1590   if (*lk == 0) {
1591     if (KMP_IS_D_LOCK(lockseq)) {
1592       KMP_COMPARE_AND_STORE_ACQ32(
1593           (volatile kmp_int32 *)&((kmp_base_tas_lock_t *)crit)->poll, 0,
1594           KMP_GET_D_TAG(lockseq));
1595     } else {
1596       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
1597     }
1598   }
1599   // Branch for accessing the actual lock object and set operation. This
1600   // branching is inevitable since this lock initialization does not follow the
1601   // normal dispatch path (lock table is not used).
1602   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1603     lck = (kmp_user_lock_p)lk;
1604     if (__kmp_env_consistency_check) {
1605       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1606                       __kmp_map_hint_to_lock(hint));
1607     }
1608 #if USE_ITT_BUILD
1609     __kmp_itt_critical_acquiring(lck);
1610 #endif
1611 #if OMPT_SUPPORT && OMPT_OPTIONAL
1612     if (ompt_enabled.enabled) {
1613       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1614       /* OMPT state update */
1615       prev_state = ti.state;
1616       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1617       ti.state = ompt_state_wait_critical;
1618 
1619       /* OMPT event callback */
1620       if (ompt_enabled.ompt_callback_mutex_acquire) {
1621         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1622             ompt_mutex_critical, (unsigned int)hint,
1623             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1624             codeptr);
1625       }
1626     }
1627 #endif
1628 #if KMP_USE_INLINED_TAS
1629     if (lockseq == lockseq_tas && !__kmp_env_consistency_check) {
1630       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1631     } else
1632 #elif KMP_USE_INLINED_FUTEX
1633     if (lockseq == lockseq_futex && !__kmp_env_consistency_check) {
1634       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1635     } else
1636 #endif
1637     {
1638       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1639     }
1640   } else {
1641     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1642     lck = ilk->lock;
1643     if (__kmp_env_consistency_check) {
1644       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1645                       __kmp_map_hint_to_lock(hint));
1646     }
1647 #if USE_ITT_BUILD
1648     __kmp_itt_critical_acquiring(lck);
1649 #endif
1650 #if OMPT_SUPPORT && OMPT_OPTIONAL
1651     if (ompt_enabled.enabled) {
1652       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1653       /* OMPT state update */
1654       prev_state = ti.state;
1655       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1656       ti.state = ompt_state_wait_critical;
1657 
1658       /* OMPT event callback */
1659       if (ompt_enabled.ompt_callback_mutex_acquire) {
1660         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1661             ompt_mutex_critical, (unsigned int)hint,
1662             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1663             codeptr);
1664       }
1665     }
1666 #endif
1667     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1668   }
1669   KMP_POP_PARTITIONED_TIMER();
1670 
1671 #if USE_ITT_BUILD
1672   __kmp_itt_critical_acquired(lck);
1673 #endif /* USE_ITT_BUILD */
1674 #if OMPT_SUPPORT && OMPT_OPTIONAL
1675   if (ompt_enabled.enabled) {
1676     /* OMPT state update */
1677     ti.state = prev_state;
1678     ti.wait_id = 0;
1679 
1680     /* OMPT event callback */
1681     if (ompt_enabled.ompt_callback_mutex_acquired) {
1682       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1683           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1684     }
1685   }
1686 #endif
1687 
1688   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1689   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1690 } // __kmpc_critical_with_hint
1691 
1692 #endif // KMP_USE_DYNAMIC_LOCK
1693 
1694 /*!
1695 @ingroup WORK_SHARING
1696 @param loc  source location information.
1697 @param global_tid  global thread number .
1698 @param crit identity of the critical section. This could be a pointer to a lock
1699 associated with the critical section, or some other suitably unique value.
1700 
1701 Leave a critical section, releasing any lock that was held during its execution.
1702 */
1703 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1704                          kmp_critical_name *crit) {
1705   kmp_user_lock_p lck;
1706 
1707   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1708 
1709 #if KMP_USE_DYNAMIC_LOCK
1710   int locktag = KMP_EXTRACT_D_TAG(crit);
1711   if (locktag) {
1712     lck = (kmp_user_lock_p)crit;
1713     KMP_ASSERT(lck != NULL);
1714     if (__kmp_env_consistency_check) {
1715       __kmp_pop_sync(global_tid, ct_critical, loc);
1716     }
1717 #if USE_ITT_BUILD
1718     __kmp_itt_critical_releasing(lck);
1719 #endif
1720 #if KMP_USE_INLINED_TAS
1721     if (locktag == locktag_tas && !__kmp_env_consistency_check) {
1722       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1723     } else
1724 #elif KMP_USE_INLINED_FUTEX
1725     if (locktag == locktag_futex && !__kmp_env_consistency_check) {
1726       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1727     } else
1728 #endif
1729     {
1730       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1731     }
1732   } else {
1733     kmp_indirect_lock_t *ilk =
1734         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1735     KMP_ASSERT(ilk != NULL);
1736     lck = ilk->lock;
1737     if (__kmp_env_consistency_check) {
1738       __kmp_pop_sync(global_tid, ct_critical, loc);
1739     }
1740 #if USE_ITT_BUILD
1741     __kmp_itt_critical_releasing(lck);
1742 #endif
1743     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1744   }
1745 
1746 #else // KMP_USE_DYNAMIC_LOCK
1747 
1748   if ((__kmp_user_lock_kind == lk_tas) &&
1749       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1750     lck = (kmp_user_lock_p)crit;
1751   }
1752 #if KMP_USE_FUTEX
1753   else if ((__kmp_user_lock_kind == lk_futex) &&
1754            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1755     lck = (kmp_user_lock_p)crit;
1756   }
1757 #endif
1758   else { // ticket, queuing or drdpa
1759     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1760   }
1761 
1762   KMP_ASSERT(lck != NULL);
1763 
1764   if (__kmp_env_consistency_check)
1765     __kmp_pop_sync(global_tid, ct_critical, loc);
1766 
1767 #if USE_ITT_BUILD
1768   __kmp_itt_critical_releasing(lck);
1769 #endif /* USE_ITT_BUILD */
1770   // Value of 'crit' should be good for using as a critical_id of the critical
1771   // section directive.
1772   __kmp_release_user_lock_with_checks(lck, global_tid);
1773 
1774 #endif // KMP_USE_DYNAMIC_LOCK
1775 
1776 #if OMPT_SUPPORT && OMPT_OPTIONAL
1777   /* OMPT release event triggers after lock is released; place here to trigger
1778    * for all #if branches */
1779   OMPT_STORE_RETURN_ADDRESS(global_tid);
1780   if (ompt_enabled.ompt_callback_mutex_released) {
1781     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1782         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1783         OMPT_LOAD_RETURN_ADDRESS(0));
1784   }
1785 #endif
1786 
1787   KMP_POP_PARTITIONED_TIMER();
1788   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1789 }
1790 
1791 /*!
1792 @ingroup SYNCHRONIZATION
1793 @param loc source location information
1794 @param global_tid thread id.
1795 @return one if the thread should execute the master block, zero otherwise
1796 
1797 Start execution of a combined barrier and master. The barrier is executed inside
1798 this function.
1799 */
1800 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1801   int status;
1802   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1803   __kmp_assert_valid_gtid(global_tid);
1804 
1805   if (!TCR_4(__kmp_init_parallel))
1806     __kmp_parallel_initialize();
1807 
1808   __kmp_resume_if_soft_paused();
1809 
1810   if (__kmp_env_consistency_check)
1811     __kmp_check_barrier(global_tid, ct_barrier, loc);
1812 
1813 #if OMPT_SUPPORT
1814   ompt_frame_t *ompt_frame;
1815   if (ompt_enabled.enabled) {
1816     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1817     if (ompt_frame->enter_frame.ptr == NULL)
1818       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1819   }
1820   OMPT_STORE_RETURN_ADDRESS(global_tid);
1821 #endif
1822 #if USE_ITT_NOTIFY
1823   __kmp_threads[global_tid]->th.th_ident = loc;
1824 #endif
1825   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1826 #if OMPT_SUPPORT && OMPT_OPTIONAL
1827   if (ompt_enabled.enabled) {
1828     ompt_frame->enter_frame = ompt_data_none;
1829   }
1830 #endif
1831 
1832   return (status != 0) ? 0 : 1;
1833 }
1834 
1835 /*!
1836 @ingroup SYNCHRONIZATION
1837 @param loc source location information
1838 @param global_tid thread id.
1839 
1840 Complete the execution of a combined barrier and master. This function should
1841 only be called at the completion of the <tt>master</tt> code. Other threads will
1842 still be waiting at the barrier and this call releases them.
1843 */
1844 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1845   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1846   __kmp_assert_valid_gtid(global_tid);
1847   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1848 }
1849 
1850 /*!
1851 @ingroup SYNCHRONIZATION
1852 @param loc source location information
1853 @param global_tid thread id.
1854 @return one if the thread should execute the master block, zero otherwise
1855 
1856 Start execution of a combined barrier and master(nowait) construct.
1857 The barrier is executed inside this function.
1858 There is no equivalent "end" function, since the
1859 */
1860 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1861   kmp_int32 ret;
1862   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1863   __kmp_assert_valid_gtid(global_tid);
1864 
1865   if (!TCR_4(__kmp_init_parallel))
1866     __kmp_parallel_initialize();
1867 
1868   __kmp_resume_if_soft_paused();
1869 
1870   if (__kmp_env_consistency_check) {
1871     if (loc == 0) {
1872       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1873     }
1874     __kmp_check_barrier(global_tid, ct_barrier, loc);
1875   }
1876 
1877 #if OMPT_SUPPORT
1878   ompt_frame_t *ompt_frame;
1879   if (ompt_enabled.enabled) {
1880     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1881     if (ompt_frame->enter_frame.ptr == NULL)
1882       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1883   }
1884   OMPT_STORE_RETURN_ADDRESS(global_tid);
1885 #endif
1886 #if USE_ITT_NOTIFY
1887   __kmp_threads[global_tid]->th.th_ident = loc;
1888 #endif
1889   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1890 #if OMPT_SUPPORT && OMPT_OPTIONAL
1891   if (ompt_enabled.enabled) {
1892     ompt_frame->enter_frame = ompt_data_none;
1893   }
1894 #endif
1895 
1896   ret = __kmpc_master(loc, global_tid);
1897 
1898   if (__kmp_env_consistency_check) {
1899     /*  there's no __kmpc_end_master called; so the (stats) */
1900     /*  actions of __kmpc_end_master are done here          */
1901     if (ret) {
1902       /* only one thread should do the pop since only */
1903       /* one did the push (see __kmpc_master())       */
1904       __kmp_pop_sync(global_tid, ct_master, loc);
1905     }
1906   }
1907 
1908   return (ret);
1909 }
1910 
1911 /* The BARRIER for a SINGLE process section is always explicit   */
1912 /*!
1913 @ingroup WORK_SHARING
1914 @param loc  source location information
1915 @param global_tid  global thread number
1916 @return One if this thread should execute the single construct, zero otherwise.
1917 
1918 Test whether to execute a <tt>single</tt> construct.
1919 There are no implicit barriers in the two "single" calls, rather the compiler
1920 should introduce an explicit barrier if it is required.
1921 */
1922 
1923 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1924   __kmp_assert_valid_gtid(global_tid);
1925   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1926 
1927   if (rc) {
1928     // We are going to execute the single statement, so we should count it.
1929     KMP_COUNT_BLOCK(OMP_SINGLE);
1930     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1931   }
1932 
1933 #if OMPT_SUPPORT && OMPT_OPTIONAL
1934   kmp_info_t *this_thr = __kmp_threads[global_tid];
1935   kmp_team_t *team = this_thr->th.th_team;
1936   int tid = __kmp_tid_from_gtid(global_tid);
1937 
1938   if (ompt_enabled.enabled) {
1939     if (rc) {
1940       if (ompt_enabled.ompt_callback_work) {
1941         ompt_callbacks.ompt_callback(ompt_callback_work)(
1942             ompt_work_single_executor, ompt_scope_begin,
1943             &(team->t.ompt_team_info.parallel_data),
1944             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1945             1, OMPT_GET_RETURN_ADDRESS(0));
1946       }
1947     } else {
1948       if (ompt_enabled.ompt_callback_work) {
1949         ompt_callbacks.ompt_callback(ompt_callback_work)(
1950             ompt_work_single_other, ompt_scope_begin,
1951             &(team->t.ompt_team_info.parallel_data),
1952             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1953             1, OMPT_GET_RETURN_ADDRESS(0));
1954         ompt_callbacks.ompt_callback(ompt_callback_work)(
1955             ompt_work_single_other, ompt_scope_end,
1956             &(team->t.ompt_team_info.parallel_data),
1957             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1958             1, OMPT_GET_RETURN_ADDRESS(0));
1959       }
1960     }
1961   }
1962 #endif
1963 
1964   return rc;
1965 }
1966 
1967 /*!
1968 @ingroup WORK_SHARING
1969 @param loc  source location information
1970 @param global_tid  global thread number
1971 
1972 Mark the end of a <tt>single</tt> construct.  This function should
1973 only be called by the thread that executed the block of code protected
1974 by the `single` construct.
1975 */
1976 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1977   __kmp_assert_valid_gtid(global_tid);
1978   __kmp_exit_single(global_tid);
1979   KMP_POP_PARTITIONED_TIMER();
1980 
1981 #if OMPT_SUPPORT && OMPT_OPTIONAL
1982   kmp_info_t *this_thr = __kmp_threads[global_tid];
1983   kmp_team_t *team = this_thr->th.th_team;
1984   int tid = __kmp_tid_from_gtid(global_tid);
1985 
1986   if (ompt_enabled.ompt_callback_work) {
1987     ompt_callbacks.ompt_callback(ompt_callback_work)(
1988         ompt_work_single_executor, ompt_scope_end,
1989         &(team->t.ompt_team_info.parallel_data),
1990         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1991         OMPT_GET_RETURN_ADDRESS(0));
1992   }
1993 #endif
1994 }
1995 
1996 /*!
1997 @ingroup WORK_SHARING
1998 @param loc Source location
1999 @param global_tid Global thread id
2000 
2001 Mark the end of a statically scheduled loop.
2002 */
2003 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
2004   KMP_POP_PARTITIONED_TIMER();
2005   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
2006 
2007 #if OMPT_SUPPORT && OMPT_OPTIONAL
2008   if (ompt_enabled.ompt_callback_work) {
2009     ompt_work_t ompt_work_type = ompt_work_loop_static;
2010     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2011     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2012     // Determine workshare type
2013     if (loc != NULL) {
2014       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
2015         ompt_work_type = ompt_work_loop_static;
2016       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
2017         ompt_work_type = ompt_work_sections;
2018       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
2019         ompt_work_type = ompt_work_distribute;
2020       } else {
2021         // use default set above.
2022         // a warning about this case is provided in __kmpc_for_static_init
2023       }
2024       KMP_DEBUG_ASSERT(ompt_work_type);
2025     }
2026     ompt_callbacks.ompt_callback(ompt_callback_work)(
2027         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
2028         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2029   }
2030 #endif
2031   if (__kmp_env_consistency_check)
2032     __kmp_pop_workshare(global_tid, ct_pdo, loc);
2033 }
2034 
2035 // User routines which take C-style arguments (call by value)
2036 // different from the Fortran equivalent routines
2037 
2038 void ompc_set_num_threads(int arg) {
2039   // !!!!! TODO: check the per-task binding
2040   __kmp_set_num_threads(arg, __kmp_entry_gtid());
2041 }
2042 
2043 void ompc_set_dynamic(int flag) {
2044   kmp_info_t *thread;
2045 
2046   /* For the thread-private implementation of the internal controls */
2047   thread = __kmp_entry_thread();
2048 
2049   __kmp_save_internal_controls(thread);
2050 
2051   set__dynamic(thread, flag ? true : false);
2052 }
2053 
2054 void ompc_set_nested(int flag) {
2055   kmp_info_t *thread;
2056 
2057   /* For the thread-private internal controls implementation */
2058   thread = __kmp_entry_thread();
2059 
2060   __kmp_save_internal_controls(thread);
2061 
2062   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
2063 }
2064 
2065 void ompc_set_max_active_levels(int max_active_levels) {
2066   /* TO DO */
2067   /* we want per-task implementation of this internal control */
2068 
2069   /* For the per-thread internal controls implementation */
2070   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
2071 }
2072 
2073 void ompc_set_schedule(omp_sched_t kind, int modifier) {
2074   // !!!!! TODO: check the per-task binding
2075   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
2076 }
2077 
2078 int ompc_get_ancestor_thread_num(int level) {
2079   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
2080 }
2081 
2082 int ompc_get_team_size(int level) {
2083   return __kmp_get_team_size(__kmp_entry_gtid(), level);
2084 }
2085 
2086 /* OpenMP 5.0 Affinity Format API */
2087 void KMP_EXPAND_NAME(ompc_set_affinity_format)(char const *format) {
2088   if (!__kmp_init_serial) {
2089     __kmp_serial_initialize();
2090   }
2091   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
2092                          format, KMP_STRLEN(format) + 1);
2093 }
2094 
2095 size_t KMP_EXPAND_NAME(ompc_get_affinity_format)(char *buffer, size_t size) {
2096   size_t format_size;
2097   if (!__kmp_init_serial) {
2098     __kmp_serial_initialize();
2099   }
2100   format_size = KMP_STRLEN(__kmp_affinity_format);
2101   if (buffer && size) {
2102     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
2103                            format_size + 1);
2104   }
2105   return format_size;
2106 }
2107 
2108 void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format) {
2109   int gtid;
2110   if (!TCR_4(__kmp_init_middle)) {
2111     __kmp_middle_initialize();
2112   }
2113   __kmp_assign_root_init_mask();
2114   gtid = __kmp_get_gtid();
2115 #if KMP_AFFINITY_SUPPORTED
2116   if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
2117       __kmp_affinity.flags.reset) {
2118     __kmp_reset_root_init_mask(gtid);
2119   }
2120 #endif
2121   __kmp_aux_display_affinity(gtid, format);
2122 }
2123 
2124 size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
2125                                               char const *format) {
2126   int gtid;
2127   size_t num_required;
2128   kmp_str_buf_t capture_buf;
2129   if (!TCR_4(__kmp_init_middle)) {
2130     __kmp_middle_initialize();
2131   }
2132   __kmp_assign_root_init_mask();
2133   gtid = __kmp_get_gtid();
2134 #if KMP_AFFINITY_SUPPORTED
2135   if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
2136       __kmp_affinity.flags.reset) {
2137     __kmp_reset_root_init_mask(gtid);
2138   }
2139 #endif
2140   __kmp_str_buf_init(&capture_buf);
2141   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
2142   if (buffer && buf_size) {
2143     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
2144                            capture_buf.used + 1);
2145   }
2146   __kmp_str_buf_free(&capture_buf);
2147   return num_required;
2148 }
2149 
2150 void kmpc_set_stacksize(int arg) {
2151   // __kmp_aux_set_stacksize initializes the library if needed
2152   __kmp_aux_set_stacksize(arg);
2153 }
2154 
2155 void kmpc_set_stacksize_s(size_t arg) {
2156   // __kmp_aux_set_stacksize initializes the library if needed
2157   __kmp_aux_set_stacksize(arg);
2158 }
2159 
2160 void kmpc_set_blocktime(int arg) {
2161   int gtid, tid, bt = arg;
2162   kmp_info_t *thread;
2163 
2164   gtid = __kmp_entry_gtid();
2165   tid = __kmp_tid_from_gtid(gtid);
2166   thread = __kmp_thread_from_gtid(gtid);
2167 
2168   __kmp_aux_convert_blocktime(&bt);
2169   __kmp_aux_set_blocktime(bt, thread, tid);
2170 }
2171 
2172 void kmpc_set_library(int arg) {
2173   // __kmp_user_set_library initializes the library if needed
2174   __kmp_user_set_library((enum library_type)arg);
2175 }
2176 
2177 void kmpc_set_defaults(char const *str) {
2178   // __kmp_aux_set_defaults initializes the library if needed
2179   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
2180 }
2181 
2182 void kmpc_set_disp_num_buffers(int arg) {
2183   // ignore after initialization because some teams have already
2184   // allocated dispatch buffers
2185   if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
2186       arg <= KMP_MAX_DISP_NUM_BUFF) {
2187     __kmp_dispatch_num_buffers = arg;
2188   }
2189 }
2190 
2191 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2192 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2193   return -1;
2194 #else
2195   if (!TCR_4(__kmp_init_middle)) {
2196     __kmp_middle_initialize();
2197   }
2198   __kmp_assign_root_init_mask();
2199   return __kmp_aux_set_affinity_mask_proc(proc, mask);
2200 #endif
2201 }
2202 
2203 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2204 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2205   return -1;
2206 #else
2207   if (!TCR_4(__kmp_init_middle)) {
2208     __kmp_middle_initialize();
2209   }
2210   __kmp_assign_root_init_mask();
2211   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2212 #endif
2213 }
2214 
2215 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2216 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2217   return -1;
2218 #else
2219   if (!TCR_4(__kmp_init_middle)) {
2220     __kmp_middle_initialize();
2221   }
2222   __kmp_assign_root_init_mask();
2223   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2224 #endif
2225 }
2226 
2227 /* -------------------------------------------------------------------------- */
2228 /*!
2229 @ingroup THREADPRIVATE
2230 @param loc       source location information
2231 @param gtid      global thread number
2232 @param cpy_size  size of the cpy_data buffer
2233 @param cpy_data  pointer to data to be copied
2234 @param cpy_func  helper function to call for copying data
2235 @param didit     flag variable: 1=single thread; 0=not single thread
2236 
2237 __kmpc_copyprivate implements the interface for the private data broadcast
2238 needed for the copyprivate clause associated with a single region in an
2239 OpenMP<sup>*</sup> program (both C and Fortran).
2240 All threads participating in the parallel region call this routine.
2241 One of the threads (called the single thread) should have the <tt>didit</tt>
2242 variable set to 1 and all other threads should have that variable set to 0.
2243 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2244 
2245 The OpenMP specification forbids the use of nowait on the single region when a
2246 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2247 barrier internally to avoid race conditions, so the code generation for the
2248 single region should avoid generating a barrier after the call to @ref
2249 __kmpc_copyprivate.
2250 
2251 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2252 The <tt>loc</tt> parameter is a pointer to source location information.
2253 
2254 Internal implementation: The single thread will first copy its descriptor
2255 address (cpy_data) to a team-private location, then the other threads will each
2256 call the function pointed to by the parameter cpy_func, which carries out the
2257 copy by copying the data using the cpy_data buffer.
2258 
2259 The cpy_func routine used for the copy and the contents of the data area defined
2260 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2261 to be done. For instance, the cpy_data buffer can hold the actual data to be
2262 copied or it may hold a list of pointers to the data. The cpy_func routine must
2263 interpret the cpy_data buffer appropriately.
2264 
2265 The interface to cpy_func is as follows:
2266 @code
2267 void cpy_func( void *destination, void *source )
2268 @endcode
2269 where void *destination is the cpy_data pointer for the thread being copied to
2270 and void *source is the cpy_data pointer for the thread being copied from.
2271 */
2272 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2273                         void *cpy_data, void (*cpy_func)(void *, void *),
2274                         kmp_int32 didit) {
2275   void **data_ptr;
2276   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2277   __kmp_assert_valid_gtid(gtid);
2278 
2279   KMP_MB();
2280 
2281   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2282 
2283   if (__kmp_env_consistency_check) {
2284     if (loc == 0) {
2285       KMP_WARNING(ConstructIdentInvalid);
2286     }
2287   }
2288 
2289   // ToDo: Optimize the following two barriers into some kind of split barrier
2290 
2291   if (didit)
2292     *data_ptr = cpy_data;
2293 
2294 #if OMPT_SUPPORT
2295   ompt_frame_t *ompt_frame;
2296   if (ompt_enabled.enabled) {
2297     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2298     if (ompt_frame->enter_frame.ptr == NULL)
2299       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2300   }
2301   OMPT_STORE_RETURN_ADDRESS(gtid);
2302 #endif
2303 /* This barrier is not a barrier region boundary */
2304 #if USE_ITT_NOTIFY
2305   __kmp_threads[gtid]->th.th_ident = loc;
2306 #endif
2307   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2308 
2309   if (!didit)
2310     (*cpy_func)(cpy_data, *data_ptr);
2311 
2312   // Consider next barrier a user-visible barrier for barrier region boundaries
2313   // Nesting checks are already handled by the single construct checks
2314   {
2315 #if OMPT_SUPPORT
2316     OMPT_STORE_RETURN_ADDRESS(gtid);
2317 #endif
2318 #if USE_ITT_NOTIFY
2319     __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2320 // tasks can overwrite the location)
2321 #endif
2322     __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2323 #if OMPT_SUPPORT && OMPT_OPTIONAL
2324     if (ompt_enabled.enabled) {
2325       ompt_frame->enter_frame = ompt_data_none;
2326     }
2327 #endif
2328   }
2329 }
2330 
2331 /* --------------------------------------------------------------------------*/
2332 /*!
2333 @ingroup THREADPRIVATE
2334 @param loc       source location information
2335 @param gtid      global thread number
2336 @param cpy_data  pointer to the data to be saved/copied or 0
2337 @return          the saved pointer to the data
2338 
2339 __kmpc_copyprivate_light is a lighter version of __kmpc_copyprivate:
2340 __kmpc_copyprivate_light only saves the pointer it's given (if it's not 0, so
2341 coming from single), and returns that pointer in all calls (for single thread
2342 it's not needed). This version doesn't do any actual data copying. Data copying
2343 has to be done somewhere else, e.g. inline in the generated code. Due to this,
2344 this function doesn't have any barrier at the end of the function, like
2345 __kmpc_copyprivate does, so generated code needs barrier after copying of all
2346 data was done.
2347 */
2348 void *__kmpc_copyprivate_light(ident_t *loc, kmp_int32 gtid, void *cpy_data) {
2349   void **data_ptr;
2350 
2351   KC_TRACE(10, ("__kmpc_copyprivate_light: called T#%d\n", gtid));
2352 
2353   KMP_MB();
2354 
2355   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2356 
2357   if (__kmp_env_consistency_check) {
2358     if (loc == 0) {
2359       KMP_WARNING(ConstructIdentInvalid);
2360     }
2361   }
2362 
2363   // ToDo: Optimize the following barrier
2364 
2365   if (cpy_data)
2366     *data_ptr = cpy_data;
2367 
2368 #if OMPT_SUPPORT
2369   ompt_frame_t *ompt_frame;
2370   if (ompt_enabled.enabled) {
2371     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2372     if (ompt_frame->enter_frame.ptr == NULL)
2373       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2374     OMPT_STORE_RETURN_ADDRESS(gtid);
2375   }
2376 #endif
2377 /* This barrier is not a barrier region boundary */
2378 #if USE_ITT_NOTIFY
2379   __kmp_threads[gtid]->th.th_ident = loc;
2380 #endif
2381   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2382 
2383   return *data_ptr;
2384 }
2385 
2386 /* -------------------------------------------------------------------------- */
2387 
2388 #define INIT_LOCK __kmp_init_user_lock_with_checks
2389 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2390 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2391 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2392 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2393 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2394   __kmp_acquire_nested_user_lock_with_checks_timed
2395 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2396 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2397 #define TEST_LOCK __kmp_test_user_lock_with_checks
2398 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2399 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2400 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2401 
2402 // TODO: Make check abort messages use location info & pass it into
2403 // with_checks routines
2404 
2405 #if KMP_USE_DYNAMIC_LOCK
2406 
2407 // internal lock initializer
2408 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2409                                                     kmp_dyna_lockseq_t seq) {
2410   if (KMP_IS_D_LOCK(seq)) {
2411     KMP_INIT_D_LOCK(lock, seq);
2412 #if USE_ITT_BUILD
2413     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2414 #endif
2415   } else {
2416     KMP_INIT_I_LOCK(lock, seq);
2417 #if USE_ITT_BUILD
2418     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2419     __kmp_itt_lock_creating(ilk->lock, loc);
2420 #endif
2421   }
2422 }
2423 
2424 // internal nest lock initializer
2425 static __forceinline void
2426 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2427                                kmp_dyna_lockseq_t seq) {
2428 #if KMP_USE_TSX
2429   // Don't have nested lock implementation for speculative locks
2430   if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2431       seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2432     seq = __kmp_user_lock_seq;
2433 #endif
2434   switch (seq) {
2435   case lockseq_tas:
2436     seq = lockseq_nested_tas;
2437     break;
2438 #if KMP_USE_FUTEX
2439   case lockseq_futex:
2440     seq = lockseq_nested_futex;
2441     break;
2442 #endif
2443   case lockseq_ticket:
2444     seq = lockseq_nested_ticket;
2445     break;
2446   case lockseq_queuing:
2447     seq = lockseq_nested_queuing;
2448     break;
2449   case lockseq_drdpa:
2450     seq = lockseq_nested_drdpa;
2451     break;
2452   default:
2453     seq = lockseq_nested_queuing;
2454   }
2455   KMP_INIT_I_LOCK(lock, seq);
2456 #if USE_ITT_BUILD
2457   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2458   __kmp_itt_lock_creating(ilk->lock, loc);
2459 #endif
2460 }
2461 
2462 /* initialize the lock with a hint */
2463 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2464                                 uintptr_t hint) {
2465   KMP_DEBUG_ASSERT(__kmp_init_serial);
2466   if (__kmp_env_consistency_check && user_lock == NULL) {
2467     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2468   }
2469 
2470   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2471 
2472 #if OMPT_SUPPORT && OMPT_OPTIONAL
2473   // This is the case, if called from omp_init_lock_with_hint:
2474   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2475   if (!codeptr)
2476     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2477   if (ompt_enabled.ompt_callback_lock_init) {
2478     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2479         ompt_mutex_lock, (omp_lock_hint_t)hint,
2480         __ompt_get_mutex_impl_type(user_lock),
2481         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2482   }
2483 #endif
2484 }
2485 
2486 /* initialize the lock with a hint */
2487 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2488                                      void **user_lock, uintptr_t hint) {
2489   KMP_DEBUG_ASSERT(__kmp_init_serial);
2490   if (__kmp_env_consistency_check && user_lock == NULL) {
2491     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2492   }
2493 
2494   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2495 
2496 #if OMPT_SUPPORT && OMPT_OPTIONAL
2497   // This is the case, if called from omp_init_lock_with_hint:
2498   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2499   if (!codeptr)
2500     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2501   if (ompt_enabled.ompt_callback_lock_init) {
2502     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2503         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2504         __ompt_get_mutex_impl_type(user_lock),
2505         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2506   }
2507 #endif
2508 }
2509 
2510 #endif // KMP_USE_DYNAMIC_LOCK
2511 
2512 /* initialize the lock */
2513 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2514 #if KMP_USE_DYNAMIC_LOCK
2515 
2516   KMP_DEBUG_ASSERT(__kmp_init_serial);
2517   if (__kmp_env_consistency_check && user_lock == NULL) {
2518     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2519   }
2520   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2521 
2522 #if OMPT_SUPPORT && OMPT_OPTIONAL
2523   // This is the case, if called from omp_init_lock_with_hint:
2524   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2525   if (!codeptr)
2526     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2527   if (ompt_enabled.ompt_callback_lock_init) {
2528     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2529         ompt_mutex_lock, omp_lock_hint_none,
2530         __ompt_get_mutex_impl_type(user_lock),
2531         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2532   }
2533 #endif
2534 
2535 #else // KMP_USE_DYNAMIC_LOCK
2536 
2537   static char const *const func = "omp_init_lock";
2538   kmp_user_lock_p lck;
2539   KMP_DEBUG_ASSERT(__kmp_init_serial);
2540 
2541   if (__kmp_env_consistency_check) {
2542     if (user_lock == NULL) {
2543       KMP_FATAL(LockIsUninitialized, func);
2544     }
2545   }
2546 
2547   KMP_CHECK_USER_LOCK_INIT();
2548 
2549   if ((__kmp_user_lock_kind == lk_tas) &&
2550       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2551     lck = (kmp_user_lock_p)user_lock;
2552   }
2553 #if KMP_USE_FUTEX
2554   else if ((__kmp_user_lock_kind == lk_futex) &&
2555            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2556     lck = (kmp_user_lock_p)user_lock;
2557   }
2558 #endif
2559   else {
2560     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2561   }
2562   INIT_LOCK(lck);
2563   __kmp_set_user_lock_location(lck, loc);
2564 
2565 #if OMPT_SUPPORT && OMPT_OPTIONAL
2566   // This is the case, if called from omp_init_lock_with_hint:
2567   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2568   if (!codeptr)
2569     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2570   if (ompt_enabled.ompt_callback_lock_init) {
2571     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2572         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2573         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2574   }
2575 #endif
2576 
2577 #if USE_ITT_BUILD
2578   __kmp_itt_lock_creating(lck);
2579 #endif /* USE_ITT_BUILD */
2580 
2581 #endif // KMP_USE_DYNAMIC_LOCK
2582 } // __kmpc_init_lock
2583 
2584 /* initialize the lock */
2585 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2586 #if KMP_USE_DYNAMIC_LOCK
2587 
2588   KMP_DEBUG_ASSERT(__kmp_init_serial);
2589   if (__kmp_env_consistency_check && user_lock == NULL) {
2590     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2591   }
2592   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2593 
2594 #if OMPT_SUPPORT && OMPT_OPTIONAL
2595   // This is the case, if called from omp_init_lock_with_hint:
2596   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2597   if (!codeptr)
2598     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2599   if (ompt_enabled.ompt_callback_lock_init) {
2600     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2601         ompt_mutex_nest_lock, omp_lock_hint_none,
2602         __ompt_get_mutex_impl_type(user_lock),
2603         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2604   }
2605 #endif
2606 
2607 #else // KMP_USE_DYNAMIC_LOCK
2608 
2609   static char const *const func = "omp_init_nest_lock";
2610   kmp_user_lock_p lck;
2611   KMP_DEBUG_ASSERT(__kmp_init_serial);
2612 
2613   if (__kmp_env_consistency_check) {
2614     if (user_lock == NULL) {
2615       KMP_FATAL(LockIsUninitialized, func);
2616     }
2617   }
2618 
2619   KMP_CHECK_USER_LOCK_INIT();
2620 
2621   if ((__kmp_user_lock_kind == lk_tas) &&
2622       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2623        OMP_NEST_LOCK_T_SIZE)) {
2624     lck = (kmp_user_lock_p)user_lock;
2625   }
2626 #if KMP_USE_FUTEX
2627   else if ((__kmp_user_lock_kind == lk_futex) &&
2628            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2629             OMP_NEST_LOCK_T_SIZE)) {
2630     lck = (kmp_user_lock_p)user_lock;
2631   }
2632 #endif
2633   else {
2634     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2635   }
2636 
2637   INIT_NESTED_LOCK(lck);
2638   __kmp_set_user_lock_location(lck, loc);
2639 
2640 #if OMPT_SUPPORT && OMPT_OPTIONAL
2641   // This is the case, if called from omp_init_lock_with_hint:
2642   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2643   if (!codeptr)
2644     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2645   if (ompt_enabled.ompt_callback_lock_init) {
2646     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2647         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2648         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2649   }
2650 #endif
2651 
2652 #if USE_ITT_BUILD
2653   __kmp_itt_lock_creating(lck);
2654 #endif /* USE_ITT_BUILD */
2655 
2656 #endif // KMP_USE_DYNAMIC_LOCK
2657 } // __kmpc_init_nest_lock
2658 
2659 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2660 #if KMP_USE_DYNAMIC_LOCK
2661 
2662 #if USE_ITT_BUILD
2663   kmp_user_lock_p lck;
2664   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2665     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2666   } else {
2667     lck = (kmp_user_lock_p)user_lock;
2668   }
2669   __kmp_itt_lock_destroyed(lck);
2670 #endif
2671 #if OMPT_SUPPORT && OMPT_OPTIONAL
2672   // This is the case, if called from omp_init_lock_with_hint:
2673   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2674   if (!codeptr)
2675     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2676   if (ompt_enabled.ompt_callback_lock_destroy) {
2677     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2678         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2679   }
2680 #endif
2681   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2682 #else
2683   kmp_user_lock_p lck;
2684 
2685   if ((__kmp_user_lock_kind == lk_tas) &&
2686       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2687     lck = (kmp_user_lock_p)user_lock;
2688   }
2689 #if KMP_USE_FUTEX
2690   else if ((__kmp_user_lock_kind == lk_futex) &&
2691            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2692     lck = (kmp_user_lock_p)user_lock;
2693   }
2694 #endif
2695   else {
2696     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2697   }
2698 
2699 #if OMPT_SUPPORT && OMPT_OPTIONAL
2700   // This is the case, if called from omp_init_lock_with_hint:
2701   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2702   if (!codeptr)
2703     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2704   if (ompt_enabled.ompt_callback_lock_destroy) {
2705     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2706         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2707   }
2708 #endif
2709 
2710 #if USE_ITT_BUILD
2711   __kmp_itt_lock_destroyed(lck);
2712 #endif /* USE_ITT_BUILD */
2713   DESTROY_LOCK(lck);
2714 
2715   if ((__kmp_user_lock_kind == lk_tas) &&
2716       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2717     ;
2718   }
2719 #if KMP_USE_FUTEX
2720   else if ((__kmp_user_lock_kind == lk_futex) &&
2721            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2722     ;
2723   }
2724 #endif
2725   else {
2726     __kmp_user_lock_free(user_lock, gtid, lck);
2727   }
2728 #endif // KMP_USE_DYNAMIC_LOCK
2729 } // __kmpc_destroy_lock
2730 
2731 /* destroy the lock */
2732 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2733 #if KMP_USE_DYNAMIC_LOCK
2734 
2735 #if USE_ITT_BUILD
2736   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2737   __kmp_itt_lock_destroyed(ilk->lock);
2738 #endif
2739 #if OMPT_SUPPORT && OMPT_OPTIONAL
2740   // This is the case, if called from omp_init_lock_with_hint:
2741   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2742   if (!codeptr)
2743     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2744   if (ompt_enabled.ompt_callback_lock_destroy) {
2745     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2746         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2747   }
2748 #endif
2749   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2750 
2751 #else // KMP_USE_DYNAMIC_LOCK
2752 
2753   kmp_user_lock_p lck;
2754 
2755   if ((__kmp_user_lock_kind == lk_tas) &&
2756       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2757        OMP_NEST_LOCK_T_SIZE)) {
2758     lck = (kmp_user_lock_p)user_lock;
2759   }
2760 #if KMP_USE_FUTEX
2761   else if ((__kmp_user_lock_kind == lk_futex) &&
2762            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2763             OMP_NEST_LOCK_T_SIZE)) {
2764     lck = (kmp_user_lock_p)user_lock;
2765   }
2766 #endif
2767   else {
2768     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2769   }
2770 
2771 #if OMPT_SUPPORT && OMPT_OPTIONAL
2772   // This is the case, if called from omp_init_lock_with_hint:
2773   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2774   if (!codeptr)
2775     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2776   if (ompt_enabled.ompt_callback_lock_destroy) {
2777     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2778         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2779   }
2780 #endif
2781 
2782 #if USE_ITT_BUILD
2783   __kmp_itt_lock_destroyed(lck);
2784 #endif /* USE_ITT_BUILD */
2785 
2786   DESTROY_NESTED_LOCK(lck);
2787 
2788   if ((__kmp_user_lock_kind == lk_tas) &&
2789       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2790        OMP_NEST_LOCK_T_SIZE)) {
2791     ;
2792   }
2793 #if KMP_USE_FUTEX
2794   else if ((__kmp_user_lock_kind == lk_futex) &&
2795            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2796             OMP_NEST_LOCK_T_SIZE)) {
2797     ;
2798   }
2799 #endif
2800   else {
2801     __kmp_user_lock_free(user_lock, gtid, lck);
2802   }
2803 #endif // KMP_USE_DYNAMIC_LOCK
2804 } // __kmpc_destroy_nest_lock
2805 
2806 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2807   KMP_COUNT_BLOCK(OMP_set_lock);
2808 #if KMP_USE_DYNAMIC_LOCK
2809   int tag = KMP_EXTRACT_D_TAG(user_lock);
2810 #if USE_ITT_BUILD
2811   __kmp_itt_lock_acquiring(
2812       (kmp_user_lock_p)
2813           user_lock); // itt function will get to the right lock object.
2814 #endif
2815 #if OMPT_SUPPORT && OMPT_OPTIONAL
2816   // This is the case, if called from omp_init_lock_with_hint:
2817   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2818   if (!codeptr)
2819     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2820   if (ompt_enabled.ompt_callback_mutex_acquire) {
2821     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2822         ompt_mutex_lock, omp_lock_hint_none,
2823         __ompt_get_mutex_impl_type(user_lock),
2824         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2825   }
2826 #endif
2827 #if KMP_USE_INLINED_TAS
2828   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2829     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2830   } else
2831 #elif KMP_USE_INLINED_FUTEX
2832   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2833     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2834   } else
2835 #endif
2836   {
2837     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2838   }
2839 #if USE_ITT_BUILD
2840   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2841 #endif
2842 #if OMPT_SUPPORT && OMPT_OPTIONAL
2843   if (ompt_enabled.ompt_callback_mutex_acquired) {
2844     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2845         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2846   }
2847 #endif
2848 
2849 #else // KMP_USE_DYNAMIC_LOCK
2850 
2851   kmp_user_lock_p lck;
2852 
2853   if ((__kmp_user_lock_kind == lk_tas) &&
2854       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2855     lck = (kmp_user_lock_p)user_lock;
2856   }
2857 #if KMP_USE_FUTEX
2858   else if ((__kmp_user_lock_kind == lk_futex) &&
2859            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2860     lck = (kmp_user_lock_p)user_lock;
2861   }
2862 #endif
2863   else {
2864     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2865   }
2866 
2867 #if USE_ITT_BUILD
2868   __kmp_itt_lock_acquiring(lck);
2869 #endif /* USE_ITT_BUILD */
2870 #if OMPT_SUPPORT && OMPT_OPTIONAL
2871   // This is the case, if called from omp_init_lock_with_hint:
2872   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2873   if (!codeptr)
2874     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2875   if (ompt_enabled.ompt_callback_mutex_acquire) {
2876     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2877         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2878         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2879   }
2880 #endif
2881 
2882   ACQUIRE_LOCK(lck, gtid);
2883 
2884 #if USE_ITT_BUILD
2885   __kmp_itt_lock_acquired(lck);
2886 #endif /* USE_ITT_BUILD */
2887 
2888 #if OMPT_SUPPORT && OMPT_OPTIONAL
2889   if (ompt_enabled.ompt_callback_mutex_acquired) {
2890     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2891         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2892   }
2893 #endif
2894 
2895 #endif // KMP_USE_DYNAMIC_LOCK
2896 }
2897 
2898 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2899 #if KMP_USE_DYNAMIC_LOCK
2900 
2901 #if USE_ITT_BUILD
2902   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2903 #endif
2904 #if OMPT_SUPPORT && OMPT_OPTIONAL
2905   // This is the case, if called from omp_init_lock_with_hint:
2906   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2907   if (!codeptr)
2908     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2909   if (ompt_enabled.enabled) {
2910     if (ompt_enabled.ompt_callback_mutex_acquire) {
2911       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2912           ompt_mutex_nest_lock, omp_lock_hint_none,
2913           __ompt_get_mutex_impl_type(user_lock),
2914           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2915     }
2916   }
2917 #endif
2918   int acquire_status =
2919       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2920   (void)acquire_status;
2921 #if USE_ITT_BUILD
2922   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2923 #endif
2924 
2925 #if OMPT_SUPPORT && OMPT_OPTIONAL
2926   if (ompt_enabled.enabled) {
2927     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2928       if (ompt_enabled.ompt_callback_mutex_acquired) {
2929         // lock_first
2930         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2931             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2932             codeptr);
2933       }
2934     } else {
2935       if (ompt_enabled.ompt_callback_nest_lock) {
2936         // lock_next
2937         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2938             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2939       }
2940     }
2941   }
2942 #endif
2943 
2944 #else // KMP_USE_DYNAMIC_LOCK
2945   int acquire_status;
2946   kmp_user_lock_p lck;
2947 
2948   if ((__kmp_user_lock_kind == lk_tas) &&
2949       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2950        OMP_NEST_LOCK_T_SIZE)) {
2951     lck = (kmp_user_lock_p)user_lock;
2952   }
2953 #if KMP_USE_FUTEX
2954   else if ((__kmp_user_lock_kind == lk_futex) &&
2955            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2956             OMP_NEST_LOCK_T_SIZE)) {
2957     lck = (kmp_user_lock_p)user_lock;
2958   }
2959 #endif
2960   else {
2961     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2962   }
2963 
2964 #if USE_ITT_BUILD
2965   __kmp_itt_lock_acquiring(lck);
2966 #endif /* USE_ITT_BUILD */
2967 #if OMPT_SUPPORT && OMPT_OPTIONAL
2968   // This is the case, if called from omp_init_lock_with_hint:
2969   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2970   if (!codeptr)
2971     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2972   if (ompt_enabled.enabled) {
2973     if (ompt_enabled.ompt_callback_mutex_acquire) {
2974       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2975           ompt_mutex_nest_lock, omp_lock_hint_none,
2976           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2977           codeptr);
2978     }
2979   }
2980 #endif
2981 
2982   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2983 
2984 #if USE_ITT_BUILD
2985   __kmp_itt_lock_acquired(lck);
2986 #endif /* USE_ITT_BUILD */
2987 
2988 #if OMPT_SUPPORT && OMPT_OPTIONAL
2989   if (ompt_enabled.enabled) {
2990     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2991       if (ompt_enabled.ompt_callback_mutex_acquired) {
2992         // lock_first
2993         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2994             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2995       }
2996     } else {
2997       if (ompt_enabled.ompt_callback_nest_lock) {
2998         // lock_next
2999         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3000             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3001       }
3002     }
3003   }
3004 #endif
3005 
3006 #endif // KMP_USE_DYNAMIC_LOCK
3007 }
3008 
3009 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3010 #if KMP_USE_DYNAMIC_LOCK
3011 
3012   int tag = KMP_EXTRACT_D_TAG(user_lock);
3013 #if USE_ITT_BUILD
3014   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
3015 #endif
3016 #if KMP_USE_INLINED_TAS
3017   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3018     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
3019   } else
3020 #elif KMP_USE_INLINED_FUTEX
3021   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3022     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
3023   } else
3024 #endif
3025   {
3026     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3027   }
3028 
3029 #if OMPT_SUPPORT && OMPT_OPTIONAL
3030   // This is the case, if called from omp_init_lock_with_hint:
3031   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3032   if (!codeptr)
3033     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3034   if (ompt_enabled.ompt_callback_mutex_released) {
3035     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3036         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3037   }
3038 #endif
3039 
3040 #else // KMP_USE_DYNAMIC_LOCK
3041 
3042   kmp_user_lock_p lck;
3043 
3044   /* Can't use serial interval since not block structured */
3045   /* release the lock */
3046 
3047   if ((__kmp_user_lock_kind == lk_tas) &&
3048       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3049 #if KMP_OS_LINUX &&                                                            \
3050     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
3051 // "fast" path implemented to fix customer performance issue
3052 #if USE_ITT_BUILD
3053     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
3054 #endif /* USE_ITT_BUILD */
3055     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
3056     KMP_MB();
3057 
3058 #if OMPT_SUPPORT && OMPT_OPTIONAL
3059     // This is the case, if called from omp_init_lock_with_hint:
3060     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3061     if (!codeptr)
3062       codeptr = OMPT_GET_RETURN_ADDRESS(0);
3063     if (ompt_enabled.ompt_callback_mutex_released) {
3064       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3065           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3066     }
3067 #endif
3068 
3069     return;
3070 #else
3071     lck = (kmp_user_lock_p)user_lock;
3072 #endif
3073   }
3074 #if KMP_USE_FUTEX
3075   else if ((__kmp_user_lock_kind == lk_futex) &&
3076            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3077     lck = (kmp_user_lock_p)user_lock;
3078   }
3079 #endif
3080   else {
3081     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
3082   }
3083 
3084 #if USE_ITT_BUILD
3085   __kmp_itt_lock_releasing(lck);
3086 #endif /* USE_ITT_BUILD */
3087 
3088   RELEASE_LOCK(lck, gtid);
3089 
3090 #if OMPT_SUPPORT && OMPT_OPTIONAL
3091   // This is the case, if called from omp_init_lock_with_hint:
3092   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3093   if (!codeptr)
3094     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3095   if (ompt_enabled.ompt_callback_mutex_released) {
3096     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3097         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3098   }
3099 #endif
3100 
3101 #endif // KMP_USE_DYNAMIC_LOCK
3102 }
3103 
3104 /* release the lock */
3105 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3106 #if KMP_USE_DYNAMIC_LOCK
3107 
3108 #if USE_ITT_BUILD
3109   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
3110 #endif
3111   int release_status =
3112       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
3113   (void)release_status;
3114 
3115 #if OMPT_SUPPORT && OMPT_OPTIONAL
3116   // This is the case, if called from omp_init_lock_with_hint:
3117   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3118   if (!codeptr)
3119     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3120   if (ompt_enabled.enabled) {
3121     if (release_status == KMP_LOCK_RELEASED) {
3122       if (ompt_enabled.ompt_callback_mutex_released) {
3123         // release_lock_last
3124         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3125             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3126             codeptr);
3127       }
3128     } else if (ompt_enabled.ompt_callback_nest_lock) {
3129       // release_lock_prev
3130       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3131           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3132     }
3133   }
3134 #endif
3135 
3136 #else // KMP_USE_DYNAMIC_LOCK
3137 
3138   kmp_user_lock_p lck;
3139 
3140   /* Can't use serial interval since not block structured */
3141 
3142   if ((__kmp_user_lock_kind == lk_tas) &&
3143       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3144        OMP_NEST_LOCK_T_SIZE)) {
3145 #if KMP_OS_LINUX &&                                                            \
3146     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
3147     // "fast" path implemented to fix customer performance issue
3148     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
3149 #if USE_ITT_BUILD
3150     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
3151 #endif /* USE_ITT_BUILD */
3152 
3153 #if OMPT_SUPPORT && OMPT_OPTIONAL
3154     int release_status = KMP_LOCK_STILL_HELD;
3155 #endif
3156 
3157     if (--(tl->lk.depth_locked) == 0) {
3158       TCW_4(tl->lk.poll, 0);
3159 #if OMPT_SUPPORT && OMPT_OPTIONAL
3160       release_status = KMP_LOCK_RELEASED;
3161 #endif
3162     }
3163     KMP_MB();
3164 
3165 #if OMPT_SUPPORT && OMPT_OPTIONAL
3166     // This is the case, if called from omp_init_lock_with_hint:
3167     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3168     if (!codeptr)
3169       codeptr = OMPT_GET_RETURN_ADDRESS(0);
3170     if (ompt_enabled.enabled) {
3171       if (release_status == KMP_LOCK_RELEASED) {
3172         if (ompt_enabled.ompt_callback_mutex_released) {
3173           // release_lock_last
3174           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3175               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3176         }
3177       } else if (ompt_enabled.ompt_callback_nest_lock) {
3178         // release_lock_previous
3179         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3180             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3181       }
3182     }
3183 #endif
3184 
3185     return;
3186 #else
3187     lck = (kmp_user_lock_p)user_lock;
3188 #endif
3189   }
3190 #if KMP_USE_FUTEX
3191   else if ((__kmp_user_lock_kind == lk_futex) &&
3192            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3193             OMP_NEST_LOCK_T_SIZE)) {
3194     lck = (kmp_user_lock_p)user_lock;
3195   }
3196 #endif
3197   else {
3198     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
3199   }
3200 
3201 #if USE_ITT_BUILD
3202   __kmp_itt_lock_releasing(lck);
3203 #endif /* USE_ITT_BUILD */
3204 
3205   int release_status;
3206   release_status = RELEASE_NESTED_LOCK(lck, gtid);
3207 #if OMPT_SUPPORT && OMPT_OPTIONAL
3208   // This is the case, if called from omp_init_lock_with_hint:
3209   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3210   if (!codeptr)
3211     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3212   if (ompt_enabled.enabled) {
3213     if (release_status == KMP_LOCK_RELEASED) {
3214       if (ompt_enabled.ompt_callback_mutex_released) {
3215         // release_lock_last
3216         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3217             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3218       }
3219     } else if (ompt_enabled.ompt_callback_nest_lock) {
3220       // release_lock_previous
3221       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3222           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3223     }
3224   }
3225 #endif
3226 
3227 #endif // KMP_USE_DYNAMIC_LOCK
3228 }
3229 
3230 /* try to acquire the lock */
3231 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3232   KMP_COUNT_BLOCK(OMP_test_lock);
3233 
3234 #if KMP_USE_DYNAMIC_LOCK
3235   int rc;
3236   int tag = KMP_EXTRACT_D_TAG(user_lock);
3237 #if USE_ITT_BUILD
3238   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3239 #endif
3240 #if OMPT_SUPPORT && OMPT_OPTIONAL
3241   // This is the case, if called from omp_init_lock_with_hint:
3242   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3243   if (!codeptr)
3244     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3245   if (ompt_enabled.ompt_callback_mutex_acquire) {
3246     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3247         ompt_mutex_test_lock, omp_lock_hint_none,
3248         __ompt_get_mutex_impl_type(user_lock),
3249         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3250   }
3251 #endif
3252 #if KMP_USE_INLINED_TAS
3253   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3254     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3255   } else
3256 #elif KMP_USE_INLINED_FUTEX
3257   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3258     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3259   } else
3260 #endif
3261   {
3262     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3263   }
3264   if (rc) {
3265 #if USE_ITT_BUILD
3266     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3267 #endif
3268 #if OMPT_SUPPORT && OMPT_OPTIONAL
3269     if (ompt_enabled.ompt_callback_mutex_acquired) {
3270       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3271           ompt_mutex_test_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3272     }
3273 #endif
3274     return FTN_TRUE;
3275   } else {
3276 #if USE_ITT_BUILD
3277     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3278 #endif
3279     return FTN_FALSE;
3280   }
3281 
3282 #else // KMP_USE_DYNAMIC_LOCK
3283 
3284   kmp_user_lock_p lck;
3285   int rc;
3286 
3287   if ((__kmp_user_lock_kind == lk_tas) &&
3288       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3289     lck = (kmp_user_lock_p)user_lock;
3290   }
3291 #if KMP_USE_FUTEX
3292   else if ((__kmp_user_lock_kind == lk_futex) &&
3293            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3294     lck = (kmp_user_lock_p)user_lock;
3295   }
3296 #endif
3297   else {
3298     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3299   }
3300 
3301 #if USE_ITT_BUILD
3302   __kmp_itt_lock_acquiring(lck);
3303 #endif /* USE_ITT_BUILD */
3304 #if OMPT_SUPPORT && OMPT_OPTIONAL
3305   // This is the case, if called from omp_init_lock_with_hint:
3306   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3307   if (!codeptr)
3308     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3309   if (ompt_enabled.ompt_callback_mutex_acquire) {
3310     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3311         ompt_mutex_test_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3312         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3313   }
3314 #endif
3315 
3316   rc = TEST_LOCK(lck, gtid);
3317 #if USE_ITT_BUILD
3318   if (rc) {
3319     __kmp_itt_lock_acquired(lck);
3320   } else {
3321     __kmp_itt_lock_cancelled(lck);
3322   }
3323 #endif /* USE_ITT_BUILD */
3324 #if OMPT_SUPPORT && OMPT_OPTIONAL
3325   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3326     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3327         ompt_mutex_test_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3328   }
3329 #endif
3330 
3331   return (rc ? FTN_TRUE : FTN_FALSE);
3332 
3333   /* Can't use serial interval since not block structured */
3334 
3335 #endif // KMP_USE_DYNAMIC_LOCK
3336 }
3337 
3338 /* try to acquire the lock */
3339 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3340 #if KMP_USE_DYNAMIC_LOCK
3341   int rc;
3342 #if USE_ITT_BUILD
3343   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3344 #endif
3345 #if OMPT_SUPPORT && OMPT_OPTIONAL
3346   // This is the case, if called from omp_init_lock_with_hint:
3347   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3348   if (!codeptr)
3349     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3350   if (ompt_enabled.ompt_callback_mutex_acquire) {
3351     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3352         ompt_mutex_test_nest_lock, omp_lock_hint_none,
3353         __ompt_get_mutex_impl_type(user_lock),
3354         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3355   }
3356 #endif
3357   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3358 #if USE_ITT_BUILD
3359   if (rc) {
3360     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3361   } else {
3362     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3363   }
3364 #endif
3365 #if OMPT_SUPPORT && OMPT_OPTIONAL
3366   if (ompt_enabled.enabled && rc) {
3367     if (rc == 1) {
3368       if (ompt_enabled.ompt_callback_mutex_acquired) {
3369         // lock_first
3370         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3371             ompt_mutex_test_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3372             codeptr);
3373       }
3374     } else {
3375       if (ompt_enabled.ompt_callback_nest_lock) {
3376         // lock_next
3377         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3378             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3379       }
3380     }
3381   }
3382 #endif
3383   return rc;
3384 
3385 #else // KMP_USE_DYNAMIC_LOCK
3386 
3387   kmp_user_lock_p lck;
3388   int rc;
3389 
3390   if ((__kmp_user_lock_kind == lk_tas) &&
3391       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3392        OMP_NEST_LOCK_T_SIZE)) {
3393     lck = (kmp_user_lock_p)user_lock;
3394   }
3395 #if KMP_USE_FUTEX
3396   else if ((__kmp_user_lock_kind == lk_futex) &&
3397            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3398             OMP_NEST_LOCK_T_SIZE)) {
3399     lck = (kmp_user_lock_p)user_lock;
3400   }
3401 #endif
3402   else {
3403     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3404   }
3405 
3406 #if USE_ITT_BUILD
3407   __kmp_itt_lock_acquiring(lck);
3408 #endif /* USE_ITT_BUILD */
3409 
3410 #if OMPT_SUPPORT && OMPT_OPTIONAL
3411   // This is the case, if called from omp_init_lock_with_hint:
3412   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3413   if (!codeptr)
3414     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3415   if (ompt_enabled.enabled) &&
3416         ompt_enabled.ompt_callback_mutex_acquire) {
3417       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3418           ompt_mutex_test_nest_lock, omp_lock_hint_none,
3419           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3420           codeptr);
3421     }
3422 #endif
3423 
3424   rc = TEST_NESTED_LOCK(lck, gtid);
3425 #if USE_ITT_BUILD
3426   if (rc) {
3427     __kmp_itt_lock_acquired(lck);
3428   } else {
3429     __kmp_itt_lock_cancelled(lck);
3430   }
3431 #endif /* USE_ITT_BUILD */
3432 #if OMPT_SUPPORT && OMPT_OPTIONAL
3433   if (ompt_enabled.enabled && rc) {
3434     if (rc == 1) {
3435       if (ompt_enabled.ompt_callback_mutex_acquired) {
3436         // lock_first
3437         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3438             ompt_mutex_test_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3439       }
3440     } else {
3441       if (ompt_enabled.ompt_callback_nest_lock) {
3442         // lock_next
3443         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3444             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3445       }
3446     }
3447   }
3448 #endif
3449   return rc;
3450 
3451   /* Can't use serial interval since not block structured */
3452 
3453 #endif // KMP_USE_DYNAMIC_LOCK
3454 }
3455 
3456 // Interface to fast scalable reduce methods routines
3457 
3458 // keep the selected method in a thread local structure for cross-function
3459 // usage: will be used in __kmpc_end_reduce* functions;
3460 // another solution: to re-determine the method one more time in
3461 // __kmpc_end_reduce* functions (new prototype required then)
3462 // AT: which solution is better?
3463 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3464   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3465 
3466 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3467   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3468 
3469 // description of the packed_reduction_method variable: look at the macros in
3470 // kmp.h
3471 
3472 // used in a critical section reduce block
3473 static __forceinline void
3474 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3475                                           kmp_critical_name *crit) {
3476 
3477   // this lock was visible to a customer and to the threading profile tool as a
3478   // serial overhead span (although it's used for an internal purpose only)
3479   //            why was it visible in previous implementation?
3480   //            should we keep it visible in new reduce block?
3481   kmp_user_lock_p lck;
3482 
3483 #if KMP_USE_DYNAMIC_LOCK
3484 
3485   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3486   // Check if it is initialized.
3487   if (*lk == 0) {
3488     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3489       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3490                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3491     } else {
3492       __kmp_init_indirect_csptr(crit, loc, global_tid,
3493                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3494     }
3495   }
3496   // Branch for accessing the actual lock object and set operation. This
3497   // branching is inevitable since this lock initialization does not follow the
3498   // normal dispatch path (lock table is not used).
3499   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3500     lck = (kmp_user_lock_p)lk;
3501     KMP_DEBUG_ASSERT(lck != NULL);
3502     if (__kmp_env_consistency_check) {
3503       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3504     }
3505     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3506   } else {
3507     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3508     lck = ilk->lock;
3509     KMP_DEBUG_ASSERT(lck != NULL);
3510     if (__kmp_env_consistency_check) {
3511       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3512     }
3513     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3514   }
3515 
3516 #else // KMP_USE_DYNAMIC_LOCK
3517 
3518   // We know that the fast reduction code is only emitted by Intel compilers
3519   // with 32 byte critical sections. If there isn't enough space, then we
3520   // have to use a pointer.
3521   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3522     lck = (kmp_user_lock_p)crit;
3523   } else {
3524     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3525   }
3526   KMP_DEBUG_ASSERT(lck != NULL);
3527 
3528   if (__kmp_env_consistency_check)
3529     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3530 
3531   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3532 
3533 #endif // KMP_USE_DYNAMIC_LOCK
3534 }
3535 
3536 // used in a critical section reduce block
3537 static __forceinline void
3538 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3539                                         kmp_critical_name *crit) {
3540 
3541   kmp_user_lock_p lck;
3542 
3543 #if KMP_USE_DYNAMIC_LOCK
3544 
3545   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3546     lck = (kmp_user_lock_p)crit;
3547     if (__kmp_env_consistency_check)
3548       __kmp_pop_sync(global_tid, ct_critical, loc);
3549     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3550   } else {
3551     kmp_indirect_lock_t *ilk =
3552         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3553     if (__kmp_env_consistency_check)
3554       __kmp_pop_sync(global_tid, ct_critical, loc);
3555     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3556   }
3557 
3558 #else // KMP_USE_DYNAMIC_LOCK
3559 
3560   // We know that the fast reduction code is only emitted by Intel compilers
3561   // with 32 byte critical sections. If there isn't enough space, then we have
3562   // to use a pointer.
3563   if (__kmp_base_user_lock_size > 32) {
3564     lck = *((kmp_user_lock_p *)crit);
3565     KMP_ASSERT(lck != NULL);
3566   } else {
3567     lck = (kmp_user_lock_p)crit;
3568   }
3569 
3570   if (__kmp_env_consistency_check)
3571     __kmp_pop_sync(global_tid, ct_critical, loc);
3572 
3573   __kmp_release_user_lock_with_checks(lck, global_tid);
3574 
3575 #endif // KMP_USE_DYNAMIC_LOCK
3576 } // __kmp_end_critical_section_reduce_block
3577 
3578 static __forceinline int
3579 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3580                                      int *task_state) {
3581   kmp_team_t *team;
3582 
3583   // Check if we are inside the teams construct?
3584   if (th->th.th_teams_microtask) {
3585     *team_p = team = th->th.th_team;
3586     if (team->t.t_level == th->th.th_teams_level) {
3587       // This is reduction at teams construct.
3588       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3589       // Let's swap teams temporarily for the reduction.
3590       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3591       th->th.th_team = team->t.t_parent;
3592       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3593       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3594       *task_state = th->th.th_task_state;
3595       th->th.th_task_state = 0;
3596 
3597       return 1;
3598     }
3599   }
3600   return 0;
3601 }
3602 
3603 static __forceinline void
3604 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3605   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3606   th->th.th_info.ds.ds_tid = 0;
3607   th->th.th_team = team;
3608   th->th.th_team_nproc = team->t.t_nproc;
3609   th->th.th_task_team = team->t.t_task_team[task_state];
3610   __kmp_type_convert(task_state, &(th->th.th_task_state));
3611 }
3612 
3613 /* 2.a.i. Reduce Block without a terminating barrier */
3614 /*!
3615 @ingroup SYNCHRONIZATION
3616 @param loc source location information
3617 @param global_tid global thread number
3618 @param num_vars number of items (variables) to be reduced
3619 @param reduce_size size of data in bytes to be reduced
3620 @param reduce_data pointer to data to be reduced
3621 @param reduce_func callback function providing reduction operation on two
3622 operands and returning result of reduction in lhs_data
3623 @param lck pointer to the unique lock data structure
3624 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3625 threads if atomic reduction needed
3626 
3627 The nowait version is used for a reduce clause with the nowait argument.
3628 */
3629 kmp_int32
3630 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3631                      size_t reduce_size, void *reduce_data,
3632                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3633                      kmp_critical_name *lck) {
3634 
3635   KMP_COUNT_BLOCK(REDUCE_nowait);
3636   int retval = 0;
3637   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3638   kmp_info_t *th;
3639   kmp_team_t *team;
3640   int teams_swapped = 0, task_state;
3641   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3642   __kmp_assert_valid_gtid(global_tid);
3643 
3644   // why do we need this initialization here at all?
3645   // Reduction clause can not be used as a stand-alone directive.
3646 
3647   // do not call __kmp_serial_initialize(), it will be called by
3648   // __kmp_parallel_initialize() if needed
3649   // possible detection of false-positive race by the threadchecker ???
3650   if (!TCR_4(__kmp_init_parallel))
3651     __kmp_parallel_initialize();
3652 
3653   __kmp_resume_if_soft_paused();
3654 
3655 // check correctness of reduce block nesting
3656 #if KMP_USE_DYNAMIC_LOCK
3657   if (__kmp_env_consistency_check)
3658     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3659 #else
3660   if (__kmp_env_consistency_check)
3661     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3662 #endif
3663 
3664   th = __kmp_thread_from_gtid(global_tid);
3665   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3666 
3667   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3668   // the value should be kept in a variable
3669   // the variable should be either a construct-specific or thread-specific
3670   // property, not a team specific property
3671   //     (a thread can reach the next reduce block on the next construct, reduce
3672   //     method may differ on the next construct)
3673   // an ident_t "loc" parameter could be used as a construct-specific property
3674   // (what if loc == 0?)
3675   //     (if both construct-specific and team-specific variables were shared,
3676   //     then unness extra syncs should be needed)
3677   // a thread-specific variable is better regarding two issues above (next
3678   // construct and extra syncs)
3679   // a thread-specific "th_local.reduction_method" variable is used currently
3680   // each thread executes 'determine' and 'set' lines (no need to execute by one
3681   // thread, to avoid unness extra syncs)
3682 
3683   packed_reduction_method = __kmp_determine_reduction_method(
3684       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3685   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3686 
3687   OMPT_REDUCTION_DECL(th, global_tid);
3688   if (packed_reduction_method == critical_reduce_block) {
3689 
3690     OMPT_REDUCTION_BEGIN;
3691 
3692     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3693     retval = 1;
3694 
3695   } else if (packed_reduction_method == empty_reduce_block) {
3696 
3697     OMPT_REDUCTION_BEGIN;
3698 
3699     // usage: if team size == 1, no synchronization is required ( Intel
3700     // platforms only )
3701     retval = 1;
3702 
3703   } else if (packed_reduction_method == atomic_reduce_block) {
3704 
3705     retval = 2;
3706 
3707     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3708     // won't be called by the code gen)
3709     //     (it's not quite good, because the checking block has been closed by
3710     //     this 'pop',
3711     //      but atomic operation has not been executed yet, will be executed
3712     //      slightly later, literally on next instruction)
3713     if (__kmp_env_consistency_check)
3714       __kmp_pop_sync(global_tid, ct_reduce, loc);
3715 
3716   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3717                                    tree_reduce_block)) {
3718 
3719 // AT: performance issue: a real barrier here
3720 // AT: (if primary thread is slow, other threads are blocked here waiting for
3721 //      the primary thread to come and release them)
3722 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3723 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3724 //      be confusing to a customer)
3725 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3726 // might go faster and be more in line with sense of NOWAIT
3727 // AT: TO DO: do epcc test and compare times
3728 
3729 // this barrier should be invisible to a customer and to the threading profile
3730 // tool (it's neither a terminating barrier nor customer's code, it's
3731 // used for an internal purpose)
3732 #if OMPT_SUPPORT
3733     // JP: can this barrier potentially leed to task scheduling?
3734     // JP: as long as there is a barrier in the implementation, OMPT should and
3735     // will provide the barrier events
3736     //         so we set-up the necessary frame/return addresses.
3737     ompt_frame_t *ompt_frame;
3738     if (ompt_enabled.enabled) {
3739       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3740       if (ompt_frame->enter_frame.ptr == NULL)
3741         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3742     }
3743     OMPT_STORE_RETURN_ADDRESS(global_tid);
3744 #endif
3745 #if USE_ITT_NOTIFY
3746     __kmp_threads[global_tid]->th.th_ident = loc;
3747 #endif
3748     retval =
3749         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3750                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3751     retval = (retval != 0) ? (0) : (1);
3752 #if OMPT_SUPPORT && OMPT_OPTIONAL
3753     if (ompt_enabled.enabled) {
3754       ompt_frame->enter_frame = ompt_data_none;
3755     }
3756 #endif
3757 
3758     // all other workers except primary thread should do this pop here
3759     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3760     if (__kmp_env_consistency_check) {
3761       if (retval == 0) {
3762         __kmp_pop_sync(global_tid, ct_reduce, loc);
3763       }
3764     }
3765 
3766   } else {
3767 
3768     // should never reach this block
3769     KMP_ASSERT(0); // "unexpected method"
3770   }
3771   if (teams_swapped) {
3772     __kmp_restore_swapped_teams(th, team, task_state);
3773   }
3774   KA_TRACE(
3775       10,
3776       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3777        global_tid, packed_reduction_method, retval));
3778 
3779   return retval;
3780 }
3781 
3782 /*!
3783 @ingroup SYNCHRONIZATION
3784 @param loc source location information
3785 @param global_tid global thread id.
3786 @param lck pointer to the unique lock data structure
3787 
3788 Finish the execution of a reduce nowait.
3789 */
3790 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3791                               kmp_critical_name *lck) {
3792 
3793   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3794 
3795   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3796   __kmp_assert_valid_gtid(global_tid);
3797 
3798   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3799 
3800   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3801 
3802   if (packed_reduction_method == critical_reduce_block) {
3803 
3804     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3805     OMPT_REDUCTION_END;
3806 
3807   } else if (packed_reduction_method == empty_reduce_block) {
3808 
3809     // usage: if team size == 1, no synchronization is required ( on Intel
3810     // platforms only )
3811 
3812     OMPT_REDUCTION_END;
3813 
3814   } else if (packed_reduction_method == atomic_reduce_block) {
3815 
3816     // neither primary thread nor other workers should get here
3817     //     (code gen does not generate this call in case 2: atomic reduce block)
3818     // actually it's better to remove this elseif at all;
3819     // after removal this value will checked by the 'else' and will assert
3820 
3821   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3822                                    tree_reduce_block)) {
3823 
3824     // only primary thread gets here
3825     // OMPT: tree reduction is annotated in the barrier code
3826 
3827   } else {
3828 
3829     // should never reach this block
3830     KMP_ASSERT(0); // "unexpected method"
3831   }
3832 
3833   if (__kmp_env_consistency_check)
3834     __kmp_pop_sync(global_tid, ct_reduce, loc);
3835 
3836   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3837                 global_tid, packed_reduction_method));
3838 
3839   return;
3840 }
3841 
3842 /* 2.a.ii. Reduce Block with a terminating barrier */
3843 
3844 /*!
3845 @ingroup SYNCHRONIZATION
3846 @param loc source location information
3847 @param global_tid global thread number
3848 @param num_vars number of items (variables) to be reduced
3849 @param reduce_size size of data in bytes to be reduced
3850 @param reduce_data pointer to data to be reduced
3851 @param reduce_func callback function providing reduction operation on two
3852 operands and returning result of reduction in lhs_data
3853 @param lck pointer to the unique lock data structure
3854 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3855 threads if atomic reduction needed
3856 
3857 A blocking reduce that includes an implicit barrier.
3858 */
3859 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3860                         size_t reduce_size, void *reduce_data,
3861                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3862                         kmp_critical_name *lck) {
3863   KMP_COUNT_BLOCK(REDUCE_wait);
3864   int retval = 0;
3865   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3866   kmp_info_t *th;
3867   kmp_team_t *team;
3868   int teams_swapped = 0, task_state;
3869 
3870   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3871   __kmp_assert_valid_gtid(global_tid);
3872 
3873   // why do we need this initialization here at all?
3874   // Reduction clause can not be a stand-alone directive.
3875 
3876   // do not call __kmp_serial_initialize(), it will be called by
3877   // __kmp_parallel_initialize() if needed
3878   // possible detection of false-positive race by the threadchecker ???
3879   if (!TCR_4(__kmp_init_parallel))
3880     __kmp_parallel_initialize();
3881 
3882   __kmp_resume_if_soft_paused();
3883 
3884 // check correctness of reduce block nesting
3885 #if KMP_USE_DYNAMIC_LOCK
3886   if (__kmp_env_consistency_check)
3887     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3888 #else
3889   if (__kmp_env_consistency_check)
3890     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3891 #endif
3892 
3893   th = __kmp_thread_from_gtid(global_tid);
3894   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3895 
3896   packed_reduction_method = __kmp_determine_reduction_method(
3897       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3898   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3899 
3900   OMPT_REDUCTION_DECL(th, global_tid);
3901 
3902   if (packed_reduction_method == critical_reduce_block) {
3903 
3904     OMPT_REDUCTION_BEGIN;
3905     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3906     retval = 1;
3907 
3908   } else if (packed_reduction_method == empty_reduce_block) {
3909 
3910     OMPT_REDUCTION_BEGIN;
3911     // usage: if team size == 1, no synchronization is required ( Intel
3912     // platforms only )
3913     retval = 1;
3914 
3915   } else if (packed_reduction_method == atomic_reduce_block) {
3916 
3917     retval = 2;
3918 
3919   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3920                                    tree_reduce_block)) {
3921 
3922 // case tree_reduce_block:
3923 // this barrier should be visible to a customer and to the threading profile
3924 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3925 #if OMPT_SUPPORT
3926     ompt_frame_t *ompt_frame;
3927     if (ompt_enabled.enabled) {
3928       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3929       if (ompt_frame->enter_frame.ptr == NULL)
3930         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3931     }
3932     OMPT_STORE_RETURN_ADDRESS(global_tid);
3933 #endif
3934 #if USE_ITT_NOTIFY
3935     __kmp_threads[global_tid]->th.th_ident =
3936         loc; // needed for correct notification of frames
3937 #endif
3938     retval =
3939         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3940                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3941     retval = (retval != 0) ? (0) : (1);
3942 #if OMPT_SUPPORT && OMPT_OPTIONAL
3943     if (ompt_enabled.enabled) {
3944       ompt_frame->enter_frame = ompt_data_none;
3945     }
3946 #endif
3947 
3948     // all other workers except primary thread should do this pop here
3949     // (none of other workers except primary will enter __kmpc_end_reduce())
3950     if (__kmp_env_consistency_check) {
3951       if (retval == 0) { // 0: all other workers; 1: primary thread
3952         __kmp_pop_sync(global_tid, ct_reduce, loc);
3953       }
3954     }
3955 
3956   } else {
3957 
3958     // should never reach this block
3959     KMP_ASSERT(0); // "unexpected method"
3960   }
3961   if (teams_swapped) {
3962     __kmp_restore_swapped_teams(th, team, task_state);
3963   }
3964 
3965   KA_TRACE(10,
3966            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3967             global_tid, packed_reduction_method, retval));
3968   return retval;
3969 }
3970 
3971 /*!
3972 @ingroup SYNCHRONIZATION
3973 @param loc source location information
3974 @param global_tid global thread id.
3975 @param lck pointer to the unique lock data structure
3976 
3977 Finish the execution of a blocking reduce.
3978 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3979 start function.
3980 */
3981 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3982                        kmp_critical_name *lck) {
3983 
3984   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3985   kmp_info_t *th;
3986   kmp_team_t *team;
3987   int teams_swapped = 0, task_state;
3988 
3989   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3990   __kmp_assert_valid_gtid(global_tid);
3991 
3992   th = __kmp_thread_from_gtid(global_tid);
3993   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3994 
3995   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3996 
3997   // this barrier should be visible to a customer and to the threading profile
3998   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3999   OMPT_REDUCTION_DECL(th, global_tid);
4000 
4001   if (packed_reduction_method == critical_reduce_block) {
4002     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
4003 
4004     OMPT_REDUCTION_END;
4005 
4006 // TODO: implicit barrier: should be exposed
4007 #if OMPT_SUPPORT
4008     ompt_frame_t *ompt_frame;
4009     if (ompt_enabled.enabled) {
4010       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
4011       if (ompt_frame->enter_frame.ptr == NULL)
4012         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
4013     }
4014     OMPT_STORE_RETURN_ADDRESS(global_tid);
4015 #endif
4016 #if USE_ITT_NOTIFY
4017     __kmp_threads[global_tid]->th.th_ident = loc;
4018 #endif
4019     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
4020 #if OMPT_SUPPORT && OMPT_OPTIONAL
4021     if (ompt_enabled.enabled) {
4022       ompt_frame->enter_frame = ompt_data_none;
4023     }
4024 #endif
4025 
4026   } else if (packed_reduction_method == empty_reduce_block) {
4027 
4028     OMPT_REDUCTION_END;
4029 
4030 // usage: if team size==1, no synchronization is required (Intel platforms only)
4031 
4032 // TODO: implicit barrier: should be exposed
4033 #if OMPT_SUPPORT
4034     ompt_frame_t *ompt_frame;
4035     if (ompt_enabled.enabled) {
4036       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
4037       if (ompt_frame->enter_frame.ptr == NULL)
4038         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
4039     }
4040     OMPT_STORE_RETURN_ADDRESS(global_tid);
4041 #endif
4042 #if USE_ITT_NOTIFY
4043     __kmp_threads[global_tid]->th.th_ident = loc;
4044 #endif
4045     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
4046 #if OMPT_SUPPORT && OMPT_OPTIONAL
4047     if (ompt_enabled.enabled) {
4048       ompt_frame->enter_frame = ompt_data_none;
4049     }
4050 #endif
4051 
4052   } else if (packed_reduction_method == atomic_reduce_block) {
4053 
4054 #if OMPT_SUPPORT
4055     ompt_frame_t *ompt_frame;
4056     if (ompt_enabled.enabled) {
4057       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
4058       if (ompt_frame->enter_frame.ptr == NULL)
4059         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
4060     }
4061     OMPT_STORE_RETURN_ADDRESS(global_tid);
4062 #endif
4063 // TODO: implicit barrier: should be exposed
4064 #if USE_ITT_NOTIFY
4065     __kmp_threads[global_tid]->th.th_ident = loc;
4066 #endif
4067     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
4068 #if OMPT_SUPPORT && OMPT_OPTIONAL
4069     if (ompt_enabled.enabled) {
4070       ompt_frame->enter_frame = ompt_data_none;
4071     }
4072 #endif
4073 
4074   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
4075                                    tree_reduce_block)) {
4076 
4077     // only primary thread executes here (primary releases all other workers)
4078     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
4079                             global_tid);
4080 
4081   } else {
4082 
4083     // should never reach this block
4084     KMP_ASSERT(0); // "unexpected method"
4085   }
4086   if (teams_swapped) {
4087     __kmp_restore_swapped_teams(th, team, task_state);
4088   }
4089 
4090   if (__kmp_env_consistency_check)
4091     __kmp_pop_sync(global_tid, ct_reduce, loc);
4092 
4093   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
4094                 global_tid, packed_reduction_method));
4095 
4096   return;
4097 }
4098 
4099 #undef __KMP_GET_REDUCTION_METHOD
4100 #undef __KMP_SET_REDUCTION_METHOD
4101 
4102 /* end of interface to fast scalable reduce routines */
4103 
4104 kmp_uint64 __kmpc_get_taskid() {
4105 
4106   kmp_int32 gtid;
4107   kmp_info_t *thread;
4108 
4109   gtid = __kmp_get_gtid();
4110   if (gtid < 0) {
4111     return 0;
4112   }
4113   thread = __kmp_thread_from_gtid(gtid);
4114   return thread->th.th_current_task->td_task_id;
4115 
4116 } // __kmpc_get_taskid
4117 
4118 kmp_uint64 __kmpc_get_parent_taskid() {
4119 
4120   kmp_int32 gtid;
4121   kmp_info_t *thread;
4122   kmp_taskdata_t *parent_task;
4123 
4124   gtid = __kmp_get_gtid();
4125   if (gtid < 0) {
4126     return 0;
4127   }
4128   thread = __kmp_thread_from_gtid(gtid);
4129   parent_task = thread->th.th_current_task->td_parent;
4130   return (parent_task == NULL ? 0 : parent_task->td_task_id);
4131 
4132 } // __kmpc_get_parent_taskid
4133 
4134 /*!
4135 @ingroup WORK_SHARING
4136 @param loc  source location information.
4137 @param gtid  global thread number.
4138 @param num_dims  number of associated doacross loops.
4139 @param dims  info on loops bounds.
4140 
4141 Initialize doacross loop information.
4142 Expect compiler send us inclusive bounds,
4143 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
4144 */
4145 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
4146                           const struct kmp_dim *dims) {
4147   __kmp_assert_valid_gtid(gtid);
4148   int j, idx;
4149   kmp_int64 last, trace_count;
4150   kmp_info_t *th = __kmp_threads[gtid];
4151   kmp_team_t *team = th->th.th_team;
4152   kmp_uint32 *flags;
4153   kmp_disp_t *pr_buf = th->th.th_dispatch;
4154   dispatch_shared_info_t *sh_buf;
4155 
4156   KA_TRACE(
4157       20,
4158       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
4159        gtid, num_dims, !team->t.t_serialized));
4160   KMP_DEBUG_ASSERT(dims != NULL);
4161   KMP_DEBUG_ASSERT(num_dims > 0);
4162 
4163   if (team->t.t_serialized) {
4164     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
4165     return; // no dependencies if team is serialized
4166   }
4167   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
4168   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
4169   // the next loop
4170   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4171 
4172   // Save bounds info into allocated private buffer
4173   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
4174   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
4175       th, sizeof(kmp_int64) * (4 * num_dims + 1));
4176   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4177   pr_buf->th_doacross_info[0] =
4178       (kmp_int64)num_dims; // first element is number of dimensions
4179   // Save also address of num_done in order to access it later without knowing
4180   // the buffer index
4181   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
4182   pr_buf->th_doacross_info[2] = dims[0].lo;
4183   pr_buf->th_doacross_info[3] = dims[0].up;
4184   pr_buf->th_doacross_info[4] = dims[0].st;
4185   last = 5;
4186   for (j = 1; j < num_dims; ++j) {
4187     kmp_int64
4188         range_length; // To keep ranges of all dimensions but the first dims[0]
4189     if (dims[j].st == 1) { // most common case
4190       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
4191       range_length = dims[j].up - dims[j].lo + 1;
4192     } else {
4193       if (dims[j].st > 0) {
4194         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
4195         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
4196       } else { // negative increment
4197         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
4198         range_length =
4199             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
4200       }
4201     }
4202     pr_buf->th_doacross_info[last++] = range_length;
4203     pr_buf->th_doacross_info[last++] = dims[j].lo;
4204     pr_buf->th_doacross_info[last++] = dims[j].up;
4205     pr_buf->th_doacross_info[last++] = dims[j].st;
4206   }
4207 
4208   // Compute total trip count.
4209   // Start with range of dims[0] which we don't need to keep in the buffer.
4210   if (dims[0].st == 1) { // most common case
4211     trace_count = dims[0].up - dims[0].lo + 1;
4212   } else if (dims[0].st > 0) {
4213     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
4214     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
4215   } else { // negative increment
4216     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
4217     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
4218   }
4219   for (j = 1; j < num_dims; ++j) {
4220     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
4221   }
4222   KMP_DEBUG_ASSERT(trace_count > 0);
4223 
4224   // Check if shared buffer is not occupied by other loop (idx -
4225   // __kmp_dispatch_num_buffers)
4226   if (idx != sh_buf->doacross_buf_idx) {
4227     // Shared buffer is occupied, wait for it to be free
4228     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
4229                  __kmp_eq_4, NULL);
4230   }
4231 #if KMP_32_BIT_ARCH
4232   // Check if we are the first thread. After the CAS the first thread gets 0,
4233   // others get 1 if initialization is in progress, allocated pointer otherwise.
4234   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
4235   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
4236       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
4237 #else
4238   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
4239       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
4240 #endif
4241   if (flags == NULL) {
4242     // we are the first thread, allocate the array of flags
4243     size_t size =
4244         (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
4245     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4246     KMP_MB();
4247     sh_buf->doacross_flags = flags;
4248   } else if (flags == (kmp_uint32 *)1) {
4249 #if KMP_32_BIT_ARCH
4250     // initialization is still in progress, need to wait
4251     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4252 #else
4253     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4254 #endif
4255       KMP_YIELD(TRUE);
4256     KMP_MB();
4257   } else {
4258     KMP_MB();
4259   }
4260   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4261   pr_buf->th_doacross_flags =
4262       sh_buf->doacross_flags; // save private copy in order to not
4263   // touch shared buffer on each iteration
4264   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4265 }
4266 
4267 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4268   __kmp_assert_valid_gtid(gtid);
4269   kmp_int64 shft;
4270   size_t num_dims, i;
4271   kmp_uint32 flag;
4272   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4273   kmp_info_t *th = __kmp_threads[gtid];
4274   kmp_team_t *team = th->th.th_team;
4275   kmp_disp_t *pr_buf;
4276   kmp_int64 lo, up, st;
4277 
4278   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4279   if (team->t.t_serialized) {
4280     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4281     return; // no dependencies if team is serialized
4282   }
4283 
4284   // calculate sequential iteration number and check out-of-bounds condition
4285   pr_buf = th->th.th_dispatch;
4286   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4287   num_dims = (size_t)pr_buf->th_doacross_info[0];
4288   lo = pr_buf->th_doacross_info[2];
4289   up = pr_buf->th_doacross_info[3];
4290   st = pr_buf->th_doacross_info[4];
4291 #if OMPT_SUPPORT && OMPT_OPTIONAL
4292   SimpleVLA<ompt_dependence_t> deps(num_dims);
4293 #endif
4294   if (st == 1) { // most common case
4295     if (vec[0] < lo || vec[0] > up) {
4296       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4297                     "bounds [%lld,%lld]\n",
4298                     gtid, vec[0], lo, up));
4299       return;
4300     }
4301     iter_number = vec[0] - lo;
4302   } else if (st > 0) {
4303     if (vec[0] < lo || vec[0] > up) {
4304       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4305                     "bounds [%lld,%lld]\n",
4306                     gtid, vec[0], lo, up));
4307       return;
4308     }
4309     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4310   } else { // negative increment
4311     if (vec[0] > lo || vec[0] < up) {
4312       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4313                     "bounds [%lld,%lld]\n",
4314                     gtid, vec[0], lo, up));
4315       return;
4316     }
4317     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4318   }
4319 #if OMPT_SUPPORT && OMPT_OPTIONAL
4320   deps[0].variable.value = iter_number;
4321   deps[0].dependence_type = ompt_dependence_type_sink;
4322 #endif
4323   for (i = 1; i < num_dims; ++i) {
4324     kmp_int64 iter, ln;
4325     size_t j = i * 4;
4326     ln = pr_buf->th_doacross_info[j + 1];
4327     lo = pr_buf->th_doacross_info[j + 2];
4328     up = pr_buf->th_doacross_info[j + 3];
4329     st = pr_buf->th_doacross_info[j + 4];
4330     if (st == 1) {
4331       if (vec[i] < lo || vec[i] > up) {
4332         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4333                       "bounds [%lld,%lld]\n",
4334                       gtid, vec[i], lo, up));
4335         return;
4336       }
4337       iter = vec[i] - lo;
4338     } else if (st > 0) {
4339       if (vec[i] < lo || vec[i] > up) {
4340         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4341                       "bounds [%lld,%lld]\n",
4342                       gtid, vec[i], lo, up));
4343         return;
4344       }
4345       iter = (kmp_uint64)(vec[i] - lo) / st;
4346     } else { // st < 0
4347       if (vec[i] > lo || vec[i] < up) {
4348         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4349                       "bounds [%lld,%lld]\n",
4350                       gtid, vec[i], lo, up));
4351         return;
4352       }
4353       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4354     }
4355     iter_number = iter + ln * iter_number;
4356 #if OMPT_SUPPORT && OMPT_OPTIONAL
4357     deps[i].variable.value = iter;
4358     deps[i].dependence_type = ompt_dependence_type_sink;
4359 #endif
4360   }
4361   shft = iter_number % 32; // use 32-bit granularity
4362   iter_number >>= 5; // divided by 32
4363   flag = 1 << shft;
4364   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4365     KMP_YIELD(TRUE);
4366   }
4367   KMP_MB();
4368 #if OMPT_SUPPORT && OMPT_OPTIONAL
4369   if (ompt_enabled.ompt_callback_dependences) {
4370     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4371         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4372   }
4373 #endif
4374   KA_TRACE(20,
4375            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4376             gtid, (iter_number << 5) + shft));
4377 }
4378 
4379 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4380   __kmp_assert_valid_gtid(gtid);
4381   kmp_int64 shft;
4382   size_t num_dims, i;
4383   kmp_uint32 flag;
4384   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4385   kmp_info_t *th = __kmp_threads[gtid];
4386   kmp_team_t *team = th->th.th_team;
4387   kmp_disp_t *pr_buf;
4388   kmp_int64 lo, st;
4389 
4390   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4391   if (team->t.t_serialized) {
4392     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4393     return; // no dependencies if team is serialized
4394   }
4395 
4396   // calculate sequential iteration number (same as in "wait" but no
4397   // out-of-bounds checks)
4398   pr_buf = th->th.th_dispatch;
4399   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4400   num_dims = (size_t)pr_buf->th_doacross_info[0];
4401   lo = pr_buf->th_doacross_info[2];
4402   st = pr_buf->th_doacross_info[4];
4403 #if OMPT_SUPPORT && OMPT_OPTIONAL
4404   SimpleVLA<ompt_dependence_t> deps(num_dims);
4405 #endif
4406   if (st == 1) { // most common case
4407     iter_number = vec[0] - lo;
4408   } else if (st > 0) {
4409     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4410   } else { // negative increment
4411     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4412   }
4413 #if OMPT_SUPPORT && OMPT_OPTIONAL
4414   deps[0].variable.value = iter_number;
4415   deps[0].dependence_type = ompt_dependence_type_source;
4416 #endif
4417   for (i = 1; i < num_dims; ++i) {
4418     kmp_int64 iter, ln;
4419     size_t j = i * 4;
4420     ln = pr_buf->th_doacross_info[j + 1];
4421     lo = pr_buf->th_doacross_info[j + 2];
4422     st = pr_buf->th_doacross_info[j + 4];
4423     if (st == 1) {
4424       iter = vec[i] - lo;
4425     } else if (st > 0) {
4426       iter = (kmp_uint64)(vec[i] - lo) / st;
4427     } else { // st < 0
4428       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4429     }
4430     iter_number = iter + ln * iter_number;
4431 #if OMPT_SUPPORT && OMPT_OPTIONAL
4432     deps[i].variable.value = iter;
4433     deps[i].dependence_type = ompt_dependence_type_source;
4434 #endif
4435   }
4436 #if OMPT_SUPPORT && OMPT_OPTIONAL
4437   if (ompt_enabled.ompt_callback_dependences) {
4438     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4439         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4440   }
4441 #endif
4442   shft = iter_number % 32; // use 32-bit granularity
4443   iter_number >>= 5; // divided by 32
4444   flag = 1 << shft;
4445   KMP_MB();
4446   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4447     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4448   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4449                 (iter_number << 5) + shft));
4450 }
4451 
4452 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4453   __kmp_assert_valid_gtid(gtid);
4454   kmp_int32 num_done;
4455   kmp_info_t *th = __kmp_threads[gtid];
4456   kmp_team_t *team = th->th.th_team;
4457   kmp_disp_t *pr_buf = th->th.th_dispatch;
4458 
4459   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4460   if (team->t.t_serialized) {
4461     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4462     return; // nothing to do
4463   }
4464   num_done =
4465       KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4466   if (num_done == th->th.th_team_nproc) {
4467     // we are the last thread, need to free shared resources
4468     int idx = pr_buf->th_doacross_buf_idx - 1;
4469     dispatch_shared_info_t *sh_buf =
4470         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4471     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4472                      (kmp_int64)&sh_buf->doacross_num_done);
4473     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4474     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4475     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4476     sh_buf->doacross_flags = NULL;
4477     sh_buf->doacross_num_done = 0;
4478     sh_buf->doacross_buf_idx +=
4479         __kmp_dispatch_num_buffers; // free buffer for future re-use
4480   }
4481   // free private resources (need to keep buffer index forever)
4482   pr_buf->th_doacross_flags = NULL;
4483   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4484   pr_buf->th_doacross_info = NULL;
4485   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4486 }
4487 
4488 /* OpenMP 5.1 Memory Management routines */
4489 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4490   return __kmp_alloc(__kmp_entry_gtid(), 0, size, allocator);
4491 }
4492 
4493 void *omp_aligned_alloc(size_t align, size_t size,
4494                         omp_allocator_handle_t allocator) {
4495   return __kmp_alloc(__kmp_entry_gtid(), align, size, allocator);
4496 }
4497 
4498 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4499   return __kmp_calloc(__kmp_entry_gtid(), 0, nmemb, size, allocator);
4500 }
4501 
4502 void *omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
4503                          omp_allocator_handle_t allocator) {
4504   return __kmp_calloc(__kmp_entry_gtid(), align, nmemb, size, allocator);
4505 }
4506 
4507 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4508                   omp_allocator_handle_t free_allocator) {
4509   return __kmp_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4510                        free_allocator);
4511 }
4512 
4513 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4514   ___kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4515 }
4516 /* end of OpenMP 5.1 Memory Management routines */
4517 
4518 int __kmpc_get_target_offload(void) {
4519   if (!__kmp_init_serial) {
4520     __kmp_serial_initialize();
4521   }
4522   return __kmp_target_offload;
4523 }
4524 
4525 int __kmpc_pause_resource(kmp_pause_status_t level) {
4526   if (!__kmp_init_serial) {
4527     return 1; // Can't pause if runtime is not initialized
4528   }
4529   return __kmp_pause_resource(level);
4530 }
4531 
4532 void __kmpc_error(ident_t *loc, int severity, const char *message) {
4533   if (!__kmp_init_serial)
4534     __kmp_serial_initialize();
4535 
4536   KMP_ASSERT(severity == severity_warning || severity == severity_fatal);
4537 
4538 #if OMPT_SUPPORT
4539   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_error) {
4540     ompt_callbacks.ompt_callback(ompt_callback_error)(
4541         (ompt_severity_t)severity, message, KMP_STRLEN(message),
4542         OMPT_GET_RETURN_ADDRESS(0));
4543   }
4544 #endif // OMPT_SUPPORT
4545 
4546   char *src_loc;
4547   if (loc && loc->psource) {
4548     kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
4549     src_loc =
4550         __kmp_str_format("%s:%d:%d", str_loc.file, str_loc.line, str_loc.col);
4551     __kmp_str_loc_free(&str_loc);
4552   } else {
4553     src_loc = __kmp_str_format("unknown");
4554   }
4555 
4556   if (severity == severity_warning)
4557     KMP_WARNING(UserDirectedWarning, src_loc, message);
4558   else
4559     KMP_FATAL(UserDirectedError, src_loc, message);
4560 
4561   __kmp_str_free(&src_loc);
4562 }
4563 
4564 // Mark begin of scope directive.
4565 void __kmpc_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4566 // reserved is for extension of scope directive and not used.
4567 #if OMPT_SUPPORT && OMPT_OPTIONAL
4568   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4569     kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4570     int tid = __kmp_tid_from_gtid(gtid);
4571     ompt_callbacks.ompt_callback(ompt_callback_work)(
4572         ompt_work_scope, ompt_scope_begin,
4573         &(team->t.ompt_team_info.parallel_data),
4574         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4575         OMPT_GET_RETURN_ADDRESS(0));
4576   }
4577 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4578 }
4579 
4580 // Mark end of scope directive
4581 void __kmpc_end_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4582 // reserved is for extension of scope directive and not used.
4583 #if OMPT_SUPPORT && OMPT_OPTIONAL
4584   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4585     kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4586     int tid = __kmp_tid_from_gtid(gtid);
4587     ompt_callbacks.ompt_callback(ompt_callback_work)(
4588         ompt_work_scope, ompt_scope_end,
4589         &(team->t.ompt_team_info.parallel_data),
4590         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4591         OMPT_GET_RETURN_ADDRESS(0));
4592   }
4593 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4594 }
4595 
4596 #ifdef KMP_USE_VERSION_SYMBOLS
4597 // For GOMP compatibility there are two versions of each omp_* API.
4598 // One is the plain C symbol and one is the Fortran symbol with an appended
4599 // underscore. When we implement a specific ompc_* version of an omp_*
4600 // function, we want the plain GOMP versioned symbol to alias the ompc_* version
4601 // instead of the Fortran versions in kmp_ftn_entry.h
4602 extern "C" {
4603 // Have to undef these from omp.h so they aren't translated into
4604 // their ompc counterparts in the KMP_VERSION_OMPC_SYMBOL macros below
4605 #ifdef omp_set_affinity_format
4606 #undef omp_set_affinity_format
4607 #endif
4608 #ifdef omp_get_affinity_format
4609 #undef omp_get_affinity_format
4610 #endif
4611 #ifdef omp_display_affinity
4612 #undef omp_display_affinity
4613 #endif
4614 #ifdef omp_capture_affinity
4615 #undef omp_capture_affinity
4616 #endif
4617 KMP_VERSION_OMPC_SYMBOL(ompc_set_affinity_format, omp_set_affinity_format, 50,
4618                         "OMP_5.0");
4619 KMP_VERSION_OMPC_SYMBOL(ompc_get_affinity_format, omp_get_affinity_format, 50,
4620                         "OMP_5.0");
4621 KMP_VERSION_OMPC_SYMBOL(ompc_display_affinity, omp_display_affinity, 50,
4622                         "OMP_5.0");
4623 KMP_VERSION_OMPC_SYMBOL(ompc_capture_affinity, omp_capture_affinity, 50,
4624                         "OMP_5.0");
4625 } // extern "C"
4626 #endif
4627