xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_csupport.cpp (revision a0ca4af9455b844c5e094fc1b09b1390ffa979fc)
1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     __kmp_assign_root_init_mask();
43     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
44   } else if (__kmp_ignore_mppbeg() == FALSE) {
45     // By default __kmp_ignore_mppbeg() returns TRUE.
46     __kmp_internal_begin();
47     KC_TRACE(10, ("__kmpc_begin: called\n"));
48   }
49 }
50 
51 /*!
52  * @ingroup STARTUP_SHUTDOWN
53  * @param loc source location information
54  *
55  * Shutdown the runtime library. This is also optional, and even if called will
56  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
57  * zero.
58  */
59 void __kmpc_end(ident_t *loc) {
60   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
61   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
62   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
63   // returns FALSE and __kmpc_end() will unregister this root (it can cause
64   // library shut down).
65   if (__kmp_ignore_mppend() == FALSE) {
66     KC_TRACE(10, ("__kmpc_end: called\n"));
67     KA_TRACE(30, ("__kmpc_end\n"));
68 
69     __kmp_internal_end_thread(-1);
70   }
71 #if KMP_OS_WINDOWS && OMPT_SUPPORT
72   // Normal exit process on Windows does not allow worker threads of the final
73   // parallel region to finish reporting their events, so shutting down the
74   // library here fixes the issue at least for the cases where __kmpc_end() is
75   // placed properly.
76   if (ompt_enabled.enabled)
77     __kmp_internal_end_library(__kmp_gtid_get_specific());
78 #endif
79 }
80 
81 /*!
82 @ingroup THREAD_STATES
83 @param loc Source location information.
84 @return The global thread index of the active thread.
85 
86 This function can be called in any context.
87 
88 If the runtime has ony been entered at the outermost level from a
89 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
90 that which would be returned by omp_get_thread_num() in the outermost
91 active parallel construct. (Or zero if there is no active parallel
92 construct, since the primary thread is necessarily thread zero).
93 
94 If multiple non-OpenMP threads all enter an OpenMP construct then this
95 will be a unique thread identifier among all the threads created by
96 the OpenMP runtime (but the value cannot be defined in terms of
97 OpenMP thread ids returned by omp_get_thread_num()).
98 */
99 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
100   kmp_int32 gtid = __kmp_entry_gtid();
101 
102   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
103 
104   return gtid;
105 }
106 
107 /*!
108 @ingroup THREAD_STATES
109 @param loc Source location information.
110 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
111 
112 This function can be called in any context.
113 It returns the total number of threads under the control of the OpenMP runtime.
114 That is not a number that can be determined by any OpenMP standard calls, since
115 the library may be called from more than one non-OpenMP thread, and this
116 reflects the total over all such calls. Similarly the runtime maintains
117 underlying threads even when they are not active (since the cost of creating
118 and destroying OS threads is high), this call counts all such threads even if
119 they are not waiting for work.
120 */
121 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
122   KC_TRACE(10,
123            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
124 
125   return TCR_4(__kmp_all_nth);
126 }
127 
128 /*!
129 @ingroup THREAD_STATES
130 @param loc Source location information.
131 @return The thread number of the calling thread in the innermost active parallel
132 construct.
133 */
134 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
135   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
136   return __kmp_tid_from_gtid(__kmp_entry_gtid());
137 }
138 
139 /*!
140 @ingroup THREAD_STATES
141 @param loc Source location information.
142 @return The number of threads in the innermost active parallel construct.
143 */
144 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
145   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
146 
147   return __kmp_entry_thread()->th.th_team->t.t_nproc;
148 }
149 
150 /*!
151  * @ingroup DEPRECATED
152  * @param loc location description
153  *
154  * This function need not be called. It always returns TRUE.
155  */
156 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
157 #ifndef KMP_DEBUG
158 
159   return TRUE;
160 
161 #else
162 
163   const char *semi2;
164   const char *semi3;
165   int line_no;
166 
167   if (__kmp_par_range == 0) {
168     return TRUE;
169   }
170   semi2 = loc->psource;
171   if (semi2 == NULL) {
172     return TRUE;
173   }
174   semi2 = strchr(semi2, ';');
175   if (semi2 == NULL) {
176     return TRUE;
177   }
178   semi2 = strchr(semi2 + 1, ';');
179   if (semi2 == NULL) {
180     return TRUE;
181   }
182   if (__kmp_par_range_filename[0]) {
183     const char *name = semi2 - 1;
184     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
185       name--;
186     }
187     if ((*name == '/') || (*name == ';')) {
188       name++;
189     }
190     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
191       return __kmp_par_range < 0;
192     }
193   }
194   semi3 = strchr(semi2 + 1, ';');
195   if (__kmp_par_range_routine[0]) {
196     if ((semi3 != NULL) && (semi3 > semi2) &&
197         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
198       return __kmp_par_range < 0;
199     }
200   }
201   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
202     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
203       return __kmp_par_range > 0;
204     }
205     return __kmp_par_range < 0;
206   }
207   return TRUE;
208 
209 #endif /* KMP_DEBUG */
210 }
211 
212 /*!
213 @ingroup THREAD_STATES
214 @param loc Source location information.
215 @return 1 if this thread is executing inside an active parallel region, zero if
216 not.
217 */
218 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
219   return __kmp_entry_thread()->th.th_root->r.r_active;
220 }
221 
222 /*!
223 @ingroup PARALLEL
224 @param loc source location information
225 @param global_tid global thread number
226 @param num_threads number of threads requested for this parallel construct
227 
228 Set the number of threads to be used by the next fork spawned by this thread.
229 This call is only required if the parallel construct has a `num_threads` clause.
230 */
231 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
232                              kmp_int32 num_threads) {
233   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
234                 global_tid, num_threads));
235   __kmp_assert_valid_gtid(global_tid);
236   __kmp_push_num_threads(loc, global_tid, num_threads);
237 }
238 
239 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
240   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
241   /* the num_threads are automatically popped */
242 }
243 
244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245                            kmp_int32 proc_bind) {
246   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247                 proc_bind));
248   __kmp_assert_valid_gtid(global_tid);
249   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250 }
251 
252 /*!
253 @ingroup PARALLEL
254 @param loc  source location information
255 @param argc  total number of arguments in the ellipsis
256 @param microtask  pointer to callback routine consisting of outlined parallel
257 construct
258 @param ...  pointers to shared variables that aren't global
259 
260 Do the actual fork and call the microtask in the relevant number of threads.
261 */
262 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263   int gtid = __kmp_entry_gtid();
264 
265 #if (KMP_STATS_ENABLED)
266   // If we were in a serial region, then stop the serial timer, record
267   // the event, and start parallel region timer
268   stats_state_e previous_state = KMP_GET_THREAD_STATE();
269   if (previous_state == stats_state_e::SERIAL_REGION) {
270     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
271   } else {
272     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273   }
274   int inParallel = __kmpc_in_parallel(loc);
275   if (inParallel) {
276     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
277   } else {
278     KMP_COUNT_BLOCK(OMP_PARALLEL);
279   }
280 #endif
281 
282   // maybe to save thr_state is enough here
283   {
284     va_list ap;
285     va_start(ap, microtask);
286 
287 #if OMPT_SUPPORT
288     ompt_frame_t *ompt_frame;
289     if (ompt_enabled.enabled) {
290       kmp_info_t *master_th = __kmp_threads[gtid];
291       ompt_frame = &master_th->th.th_current_task->ompt_task_info.frame;
292       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
293     }
294     OMPT_STORE_RETURN_ADDRESS(gtid);
295 #endif
296 
297 #if INCLUDE_SSC_MARKS
298     SSC_MARK_FORKING();
299 #endif
300     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
301                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
302                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
303                     kmp_va_addr_of(ap));
304 #if INCLUDE_SSC_MARKS
305     SSC_MARK_JOINING();
306 #endif
307     __kmp_join_call(loc, gtid
308 #if OMPT_SUPPORT
309                     ,
310                     fork_context_intel
311 #endif
312     );
313 
314     va_end(ap);
315 
316 #if OMPT_SUPPORT
317     if (ompt_enabled.enabled) {
318       ompt_frame->enter_frame = ompt_data_none;
319     }
320 #endif
321   }
322 
323 #if KMP_STATS_ENABLED
324   if (previous_state == stats_state_e::SERIAL_REGION) {
325     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
326     KMP_SET_THREAD_STATE(previous_state);
327   } else {
328     KMP_POP_PARTITIONED_TIMER();
329   }
330 #endif // KMP_STATS_ENABLED
331 }
332 
333 /*!
334 @ingroup PARALLEL
335 @param loc  source location information
336 @param microtask  pointer to callback routine consisting of outlined parallel
337 construct
338 @param cond  condition for running in parallel
339 @param args  struct of pointers to shared variables that aren't global
340 
341 Perform a fork only if the condition is true.
342 */
343 void __kmpc_fork_call_if(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
344                          kmp_int32 cond, void *args) {
345   int gtid = __kmp_entry_gtid();
346   if (cond) {
347     if (args)
348       __kmpc_fork_call(loc, argc, microtask, args);
349     else
350       __kmpc_fork_call(loc, argc, microtask);
351   } else {
352     __kmpc_serialized_parallel(loc, gtid);
353 
354 #if OMPT_SUPPORT
355     void *exit_frame_ptr;
356 #endif
357 
358     if (args)
359       __kmp_invoke_microtask(VOLATILE_CAST(microtask_t) microtask, gtid,
360                              /*npr=*/0,
361                              /*argc=*/1, &args
362 #if OMPT_SUPPORT
363                              ,
364                              &exit_frame_ptr
365 #endif
366       );
367     else
368       __kmp_invoke_microtask(VOLATILE_CAST(microtask_t) microtask, gtid,
369                              /*npr=*/0,
370                              /*argc=*/0,
371                              /*args=*/nullptr
372 #if OMPT_SUPPORT
373                              ,
374                              &exit_frame_ptr
375 #endif
376       );
377 
378     __kmpc_end_serialized_parallel(loc, gtid);
379   }
380 }
381 
382 /*!
383 @ingroup PARALLEL
384 @param loc source location information
385 @param global_tid global thread number
386 @param num_teams number of teams requested for the teams construct
387 @param num_threads number of threads per team requested for the teams construct
388 
389 Set the number of teams to be used by the teams construct.
390 This call is only required if the teams construct has a `num_teams` clause
391 or a `thread_limit` clause (or both).
392 */
393 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
394                            kmp_int32 num_teams, kmp_int32 num_threads) {
395   KA_TRACE(20,
396            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
397             global_tid, num_teams, num_threads));
398   __kmp_assert_valid_gtid(global_tid);
399   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
400 }
401 
402 /*!
403 @ingroup PARALLEL
404 @param loc source location information
405 @param global_tid global thread number
406 @param thread_limit limit on number of threads which can be created within the
407 current task
408 
409 Set the thread_limit for the current task
410 This call is there to support `thread_limit` clause on the `target` construct
411 */
412 void __kmpc_set_thread_limit(ident_t *loc, kmp_int32 global_tid,
413                              kmp_int32 thread_limit) {
414   __kmp_assert_valid_gtid(global_tid);
415   kmp_info_t *thread = __kmp_threads[global_tid];
416   if (thread_limit > 0)
417     thread->th.th_current_task->td_icvs.task_thread_limit = thread_limit;
418 }
419 
420 /*!
421 @ingroup PARALLEL
422 @param loc source location information
423 @param global_tid global thread number
424 @param num_teams_lb lower bound on number of teams requested for the teams
425 construct
426 @param num_teams_ub upper bound on number of teams requested for the teams
427 construct
428 @param num_threads number of threads per team requested for the teams construct
429 
430 Set the number of teams to be used by the teams construct. The number of initial
431 teams cretaed will be greater than or equal to the lower bound and less than or
432 equal to the upper bound.
433 This call is only required if the teams construct has a `num_teams` clause
434 or a `thread_limit` clause (or both).
435 */
436 void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
437                               kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
438                               kmp_int32 num_threads) {
439   KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
440                 " num_teams_ub=%d num_threads=%d\n",
441                 global_tid, num_teams_lb, num_teams_ub, num_threads));
442   __kmp_assert_valid_gtid(global_tid);
443   __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
444                           num_threads);
445 }
446 
447 /*!
448 @ingroup PARALLEL
449 @param loc  source location information
450 @param argc  total number of arguments in the ellipsis
451 @param microtask  pointer to callback routine consisting of outlined teams
452 construct
453 @param ...  pointers to shared variables that aren't global
454 
455 Do the actual fork and call the microtask in the relevant number of threads.
456 */
457 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
458                        ...) {
459   int gtid = __kmp_entry_gtid();
460   kmp_info_t *this_thr = __kmp_threads[gtid];
461   va_list ap;
462   va_start(ap, microtask);
463 
464 #if KMP_STATS_ENABLED
465   KMP_COUNT_BLOCK(OMP_TEAMS);
466   stats_state_e previous_state = KMP_GET_THREAD_STATE();
467   if (previous_state == stats_state_e::SERIAL_REGION) {
468     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
469   } else {
470     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
471   }
472 #endif
473 
474   // remember teams entry point and nesting level
475   this_thr->th.th_teams_microtask = microtask;
476   this_thr->th.th_teams_level =
477       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
478 
479 #if OMPT_SUPPORT
480   kmp_team_t *parent_team = this_thr->th.th_team;
481   int tid = __kmp_tid_from_gtid(gtid);
482   if (ompt_enabled.enabled) {
483     parent_team->t.t_implicit_task_taskdata[tid]
484         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
485   }
486   OMPT_STORE_RETURN_ADDRESS(gtid);
487 #endif
488 
489   // check if __kmpc_push_num_teams called, set default number of teams
490   // otherwise
491   if (this_thr->th.th_teams_size.nteams == 0) {
492     __kmp_push_num_teams(loc, gtid, 0, 0);
493   }
494   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
495   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
496   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
497 
498   __kmp_fork_call(
499       loc, gtid, fork_context_intel, argc,
500       VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
501       VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
502   __kmp_join_call(loc, gtid
503 #if OMPT_SUPPORT
504                   ,
505                   fork_context_intel
506 #endif
507   );
508 
509   // Pop current CG root off list
510   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
511   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
512   this_thr->th.th_cg_roots = tmp->up;
513   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
514                  " to node %p. cg_nthreads was %d\n",
515                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
516   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
517   int i = tmp->cg_nthreads--;
518   if (i == 1) { // check is we are the last thread in CG (not always the case)
519     __kmp_free(tmp);
520   }
521   // Restore current task's thread_limit from CG root
522   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
523   this_thr->th.th_current_task->td_icvs.thread_limit =
524       this_thr->th.th_cg_roots->cg_thread_limit;
525 
526   this_thr->th.th_teams_microtask = NULL;
527   this_thr->th.th_teams_level = 0;
528   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
529   va_end(ap);
530 #if KMP_STATS_ENABLED
531   if (previous_state == stats_state_e::SERIAL_REGION) {
532     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
533     KMP_SET_THREAD_STATE(previous_state);
534   } else {
535     KMP_POP_PARTITIONED_TIMER();
536   }
537 #endif // KMP_STATS_ENABLED
538 }
539 
540 // I don't think this function should ever have been exported.
541 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
542 // openmp code ever called it, but it's been exported from the RTL for so
543 // long that I'm afraid to remove the definition.
544 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
545 
546 /*!
547 @ingroup PARALLEL
548 @param loc  source location information
549 @param global_tid  global thread number
550 
551 Enter a serialized parallel construct. This interface is used to handle a
552 conditional parallel region, like this,
553 @code
554 #pragma omp parallel if (condition)
555 @endcode
556 when the condition is false.
557 */
558 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
559   // The implementation is now in kmp_runtime.cpp so that it can share static
560   // functions with kmp_fork_call since the tasks to be done are similar in
561   // each case.
562   __kmp_assert_valid_gtid(global_tid);
563 #if OMPT_SUPPORT
564   OMPT_STORE_RETURN_ADDRESS(global_tid);
565 #endif
566   __kmp_serialized_parallel(loc, global_tid);
567 }
568 
569 /*!
570 @ingroup PARALLEL
571 @param loc  source location information
572 @param global_tid  global thread number
573 
574 Leave a serialized parallel construct.
575 */
576 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
577   kmp_internal_control_t *top;
578   kmp_info_t *this_thr;
579   kmp_team_t *serial_team;
580 
581   KC_TRACE(10,
582            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
583 
584   /* skip all this code for autopar serialized loops since it results in
585      unacceptable overhead */
586   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
587     return;
588 
589   // Not autopar code
590   __kmp_assert_valid_gtid(global_tid);
591   if (!TCR_4(__kmp_init_parallel))
592     __kmp_parallel_initialize();
593 
594   __kmp_resume_if_soft_paused();
595 
596   this_thr = __kmp_threads[global_tid];
597   serial_team = this_thr->th.th_serial_team;
598 
599   kmp_task_team_t *task_team = this_thr->th.th_task_team;
600   // we need to wait for the proxy tasks before finishing the thread
601   if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
602                             task_team->tt.tt_hidden_helper_task_encountered))
603     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
604 
605   KMP_MB();
606   KMP_DEBUG_ASSERT(serial_team);
607   KMP_ASSERT(serial_team->t.t_serialized);
608   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
609   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
610   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
611   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
612 
613 #if OMPT_SUPPORT
614   if (ompt_enabled.enabled &&
615       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
616     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
617     if (ompt_enabled.ompt_callback_implicit_task) {
618       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
619           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
620           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
621     }
622 
623     // reset clear the task id only after unlinking the task
624     ompt_data_t *parent_task_data;
625     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
626 
627     if (ompt_enabled.ompt_callback_parallel_end) {
628       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
629           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
630           ompt_parallel_invoker_program | ompt_parallel_team,
631           OMPT_LOAD_RETURN_ADDRESS(global_tid));
632     }
633     __ompt_lw_taskteam_unlink(this_thr);
634     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
635   }
636 #endif
637 
638   /* If necessary, pop the internal control stack values and replace the team
639    * values */
640   top = serial_team->t.t_control_stack_top;
641   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
642     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
643     serial_team->t.t_control_stack_top = top->next;
644     __kmp_free(top);
645   }
646 
647   /* pop dispatch buffers stack */
648   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
649   {
650     dispatch_private_info_t *disp_buffer =
651         serial_team->t.t_dispatch->th_disp_buffer;
652     serial_team->t.t_dispatch->th_disp_buffer =
653         serial_team->t.t_dispatch->th_disp_buffer->next;
654     __kmp_free(disp_buffer);
655   }
656   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
657 
658   --serial_team->t.t_serialized;
659   if (serial_team->t.t_serialized == 0) {
660 
661     /* return to the parallel section */
662 
663 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
664     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
665       __kmp_clear_x87_fpu_status_word();
666       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
667       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
668     }
669 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
670 
671     __kmp_pop_current_task_from_thread(this_thr);
672 #if OMPD_SUPPORT
673     if (ompd_state & OMPD_ENABLE_BP)
674       ompd_bp_parallel_end();
675 #endif
676 
677     this_thr->th.th_team = serial_team->t.t_parent;
678     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
679 
680     /* restore values cached in the thread */
681     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
682     this_thr->th.th_team_master =
683         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
684     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
685 
686     /* TODO the below shouldn't need to be adjusted for serialized teams */
687     this_thr->th.th_dispatch =
688         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
689 
690     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
691     this_thr->th.th_current_task->td_flags.executing = 1;
692 
693     if (__kmp_tasking_mode != tskm_immediate_exec) {
694       // Copy the task team from the new child / old parent team to the thread.
695       this_thr->th.th_task_team =
696           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
697       KA_TRACE(20,
698                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
699                 "team %p\n",
700                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
701     }
702 #if KMP_AFFINITY_SUPPORTED
703     if (this_thr->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
704       __kmp_reset_root_init_mask(global_tid);
705     }
706 #endif
707   } else {
708     if (__kmp_tasking_mode != tskm_immediate_exec) {
709       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
710                     "depth of serial team %p to %d\n",
711                     global_tid, serial_team, serial_team->t.t_serialized));
712     }
713   }
714 
715   serial_team->t.t_level--;
716   if (__kmp_env_consistency_check)
717     __kmp_pop_parallel(global_tid, NULL);
718 #if OMPT_SUPPORT
719   if (ompt_enabled.enabled)
720     this_thr->th.ompt_thread_info.state =
721         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
722                                            : ompt_state_work_parallel);
723 #endif
724 }
725 
726 /*!
727 @ingroup SYNCHRONIZATION
728 @param loc  source location information.
729 
730 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
731 depending on the memory ordering convention obeyed by the compiler
732 even that may not be necessary).
733 */
734 void __kmpc_flush(ident_t *loc) {
735   KC_TRACE(10, ("__kmpc_flush: called\n"));
736 
737   /* need explicit __mf() here since use volatile instead in library */
738   KMP_MFENCE(); /* Flush all pending memory write invalidates.  */
739 
740 #if OMPT_SUPPORT && OMPT_OPTIONAL
741   if (ompt_enabled.ompt_callback_flush) {
742     ompt_callbacks.ompt_callback(ompt_callback_flush)(
743         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
744   }
745 #endif
746 }
747 
748 /* -------------------------------------------------------------------------- */
749 /*!
750 @ingroup SYNCHRONIZATION
751 @param loc source location information
752 @param global_tid thread id.
753 
754 Execute a barrier.
755 */
756 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
757   KMP_COUNT_BLOCK(OMP_BARRIER);
758   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
759   __kmp_assert_valid_gtid(global_tid);
760 
761   if (!TCR_4(__kmp_init_parallel))
762     __kmp_parallel_initialize();
763 
764   __kmp_resume_if_soft_paused();
765 
766   if (__kmp_env_consistency_check) {
767     if (loc == 0) {
768       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
769     }
770     __kmp_check_barrier(global_tid, ct_barrier, loc);
771   }
772 
773 #if OMPT_SUPPORT
774   ompt_frame_t *ompt_frame;
775   if (ompt_enabled.enabled) {
776     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
777     if (ompt_frame->enter_frame.ptr == NULL)
778       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
779   }
780   OMPT_STORE_RETURN_ADDRESS(global_tid);
781 #endif
782   __kmp_threads[global_tid]->th.th_ident = loc;
783   // TODO: explicit barrier_wait_id:
784   //   this function is called when 'barrier' directive is present or
785   //   implicit barrier at the end of a worksharing construct.
786   // 1) better to add a per-thread barrier counter to a thread data structure
787   // 2) set to 0 when a new team is created
788   // 4) no sync is required
789 
790   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
791 #if OMPT_SUPPORT && OMPT_OPTIONAL
792   if (ompt_enabled.enabled) {
793     ompt_frame->enter_frame = ompt_data_none;
794   }
795 #endif
796 }
797 
798 /* The BARRIER for a MASTER section is always explicit   */
799 /*!
800 @ingroup WORK_SHARING
801 @param loc  source location information.
802 @param global_tid  global thread number .
803 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
804 */
805 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
806   int status = 0;
807 
808   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
809   __kmp_assert_valid_gtid(global_tid);
810 
811   if (!TCR_4(__kmp_init_parallel))
812     __kmp_parallel_initialize();
813 
814   __kmp_resume_if_soft_paused();
815 
816   if (KMP_MASTER_GTID(global_tid)) {
817     KMP_COUNT_BLOCK(OMP_MASTER);
818     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
819     status = 1;
820   }
821 
822 #if OMPT_SUPPORT && OMPT_OPTIONAL
823   if (status) {
824     if (ompt_enabled.ompt_callback_masked) {
825       kmp_info_t *this_thr = __kmp_threads[global_tid];
826       kmp_team_t *team = this_thr->th.th_team;
827 
828       int tid = __kmp_tid_from_gtid(global_tid);
829       ompt_callbacks.ompt_callback(ompt_callback_masked)(
830           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
831           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
832           OMPT_GET_RETURN_ADDRESS(0));
833     }
834   }
835 #endif
836 
837   if (__kmp_env_consistency_check) {
838 #if KMP_USE_DYNAMIC_LOCK
839     if (status)
840       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
841     else
842       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
843 #else
844     if (status)
845       __kmp_push_sync(global_tid, ct_master, loc, NULL);
846     else
847       __kmp_check_sync(global_tid, ct_master, loc, NULL);
848 #endif
849   }
850 
851   return status;
852 }
853 
854 /*!
855 @ingroup WORK_SHARING
856 @param loc  source location information.
857 @param global_tid  global thread number .
858 
859 Mark the end of a <tt>master</tt> region. This should only be called by the
860 thread that executes the <tt>master</tt> region.
861 */
862 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
863   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
864   __kmp_assert_valid_gtid(global_tid);
865   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
866   KMP_POP_PARTITIONED_TIMER();
867 
868 #if OMPT_SUPPORT && OMPT_OPTIONAL
869   kmp_info_t *this_thr = __kmp_threads[global_tid];
870   kmp_team_t *team = this_thr->th.th_team;
871   if (ompt_enabled.ompt_callback_masked) {
872     int tid = __kmp_tid_from_gtid(global_tid);
873     ompt_callbacks.ompt_callback(ompt_callback_masked)(
874         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
875         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
876         OMPT_GET_RETURN_ADDRESS(0));
877   }
878 #endif
879 
880   if (__kmp_env_consistency_check) {
881     if (KMP_MASTER_GTID(global_tid))
882       __kmp_pop_sync(global_tid, ct_master, loc);
883   }
884 }
885 
886 /*!
887 @ingroup WORK_SHARING
888 @param loc  source location information.
889 @param global_tid  global thread number.
890 @param filter result of evaluating filter clause on thread global_tid, or zero
891 if no filter clause present
892 @return 1 if this thread should execute the <tt>masked</tt> block, 0 otherwise.
893 */
894 kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
895   int status = 0;
896   int tid;
897   KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
898   __kmp_assert_valid_gtid(global_tid);
899 
900   if (!TCR_4(__kmp_init_parallel))
901     __kmp_parallel_initialize();
902 
903   __kmp_resume_if_soft_paused();
904 
905   tid = __kmp_tid_from_gtid(global_tid);
906   if (tid == filter) {
907     KMP_COUNT_BLOCK(OMP_MASKED);
908     KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
909     status = 1;
910   }
911 
912 #if OMPT_SUPPORT && OMPT_OPTIONAL
913   if (status) {
914     if (ompt_enabled.ompt_callback_masked) {
915       kmp_info_t *this_thr = __kmp_threads[global_tid];
916       kmp_team_t *team = this_thr->th.th_team;
917       ompt_callbacks.ompt_callback(ompt_callback_masked)(
918           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
919           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
920           OMPT_GET_RETURN_ADDRESS(0));
921     }
922   }
923 #endif
924 
925   if (__kmp_env_consistency_check) {
926 #if KMP_USE_DYNAMIC_LOCK
927     if (status)
928       __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
929     else
930       __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
931 #else
932     if (status)
933       __kmp_push_sync(global_tid, ct_masked, loc, NULL);
934     else
935       __kmp_check_sync(global_tid, ct_masked, loc, NULL);
936 #endif
937   }
938 
939   return status;
940 }
941 
942 /*!
943 @ingroup WORK_SHARING
944 @param loc  source location information.
945 @param global_tid  global thread number .
946 
947 Mark the end of a <tt>masked</tt> region. This should only be called by the
948 thread that executes the <tt>masked</tt> region.
949 */
950 void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
951   KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
952   __kmp_assert_valid_gtid(global_tid);
953   KMP_POP_PARTITIONED_TIMER();
954 
955 #if OMPT_SUPPORT && OMPT_OPTIONAL
956   kmp_info_t *this_thr = __kmp_threads[global_tid];
957   kmp_team_t *team = this_thr->th.th_team;
958   if (ompt_enabled.ompt_callback_masked) {
959     int tid = __kmp_tid_from_gtid(global_tid);
960     ompt_callbacks.ompt_callback(ompt_callback_masked)(
961         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
962         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
963         OMPT_GET_RETURN_ADDRESS(0));
964   }
965 #endif
966 
967   if (__kmp_env_consistency_check) {
968     __kmp_pop_sync(global_tid, ct_masked, loc);
969   }
970 }
971 
972 /*!
973 @ingroup WORK_SHARING
974 @param loc  source location information.
975 @param gtid  global thread number.
976 
977 Start execution of an <tt>ordered</tt> construct.
978 */
979 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
980   int cid = 0;
981   kmp_info_t *th;
982   KMP_DEBUG_ASSERT(__kmp_init_serial);
983 
984   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
985   __kmp_assert_valid_gtid(gtid);
986 
987   if (!TCR_4(__kmp_init_parallel))
988     __kmp_parallel_initialize();
989 
990   __kmp_resume_if_soft_paused();
991 
992 #if USE_ITT_BUILD
993   __kmp_itt_ordered_prep(gtid);
994 // TODO: ordered_wait_id
995 #endif /* USE_ITT_BUILD */
996 
997   th = __kmp_threads[gtid];
998 
999 #if OMPT_SUPPORT && OMPT_OPTIONAL
1000   kmp_team_t *team;
1001   ompt_wait_id_t lck;
1002   void *codeptr_ra;
1003   OMPT_STORE_RETURN_ADDRESS(gtid);
1004   if (ompt_enabled.enabled) {
1005     team = __kmp_team_from_gtid(gtid);
1006     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
1007     /* OMPT state update */
1008     th->th.ompt_thread_info.wait_id = lck;
1009     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
1010 
1011     /* OMPT event callback */
1012     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1013     if (ompt_enabled.ompt_callback_mutex_acquire) {
1014       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1015           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
1016           codeptr_ra);
1017     }
1018   }
1019 #endif
1020 
1021   if (th->th.th_dispatch->th_deo_fcn != 0)
1022     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
1023   else
1024     __kmp_parallel_deo(&gtid, &cid, loc);
1025 
1026 #if OMPT_SUPPORT && OMPT_OPTIONAL
1027   if (ompt_enabled.enabled) {
1028     /* OMPT state update */
1029     th->th.ompt_thread_info.state = ompt_state_work_parallel;
1030     th->th.ompt_thread_info.wait_id = 0;
1031 
1032     /* OMPT event callback */
1033     if (ompt_enabled.ompt_callback_mutex_acquired) {
1034       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1035           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1036     }
1037   }
1038 #endif
1039 
1040 #if USE_ITT_BUILD
1041   __kmp_itt_ordered_start(gtid);
1042 #endif /* USE_ITT_BUILD */
1043 }
1044 
1045 /*!
1046 @ingroup WORK_SHARING
1047 @param loc  source location information.
1048 @param gtid  global thread number.
1049 
1050 End execution of an <tt>ordered</tt> construct.
1051 */
1052 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
1053   int cid = 0;
1054   kmp_info_t *th;
1055 
1056   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
1057   __kmp_assert_valid_gtid(gtid);
1058 
1059 #if USE_ITT_BUILD
1060   __kmp_itt_ordered_end(gtid);
1061 // TODO: ordered_wait_id
1062 #endif /* USE_ITT_BUILD */
1063 
1064   th = __kmp_threads[gtid];
1065 
1066   if (th->th.th_dispatch->th_dxo_fcn != 0)
1067     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
1068   else
1069     __kmp_parallel_dxo(&gtid, &cid, loc);
1070 
1071 #if OMPT_SUPPORT && OMPT_OPTIONAL
1072   OMPT_STORE_RETURN_ADDRESS(gtid);
1073   if (ompt_enabled.ompt_callback_mutex_released) {
1074     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1075         ompt_mutex_ordered,
1076         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
1077             ->t.t_ordered.dt.t_value,
1078         OMPT_LOAD_RETURN_ADDRESS(gtid));
1079   }
1080 #endif
1081 }
1082 
1083 #if KMP_USE_DYNAMIC_LOCK
1084 
1085 static __forceinline void
1086 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
1087                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
1088   // Pointer to the allocated indirect lock is written to crit, while indexing
1089   // is ignored.
1090   void *idx;
1091   kmp_indirect_lock_t **lck;
1092   lck = (kmp_indirect_lock_t **)crit;
1093   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
1094   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
1095   KMP_SET_I_LOCK_LOCATION(ilk, loc);
1096   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
1097   KA_TRACE(20,
1098            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
1099 #if USE_ITT_BUILD
1100   __kmp_itt_critical_creating(ilk->lock, loc);
1101 #endif
1102   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
1103   if (status == 0) {
1104 #if USE_ITT_BUILD
1105     __kmp_itt_critical_destroyed(ilk->lock);
1106 #endif
1107     // We don't really need to destroy the unclaimed lock here since it will be
1108     // cleaned up at program exit.
1109     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
1110   }
1111   KMP_DEBUG_ASSERT(*lck != NULL);
1112 }
1113 
1114 // Fast-path acquire tas lock
1115 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
1116   {                                                                            \
1117     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1118     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1119     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1120     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
1121         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
1122       kmp_uint32 spins;                                                        \
1123       KMP_FSYNC_PREPARE(l);                                                    \
1124       KMP_INIT_YIELD(spins);                                                   \
1125       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1126       do {                                                                     \
1127         if (TCR_4(__kmp_nth) >                                                 \
1128             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1129           KMP_YIELD(TRUE);                                                     \
1130         } else {                                                               \
1131           KMP_YIELD_SPIN(spins);                                               \
1132         }                                                                      \
1133         __kmp_spin_backoff(&backoff);                                          \
1134       } while (                                                                \
1135           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1136           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1137     }                                                                          \
1138     KMP_FSYNC_ACQUIRED(l);                                                     \
1139   }
1140 
1141 // Fast-path test tas lock
1142 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1143   {                                                                            \
1144     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1145     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1146     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1147     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1148          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1149   }
1150 
1151 // Fast-path release tas lock
1152 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1153   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1154 
1155 #if KMP_USE_FUTEX
1156 
1157 #include <sys/syscall.h>
1158 #include <unistd.h>
1159 #ifndef FUTEX_WAIT
1160 #define FUTEX_WAIT 0
1161 #endif
1162 #ifndef FUTEX_WAKE
1163 #define FUTEX_WAKE 1
1164 #endif
1165 
1166 // Fast-path acquire futex lock
1167 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1168   {                                                                            \
1169     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1170     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1171     KMP_MB();                                                                  \
1172     KMP_FSYNC_PREPARE(ftx);                                                    \
1173     kmp_int32 poll_val;                                                        \
1174     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1175                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1176                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1177       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1178       if (!cond) {                                                             \
1179         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1180                                          poll_val |                            \
1181                                              KMP_LOCK_BUSY(1, futex))) {       \
1182           continue;                                                            \
1183         }                                                                      \
1184         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1185       }                                                                        \
1186       kmp_int32 rc;                                                            \
1187       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1188                         NULL, NULL, 0)) != 0) {                                \
1189         continue;                                                              \
1190       }                                                                        \
1191       gtid_code |= 1;                                                          \
1192     }                                                                          \
1193     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1194   }
1195 
1196 // Fast-path test futex lock
1197 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1198   {                                                                            \
1199     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1200     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1201                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1202       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1203       rc = TRUE;                                                               \
1204     } else {                                                                   \
1205       rc = FALSE;                                                              \
1206     }                                                                          \
1207   }
1208 
1209 // Fast-path release futex lock
1210 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1211   {                                                                            \
1212     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1213     KMP_MB();                                                                  \
1214     KMP_FSYNC_RELEASING(ftx);                                                  \
1215     kmp_int32 poll_val =                                                       \
1216         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1217     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1218       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1219               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1220     }                                                                          \
1221     KMP_MB();                                                                  \
1222     KMP_YIELD_OVERSUB();                                                       \
1223   }
1224 
1225 #endif // KMP_USE_FUTEX
1226 
1227 #else // KMP_USE_DYNAMIC_LOCK
1228 
1229 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1230                                                       ident_t const *loc,
1231                                                       kmp_int32 gtid) {
1232   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1233 
1234   // Because of the double-check, the following load doesn't need to be volatile
1235   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1236 
1237   if (lck == NULL) {
1238     void *idx;
1239 
1240     // Allocate & initialize the lock.
1241     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1242     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1243     __kmp_init_user_lock_with_checks(lck);
1244     __kmp_set_user_lock_location(lck, loc);
1245 #if USE_ITT_BUILD
1246     __kmp_itt_critical_creating(lck);
1247 // __kmp_itt_critical_creating() should be called *before* the first usage
1248 // of underlying lock. It is the only place where we can guarantee it. There
1249 // are chances the lock will destroyed with no usage, but it is not a
1250 // problem, because this is not real event seen by user but rather setting
1251 // name for object (lock). See more details in kmp_itt.h.
1252 #endif /* USE_ITT_BUILD */
1253 
1254     // Use a cmpxchg instruction to slam the start of the critical section with
1255     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1256     // and use the lock that the other thread allocated.
1257     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1258 
1259     if (status == 0) {
1260 // Deallocate the lock and reload the value.
1261 #if USE_ITT_BUILD
1262       __kmp_itt_critical_destroyed(lck);
1263 // Let ITT know the lock is destroyed and the same memory location may be reused
1264 // for another purpose.
1265 #endif /* USE_ITT_BUILD */
1266       __kmp_destroy_user_lock_with_checks(lck);
1267       __kmp_user_lock_free(&idx, gtid, lck);
1268       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1269       KMP_DEBUG_ASSERT(lck != NULL);
1270     }
1271   }
1272   return lck;
1273 }
1274 
1275 #endif // KMP_USE_DYNAMIC_LOCK
1276 
1277 /*!
1278 @ingroup WORK_SHARING
1279 @param loc  source location information.
1280 @param global_tid  global thread number.
1281 @param crit identity of the critical section. This could be a pointer to a lock
1282 associated with the critical section, or some other suitably unique value.
1283 
1284 Enter code protected by a `critical` construct.
1285 This function blocks until the executing thread can enter the critical section.
1286 */
1287 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1288                      kmp_critical_name *crit) {
1289 #if KMP_USE_DYNAMIC_LOCK
1290 #if OMPT_SUPPORT && OMPT_OPTIONAL
1291   OMPT_STORE_RETURN_ADDRESS(global_tid);
1292 #endif // OMPT_SUPPORT
1293   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1294 #else
1295   KMP_COUNT_BLOCK(OMP_CRITICAL);
1296 #if OMPT_SUPPORT && OMPT_OPTIONAL
1297   ompt_state_t prev_state = ompt_state_undefined;
1298   ompt_thread_info_t ti;
1299 #endif
1300   kmp_user_lock_p lck;
1301 
1302   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1303   __kmp_assert_valid_gtid(global_tid);
1304 
1305   // TODO: add THR_OVHD_STATE
1306 
1307   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1308   KMP_CHECK_USER_LOCK_INIT();
1309 
1310   if ((__kmp_user_lock_kind == lk_tas) &&
1311       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1312     lck = (kmp_user_lock_p)crit;
1313   }
1314 #if KMP_USE_FUTEX
1315   else if ((__kmp_user_lock_kind == lk_futex) &&
1316            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1317     lck = (kmp_user_lock_p)crit;
1318   }
1319 #endif
1320   else { // ticket, queuing or drdpa
1321     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1322   }
1323 
1324   if (__kmp_env_consistency_check)
1325     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1326 
1327     // since the critical directive binds to all threads, not just the current
1328     // team we have to check this even if we are in a serialized team.
1329     // also, even if we are the uber thread, we still have to conduct the lock,
1330     // as we have to contend with sibling threads.
1331 
1332 #if USE_ITT_BUILD
1333   __kmp_itt_critical_acquiring(lck);
1334 #endif /* USE_ITT_BUILD */
1335 #if OMPT_SUPPORT && OMPT_OPTIONAL
1336   OMPT_STORE_RETURN_ADDRESS(gtid);
1337   void *codeptr_ra = NULL;
1338   if (ompt_enabled.enabled) {
1339     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1340     /* OMPT state update */
1341     prev_state = ti.state;
1342     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1343     ti.state = ompt_state_wait_critical;
1344 
1345     /* OMPT event callback */
1346     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1347     if (ompt_enabled.ompt_callback_mutex_acquire) {
1348       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1349           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1350           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1351     }
1352   }
1353 #endif
1354   // Value of 'crit' should be good for using as a critical_id of the critical
1355   // section directive.
1356   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1357 
1358 #if USE_ITT_BUILD
1359   __kmp_itt_critical_acquired(lck);
1360 #endif /* USE_ITT_BUILD */
1361 #if OMPT_SUPPORT && OMPT_OPTIONAL
1362   if (ompt_enabled.enabled) {
1363     /* OMPT state update */
1364     ti.state = prev_state;
1365     ti.wait_id = 0;
1366 
1367     /* OMPT event callback */
1368     if (ompt_enabled.ompt_callback_mutex_acquired) {
1369       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1370           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1371     }
1372   }
1373 #endif
1374   KMP_POP_PARTITIONED_TIMER();
1375 
1376   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1377   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1378 #endif // KMP_USE_DYNAMIC_LOCK
1379 }
1380 
1381 #if KMP_USE_DYNAMIC_LOCK
1382 
1383 // Converts the given hint to an internal lock implementation
1384 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1385 #if KMP_USE_TSX
1386 #define KMP_TSX_LOCK(seq) lockseq_##seq
1387 #else
1388 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1389 #endif
1390 
1391 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1392 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.flags.rtm)
1393 #else
1394 #define KMP_CPUINFO_RTM 0
1395 #endif
1396 
1397   // Hints that do not require further logic
1398   if (hint & kmp_lock_hint_hle)
1399     return KMP_TSX_LOCK(hle);
1400   if (hint & kmp_lock_hint_rtm)
1401     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1402   if (hint & kmp_lock_hint_adaptive)
1403     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1404 
1405   // Rule out conflicting hints first by returning the default lock
1406   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1407     return __kmp_user_lock_seq;
1408   if ((hint & omp_lock_hint_speculative) &&
1409       (hint & omp_lock_hint_nonspeculative))
1410     return __kmp_user_lock_seq;
1411 
1412   // Do not even consider speculation when it appears to be contended
1413   if (hint & omp_lock_hint_contended)
1414     return lockseq_queuing;
1415 
1416   // Uncontended lock without speculation
1417   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1418     return lockseq_tas;
1419 
1420   // Use RTM lock for speculation
1421   if (hint & omp_lock_hint_speculative)
1422     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1423 
1424   return __kmp_user_lock_seq;
1425 }
1426 
1427 #if OMPT_SUPPORT && OMPT_OPTIONAL
1428 #if KMP_USE_DYNAMIC_LOCK
1429 static kmp_mutex_impl_t
1430 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1431   if (user_lock) {
1432     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1433     case 0:
1434       break;
1435 #if KMP_USE_FUTEX
1436     case locktag_futex:
1437       return kmp_mutex_impl_queuing;
1438 #endif
1439     case locktag_tas:
1440       return kmp_mutex_impl_spin;
1441 #if KMP_USE_TSX
1442     case locktag_hle:
1443     case locktag_rtm_spin:
1444       return kmp_mutex_impl_speculative;
1445 #endif
1446     default:
1447       return kmp_mutex_impl_none;
1448     }
1449     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1450   }
1451   KMP_ASSERT(ilock);
1452   switch (ilock->type) {
1453 #if KMP_USE_TSX
1454   case locktag_adaptive:
1455   case locktag_rtm_queuing:
1456     return kmp_mutex_impl_speculative;
1457 #endif
1458   case locktag_nested_tas:
1459     return kmp_mutex_impl_spin;
1460 #if KMP_USE_FUTEX
1461   case locktag_nested_futex:
1462 #endif
1463   case locktag_ticket:
1464   case locktag_queuing:
1465   case locktag_drdpa:
1466   case locktag_nested_ticket:
1467   case locktag_nested_queuing:
1468   case locktag_nested_drdpa:
1469     return kmp_mutex_impl_queuing;
1470   default:
1471     return kmp_mutex_impl_none;
1472   }
1473 }
1474 #else
1475 // For locks without dynamic binding
1476 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1477   switch (__kmp_user_lock_kind) {
1478   case lk_tas:
1479     return kmp_mutex_impl_spin;
1480 #if KMP_USE_FUTEX
1481   case lk_futex:
1482 #endif
1483   case lk_ticket:
1484   case lk_queuing:
1485   case lk_drdpa:
1486     return kmp_mutex_impl_queuing;
1487 #if KMP_USE_TSX
1488   case lk_hle:
1489   case lk_rtm_queuing:
1490   case lk_rtm_spin:
1491   case lk_adaptive:
1492     return kmp_mutex_impl_speculative;
1493 #endif
1494   default:
1495     return kmp_mutex_impl_none;
1496   }
1497 }
1498 #endif // KMP_USE_DYNAMIC_LOCK
1499 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1500 
1501 /*!
1502 @ingroup WORK_SHARING
1503 @param loc  source location information.
1504 @param global_tid  global thread number.
1505 @param crit identity of the critical section. This could be a pointer to a lock
1506 associated with the critical section, or some other suitably unique value.
1507 @param hint the lock hint.
1508 
1509 Enter code protected by a `critical` construct with a hint. The hint value is
1510 used to suggest a lock implementation. This function blocks until the executing
1511 thread can enter the critical section unless the hint suggests use of
1512 speculative execution and the hardware supports it.
1513 */
1514 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1515                                kmp_critical_name *crit, uint32_t hint) {
1516   KMP_COUNT_BLOCK(OMP_CRITICAL);
1517   kmp_user_lock_p lck;
1518 #if OMPT_SUPPORT && OMPT_OPTIONAL
1519   ompt_state_t prev_state = ompt_state_undefined;
1520   ompt_thread_info_t ti;
1521   // This is the case, if called from __kmpc_critical:
1522   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1523   if (!codeptr)
1524     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1525 #endif
1526 
1527   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1528   __kmp_assert_valid_gtid(global_tid);
1529 
1530   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1531   // Check if it is initialized.
1532   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1533   kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
1534   if (*lk == 0) {
1535     if (KMP_IS_D_LOCK(lockseq)) {
1536       KMP_COMPARE_AND_STORE_ACQ32(
1537           (volatile kmp_int32 *)&((kmp_base_tas_lock_t *)crit)->poll, 0,
1538           KMP_GET_D_TAG(lockseq));
1539     } else {
1540       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
1541     }
1542   }
1543   // Branch for accessing the actual lock object and set operation. This
1544   // branching is inevitable since this lock initialization does not follow the
1545   // normal dispatch path (lock table is not used).
1546   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1547     lck = (kmp_user_lock_p)lk;
1548     if (__kmp_env_consistency_check) {
1549       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1550                       __kmp_map_hint_to_lock(hint));
1551     }
1552 #if USE_ITT_BUILD
1553     __kmp_itt_critical_acquiring(lck);
1554 #endif
1555 #if OMPT_SUPPORT && OMPT_OPTIONAL
1556     if (ompt_enabled.enabled) {
1557       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1558       /* OMPT state update */
1559       prev_state = ti.state;
1560       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1561       ti.state = ompt_state_wait_critical;
1562 
1563       /* OMPT event callback */
1564       if (ompt_enabled.ompt_callback_mutex_acquire) {
1565         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1566             ompt_mutex_critical, (unsigned int)hint,
1567             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1568             codeptr);
1569       }
1570     }
1571 #endif
1572 #if KMP_USE_INLINED_TAS
1573     if (lockseq == lockseq_tas && !__kmp_env_consistency_check) {
1574       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1575     } else
1576 #elif KMP_USE_INLINED_FUTEX
1577     if (lockseq == lockseq_futex && !__kmp_env_consistency_check) {
1578       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1579     } else
1580 #endif
1581     {
1582       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1583     }
1584   } else {
1585     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1586     lck = ilk->lock;
1587     if (__kmp_env_consistency_check) {
1588       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1589                       __kmp_map_hint_to_lock(hint));
1590     }
1591 #if USE_ITT_BUILD
1592     __kmp_itt_critical_acquiring(lck);
1593 #endif
1594 #if OMPT_SUPPORT && OMPT_OPTIONAL
1595     if (ompt_enabled.enabled) {
1596       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1597       /* OMPT state update */
1598       prev_state = ti.state;
1599       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1600       ti.state = ompt_state_wait_critical;
1601 
1602       /* OMPT event callback */
1603       if (ompt_enabled.ompt_callback_mutex_acquire) {
1604         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1605             ompt_mutex_critical, (unsigned int)hint,
1606             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1607             codeptr);
1608       }
1609     }
1610 #endif
1611     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1612   }
1613   KMP_POP_PARTITIONED_TIMER();
1614 
1615 #if USE_ITT_BUILD
1616   __kmp_itt_critical_acquired(lck);
1617 #endif /* USE_ITT_BUILD */
1618 #if OMPT_SUPPORT && OMPT_OPTIONAL
1619   if (ompt_enabled.enabled) {
1620     /* OMPT state update */
1621     ti.state = prev_state;
1622     ti.wait_id = 0;
1623 
1624     /* OMPT event callback */
1625     if (ompt_enabled.ompt_callback_mutex_acquired) {
1626       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1627           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1628     }
1629   }
1630 #endif
1631 
1632   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1633   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1634 } // __kmpc_critical_with_hint
1635 
1636 #endif // KMP_USE_DYNAMIC_LOCK
1637 
1638 /*!
1639 @ingroup WORK_SHARING
1640 @param loc  source location information.
1641 @param global_tid  global thread number .
1642 @param crit identity of the critical section. This could be a pointer to a lock
1643 associated with the critical section, or some other suitably unique value.
1644 
1645 Leave a critical section, releasing any lock that was held during its execution.
1646 */
1647 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1648                          kmp_critical_name *crit) {
1649   kmp_user_lock_p lck;
1650 
1651   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1652 
1653 #if KMP_USE_DYNAMIC_LOCK
1654   int locktag = KMP_EXTRACT_D_TAG(crit);
1655   if (locktag) {
1656     lck = (kmp_user_lock_p)crit;
1657     KMP_ASSERT(lck != NULL);
1658     if (__kmp_env_consistency_check) {
1659       __kmp_pop_sync(global_tid, ct_critical, loc);
1660     }
1661 #if USE_ITT_BUILD
1662     __kmp_itt_critical_releasing(lck);
1663 #endif
1664 #if KMP_USE_INLINED_TAS
1665     if (locktag == locktag_tas && !__kmp_env_consistency_check) {
1666       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1667     } else
1668 #elif KMP_USE_INLINED_FUTEX
1669     if (locktag == locktag_futex && !__kmp_env_consistency_check) {
1670       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1671     } else
1672 #endif
1673     {
1674       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1675     }
1676   } else {
1677     kmp_indirect_lock_t *ilk =
1678         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1679     KMP_ASSERT(ilk != NULL);
1680     lck = ilk->lock;
1681     if (__kmp_env_consistency_check) {
1682       __kmp_pop_sync(global_tid, ct_critical, loc);
1683     }
1684 #if USE_ITT_BUILD
1685     __kmp_itt_critical_releasing(lck);
1686 #endif
1687     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1688   }
1689 
1690 #else // KMP_USE_DYNAMIC_LOCK
1691 
1692   if ((__kmp_user_lock_kind == lk_tas) &&
1693       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1694     lck = (kmp_user_lock_p)crit;
1695   }
1696 #if KMP_USE_FUTEX
1697   else if ((__kmp_user_lock_kind == lk_futex) &&
1698            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1699     lck = (kmp_user_lock_p)crit;
1700   }
1701 #endif
1702   else { // ticket, queuing or drdpa
1703     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1704   }
1705 
1706   KMP_ASSERT(lck != NULL);
1707 
1708   if (__kmp_env_consistency_check)
1709     __kmp_pop_sync(global_tid, ct_critical, loc);
1710 
1711 #if USE_ITT_BUILD
1712   __kmp_itt_critical_releasing(lck);
1713 #endif /* USE_ITT_BUILD */
1714   // Value of 'crit' should be good for using as a critical_id of the critical
1715   // section directive.
1716   __kmp_release_user_lock_with_checks(lck, global_tid);
1717 
1718 #endif // KMP_USE_DYNAMIC_LOCK
1719 
1720 #if OMPT_SUPPORT && OMPT_OPTIONAL
1721   /* OMPT release event triggers after lock is released; place here to trigger
1722    * for all #if branches */
1723   OMPT_STORE_RETURN_ADDRESS(global_tid);
1724   if (ompt_enabled.ompt_callback_mutex_released) {
1725     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1726         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1727         OMPT_LOAD_RETURN_ADDRESS(0));
1728   }
1729 #endif
1730 
1731   KMP_POP_PARTITIONED_TIMER();
1732   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1733 }
1734 
1735 /*!
1736 @ingroup SYNCHRONIZATION
1737 @param loc source location information
1738 @param global_tid thread id.
1739 @return one if the thread should execute the master block, zero otherwise
1740 
1741 Start execution of a combined barrier and master. The barrier is executed inside
1742 this function.
1743 */
1744 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1745   int status;
1746   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1747   __kmp_assert_valid_gtid(global_tid);
1748 
1749   if (!TCR_4(__kmp_init_parallel))
1750     __kmp_parallel_initialize();
1751 
1752   __kmp_resume_if_soft_paused();
1753 
1754   if (__kmp_env_consistency_check)
1755     __kmp_check_barrier(global_tid, ct_barrier, loc);
1756 
1757 #if OMPT_SUPPORT
1758   ompt_frame_t *ompt_frame;
1759   if (ompt_enabled.enabled) {
1760     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1761     if (ompt_frame->enter_frame.ptr == NULL)
1762       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1763   }
1764   OMPT_STORE_RETURN_ADDRESS(global_tid);
1765 #endif
1766 #if USE_ITT_NOTIFY
1767   __kmp_threads[global_tid]->th.th_ident = loc;
1768 #endif
1769   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1770 #if OMPT_SUPPORT && OMPT_OPTIONAL
1771   if (ompt_enabled.enabled) {
1772     ompt_frame->enter_frame = ompt_data_none;
1773   }
1774 #endif
1775 
1776   return (status != 0) ? 0 : 1;
1777 }
1778 
1779 /*!
1780 @ingroup SYNCHRONIZATION
1781 @param loc source location information
1782 @param global_tid thread id.
1783 
1784 Complete the execution of a combined barrier and master. This function should
1785 only be called at the completion of the <tt>master</tt> code. Other threads will
1786 still be waiting at the barrier and this call releases them.
1787 */
1788 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1789   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1790   __kmp_assert_valid_gtid(global_tid);
1791   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1792 }
1793 
1794 /*!
1795 @ingroup SYNCHRONIZATION
1796 @param loc source location information
1797 @param global_tid thread id.
1798 @return one if the thread should execute the master block, zero otherwise
1799 
1800 Start execution of a combined barrier and master(nowait) construct.
1801 The barrier is executed inside this function.
1802 There is no equivalent "end" function, since the
1803 */
1804 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1805   kmp_int32 ret;
1806   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1807   __kmp_assert_valid_gtid(global_tid);
1808 
1809   if (!TCR_4(__kmp_init_parallel))
1810     __kmp_parallel_initialize();
1811 
1812   __kmp_resume_if_soft_paused();
1813 
1814   if (__kmp_env_consistency_check) {
1815     if (loc == 0) {
1816       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1817     }
1818     __kmp_check_barrier(global_tid, ct_barrier, loc);
1819   }
1820 
1821 #if OMPT_SUPPORT
1822   ompt_frame_t *ompt_frame;
1823   if (ompt_enabled.enabled) {
1824     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1825     if (ompt_frame->enter_frame.ptr == NULL)
1826       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1827   }
1828   OMPT_STORE_RETURN_ADDRESS(global_tid);
1829 #endif
1830 #if USE_ITT_NOTIFY
1831   __kmp_threads[global_tid]->th.th_ident = loc;
1832 #endif
1833   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1834 #if OMPT_SUPPORT && OMPT_OPTIONAL
1835   if (ompt_enabled.enabled) {
1836     ompt_frame->enter_frame = ompt_data_none;
1837   }
1838 #endif
1839 
1840   ret = __kmpc_master(loc, global_tid);
1841 
1842   if (__kmp_env_consistency_check) {
1843     /*  there's no __kmpc_end_master called; so the (stats) */
1844     /*  actions of __kmpc_end_master are done here          */
1845     if (ret) {
1846       /* only one thread should do the pop since only */
1847       /* one did the push (see __kmpc_master())       */
1848       __kmp_pop_sync(global_tid, ct_master, loc);
1849     }
1850   }
1851 
1852   return (ret);
1853 }
1854 
1855 /* The BARRIER for a SINGLE process section is always explicit   */
1856 /*!
1857 @ingroup WORK_SHARING
1858 @param loc  source location information
1859 @param global_tid  global thread number
1860 @return One if this thread should execute the single construct, zero otherwise.
1861 
1862 Test whether to execute a <tt>single</tt> construct.
1863 There are no implicit barriers in the two "single" calls, rather the compiler
1864 should introduce an explicit barrier if it is required.
1865 */
1866 
1867 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1868   __kmp_assert_valid_gtid(global_tid);
1869   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1870 
1871   if (rc) {
1872     // We are going to execute the single statement, so we should count it.
1873     KMP_COUNT_BLOCK(OMP_SINGLE);
1874     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1875   }
1876 
1877 #if OMPT_SUPPORT && OMPT_OPTIONAL
1878   kmp_info_t *this_thr = __kmp_threads[global_tid];
1879   kmp_team_t *team = this_thr->th.th_team;
1880   int tid = __kmp_tid_from_gtid(global_tid);
1881 
1882   if (ompt_enabled.enabled) {
1883     if (rc) {
1884       if (ompt_enabled.ompt_callback_work) {
1885         ompt_callbacks.ompt_callback(ompt_callback_work)(
1886             ompt_work_single_executor, ompt_scope_begin,
1887             &(team->t.ompt_team_info.parallel_data),
1888             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1889             1, OMPT_GET_RETURN_ADDRESS(0));
1890       }
1891     } else {
1892       if (ompt_enabled.ompt_callback_work) {
1893         ompt_callbacks.ompt_callback(ompt_callback_work)(
1894             ompt_work_single_other, ompt_scope_begin,
1895             &(team->t.ompt_team_info.parallel_data),
1896             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1897             1, OMPT_GET_RETURN_ADDRESS(0));
1898         ompt_callbacks.ompt_callback(ompt_callback_work)(
1899             ompt_work_single_other, ompt_scope_end,
1900             &(team->t.ompt_team_info.parallel_data),
1901             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1902             1, OMPT_GET_RETURN_ADDRESS(0));
1903       }
1904     }
1905   }
1906 #endif
1907 
1908   return rc;
1909 }
1910 
1911 /*!
1912 @ingroup WORK_SHARING
1913 @param loc  source location information
1914 @param global_tid  global thread number
1915 
1916 Mark the end of a <tt>single</tt> construct.  This function should
1917 only be called by the thread that executed the block of code protected
1918 by the `single` construct.
1919 */
1920 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1921   __kmp_assert_valid_gtid(global_tid);
1922   __kmp_exit_single(global_tid);
1923   KMP_POP_PARTITIONED_TIMER();
1924 
1925 #if OMPT_SUPPORT && OMPT_OPTIONAL
1926   kmp_info_t *this_thr = __kmp_threads[global_tid];
1927   kmp_team_t *team = this_thr->th.th_team;
1928   int tid = __kmp_tid_from_gtid(global_tid);
1929 
1930   if (ompt_enabled.ompt_callback_work) {
1931     ompt_callbacks.ompt_callback(ompt_callback_work)(
1932         ompt_work_single_executor, ompt_scope_end,
1933         &(team->t.ompt_team_info.parallel_data),
1934         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1935         OMPT_GET_RETURN_ADDRESS(0));
1936   }
1937 #endif
1938 }
1939 
1940 /*!
1941 @ingroup WORK_SHARING
1942 @param loc Source location
1943 @param global_tid Global thread id
1944 
1945 Mark the end of a statically scheduled loop.
1946 */
1947 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1948   KMP_POP_PARTITIONED_TIMER();
1949   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1950 
1951 #if OMPT_SUPPORT && OMPT_OPTIONAL
1952   if (ompt_enabled.ompt_callback_work) {
1953     ompt_work_t ompt_work_type = ompt_work_loop;
1954     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1955     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1956     // Determine workshare type
1957     if (loc != NULL) {
1958       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1959         ompt_work_type = ompt_work_loop;
1960       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1961         ompt_work_type = ompt_work_sections;
1962       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1963         ompt_work_type = ompt_work_distribute;
1964       } else {
1965         // use default set above.
1966         // a warning about this case is provided in __kmpc_for_static_init
1967       }
1968       KMP_DEBUG_ASSERT(ompt_work_type);
1969     }
1970     ompt_callbacks.ompt_callback(ompt_callback_work)(
1971         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1972         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1973   }
1974 #endif
1975   if (__kmp_env_consistency_check)
1976     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1977 }
1978 
1979 // User routines which take C-style arguments (call by value)
1980 // different from the Fortran equivalent routines
1981 
1982 void ompc_set_num_threads(int arg) {
1983   // !!!!! TODO: check the per-task binding
1984   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1985 }
1986 
1987 void ompc_set_dynamic(int flag) {
1988   kmp_info_t *thread;
1989 
1990   /* For the thread-private implementation of the internal controls */
1991   thread = __kmp_entry_thread();
1992 
1993   __kmp_save_internal_controls(thread);
1994 
1995   set__dynamic(thread, flag ? true : false);
1996 }
1997 
1998 void ompc_set_nested(int flag) {
1999   kmp_info_t *thread;
2000 
2001   /* For the thread-private internal controls implementation */
2002   thread = __kmp_entry_thread();
2003 
2004   __kmp_save_internal_controls(thread);
2005 
2006   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
2007 }
2008 
2009 void ompc_set_max_active_levels(int max_active_levels) {
2010   /* TO DO */
2011   /* we want per-task implementation of this internal control */
2012 
2013   /* For the per-thread internal controls implementation */
2014   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
2015 }
2016 
2017 void ompc_set_schedule(omp_sched_t kind, int modifier) {
2018   // !!!!! TODO: check the per-task binding
2019   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
2020 }
2021 
2022 int ompc_get_ancestor_thread_num(int level) {
2023   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
2024 }
2025 
2026 int ompc_get_team_size(int level) {
2027   return __kmp_get_team_size(__kmp_entry_gtid(), level);
2028 }
2029 
2030 /* OpenMP 5.0 Affinity Format API */
2031 void KMP_EXPAND_NAME(ompc_set_affinity_format)(char const *format) {
2032   if (!__kmp_init_serial) {
2033     __kmp_serial_initialize();
2034   }
2035   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
2036                          format, KMP_STRLEN(format) + 1);
2037 }
2038 
2039 size_t KMP_EXPAND_NAME(ompc_get_affinity_format)(char *buffer, size_t size) {
2040   size_t format_size;
2041   if (!__kmp_init_serial) {
2042     __kmp_serial_initialize();
2043   }
2044   format_size = KMP_STRLEN(__kmp_affinity_format);
2045   if (buffer && size) {
2046     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
2047                            format_size + 1);
2048   }
2049   return format_size;
2050 }
2051 
2052 void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format) {
2053   int gtid;
2054   if (!TCR_4(__kmp_init_middle)) {
2055     __kmp_middle_initialize();
2056   }
2057   __kmp_assign_root_init_mask();
2058   gtid = __kmp_get_gtid();
2059 #if KMP_AFFINITY_SUPPORTED
2060   if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
2061       __kmp_affinity.flags.reset) {
2062     __kmp_reset_root_init_mask(gtid);
2063   }
2064 #endif
2065   __kmp_aux_display_affinity(gtid, format);
2066 }
2067 
2068 size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
2069                                               char const *format) {
2070   int gtid;
2071   size_t num_required;
2072   kmp_str_buf_t capture_buf;
2073   if (!TCR_4(__kmp_init_middle)) {
2074     __kmp_middle_initialize();
2075   }
2076   __kmp_assign_root_init_mask();
2077   gtid = __kmp_get_gtid();
2078 #if KMP_AFFINITY_SUPPORTED
2079   if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
2080       __kmp_affinity.flags.reset) {
2081     __kmp_reset_root_init_mask(gtid);
2082   }
2083 #endif
2084   __kmp_str_buf_init(&capture_buf);
2085   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
2086   if (buffer && buf_size) {
2087     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
2088                            capture_buf.used + 1);
2089   }
2090   __kmp_str_buf_free(&capture_buf);
2091   return num_required;
2092 }
2093 
2094 void kmpc_set_stacksize(int arg) {
2095   // __kmp_aux_set_stacksize initializes the library if needed
2096   __kmp_aux_set_stacksize(arg);
2097 }
2098 
2099 void kmpc_set_stacksize_s(size_t arg) {
2100   // __kmp_aux_set_stacksize initializes the library if needed
2101   __kmp_aux_set_stacksize(arg);
2102 }
2103 
2104 void kmpc_set_blocktime(int arg) {
2105   int gtid, tid, bt = arg;
2106   kmp_info_t *thread;
2107 
2108   gtid = __kmp_entry_gtid();
2109   tid = __kmp_tid_from_gtid(gtid);
2110   thread = __kmp_thread_from_gtid(gtid);
2111 
2112   __kmp_aux_convert_blocktime(&bt);
2113   __kmp_aux_set_blocktime(bt, thread, tid);
2114 }
2115 
2116 void kmpc_set_library(int arg) {
2117   // __kmp_user_set_library initializes the library if needed
2118   __kmp_user_set_library((enum library_type)arg);
2119 }
2120 
2121 void kmpc_set_defaults(char const *str) {
2122   // __kmp_aux_set_defaults initializes the library if needed
2123   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
2124 }
2125 
2126 void kmpc_set_disp_num_buffers(int arg) {
2127   // ignore after initialization because some teams have already
2128   // allocated dispatch buffers
2129   if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
2130       arg <= KMP_MAX_DISP_NUM_BUFF) {
2131     __kmp_dispatch_num_buffers = arg;
2132   }
2133 }
2134 
2135 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2136 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2137   return -1;
2138 #else
2139   if (!TCR_4(__kmp_init_middle)) {
2140     __kmp_middle_initialize();
2141   }
2142   __kmp_assign_root_init_mask();
2143   return __kmp_aux_set_affinity_mask_proc(proc, mask);
2144 #endif
2145 }
2146 
2147 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2148 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2149   return -1;
2150 #else
2151   if (!TCR_4(__kmp_init_middle)) {
2152     __kmp_middle_initialize();
2153   }
2154   __kmp_assign_root_init_mask();
2155   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2156 #endif
2157 }
2158 
2159 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2160 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2161   return -1;
2162 #else
2163   if (!TCR_4(__kmp_init_middle)) {
2164     __kmp_middle_initialize();
2165   }
2166   __kmp_assign_root_init_mask();
2167   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2168 #endif
2169 }
2170 
2171 /* -------------------------------------------------------------------------- */
2172 /*!
2173 @ingroup THREADPRIVATE
2174 @param loc       source location information
2175 @param gtid      global thread number
2176 @param cpy_size  size of the cpy_data buffer
2177 @param cpy_data  pointer to data to be copied
2178 @param cpy_func  helper function to call for copying data
2179 @param didit     flag variable: 1=single thread; 0=not single thread
2180 
2181 __kmpc_copyprivate implements the interface for the private data broadcast
2182 needed for the copyprivate clause associated with a single region in an
2183 OpenMP<sup>*</sup> program (both C and Fortran).
2184 All threads participating in the parallel region call this routine.
2185 One of the threads (called the single thread) should have the <tt>didit</tt>
2186 variable set to 1 and all other threads should have that variable set to 0.
2187 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2188 
2189 The OpenMP specification forbids the use of nowait on the single region when a
2190 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2191 barrier internally to avoid race conditions, so the code generation for the
2192 single region should avoid generating a barrier after the call to @ref
2193 __kmpc_copyprivate.
2194 
2195 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2196 The <tt>loc</tt> parameter is a pointer to source location information.
2197 
2198 Internal implementation: The single thread will first copy its descriptor
2199 address (cpy_data) to a team-private location, then the other threads will each
2200 call the function pointed to by the parameter cpy_func, which carries out the
2201 copy by copying the data using the cpy_data buffer.
2202 
2203 The cpy_func routine used for the copy and the contents of the data area defined
2204 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2205 to be done. For instance, the cpy_data buffer can hold the actual data to be
2206 copied or it may hold a list of pointers to the data. The cpy_func routine must
2207 interpret the cpy_data buffer appropriately.
2208 
2209 The interface to cpy_func is as follows:
2210 @code
2211 void cpy_func( void *destination, void *source )
2212 @endcode
2213 where void *destination is the cpy_data pointer for the thread being copied to
2214 and void *source is the cpy_data pointer for the thread being copied from.
2215 */
2216 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2217                         void *cpy_data, void (*cpy_func)(void *, void *),
2218                         kmp_int32 didit) {
2219   void **data_ptr;
2220   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2221   __kmp_assert_valid_gtid(gtid);
2222 
2223   KMP_MB();
2224 
2225   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2226 
2227   if (__kmp_env_consistency_check) {
2228     if (loc == 0) {
2229       KMP_WARNING(ConstructIdentInvalid);
2230     }
2231   }
2232 
2233   // ToDo: Optimize the following two barriers into some kind of split barrier
2234 
2235   if (didit)
2236     *data_ptr = cpy_data;
2237 
2238 #if OMPT_SUPPORT
2239   ompt_frame_t *ompt_frame;
2240   if (ompt_enabled.enabled) {
2241     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2242     if (ompt_frame->enter_frame.ptr == NULL)
2243       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2244   }
2245   OMPT_STORE_RETURN_ADDRESS(gtid);
2246 #endif
2247 /* This barrier is not a barrier region boundary */
2248 #if USE_ITT_NOTIFY
2249   __kmp_threads[gtid]->th.th_ident = loc;
2250 #endif
2251   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2252 
2253   if (!didit)
2254     (*cpy_func)(cpy_data, *data_ptr);
2255 
2256   // Consider next barrier a user-visible barrier for barrier region boundaries
2257   // Nesting checks are already handled by the single construct checks
2258   {
2259 #if OMPT_SUPPORT
2260     OMPT_STORE_RETURN_ADDRESS(gtid);
2261 #endif
2262 #if USE_ITT_NOTIFY
2263     __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2264 // tasks can overwrite the location)
2265 #endif
2266     __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2267 #if OMPT_SUPPORT && OMPT_OPTIONAL
2268     if (ompt_enabled.enabled) {
2269       ompt_frame->enter_frame = ompt_data_none;
2270     }
2271 #endif
2272   }
2273 }
2274 
2275 /* --------------------------------------------------------------------------*/
2276 /*!
2277 @ingroup THREADPRIVATE
2278 @param loc       source location information
2279 @param gtid      global thread number
2280 @param cpy_data  pointer to the data to be saved/copied or 0
2281 @return          the saved pointer to the data
2282 
2283 __kmpc_copyprivate_light is a lighter version of __kmpc_copyprivate:
2284 __kmpc_copyprivate_light only saves the pointer it's given (if it's not 0, so
2285 coming from single), and returns that pointer in all calls (for single thread
2286 it's not needed). This version doesn't do any actual data copying. Data copying
2287 has to be done somewhere else, e.g. inline in the generated code. Due to this,
2288 this function doesn't have any barrier at the end of the function, like
2289 __kmpc_copyprivate does, so generated code needs barrier after copying of all
2290 data was done.
2291 */
2292 void *__kmpc_copyprivate_light(ident_t *loc, kmp_int32 gtid, void *cpy_data) {
2293   void **data_ptr;
2294 
2295   KC_TRACE(10, ("__kmpc_copyprivate_light: called T#%d\n", gtid));
2296 
2297   KMP_MB();
2298 
2299   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2300 
2301   if (__kmp_env_consistency_check) {
2302     if (loc == 0) {
2303       KMP_WARNING(ConstructIdentInvalid);
2304     }
2305   }
2306 
2307   // ToDo: Optimize the following barrier
2308 
2309   if (cpy_data)
2310     *data_ptr = cpy_data;
2311 
2312 #if OMPT_SUPPORT
2313   ompt_frame_t *ompt_frame;
2314   if (ompt_enabled.enabled) {
2315     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2316     if (ompt_frame->enter_frame.ptr == NULL)
2317       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2318     OMPT_STORE_RETURN_ADDRESS(gtid);
2319   }
2320 #endif
2321 /* This barrier is not a barrier region boundary */
2322 #if USE_ITT_NOTIFY
2323   __kmp_threads[gtid]->th.th_ident = loc;
2324 #endif
2325   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2326 
2327   return *data_ptr;
2328 }
2329 
2330 /* -------------------------------------------------------------------------- */
2331 
2332 #define INIT_LOCK __kmp_init_user_lock_with_checks
2333 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2334 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2335 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2336 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2337 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2338   __kmp_acquire_nested_user_lock_with_checks_timed
2339 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2340 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2341 #define TEST_LOCK __kmp_test_user_lock_with_checks
2342 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2343 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2344 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2345 
2346 // TODO: Make check abort messages use location info & pass it into
2347 // with_checks routines
2348 
2349 #if KMP_USE_DYNAMIC_LOCK
2350 
2351 // internal lock initializer
2352 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2353                                                     kmp_dyna_lockseq_t seq) {
2354   if (KMP_IS_D_LOCK(seq)) {
2355     KMP_INIT_D_LOCK(lock, seq);
2356 #if USE_ITT_BUILD
2357     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2358 #endif
2359   } else {
2360     KMP_INIT_I_LOCK(lock, seq);
2361 #if USE_ITT_BUILD
2362     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2363     __kmp_itt_lock_creating(ilk->lock, loc);
2364 #endif
2365   }
2366 }
2367 
2368 // internal nest lock initializer
2369 static __forceinline void
2370 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2371                                kmp_dyna_lockseq_t seq) {
2372 #if KMP_USE_TSX
2373   // Don't have nested lock implementation for speculative locks
2374   if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2375       seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2376     seq = __kmp_user_lock_seq;
2377 #endif
2378   switch (seq) {
2379   case lockseq_tas:
2380     seq = lockseq_nested_tas;
2381     break;
2382 #if KMP_USE_FUTEX
2383   case lockseq_futex:
2384     seq = lockseq_nested_futex;
2385     break;
2386 #endif
2387   case lockseq_ticket:
2388     seq = lockseq_nested_ticket;
2389     break;
2390   case lockseq_queuing:
2391     seq = lockseq_nested_queuing;
2392     break;
2393   case lockseq_drdpa:
2394     seq = lockseq_nested_drdpa;
2395     break;
2396   default:
2397     seq = lockseq_nested_queuing;
2398   }
2399   KMP_INIT_I_LOCK(lock, seq);
2400 #if USE_ITT_BUILD
2401   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2402   __kmp_itt_lock_creating(ilk->lock, loc);
2403 #endif
2404 }
2405 
2406 /* initialize the lock with a hint */
2407 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2408                                 uintptr_t hint) {
2409   KMP_DEBUG_ASSERT(__kmp_init_serial);
2410   if (__kmp_env_consistency_check && user_lock == NULL) {
2411     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2412   }
2413 
2414   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2415 
2416 #if OMPT_SUPPORT && OMPT_OPTIONAL
2417   // This is the case, if called from omp_init_lock_with_hint:
2418   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2419   if (!codeptr)
2420     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2421   if (ompt_enabled.ompt_callback_lock_init) {
2422     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2423         ompt_mutex_lock, (omp_lock_hint_t)hint,
2424         __ompt_get_mutex_impl_type(user_lock),
2425         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2426   }
2427 #endif
2428 }
2429 
2430 /* initialize the lock with a hint */
2431 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2432                                      void **user_lock, uintptr_t hint) {
2433   KMP_DEBUG_ASSERT(__kmp_init_serial);
2434   if (__kmp_env_consistency_check && user_lock == NULL) {
2435     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2436   }
2437 
2438   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2439 
2440 #if OMPT_SUPPORT && OMPT_OPTIONAL
2441   // This is the case, if called from omp_init_lock_with_hint:
2442   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2443   if (!codeptr)
2444     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2445   if (ompt_enabled.ompt_callback_lock_init) {
2446     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2447         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2448         __ompt_get_mutex_impl_type(user_lock),
2449         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2450   }
2451 #endif
2452 }
2453 
2454 #endif // KMP_USE_DYNAMIC_LOCK
2455 
2456 /* initialize the lock */
2457 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2458 #if KMP_USE_DYNAMIC_LOCK
2459 
2460   KMP_DEBUG_ASSERT(__kmp_init_serial);
2461   if (__kmp_env_consistency_check && user_lock == NULL) {
2462     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2463   }
2464   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2465 
2466 #if OMPT_SUPPORT && OMPT_OPTIONAL
2467   // This is the case, if called from omp_init_lock_with_hint:
2468   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2469   if (!codeptr)
2470     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2471   if (ompt_enabled.ompt_callback_lock_init) {
2472     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2473         ompt_mutex_lock, omp_lock_hint_none,
2474         __ompt_get_mutex_impl_type(user_lock),
2475         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2476   }
2477 #endif
2478 
2479 #else // KMP_USE_DYNAMIC_LOCK
2480 
2481   static char const *const func = "omp_init_lock";
2482   kmp_user_lock_p lck;
2483   KMP_DEBUG_ASSERT(__kmp_init_serial);
2484 
2485   if (__kmp_env_consistency_check) {
2486     if (user_lock == NULL) {
2487       KMP_FATAL(LockIsUninitialized, func);
2488     }
2489   }
2490 
2491   KMP_CHECK_USER_LOCK_INIT();
2492 
2493   if ((__kmp_user_lock_kind == lk_tas) &&
2494       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2495     lck = (kmp_user_lock_p)user_lock;
2496   }
2497 #if KMP_USE_FUTEX
2498   else if ((__kmp_user_lock_kind == lk_futex) &&
2499            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2500     lck = (kmp_user_lock_p)user_lock;
2501   }
2502 #endif
2503   else {
2504     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2505   }
2506   INIT_LOCK(lck);
2507   __kmp_set_user_lock_location(lck, loc);
2508 
2509 #if OMPT_SUPPORT && OMPT_OPTIONAL
2510   // This is the case, if called from omp_init_lock_with_hint:
2511   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2512   if (!codeptr)
2513     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2514   if (ompt_enabled.ompt_callback_lock_init) {
2515     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2516         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2517         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2518   }
2519 #endif
2520 
2521 #if USE_ITT_BUILD
2522   __kmp_itt_lock_creating(lck);
2523 #endif /* USE_ITT_BUILD */
2524 
2525 #endif // KMP_USE_DYNAMIC_LOCK
2526 } // __kmpc_init_lock
2527 
2528 /* initialize the lock */
2529 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2530 #if KMP_USE_DYNAMIC_LOCK
2531 
2532   KMP_DEBUG_ASSERT(__kmp_init_serial);
2533   if (__kmp_env_consistency_check && user_lock == NULL) {
2534     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2535   }
2536   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2537 
2538 #if OMPT_SUPPORT && OMPT_OPTIONAL
2539   // This is the case, if called from omp_init_lock_with_hint:
2540   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2541   if (!codeptr)
2542     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2543   if (ompt_enabled.ompt_callback_lock_init) {
2544     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2545         ompt_mutex_nest_lock, omp_lock_hint_none,
2546         __ompt_get_mutex_impl_type(user_lock),
2547         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2548   }
2549 #endif
2550 
2551 #else // KMP_USE_DYNAMIC_LOCK
2552 
2553   static char const *const func = "omp_init_nest_lock";
2554   kmp_user_lock_p lck;
2555   KMP_DEBUG_ASSERT(__kmp_init_serial);
2556 
2557   if (__kmp_env_consistency_check) {
2558     if (user_lock == NULL) {
2559       KMP_FATAL(LockIsUninitialized, func);
2560     }
2561   }
2562 
2563   KMP_CHECK_USER_LOCK_INIT();
2564 
2565   if ((__kmp_user_lock_kind == lk_tas) &&
2566       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2567        OMP_NEST_LOCK_T_SIZE)) {
2568     lck = (kmp_user_lock_p)user_lock;
2569   }
2570 #if KMP_USE_FUTEX
2571   else if ((__kmp_user_lock_kind == lk_futex) &&
2572            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2573             OMP_NEST_LOCK_T_SIZE)) {
2574     lck = (kmp_user_lock_p)user_lock;
2575   }
2576 #endif
2577   else {
2578     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2579   }
2580 
2581   INIT_NESTED_LOCK(lck);
2582   __kmp_set_user_lock_location(lck, loc);
2583 
2584 #if OMPT_SUPPORT && OMPT_OPTIONAL
2585   // This is the case, if called from omp_init_lock_with_hint:
2586   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2587   if (!codeptr)
2588     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2589   if (ompt_enabled.ompt_callback_lock_init) {
2590     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2591         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2592         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2593   }
2594 #endif
2595 
2596 #if USE_ITT_BUILD
2597   __kmp_itt_lock_creating(lck);
2598 #endif /* USE_ITT_BUILD */
2599 
2600 #endif // KMP_USE_DYNAMIC_LOCK
2601 } // __kmpc_init_nest_lock
2602 
2603 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2604 #if KMP_USE_DYNAMIC_LOCK
2605 
2606 #if USE_ITT_BUILD
2607   kmp_user_lock_p lck;
2608   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2609     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2610   } else {
2611     lck = (kmp_user_lock_p)user_lock;
2612   }
2613   __kmp_itt_lock_destroyed(lck);
2614 #endif
2615 #if OMPT_SUPPORT && OMPT_OPTIONAL
2616   // This is the case, if called from omp_init_lock_with_hint:
2617   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2618   if (!codeptr)
2619     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2620   if (ompt_enabled.ompt_callback_lock_destroy) {
2621     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2622         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2623   }
2624 #endif
2625   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2626 #else
2627   kmp_user_lock_p lck;
2628 
2629   if ((__kmp_user_lock_kind == lk_tas) &&
2630       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2631     lck = (kmp_user_lock_p)user_lock;
2632   }
2633 #if KMP_USE_FUTEX
2634   else if ((__kmp_user_lock_kind == lk_futex) &&
2635            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2636     lck = (kmp_user_lock_p)user_lock;
2637   }
2638 #endif
2639   else {
2640     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2641   }
2642 
2643 #if OMPT_SUPPORT && OMPT_OPTIONAL
2644   // This is the case, if called from omp_init_lock_with_hint:
2645   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2646   if (!codeptr)
2647     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2648   if (ompt_enabled.ompt_callback_lock_destroy) {
2649     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2650         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2651   }
2652 #endif
2653 
2654 #if USE_ITT_BUILD
2655   __kmp_itt_lock_destroyed(lck);
2656 #endif /* USE_ITT_BUILD */
2657   DESTROY_LOCK(lck);
2658 
2659   if ((__kmp_user_lock_kind == lk_tas) &&
2660       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2661     ;
2662   }
2663 #if KMP_USE_FUTEX
2664   else if ((__kmp_user_lock_kind == lk_futex) &&
2665            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2666     ;
2667   }
2668 #endif
2669   else {
2670     __kmp_user_lock_free(user_lock, gtid, lck);
2671   }
2672 #endif // KMP_USE_DYNAMIC_LOCK
2673 } // __kmpc_destroy_lock
2674 
2675 /* destroy the lock */
2676 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2677 #if KMP_USE_DYNAMIC_LOCK
2678 
2679 #if USE_ITT_BUILD
2680   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2681   __kmp_itt_lock_destroyed(ilk->lock);
2682 #endif
2683 #if OMPT_SUPPORT && OMPT_OPTIONAL
2684   // This is the case, if called from omp_init_lock_with_hint:
2685   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2686   if (!codeptr)
2687     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2688   if (ompt_enabled.ompt_callback_lock_destroy) {
2689     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2690         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2691   }
2692 #endif
2693   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2694 
2695 #else // KMP_USE_DYNAMIC_LOCK
2696 
2697   kmp_user_lock_p lck;
2698 
2699   if ((__kmp_user_lock_kind == lk_tas) &&
2700       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2701        OMP_NEST_LOCK_T_SIZE)) {
2702     lck = (kmp_user_lock_p)user_lock;
2703   }
2704 #if KMP_USE_FUTEX
2705   else if ((__kmp_user_lock_kind == lk_futex) &&
2706            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2707             OMP_NEST_LOCK_T_SIZE)) {
2708     lck = (kmp_user_lock_p)user_lock;
2709   }
2710 #endif
2711   else {
2712     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2713   }
2714 
2715 #if OMPT_SUPPORT && OMPT_OPTIONAL
2716   // This is the case, if called from omp_init_lock_with_hint:
2717   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2718   if (!codeptr)
2719     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2720   if (ompt_enabled.ompt_callback_lock_destroy) {
2721     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2722         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2723   }
2724 #endif
2725 
2726 #if USE_ITT_BUILD
2727   __kmp_itt_lock_destroyed(lck);
2728 #endif /* USE_ITT_BUILD */
2729 
2730   DESTROY_NESTED_LOCK(lck);
2731 
2732   if ((__kmp_user_lock_kind == lk_tas) &&
2733       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2734        OMP_NEST_LOCK_T_SIZE)) {
2735     ;
2736   }
2737 #if KMP_USE_FUTEX
2738   else if ((__kmp_user_lock_kind == lk_futex) &&
2739            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2740             OMP_NEST_LOCK_T_SIZE)) {
2741     ;
2742   }
2743 #endif
2744   else {
2745     __kmp_user_lock_free(user_lock, gtid, lck);
2746   }
2747 #endif // KMP_USE_DYNAMIC_LOCK
2748 } // __kmpc_destroy_nest_lock
2749 
2750 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2751   KMP_COUNT_BLOCK(OMP_set_lock);
2752 #if KMP_USE_DYNAMIC_LOCK
2753   int tag = KMP_EXTRACT_D_TAG(user_lock);
2754 #if USE_ITT_BUILD
2755   __kmp_itt_lock_acquiring(
2756       (kmp_user_lock_p)
2757           user_lock); // itt function will get to the right lock object.
2758 #endif
2759 #if OMPT_SUPPORT && OMPT_OPTIONAL
2760   // This is the case, if called from omp_init_lock_with_hint:
2761   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2762   if (!codeptr)
2763     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2764   if (ompt_enabled.ompt_callback_mutex_acquire) {
2765     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2766         ompt_mutex_lock, omp_lock_hint_none,
2767         __ompt_get_mutex_impl_type(user_lock),
2768         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2769   }
2770 #endif
2771 #if KMP_USE_INLINED_TAS
2772   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2773     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2774   } else
2775 #elif KMP_USE_INLINED_FUTEX
2776   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2777     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2778   } else
2779 #endif
2780   {
2781     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2782   }
2783 #if USE_ITT_BUILD
2784   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2785 #endif
2786 #if OMPT_SUPPORT && OMPT_OPTIONAL
2787   if (ompt_enabled.ompt_callback_mutex_acquired) {
2788     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2789         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2790   }
2791 #endif
2792 
2793 #else // KMP_USE_DYNAMIC_LOCK
2794 
2795   kmp_user_lock_p lck;
2796 
2797   if ((__kmp_user_lock_kind == lk_tas) &&
2798       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2799     lck = (kmp_user_lock_p)user_lock;
2800   }
2801 #if KMP_USE_FUTEX
2802   else if ((__kmp_user_lock_kind == lk_futex) &&
2803            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2804     lck = (kmp_user_lock_p)user_lock;
2805   }
2806 #endif
2807   else {
2808     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2809   }
2810 
2811 #if USE_ITT_BUILD
2812   __kmp_itt_lock_acquiring(lck);
2813 #endif /* USE_ITT_BUILD */
2814 #if OMPT_SUPPORT && OMPT_OPTIONAL
2815   // This is the case, if called from omp_init_lock_with_hint:
2816   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2817   if (!codeptr)
2818     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2819   if (ompt_enabled.ompt_callback_mutex_acquire) {
2820     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2821         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2822         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2823   }
2824 #endif
2825 
2826   ACQUIRE_LOCK(lck, gtid);
2827 
2828 #if USE_ITT_BUILD
2829   __kmp_itt_lock_acquired(lck);
2830 #endif /* USE_ITT_BUILD */
2831 
2832 #if OMPT_SUPPORT && OMPT_OPTIONAL
2833   if (ompt_enabled.ompt_callback_mutex_acquired) {
2834     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2835         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2836   }
2837 #endif
2838 
2839 #endif // KMP_USE_DYNAMIC_LOCK
2840 }
2841 
2842 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2843 #if KMP_USE_DYNAMIC_LOCK
2844 
2845 #if USE_ITT_BUILD
2846   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2847 #endif
2848 #if OMPT_SUPPORT && OMPT_OPTIONAL
2849   // This is the case, if called from omp_init_lock_with_hint:
2850   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2851   if (!codeptr)
2852     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2853   if (ompt_enabled.enabled) {
2854     if (ompt_enabled.ompt_callback_mutex_acquire) {
2855       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2856           ompt_mutex_nest_lock, omp_lock_hint_none,
2857           __ompt_get_mutex_impl_type(user_lock),
2858           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2859     }
2860   }
2861 #endif
2862   int acquire_status =
2863       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2864   (void)acquire_status;
2865 #if USE_ITT_BUILD
2866   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2867 #endif
2868 
2869 #if OMPT_SUPPORT && OMPT_OPTIONAL
2870   if (ompt_enabled.enabled) {
2871     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2872       if (ompt_enabled.ompt_callback_mutex_acquired) {
2873         // lock_first
2874         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2875             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2876             codeptr);
2877       }
2878     } else {
2879       if (ompt_enabled.ompt_callback_nest_lock) {
2880         // lock_next
2881         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2882             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2883       }
2884     }
2885   }
2886 #endif
2887 
2888 #else // KMP_USE_DYNAMIC_LOCK
2889   int acquire_status;
2890   kmp_user_lock_p lck;
2891 
2892   if ((__kmp_user_lock_kind == lk_tas) &&
2893       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2894        OMP_NEST_LOCK_T_SIZE)) {
2895     lck = (kmp_user_lock_p)user_lock;
2896   }
2897 #if KMP_USE_FUTEX
2898   else if ((__kmp_user_lock_kind == lk_futex) &&
2899            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2900             OMP_NEST_LOCK_T_SIZE)) {
2901     lck = (kmp_user_lock_p)user_lock;
2902   }
2903 #endif
2904   else {
2905     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2906   }
2907 
2908 #if USE_ITT_BUILD
2909   __kmp_itt_lock_acquiring(lck);
2910 #endif /* USE_ITT_BUILD */
2911 #if OMPT_SUPPORT && OMPT_OPTIONAL
2912   // This is the case, if called from omp_init_lock_with_hint:
2913   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2914   if (!codeptr)
2915     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2916   if (ompt_enabled.enabled) {
2917     if (ompt_enabled.ompt_callback_mutex_acquire) {
2918       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2919           ompt_mutex_nest_lock, omp_lock_hint_none,
2920           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2921           codeptr);
2922     }
2923   }
2924 #endif
2925 
2926   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2927 
2928 #if USE_ITT_BUILD
2929   __kmp_itt_lock_acquired(lck);
2930 #endif /* USE_ITT_BUILD */
2931 
2932 #if OMPT_SUPPORT && OMPT_OPTIONAL
2933   if (ompt_enabled.enabled) {
2934     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2935       if (ompt_enabled.ompt_callback_mutex_acquired) {
2936         // lock_first
2937         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2938             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2939       }
2940     } else {
2941       if (ompt_enabled.ompt_callback_nest_lock) {
2942         // lock_next
2943         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2944             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2945       }
2946     }
2947   }
2948 #endif
2949 
2950 #endif // KMP_USE_DYNAMIC_LOCK
2951 }
2952 
2953 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2954 #if KMP_USE_DYNAMIC_LOCK
2955 
2956   int tag = KMP_EXTRACT_D_TAG(user_lock);
2957 #if USE_ITT_BUILD
2958   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2959 #endif
2960 #if KMP_USE_INLINED_TAS
2961   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2962     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2963   } else
2964 #elif KMP_USE_INLINED_FUTEX
2965   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2966     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2967   } else
2968 #endif
2969   {
2970     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2971   }
2972 
2973 #if OMPT_SUPPORT && OMPT_OPTIONAL
2974   // This is the case, if called from omp_init_lock_with_hint:
2975   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2976   if (!codeptr)
2977     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2978   if (ompt_enabled.ompt_callback_mutex_released) {
2979     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2980         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2981   }
2982 #endif
2983 
2984 #else // KMP_USE_DYNAMIC_LOCK
2985 
2986   kmp_user_lock_p lck;
2987 
2988   /* Can't use serial interval since not block structured */
2989   /* release the lock */
2990 
2991   if ((__kmp_user_lock_kind == lk_tas) &&
2992       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2993 #if KMP_OS_LINUX &&                                                            \
2994     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2995 // "fast" path implemented to fix customer performance issue
2996 #if USE_ITT_BUILD
2997     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2998 #endif /* USE_ITT_BUILD */
2999     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
3000     KMP_MB();
3001 
3002 #if OMPT_SUPPORT && OMPT_OPTIONAL
3003     // This is the case, if called from omp_init_lock_with_hint:
3004     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3005     if (!codeptr)
3006       codeptr = OMPT_GET_RETURN_ADDRESS(0);
3007     if (ompt_enabled.ompt_callback_mutex_released) {
3008       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3009           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3010     }
3011 #endif
3012 
3013     return;
3014 #else
3015     lck = (kmp_user_lock_p)user_lock;
3016 #endif
3017   }
3018 #if KMP_USE_FUTEX
3019   else if ((__kmp_user_lock_kind == lk_futex) &&
3020            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3021     lck = (kmp_user_lock_p)user_lock;
3022   }
3023 #endif
3024   else {
3025     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
3026   }
3027 
3028 #if USE_ITT_BUILD
3029   __kmp_itt_lock_releasing(lck);
3030 #endif /* USE_ITT_BUILD */
3031 
3032   RELEASE_LOCK(lck, gtid);
3033 
3034 #if OMPT_SUPPORT && OMPT_OPTIONAL
3035   // This is the case, if called from omp_init_lock_with_hint:
3036   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3037   if (!codeptr)
3038     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3039   if (ompt_enabled.ompt_callback_mutex_released) {
3040     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3041         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3042   }
3043 #endif
3044 
3045 #endif // KMP_USE_DYNAMIC_LOCK
3046 }
3047 
3048 /* release the lock */
3049 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3050 #if KMP_USE_DYNAMIC_LOCK
3051 
3052 #if USE_ITT_BUILD
3053   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
3054 #endif
3055   int release_status =
3056       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
3057   (void)release_status;
3058 
3059 #if OMPT_SUPPORT && OMPT_OPTIONAL
3060   // This is the case, if called from omp_init_lock_with_hint:
3061   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3062   if (!codeptr)
3063     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3064   if (ompt_enabled.enabled) {
3065     if (release_status == KMP_LOCK_RELEASED) {
3066       if (ompt_enabled.ompt_callback_mutex_released) {
3067         // release_lock_last
3068         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3069             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3070             codeptr);
3071       }
3072     } else if (ompt_enabled.ompt_callback_nest_lock) {
3073       // release_lock_prev
3074       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3075           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3076     }
3077   }
3078 #endif
3079 
3080 #else // KMP_USE_DYNAMIC_LOCK
3081 
3082   kmp_user_lock_p lck;
3083 
3084   /* Can't use serial interval since not block structured */
3085 
3086   if ((__kmp_user_lock_kind == lk_tas) &&
3087       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3088        OMP_NEST_LOCK_T_SIZE)) {
3089 #if KMP_OS_LINUX &&                                                            \
3090     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
3091     // "fast" path implemented to fix customer performance issue
3092     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
3093 #if USE_ITT_BUILD
3094     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
3095 #endif /* USE_ITT_BUILD */
3096 
3097 #if OMPT_SUPPORT && OMPT_OPTIONAL
3098     int release_status = KMP_LOCK_STILL_HELD;
3099 #endif
3100 
3101     if (--(tl->lk.depth_locked) == 0) {
3102       TCW_4(tl->lk.poll, 0);
3103 #if OMPT_SUPPORT && OMPT_OPTIONAL
3104       release_status = KMP_LOCK_RELEASED;
3105 #endif
3106     }
3107     KMP_MB();
3108 
3109 #if OMPT_SUPPORT && OMPT_OPTIONAL
3110     // This is the case, if called from omp_init_lock_with_hint:
3111     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3112     if (!codeptr)
3113       codeptr = OMPT_GET_RETURN_ADDRESS(0);
3114     if (ompt_enabled.enabled) {
3115       if (release_status == KMP_LOCK_RELEASED) {
3116         if (ompt_enabled.ompt_callback_mutex_released) {
3117           // release_lock_last
3118           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3119               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3120         }
3121       } else if (ompt_enabled.ompt_callback_nest_lock) {
3122         // release_lock_previous
3123         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3124             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3125       }
3126     }
3127 #endif
3128 
3129     return;
3130 #else
3131     lck = (kmp_user_lock_p)user_lock;
3132 #endif
3133   }
3134 #if KMP_USE_FUTEX
3135   else if ((__kmp_user_lock_kind == lk_futex) &&
3136            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3137             OMP_NEST_LOCK_T_SIZE)) {
3138     lck = (kmp_user_lock_p)user_lock;
3139   }
3140 #endif
3141   else {
3142     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
3143   }
3144 
3145 #if USE_ITT_BUILD
3146   __kmp_itt_lock_releasing(lck);
3147 #endif /* USE_ITT_BUILD */
3148 
3149   int release_status;
3150   release_status = RELEASE_NESTED_LOCK(lck, gtid);
3151 #if OMPT_SUPPORT && OMPT_OPTIONAL
3152   // This is the case, if called from omp_init_lock_with_hint:
3153   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3154   if (!codeptr)
3155     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3156   if (ompt_enabled.enabled) {
3157     if (release_status == KMP_LOCK_RELEASED) {
3158       if (ompt_enabled.ompt_callback_mutex_released) {
3159         // release_lock_last
3160         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3161             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3162       }
3163     } else if (ompt_enabled.ompt_callback_nest_lock) {
3164       // release_lock_previous
3165       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3166           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3167     }
3168   }
3169 #endif
3170 
3171 #endif // KMP_USE_DYNAMIC_LOCK
3172 }
3173 
3174 /* try to acquire the lock */
3175 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3176   KMP_COUNT_BLOCK(OMP_test_lock);
3177 
3178 #if KMP_USE_DYNAMIC_LOCK
3179   int rc;
3180   int tag = KMP_EXTRACT_D_TAG(user_lock);
3181 #if USE_ITT_BUILD
3182   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3183 #endif
3184 #if OMPT_SUPPORT && OMPT_OPTIONAL
3185   // This is the case, if called from omp_init_lock_with_hint:
3186   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3187   if (!codeptr)
3188     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3189   if (ompt_enabled.ompt_callback_mutex_acquire) {
3190     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3191         ompt_mutex_test_lock, omp_lock_hint_none,
3192         __ompt_get_mutex_impl_type(user_lock),
3193         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3194   }
3195 #endif
3196 #if KMP_USE_INLINED_TAS
3197   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3198     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3199   } else
3200 #elif KMP_USE_INLINED_FUTEX
3201   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3202     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3203   } else
3204 #endif
3205   {
3206     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3207   }
3208   if (rc) {
3209 #if USE_ITT_BUILD
3210     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3211 #endif
3212 #if OMPT_SUPPORT && OMPT_OPTIONAL
3213     if (ompt_enabled.ompt_callback_mutex_acquired) {
3214       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3215           ompt_mutex_test_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3216     }
3217 #endif
3218     return FTN_TRUE;
3219   } else {
3220 #if USE_ITT_BUILD
3221     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3222 #endif
3223     return FTN_FALSE;
3224   }
3225 
3226 #else // KMP_USE_DYNAMIC_LOCK
3227 
3228   kmp_user_lock_p lck;
3229   int rc;
3230 
3231   if ((__kmp_user_lock_kind == lk_tas) &&
3232       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3233     lck = (kmp_user_lock_p)user_lock;
3234   }
3235 #if KMP_USE_FUTEX
3236   else if ((__kmp_user_lock_kind == lk_futex) &&
3237            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3238     lck = (kmp_user_lock_p)user_lock;
3239   }
3240 #endif
3241   else {
3242     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3243   }
3244 
3245 #if USE_ITT_BUILD
3246   __kmp_itt_lock_acquiring(lck);
3247 #endif /* USE_ITT_BUILD */
3248 #if OMPT_SUPPORT && OMPT_OPTIONAL
3249   // This is the case, if called from omp_init_lock_with_hint:
3250   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3251   if (!codeptr)
3252     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3253   if (ompt_enabled.ompt_callback_mutex_acquire) {
3254     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3255         ompt_mutex_test_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3256         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3257   }
3258 #endif
3259 
3260   rc = TEST_LOCK(lck, gtid);
3261 #if USE_ITT_BUILD
3262   if (rc) {
3263     __kmp_itt_lock_acquired(lck);
3264   } else {
3265     __kmp_itt_lock_cancelled(lck);
3266   }
3267 #endif /* USE_ITT_BUILD */
3268 #if OMPT_SUPPORT && OMPT_OPTIONAL
3269   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3270     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3271         ompt_mutex_test_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3272   }
3273 #endif
3274 
3275   return (rc ? FTN_TRUE : FTN_FALSE);
3276 
3277   /* Can't use serial interval since not block structured */
3278 
3279 #endif // KMP_USE_DYNAMIC_LOCK
3280 }
3281 
3282 /* try to acquire the lock */
3283 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3284 #if KMP_USE_DYNAMIC_LOCK
3285   int rc;
3286 #if USE_ITT_BUILD
3287   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3288 #endif
3289 #if OMPT_SUPPORT && OMPT_OPTIONAL
3290   // This is the case, if called from omp_init_lock_with_hint:
3291   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3292   if (!codeptr)
3293     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3294   if (ompt_enabled.ompt_callback_mutex_acquire) {
3295     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3296         ompt_mutex_test_nest_lock, omp_lock_hint_none,
3297         __ompt_get_mutex_impl_type(user_lock),
3298         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3299   }
3300 #endif
3301   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3302 #if USE_ITT_BUILD
3303   if (rc) {
3304     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3305   } else {
3306     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3307   }
3308 #endif
3309 #if OMPT_SUPPORT && OMPT_OPTIONAL
3310   if (ompt_enabled.enabled && rc) {
3311     if (rc == 1) {
3312       if (ompt_enabled.ompt_callback_mutex_acquired) {
3313         // lock_first
3314         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3315             ompt_mutex_test_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3316             codeptr);
3317       }
3318     } else {
3319       if (ompt_enabled.ompt_callback_nest_lock) {
3320         // lock_next
3321         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3322             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3323       }
3324     }
3325   }
3326 #endif
3327   return rc;
3328 
3329 #else // KMP_USE_DYNAMIC_LOCK
3330 
3331   kmp_user_lock_p lck;
3332   int rc;
3333 
3334   if ((__kmp_user_lock_kind == lk_tas) &&
3335       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3336        OMP_NEST_LOCK_T_SIZE)) {
3337     lck = (kmp_user_lock_p)user_lock;
3338   }
3339 #if KMP_USE_FUTEX
3340   else if ((__kmp_user_lock_kind == lk_futex) &&
3341            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3342             OMP_NEST_LOCK_T_SIZE)) {
3343     lck = (kmp_user_lock_p)user_lock;
3344   }
3345 #endif
3346   else {
3347     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3348   }
3349 
3350 #if USE_ITT_BUILD
3351   __kmp_itt_lock_acquiring(lck);
3352 #endif /* USE_ITT_BUILD */
3353 
3354 #if OMPT_SUPPORT && OMPT_OPTIONAL
3355   // This is the case, if called from omp_init_lock_with_hint:
3356   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3357   if (!codeptr)
3358     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3359   if (ompt_enabled.enabled) &&
3360         ompt_enabled.ompt_callback_mutex_acquire) {
3361       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3362           ompt_mutex_test_nest_lock, omp_lock_hint_none,
3363           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3364           codeptr);
3365     }
3366 #endif
3367 
3368   rc = TEST_NESTED_LOCK(lck, gtid);
3369 #if USE_ITT_BUILD
3370   if (rc) {
3371     __kmp_itt_lock_acquired(lck);
3372   } else {
3373     __kmp_itt_lock_cancelled(lck);
3374   }
3375 #endif /* USE_ITT_BUILD */
3376 #if OMPT_SUPPORT && OMPT_OPTIONAL
3377   if (ompt_enabled.enabled && rc) {
3378     if (rc == 1) {
3379       if (ompt_enabled.ompt_callback_mutex_acquired) {
3380         // lock_first
3381         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3382             ompt_mutex_test_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3383       }
3384     } else {
3385       if (ompt_enabled.ompt_callback_nest_lock) {
3386         // lock_next
3387         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3388             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3389       }
3390     }
3391   }
3392 #endif
3393   return rc;
3394 
3395   /* Can't use serial interval since not block structured */
3396 
3397 #endif // KMP_USE_DYNAMIC_LOCK
3398 }
3399 
3400 // Interface to fast scalable reduce methods routines
3401 
3402 // keep the selected method in a thread local structure for cross-function
3403 // usage: will be used in __kmpc_end_reduce* functions;
3404 // another solution: to re-determine the method one more time in
3405 // __kmpc_end_reduce* functions (new prototype required then)
3406 // AT: which solution is better?
3407 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3408   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3409 
3410 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3411   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3412 
3413 // description of the packed_reduction_method variable: look at the macros in
3414 // kmp.h
3415 
3416 // used in a critical section reduce block
3417 static __forceinline void
3418 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3419                                           kmp_critical_name *crit) {
3420 
3421   // this lock was visible to a customer and to the threading profile tool as a
3422   // serial overhead span (although it's used for an internal purpose only)
3423   //            why was it visible in previous implementation?
3424   //            should we keep it visible in new reduce block?
3425   kmp_user_lock_p lck;
3426 
3427 #if KMP_USE_DYNAMIC_LOCK
3428 
3429   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3430   // Check if it is initialized.
3431   if (*lk == 0) {
3432     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3433       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3434                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3435     } else {
3436       __kmp_init_indirect_csptr(crit, loc, global_tid,
3437                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3438     }
3439   }
3440   // Branch for accessing the actual lock object and set operation. This
3441   // branching is inevitable since this lock initialization does not follow the
3442   // normal dispatch path (lock table is not used).
3443   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3444     lck = (kmp_user_lock_p)lk;
3445     KMP_DEBUG_ASSERT(lck != NULL);
3446     if (__kmp_env_consistency_check) {
3447       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3448     }
3449     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3450   } else {
3451     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3452     lck = ilk->lock;
3453     KMP_DEBUG_ASSERT(lck != NULL);
3454     if (__kmp_env_consistency_check) {
3455       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3456     }
3457     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3458   }
3459 
3460 #else // KMP_USE_DYNAMIC_LOCK
3461 
3462   // We know that the fast reduction code is only emitted by Intel compilers
3463   // with 32 byte critical sections. If there isn't enough space, then we
3464   // have to use a pointer.
3465   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3466     lck = (kmp_user_lock_p)crit;
3467   } else {
3468     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3469   }
3470   KMP_DEBUG_ASSERT(lck != NULL);
3471 
3472   if (__kmp_env_consistency_check)
3473     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3474 
3475   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3476 
3477 #endif // KMP_USE_DYNAMIC_LOCK
3478 }
3479 
3480 // used in a critical section reduce block
3481 static __forceinline void
3482 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3483                                         kmp_critical_name *crit) {
3484 
3485   kmp_user_lock_p lck;
3486 
3487 #if KMP_USE_DYNAMIC_LOCK
3488 
3489   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3490     lck = (kmp_user_lock_p)crit;
3491     if (__kmp_env_consistency_check)
3492       __kmp_pop_sync(global_tid, ct_critical, loc);
3493     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3494   } else {
3495     kmp_indirect_lock_t *ilk =
3496         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3497     if (__kmp_env_consistency_check)
3498       __kmp_pop_sync(global_tid, ct_critical, loc);
3499     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3500   }
3501 
3502 #else // KMP_USE_DYNAMIC_LOCK
3503 
3504   // We know that the fast reduction code is only emitted by Intel compilers
3505   // with 32 byte critical sections. If there isn't enough space, then we have
3506   // to use a pointer.
3507   if (__kmp_base_user_lock_size > 32) {
3508     lck = *((kmp_user_lock_p *)crit);
3509     KMP_ASSERT(lck != NULL);
3510   } else {
3511     lck = (kmp_user_lock_p)crit;
3512   }
3513 
3514   if (__kmp_env_consistency_check)
3515     __kmp_pop_sync(global_tid, ct_critical, loc);
3516 
3517   __kmp_release_user_lock_with_checks(lck, global_tid);
3518 
3519 #endif // KMP_USE_DYNAMIC_LOCK
3520 } // __kmp_end_critical_section_reduce_block
3521 
3522 static __forceinline int
3523 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3524                                      int *task_state) {
3525   kmp_team_t *team;
3526 
3527   // Check if we are inside the teams construct?
3528   if (th->th.th_teams_microtask) {
3529     *team_p = team = th->th.th_team;
3530     if (team->t.t_level == th->th.th_teams_level) {
3531       // This is reduction at teams construct.
3532       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3533       // Let's swap teams temporarily for the reduction.
3534       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3535       th->th.th_team = team->t.t_parent;
3536       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3537       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3538       *task_state = th->th.th_task_state;
3539       th->th.th_task_state = 0;
3540 
3541       return 1;
3542     }
3543   }
3544   return 0;
3545 }
3546 
3547 static __forceinline void
3548 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3549   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3550   th->th.th_info.ds.ds_tid = 0;
3551   th->th.th_team = team;
3552   th->th.th_team_nproc = team->t.t_nproc;
3553   th->th.th_task_team = team->t.t_task_team[task_state];
3554   __kmp_type_convert(task_state, &(th->th.th_task_state));
3555 }
3556 
3557 /* 2.a.i. Reduce Block without a terminating barrier */
3558 /*!
3559 @ingroup SYNCHRONIZATION
3560 @param loc source location information
3561 @param global_tid global thread number
3562 @param num_vars number of items (variables) to be reduced
3563 @param reduce_size size of data in bytes to be reduced
3564 @param reduce_data pointer to data to be reduced
3565 @param reduce_func callback function providing reduction operation on two
3566 operands and returning result of reduction in lhs_data
3567 @param lck pointer to the unique lock data structure
3568 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3569 threads if atomic reduction needed
3570 
3571 The nowait version is used for a reduce clause with the nowait argument.
3572 */
3573 kmp_int32
3574 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3575                      size_t reduce_size, void *reduce_data,
3576                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3577                      kmp_critical_name *lck) {
3578 
3579   KMP_COUNT_BLOCK(REDUCE_nowait);
3580   int retval = 0;
3581   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3582   kmp_info_t *th;
3583   kmp_team_t *team;
3584   int teams_swapped = 0, task_state;
3585   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3586   __kmp_assert_valid_gtid(global_tid);
3587 
3588   // why do we need this initialization here at all?
3589   // Reduction clause can not be used as a stand-alone directive.
3590 
3591   // do not call __kmp_serial_initialize(), it will be called by
3592   // __kmp_parallel_initialize() if needed
3593   // possible detection of false-positive race by the threadchecker ???
3594   if (!TCR_4(__kmp_init_parallel))
3595     __kmp_parallel_initialize();
3596 
3597   __kmp_resume_if_soft_paused();
3598 
3599 // check correctness of reduce block nesting
3600 #if KMP_USE_DYNAMIC_LOCK
3601   if (__kmp_env_consistency_check)
3602     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3603 #else
3604   if (__kmp_env_consistency_check)
3605     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3606 #endif
3607 
3608   th = __kmp_thread_from_gtid(global_tid);
3609   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3610 
3611   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3612   // the value should be kept in a variable
3613   // the variable should be either a construct-specific or thread-specific
3614   // property, not a team specific property
3615   //     (a thread can reach the next reduce block on the next construct, reduce
3616   //     method may differ on the next construct)
3617   // an ident_t "loc" parameter could be used as a construct-specific property
3618   // (what if loc == 0?)
3619   //     (if both construct-specific and team-specific variables were shared,
3620   //     then unness extra syncs should be needed)
3621   // a thread-specific variable is better regarding two issues above (next
3622   // construct and extra syncs)
3623   // a thread-specific "th_local.reduction_method" variable is used currently
3624   // each thread executes 'determine' and 'set' lines (no need to execute by one
3625   // thread, to avoid unness extra syncs)
3626 
3627   packed_reduction_method = __kmp_determine_reduction_method(
3628       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3629   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3630 
3631   OMPT_REDUCTION_DECL(th, global_tid);
3632   if (packed_reduction_method == critical_reduce_block) {
3633 
3634     OMPT_REDUCTION_BEGIN;
3635 
3636     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3637     retval = 1;
3638 
3639   } else if (packed_reduction_method == empty_reduce_block) {
3640 
3641     OMPT_REDUCTION_BEGIN;
3642 
3643     // usage: if team size == 1, no synchronization is required ( Intel
3644     // platforms only )
3645     retval = 1;
3646 
3647   } else if (packed_reduction_method == atomic_reduce_block) {
3648 
3649     retval = 2;
3650 
3651     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3652     // won't be called by the code gen)
3653     //     (it's not quite good, because the checking block has been closed by
3654     //     this 'pop',
3655     //      but atomic operation has not been executed yet, will be executed
3656     //      slightly later, literally on next instruction)
3657     if (__kmp_env_consistency_check)
3658       __kmp_pop_sync(global_tid, ct_reduce, loc);
3659 
3660   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3661                                    tree_reduce_block)) {
3662 
3663 // AT: performance issue: a real barrier here
3664 // AT: (if primary thread is slow, other threads are blocked here waiting for
3665 //      the primary thread to come and release them)
3666 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3667 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3668 //      be confusing to a customer)
3669 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3670 // might go faster and be more in line with sense of NOWAIT
3671 // AT: TO DO: do epcc test and compare times
3672 
3673 // this barrier should be invisible to a customer and to the threading profile
3674 // tool (it's neither a terminating barrier nor customer's code, it's
3675 // used for an internal purpose)
3676 #if OMPT_SUPPORT
3677     // JP: can this barrier potentially leed to task scheduling?
3678     // JP: as long as there is a barrier in the implementation, OMPT should and
3679     // will provide the barrier events
3680     //         so we set-up the necessary frame/return addresses.
3681     ompt_frame_t *ompt_frame;
3682     if (ompt_enabled.enabled) {
3683       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3684       if (ompt_frame->enter_frame.ptr == NULL)
3685         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3686     }
3687     OMPT_STORE_RETURN_ADDRESS(global_tid);
3688 #endif
3689 #if USE_ITT_NOTIFY
3690     __kmp_threads[global_tid]->th.th_ident = loc;
3691 #endif
3692     retval =
3693         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3694                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3695     retval = (retval != 0) ? (0) : (1);
3696 #if OMPT_SUPPORT && OMPT_OPTIONAL
3697     if (ompt_enabled.enabled) {
3698       ompt_frame->enter_frame = ompt_data_none;
3699     }
3700 #endif
3701 
3702     // all other workers except primary thread should do this pop here
3703     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3704     if (__kmp_env_consistency_check) {
3705       if (retval == 0) {
3706         __kmp_pop_sync(global_tid, ct_reduce, loc);
3707       }
3708     }
3709 
3710   } else {
3711 
3712     // should never reach this block
3713     KMP_ASSERT(0); // "unexpected method"
3714   }
3715   if (teams_swapped) {
3716     __kmp_restore_swapped_teams(th, team, task_state);
3717   }
3718   KA_TRACE(
3719       10,
3720       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3721        global_tid, packed_reduction_method, retval));
3722 
3723   return retval;
3724 }
3725 
3726 /*!
3727 @ingroup SYNCHRONIZATION
3728 @param loc source location information
3729 @param global_tid global thread id.
3730 @param lck pointer to the unique lock data structure
3731 
3732 Finish the execution of a reduce nowait.
3733 */
3734 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3735                               kmp_critical_name *lck) {
3736 
3737   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3738 
3739   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3740   __kmp_assert_valid_gtid(global_tid);
3741 
3742   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3743 
3744   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3745 
3746   if (packed_reduction_method == critical_reduce_block) {
3747 
3748     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3749     OMPT_REDUCTION_END;
3750 
3751   } else if (packed_reduction_method == empty_reduce_block) {
3752 
3753     // usage: if team size == 1, no synchronization is required ( on Intel
3754     // platforms only )
3755 
3756     OMPT_REDUCTION_END;
3757 
3758   } else if (packed_reduction_method == atomic_reduce_block) {
3759 
3760     // neither primary thread nor other workers should get here
3761     //     (code gen does not generate this call in case 2: atomic reduce block)
3762     // actually it's better to remove this elseif at all;
3763     // after removal this value will checked by the 'else' and will assert
3764 
3765   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3766                                    tree_reduce_block)) {
3767 
3768     // only primary thread gets here
3769     // OMPT: tree reduction is annotated in the barrier code
3770 
3771   } else {
3772 
3773     // should never reach this block
3774     KMP_ASSERT(0); // "unexpected method"
3775   }
3776 
3777   if (__kmp_env_consistency_check)
3778     __kmp_pop_sync(global_tid, ct_reduce, loc);
3779 
3780   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3781                 global_tid, packed_reduction_method));
3782 
3783   return;
3784 }
3785 
3786 /* 2.a.ii. Reduce Block with a terminating barrier */
3787 
3788 /*!
3789 @ingroup SYNCHRONIZATION
3790 @param loc source location information
3791 @param global_tid global thread number
3792 @param num_vars number of items (variables) to be reduced
3793 @param reduce_size size of data in bytes to be reduced
3794 @param reduce_data pointer to data to be reduced
3795 @param reduce_func callback function providing reduction operation on two
3796 operands and returning result of reduction in lhs_data
3797 @param lck pointer to the unique lock data structure
3798 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3799 threads if atomic reduction needed
3800 
3801 A blocking reduce that includes an implicit barrier.
3802 */
3803 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3804                         size_t reduce_size, void *reduce_data,
3805                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3806                         kmp_critical_name *lck) {
3807   KMP_COUNT_BLOCK(REDUCE_wait);
3808   int retval = 0;
3809   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3810   kmp_info_t *th;
3811   kmp_team_t *team;
3812   int teams_swapped = 0, task_state;
3813 
3814   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3815   __kmp_assert_valid_gtid(global_tid);
3816 
3817   // why do we need this initialization here at all?
3818   // Reduction clause can not be a stand-alone directive.
3819 
3820   // do not call __kmp_serial_initialize(), it will be called by
3821   // __kmp_parallel_initialize() if needed
3822   // possible detection of false-positive race by the threadchecker ???
3823   if (!TCR_4(__kmp_init_parallel))
3824     __kmp_parallel_initialize();
3825 
3826   __kmp_resume_if_soft_paused();
3827 
3828 // check correctness of reduce block nesting
3829 #if KMP_USE_DYNAMIC_LOCK
3830   if (__kmp_env_consistency_check)
3831     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3832 #else
3833   if (__kmp_env_consistency_check)
3834     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3835 #endif
3836 
3837   th = __kmp_thread_from_gtid(global_tid);
3838   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3839 
3840   packed_reduction_method = __kmp_determine_reduction_method(
3841       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3842   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3843 
3844   OMPT_REDUCTION_DECL(th, global_tid);
3845 
3846   if (packed_reduction_method == critical_reduce_block) {
3847 
3848     OMPT_REDUCTION_BEGIN;
3849     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3850     retval = 1;
3851 
3852   } else if (packed_reduction_method == empty_reduce_block) {
3853 
3854     OMPT_REDUCTION_BEGIN;
3855     // usage: if team size == 1, no synchronization is required ( Intel
3856     // platforms only )
3857     retval = 1;
3858 
3859   } else if (packed_reduction_method == atomic_reduce_block) {
3860 
3861     retval = 2;
3862 
3863   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3864                                    tree_reduce_block)) {
3865 
3866 // case tree_reduce_block:
3867 // this barrier should be visible to a customer and to the threading profile
3868 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3869 #if OMPT_SUPPORT
3870     ompt_frame_t *ompt_frame;
3871     if (ompt_enabled.enabled) {
3872       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3873       if (ompt_frame->enter_frame.ptr == NULL)
3874         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3875     }
3876     OMPT_STORE_RETURN_ADDRESS(global_tid);
3877 #endif
3878 #if USE_ITT_NOTIFY
3879     __kmp_threads[global_tid]->th.th_ident =
3880         loc; // needed for correct notification of frames
3881 #endif
3882     retval =
3883         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3884                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3885     retval = (retval != 0) ? (0) : (1);
3886 #if OMPT_SUPPORT && OMPT_OPTIONAL
3887     if (ompt_enabled.enabled) {
3888       ompt_frame->enter_frame = ompt_data_none;
3889     }
3890 #endif
3891 
3892     // all other workers except primary thread should do this pop here
3893     // (none of other workers except primary will enter __kmpc_end_reduce())
3894     if (__kmp_env_consistency_check) {
3895       if (retval == 0) { // 0: all other workers; 1: primary thread
3896         __kmp_pop_sync(global_tid, ct_reduce, loc);
3897       }
3898     }
3899 
3900   } else {
3901 
3902     // should never reach this block
3903     KMP_ASSERT(0); // "unexpected method"
3904   }
3905   if (teams_swapped) {
3906     __kmp_restore_swapped_teams(th, team, task_state);
3907   }
3908 
3909   KA_TRACE(10,
3910            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3911             global_tid, packed_reduction_method, retval));
3912   return retval;
3913 }
3914 
3915 /*!
3916 @ingroup SYNCHRONIZATION
3917 @param loc source location information
3918 @param global_tid global thread id.
3919 @param lck pointer to the unique lock data structure
3920 
3921 Finish the execution of a blocking reduce.
3922 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3923 start function.
3924 */
3925 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3926                        kmp_critical_name *lck) {
3927 
3928   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3929   kmp_info_t *th;
3930   kmp_team_t *team;
3931   int teams_swapped = 0, task_state;
3932 
3933   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3934   __kmp_assert_valid_gtid(global_tid);
3935 
3936   th = __kmp_thread_from_gtid(global_tid);
3937   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3938 
3939   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3940 
3941   // this barrier should be visible to a customer and to the threading profile
3942   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3943   OMPT_REDUCTION_DECL(th, global_tid);
3944 
3945   if (packed_reduction_method == critical_reduce_block) {
3946     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3947 
3948     OMPT_REDUCTION_END;
3949 
3950 // TODO: implicit barrier: should be exposed
3951 #if OMPT_SUPPORT
3952     ompt_frame_t *ompt_frame;
3953     if (ompt_enabled.enabled) {
3954       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3955       if (ompt_frame->enter_frame.ptr == NULL)
3956         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3957     }
3958     OMPT_STORE_RETURN_ADDRESS(global_tid);
3959 #endif
3960 #if USE_ITT_NOTIFY
3961     __kmp_threads[global_tid]->th.th_ident = loc;
3962 #endif
3963     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3964 #if OMPT_SUPPORT && OMPT_OPTIONAL
3965     if (ompt_enabled.enabled) {
3966       ompt_frame->enter_frame = ompt_data_none;
3967     }
3968 #endif
3969 
3970   } else if (packed_reduction_method == empty_reduce_block) {
3971 
3972     OMPT_REDUCTION_END;
3973 
3974 // usage: if team size==1, no synchronization is required (Intel platforms only)
3975 
3976 // TODO: implicit barrier: should be exposed
3977 #if OMPT_SUPPORT
3978     ompt_frame_t *ompt_frame;
3979     if (ompt_enabled.enabled) {
3980       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3981       if (ompt_frame->enter_frame.ptr == NULL)
3982         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3983     }
3984     OMPT_STORE_RETURN_ADDRESS(global_tid);
3985 #endif
3986 #if USE_ITT_NOTIFY
3987     __kmp_threads[global_tid]->th.th_ident = loc;
3988 #endif
3989     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3990 #if OMPT_SUPPORT && OMPT_OPTIONAL
3991     if (ompt_enabled.enabled) {
3992       ompt_frame->enter_frame = ompt_data_none;
3993     }
3994 #endif
3995 
3996   } else if (packed_reduction_method == atomic_reduce_block) {
3997 
3998 #if OMPT_SUPPORT
3999     ompt_frame_t *ompt_frame;
4000     if (ompt_enabled.enabled) {
4001       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
4002       if (ompt_frame->enter_frame.ptr == NULL)
4003         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
4004     }
4005     OMPT_STORE_RETURN_ADDRESS(global_tid);
4006 #endif
4007 // TODO: implicit barrier: should be exposed
4008 #if USE_ITT_NOTIFY
4009     __kmp_threads[global_tid]->th.th_ident = loc;
4010 #endif
4011     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
4012 #if OMPT_SUPPORT && OMPT_OPTIONAL
4013     if (ompt_enabled.enabled) {
4014       ompt_frame->enter_frame = ompt_data_none;
4015     }
4016 #endif
4017 
4018   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
4019                                    tree_reduce_block)) {
4020 
4021     // only primary thread executes here (primary releases all other workers)
4022     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
4023                             global_tid);
4024 
4025   } else {
4026 
4027     // should never reach this block
4028     KMP_ASSERT(0); // "unexpected method"
4029   }
4030   if (teams_swapped) {
4031     __kmp_restore_swapped_teams(th, team, task_state);
4032   }
4033 
4034   if (__kmp_env_consistency_check)
4035     __kmp_pop_sync(global_tid, ct_reduce, loc);
4036 
4037   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
4038                 global_tid, packed_reduction_method));
4039 
4040   return;
4041 }
4042 
4043 #undef __KMP_GET_REDUCTION_METHOD
4044 #undef __KMP_SET_REDUCTION_METHOD
4045 
4046 /* end of interface to fast scalable reduce routines */
4047 
4048 kmp_uint64 __kmpc_get_taskid() {
4049 
4050   kmp_int32 gtid;
4051   kmp_info_t *thread;
4052 
4053   gtid = __kmp_get_gtid();
4054   if (gtid < 0) {
4055     return 0;
4056   }
4057   thread = __kmp_thread_from_gtid(gtid);
4058   return thread->th.th_current_task->td_task_id;
4059 
4060 } // __kmpc_get_taskid
4061 
4062 kmp_uint64 __kmpc_get_parent_taskid() {
4063 
4064   kmp_int32 gtid;
4065   kmp_info_t *thread;
4066   kmp_taskdata_t *parent_task;
4067 
4068   gtid = __kmp_get_gtid();
4069   if (gtid < 0) {
4070     return 0;
4071   }
4072   thread = __kmp_thread_from_gtid(gtid);
4073   parent_task = thread->th.th_current_task->td_parent;
4074   return (parent_task == NULL ? 0 : parent_task->td_task_id);
4075 
4076 } // __kmpc_get_parent_taskid
4077 
4078 /*!
4079 @ingroup WORK_SHARING
4080 @param loc  source location information.
4081 @param gtid  global thread number.
4082 @param num_dims  number of associated doacross loops.
4083 @param dims  info on loops bounds.
4084 
4085 Initialize doacross loop information.
4086 Expect compiler send us inclusive bounds,
4087 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
4088 */
4089 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
4090                           const struct kmp_dim *dims) {
4091   __kmp_assert_valid_gtid(gtid);
4092   int j, idx;
4093   kmp_int64 last, trace_count;
4094   kmp_info_t *th = __kmp_threads[gtid];
4095   kmp_team_t *team = th->th.th_team;
4096   kmp_uint32 *flags;
4097   kmp_disp_t *pr_buf = th->th.th_dispatch;
4098   dispatch_shared_info_t *sh_buf;
4099 
4100   KA_TRACE(
4101       20,
4102       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
4103        gtid, num_dims, !team->t.t_serialized));
4104   KMP_DEBUG_ASSERT(dims != NULL);
4105   KMP_DEBUG_ASSERT(num_dims > 0);
4106 
4107   if (team->t.t_serialized) {
4108     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
4109     return; // no dependencies if team is serialized
4110   }
4111   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
4112   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
4113   // the next loop
4114   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4115 
4116   // Save bounds info into allocated private buffer
4117   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
4118   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
4119       th, sizeof(kmp_int64) * (4 * num_dims + 1));
4120   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4121   pr_buf->th_doacross_info[0] =
4122       (kmp_int64)num_dims; // first element is number of dimensions
4123   // Save also address of num_done in order to access it later without knowing
4124   // the buffer index
4125   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
4126   pr_buf->th_doacross_info[2] = dims[0].lo;
4127   pr_buf->th_doacross_info[3] = dims[0].up;
4128   pr_buf->th_doacross_info[4] = dims[0].st;
4129   last = 5;
4130   for (j = 1; j < num_dims; ++j) {
4131     kmp_int64
4132         range_length; // To keep ranges of all dimensions but the first dims[0]
4133     if (dims[j].st == 1) { // most common case
4134       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
4135       range_length = dims[j].up - dims[j].lo + 1;
4136     } else {
4137       if (dims[j].st > 0) {
4138         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
4139         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
4140       } else { // negative increment
4141         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
4142         range_length =
4143             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
4144       }
4145     }
4146     pr_buf->th_doacross_info[last++] = range_length;
4147     pr_buf->th_doacross_info[last++] = dims[j].lo;
4148     pr_buf->th_doacross_info[last++] = dims[j].up;
4149     pr_buf->th_doacross_info[last++] = dims[j].st;
4150   }
4151 
4152   // Compute total trip count.
4153   // Start with range of dims[0] which we don't need to keep in the buffer.
4154   if (dims[0].st == 1) { // most common case
4155     trace_count = dims[0].up - dims[0].lo + 1;
4156   } else if (dims[0].st > 0) {
4157     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
4158     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
4159   } else { // negative increment
4160     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
4161     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
4162   }
4163   for (j = 1; j < num_dims; ++j) {
4164     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
4165   }
4166   KMP_DEBUG_ASSERT(trace_count > 0);
4167 
4168   // Check if shared buffer is not occupied by other loop (idx -
4169   // __kmp_dispatch_num_buffers)
4170   if (idx != sh_buf->doacross_buf_idx) {
4171     // Shared buffer is occupied, wait for it to be free
4172     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
4173                  __kmp_eq_4, NULL);
4174   }
4175 #if KMP_32_BIT_ARCH
4176   // Check if we are the first thread. After the CAS the first thread gets 0,
4177   // others get 1 if initialization is in progress, allocated pointer otherwise.
4178   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
4179   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
4180       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
4181 #else
4182   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
4183       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
4184 #endif
4185   if (flags == NULL) {
4186     // we are the first thread, allocate the array of flags
4187     size_t size =
4188         (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
4189     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4190     KMP_MB();
4191     sh_buf->doacross_flags = flags;
4192   } else if (flags == (kmp_uint32 *)1) {
4193 #if KMP_32_BIT_ARCH
4194     // initialization is still in progress, need to wait
4195     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4196 #else
4197     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4198 #endif
4199       KMP_YIELD(TRUE);
4200     KMP_MB();
4201   } else {
4202     KMP_MB();
4203   }
4204   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4205   pr_buf->th_doacross_flags =
4206       sh_buf->doacross_flags; // save private copy in order to not
4207   // touch shared buffer on each iteration
4208   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4209 }
4210 
4211 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4212   __kmp_assert_valid_gtid(gtid);
4213   kmp_int64 shft;
4214   size_t num_dims, i;
4215   kmp_uint32 flag;
4216   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4217   kmp_info_t *th = __kmp_threads[gtid];
4218   kmp_team_t *team = th->th.th_team;
4219   kmp_disp_t *pr_buf;
4220   kmp_int64 lo, up, st;
4221 
4222   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4223   if (team->t.t_serialized) {
4224     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4225     return; // no dependencies if team is serialized
4226   }
4227 
4228   // calculate sequential iteration number and check out-of-bounds condition
4229   pr_buf = th->th.th_dispatch;
4230   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4231   num_dims = (size_t)pr_buf->th_doacross_info[0];
4232   lo = pr_buf->th_doacross_info[2];
4233   up = pr_buf->th_doacross_info[3];
4234   st = pr_buf->th_doacross_info[4];
4235 #if OMPT_SUPPORT && OMPT_OPTIONAL
4236   ompt_dependence_t deps[num_dims];
4237 #endif
4238   if (st == 1) { // most common case
4239     if (vec[0] < lo || vec[0] > up) {
4240       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4241                     "bounds [%lld,%lld]\n",
4242                     gtid, vec[0], lo, up));
4243       return;
4244     }
4245     iter_number = vec[0] - lo;
4246   } else if (st > 0) {
4247     if (vec[0] < lo || vec[0] > up) {
4248       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4249                     "bounds [%lld,%lld]\n",
4250                     gtid, vec[0], lo, up));
4251       return;
4252     }
4253     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4254   } else { // negative increment
4255     if (vec[0] > lo || vec[0] < up) {
4256       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4257                     "bounds [%lld,%lld]\n",
4258                     gtid, vec[0], lo, up));
4259       return;
4260     }
4261     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4262   }
4263 #if OMPT_SUPPORT && OMPT_OPTIONAL
4264   deps[0].variable.value = iter_number;
4265   deps[0].dependence_type = ompt_dependence_type_sink;
4266 #endif
4267   for (i = 1; i < num_dims; ++i) {
4268     kmp_int64 iter, ln;
4269     size_t j = i * 4;
4270     ln = pr_buf->th_doacross_info[j + 1];
4271     lo = pr_buf->th_doacross_info[j + 2];
4272     up = pr_buf->th_doacross_info[j + 3];
4273     st = pr_buf->th_doacross_info[j + 4];
4274     if (st == 1) {
4275       if (vec[i] < lo || vec[i] > up) {
4276         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4277                       "bounds [%lld,%lld]\n",
4278                       gtid, vec[i], lo, up));
4279         return;
4280       }
4281       iter = vec[i] - lo;
4282     } else if (st > 0) {
4283       if (vec[i] < lo || vec[i] > up) {
4284         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4285                       "bounds [%lld,%lld]\n",
4286                       gtid, vec[i], lo, up));
4287         return;
4288       }
4289       iter = (kmp_uint64)(vec[i] - lo) / st;
4290     } else { // st < 0
4291       if (vec[i] > lo || vec[i] < up) {
4292         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4293                       "bounds [%lld,%lld]\n",
4294                       gtid, vec[i], lo, up));
4295         return;
4296       }
4297       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4298     }
4299     iter_number = iter + ln * iter_number;
4300 #if OMPT_SUPPORT && OMPT_OPTIONAL
4301     deps[i].variable.value = iter;
4302     deps[i].dependence_type = ompt_dependence_type_sink;
4303 #endif
4304   }
4305   shft = iter_number % 32; // use 32-bit granularity
4306   iter_number >>= 5; // divided by 32
4307   flag = 1 << shft;
4308   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4309     KMP_YIELD(TRUE);
4310   }
4311   KMP_MB();
4312 #if OMPT_SUPPORT && OMPT_OPTIONAL
4313   if (ompt_enabled.ompt_callback_dependences) {
4314     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4315         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4316   }
4317 #endif
4318   KA_TRACE(20,
4319            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4320             gtid, (iter_number << 5) + shft));
4321 }
4322 
4323 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4324   __kmp_assert_valid_gtid(gtid);
4325   kmp_int64 shft;
4326   size_t num_dims, i;
4327   kmp_uint32 flag;
4328   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4329   kmp_info_t *th = __kmp_threads[gtid];
4330   kmp_team_t *team = th->th.th_team;
4331   kmp_disp_t *pr_buf;
4332   kmp_int64 lo, st;
4333 
4334   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4335   if (team->t.t_serialized) {
4336     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4337     return; // no dependencies if team is serialized
4338   }
4339 
4340   // calculate sequential iteration number (same as in "wait" but no
4341   // out-of-bounds checks)
4342   pr_buf = th->th.th_dispatch;
4343   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4344   num_dims = (size_t)pr_buf->th_doacross_info[0];
4345   lo = pr_buf->th_doacross_info[2];
4346   st = pr_buf->th_doacross_info[4];
4347 #if OMPT_SUPPORT && OMPT_OPTIONAL
4348   ompt_dependence_t deps[num_dims];
4349 #endif
4350   if (st == 1) { // most common case
4351     iter_number = vec[0] - lo;
4352   } else if (st > 0) {
4353     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4354   } else { // negative increment
4355     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4356   }
4357 #if OMPT_SUPPORT && OMPT_OPTIONAL
4358   deps[0].variable.value = iter_number;
4359   deps[0].dependence_type = ompt_dependence_type_source;
4360 #endif
4361   for (i = 1; i < num_dims; ++i) {
4362     kmp_int64 iter, ln;
4363     size_t j = i * 4;
4364     ln = pr_buf->th_doacross_info[j + 1];
4365     lo = pr_buf->th_doacross_info[j + 2];
4366     st = pr_buf->th_doacross_info[j + 4];
4367     if (st == 1) {
4368       iter = vec[i] - lo;
4369     } else if (st > 0) {
4370       iter = (kmp_uint64)(vec[i] - lo) / st;
4371     } else { // st < 0
4372       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4373     }
4374     iter_number = iter + ln * iter_number;
4375 #if OMPT_SUPPORT && OMPT_OPTIONAL
4376     deps[i].variable.value = iter;
4377     deps[i].dependence_type = ompt_dependence_type_source;
4378 #endif
4379   }
4380 #if OMPT_SUPPORT && OMPT_OPTIONAL
4381   if (ompt_enabled.ompt_callback_dependences) {
4382     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4383         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4384   }
4385 #endif
4386   shft = iter_number % 32; // use 32-bit granularity
4387   iter_number >>= 5; // divided by 32
4388   flag = 1 << shft;
4389   KMP_MB();
4390   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4391     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4392   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4393                 (iter_number << 5) + shft));
4394 }
4395 
4396 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4397   __kmp_assert_valid_gtid(gtid);
4398   kmp_int32 num_done;
4399   kmp_info_t *th = __kmp_threads[gtid];
4400   kmp_team_t *team = th->th.th_team;
4401   kmp_disp_t *pr_buf = th->th.th_dispatch;
4402 
4403   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4404   if (team->t.t_serialized) {
4405     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4406     return; // nothing to do
4407   }
4408   num_done =
4409       KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4410   if (num_done == th->th.th_team_nproc) {
4411     // we are the last thread, need to free shared resources
4412     int idx = pr_buf->th_doacross_buf_idx - 1;
4413     dispatch_shared_info_t *sh_buf =
4414         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4415     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4416                      (kmp_int64)&sh_buf->doacross_num_done);
4417     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4418     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4419     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4420     sh_buf->doacross_flags = NULL;
4421     sh_buf->doacross_num_done = 0;
4422     sh_buf->doacross_buf_idx +=
4423         __kmp_dispatch_num_buffers; // free buffer for future re-use
4424   }
4425   // free private resources (need to keep buffer index forever)
4426   pr_buf->th_doacross_flags = NULL;
4427   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4428   pr_buf->th_doacross_info = NULL;
4429   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4430 }
4431 
4432 /* OpenMP 5.1 Memory Management routines */
4433 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4434   return __kmp_alloc(__kmp_entry_gtid(), 0, size, allocator);
4435 }
4436 
4437 void *omp_aligned_alloc(size_t align, size_t size,
4438                         omp_allocator_handle_t allocator) {
4439   return __kmp_alloc(__kmp_entry_gtid(), align, size, allocator);
4440 }
4441 
4442 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4443   return __kmp_calloc(__kmp_entry_gtid(), 0, nmemb, size, allocator);
4444 }
4445 
4446 void *omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
4447                          omp_allocator_handle_t allocator) {
4448   return __kmp_calloc(__kmp_entry_gtid(), align, nmemb, size, allocator);
4449 }
4450 
4451 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4452                   omp_allocator_handle_t free_allocator) {
4453   return __kmp_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4454                        free_allocator);
4455 }
4456 
4457 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4458   ___kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4459 }
4460 /* end of OpenMP 5.1 Memory Management routines */
4461 
4462 int __kmpc_get_target_offload(void) {
4463   if (!__kmp_init_serial) {
4464     __kmp_serial_initialize();
4465   }
4466   return __kmp_target_offload;
4467 }
4468 
4469 int __kmpc_pause_resource(kmp_pause_status_t level) {
4470   if (!__kmp_init_serial) {
4471     return 1; // Can't pause if runtime is not initialized
4472   }
4473   return __kmp_pause_resource(level);
4474 }
4475 
4476 void __kmpc_error(ident_t *loc, int severity, const char *message) {
4477   if (!__kmp_init_serial)
4478     __kmp_serial_initialize();
4479 
4480   KMP_ASSERT(severity == severity_warning || severity == severity_fatal);
4481 
4482 #if OMPT_SUPPORT
4483   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_error) {
4484     ompt_callbacks.ompt_callback(ompt_callback_error)(
4485         (ompt_severity_t)severity, message, KMP_STRLEN(message),
4486         OMPT_GET_RETURN_ADDRESS(0));
4487   }
4488 #endif // OMPT_SUPPORT
4489 
4490   char *src_loc;
4491   if (loc && loc->psource) {
4492     kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
4493     src_loc =
4494         __kmp_str_format("%s:%d:%d", str_loc.file, str_loc.line, str_loc.col);
4495     __kmp_str_loc_free(&str_loc);
4496   } else {
4497     src_loc = __kmp_str_format("unknown");
4498   }
4499 
4500   if (severity == severity_warning)
4501     KMP_WARNING(UserDirectedWarning, src_loc, message);
4502   else
4503     KMP_FATAL(UserDirectedError, src_loc, message);
4504 
4505   __kmp_str_free(&src_loc);
4506 }
4507 
4508 // Mark begin of scope directive.
4509 void __kmpc_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4510 // reserved is for extension of scope directive and not used.
4511 #if OMPT_SUPPORT && OMPT_OPTIONAL
4512   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4513     kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4514     int tid = __kmp_tid_from_gtid(gtid);
4515     ompt_callbacks.ompt_callback(ompt_callback_work)(
4516         ompt_work_scope, ompt_scope_begin,
4517         &(team->t.ompt_team_info.parallel_data),
4518         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4519         OMPT_GET_RETURN_ADDRESS(0));
4520   }
4521 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4522 }
4523 
4524 // Mark end of scope directive
4525 void __kmpc_end_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4526 // reserved is for extension of scope directive and not used.
4527 #if OMPT_SUPPORT && OMPT_OPTIONAL
4528   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4529     kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4530     int tid = __kmp_tid_from_gtid(gtid);
4531     ompt_callbacks.ompt_callback(ompt_callback_work)(
4532         ompt_work_scope, ompt_scope_end,
4533         &(team->t.ompt_team_info.parallel_data),
4534         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4535         OMPT_GET_RETURN_ADDRESS(0));
4536   }
4537 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4538 }
4539 
4540 #ifdef KMP_USE_VERSION_SYMBOLS
4541 // For GOMP compatibility there are two versions of each omp_* API.
4542 // One is the plain C symbol and one is the Fortran symbol with an appended
4543 // underscore. When we implement a specific ompc_* version of an omp_*
4544 // function, we want the plain GOMP versioned symbol to alias the ompc_* version
4545 // instead of the Fortran versions in kmp_ftn_entry.h
4546 extern "C" {
4547 // Have to undef these from omp.h so they aren't translated into
4548 // their ompc counterparts in the KMP_VERSION_OMPC_SYMBOL macros below
4549 #ifdef omp_set_affinity_format
4550 #undef omp_set_affinity_format
4551 #endif
4552 #ifdef omp_get_affinity_format
4553 #undef omp_get_affinity_format
4554 #endif
4555 #ifdef omp_display_affinity
4556 #undef omp_display_affinity
4557 #endif
4558 #ifdef omp_capture_affinity
4559 #undef omp_capture_affinity
4560 #endif
4561 KMP_VERSION_OMPC_SYMBOL(ompc_set_affinity_format, omp_set_affinity_format, 50,
4562                         "OMP_5.0");
4563 KMP_VERSION_OMPC_SYMBOL(ompc_get_affinity_format, omp_get_affinity_format, 50,
4564                         "OMP_5.0");
4565 KMP_VERSION_OMPC_SYMBOL(ompc_display_affinity, omp_display_affinity, 50,
4566                         "OMP_5.0");
4567 KMP_VERSION_OMPC_SYMBOL(ompc_capture_affinity, omp_capture_affinity, 50,
4568                         "OMP_5.0");
4569 } // extern "C"
4570 #endif
4571