xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_csupport.cpp (revision da759cfa320d5076b075d15ff3f00ab3ba5634fd)
1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
43   } else if (__kmp_ignore_mppbeg() == FALSE) {
44     // By default __kmp_ignore_mppbeg() returns TRUE.
45     __kmp_internal_begin();
46     KC_TRACE(10, ("__kmpc_begin: called\n"));
47   }
48 }
49 
50 /*!
51  * @ingroup STARTUP_SHUTDOWN
52  * @param loc source location information
53  *
54  * Shutdown the runtime library. This is also optional, and even if called will
55  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
56  * zero.
57  */
58 void __kmpc_end(ident_t *loc) {
59   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
60   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
61   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
62   // returns FALSE and __kmpc_end() will unregister this root (it can cause
63   // library shut down).
64   if (__kmp_ignore_mppend() == FALSE) {
65     KC_TRACE(10, ("__kmpc_end: called\n"));
66     KA_TRACE(30, ("__kmpc_end\n"));
67 
68     __kmp_internal_end_thread(-1);
69   }
70 #if KMP_OS_WINDOWS && OMPT_SUPPORT
71   // Normal exit process on Windows does not allow worker threads of the final
72   // parallel region to finish reporting their events, so shutting down the
73   // library here fixes the issue at least for the cases where __kmpc_end() is
74   // placed properly.
75   if (ompt_enabled.enabled)
76     __kmp_internal_end_library(__kmp_gtid_get_specific());
77 #endif
78 }
79 
80 /*!
81 @ingroup THREAD_STATES
82 @param loc Source location information.
83 @return The global thread index of the active thread.
84 
85 This function can be called in any context.
86 
87 If the runtime has ony been entered at the outermost level from a
88 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
89 that which would be returned by omp_get_thread_num() in the outermost
90 active parallel construct. (Or zero if there is no active parallel
91 construct, since the master thread is necessarily thread zero).
92 
93 If multiple non-OpenMP threads all enter an OpenMP construct then this
94 will be a unique thread identifier among all the threads created by
95 the OpenMP runtime (but the value cannote be defined in terms of
96 OpenMP thread ids returned by omp_get_thread_num()).
97 */
98 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
99   kmp_int32 gtid = __kmp_entry_gtid();
100 
101   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
102 
103   return gtid;
104 }
105 
106 /*!
107 @ingroup THREAD_STATES
108 @param loc Source location information.
109 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
110 
111 This function can be called in any context.
112 It returns the total number of threads under the control of the OpenMP runtime.
113 That is not a number that can be determined by any OpenMP standard calls, since
114 the library may be called from more than one non-OpenMP thread, and this
115 reflects the total over all such calls. Similarly the runtime maintains
116 underlying threads even when they are not active (since the cost of creating
117 and destroying OS threads is high), this call counts all such threads even if
118 they are not waiting for work.
119 */
120 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
121   KC_TRACE(10,
122            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
123 
124   return TCR_4(__kmp_all_nth);
125 }
126 
127 /*!
128 @ingroup THREAD_STATES
129 @param loc Source location information.
130 @return The thread number of the calling thread in the innermost active parallel
131 construct.
132 */
133 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
134   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
135   return __kmp_tid_from_gtid(__kmp_entry_gtid());
136 }
137 
138 /*!
139 @ingroup THREAD_STATES
140 @param loc Source location information.
141 @return The number of threads in the innermost active parallel construct.
142 */
143 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
144   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
145 
146   return __kmp_entry_thread()->th.th_team->t.t_nproc;
147 }
148 
149 /*!
150  * @ingroup DEPRECATED
151  * @param loc location description
152  *
153  * This function need not be called. It always returns TRUE.
154  */
155 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
156 #ifndef KMP_DEBUG
157 
158   return TRUE;
159 
160 #else
161 
162   const char *semi2;
163   const char *semi3;
164   int line_no;
165 
166   if (__kmp_par_range == 0) {
167     return TRUE;
168   }
169   semi2 = loc->psource;
170   if (semi2 == NULL) {
171     return TRUE;
172   }
173   semi2 = strchr(semi2, ';');
174   if (semi2 == NULL) {
175     return TRUE;
176   }
177   semi2 = strchr(semi2 + 1, ';');
178   if (semi2 == NULL) {
179     return TRUE;
180   }
181   if (__kmp_par_range_filename[0]) {
182     const char *name = semi2 - 1;
183     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
184       name--;
185     }
186     if ((*name == '/') || (*name == ';')) {
187       name++;
188     }
189     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
190       return __kmp_par_range < 0;
191     }
192   }
193   semi3 = strchr(semi2 + 1, ';');
194   if (__kmp_par_range_routine[0]) {
195     if ((semi3 != NULL) && (semi3 > semi2) &&
196         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
197       return __kmp_par_range < 0;
198     }
199   }
200   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
201     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
202       return __kmp_par_range > 0;
203     }
204     return __kmp_par_range < 0;
205   }
206   return TRUE;
207 
208 #endif /* KMP_DEBUG */
209 }
210 
211 /*!
212 @ingroup THREAD_STATES
213 @param loc Source location information.
214 @return 1 if this thread is executing inside an active parallel region, zero if
215 not.
216 */
217 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
218   return __kmp_entry_thread()->th.th_root->r.r_active;
219 }
220 
221 /*!
222 @ingroup PARALLEL
223 @param loc source location information
224 @param global_tid global thread number
225 @param num_threads number of threads requested for this parallel construct
226 
227 Set the number of threads to be used by the next fork spawned by this thread.
228 This call is only required if the parallel construct has a `num_threads` clause.
229 */
230 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
231                              kmp_int32 num_threads) {
232   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
233                 global_tid, num_threads));
234 
235   __kmp_push_num_threads(loc, global_tid, num_threads);
236 }
237 
238 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
239   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
240 
241   /* the num_threads are automatically popped */
242 }
243 
244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245                            kmp_int32 proc_bind) {
246   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247                 proc_bind));
248 
249   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250 }
251 
252 /*!
253 @ingroup PARALLEL
254 @param loc  source location information
255 @param argc  total number of arguments in the ellipsis
256 @param microtask  pointer to callback routine consisting of outlined parallel
257 construct
258 @param ...  pointers to shared variables that aren't global
259 
260 Do the actual fork and call the microtask in the relevant number of threads.
261 */
262 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263   int gtid = __kmp_entry_gtid();
264 
265 #if (KMP_STATS_ENABLED)
266   // If we were in a serial region, then stop the serial timer, record
267   // the event, and start parallel region timer
268   stats_state_e previous_state = KMP_GET_THREAD_STATE();
269   if (previous_state == stats_state_e::SERIAL_REGION) {
270     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
271   } else {
272     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273   }
274   int inParallel = __kmpc_in_parallel(loc);
275   if (inParallel) {
276     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
277   } else {
278     KMP_COUNT_BLOCK(OMP_PARALLEL);
279   }
280 #endif
281 
282   // maybe to save thr_state is enough here
283   {
284     va_list ap;
285     va_start(ap, microtask);
286 
287 #if OMPT_SUPPORT
288     ompt_frame_t *ompt_frame;
289     if (ompt_enabled.enabled) {
290       kmp_info_t *master_th = __kmp_threads[gtid];
291       kmp_team_t *parent_team = master_th->th.th_team;
292       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
293       if (lwt)
294         ompt_frame = &(lwt->ompt_task_info.frame);
295       else {
296         int tid = __kmp_tid_from_gtid(gtid);
297         ompt_frame = &(
298             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
299       }
300       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
301       OMPT_STORE_RETURN_ADDRESS(gtid);
302     }
303 #endif
304 
305 #if INCLUDE_SSC_MARKS
306     SSC_MARK_FORKING();
307 #endif
308     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
309                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
310                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
311 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
312 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
313                     &ap
314 #else
315                     ap
316 #endif
317                     );
318 #if INCLUDE_SSC_MARKS
319     SSC_MARK_JOINING();
320 #endif
321     __kmp_join_call(loc, gtid
322 #if OMPT_SUPPORT
323                     ,
324                     fork_context_intel
325 #endif
326                     );
327 
328     va_end(ap);
329   }
330 
331 #if KMP_STATS_ENABLED
332   if (previous_state == stats_state_e::SERIAL_REGION) {
333     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
334   } else {
335     KMP_POP_PARTITIONED_TIMER();
336   }
337 #endif // KMP_STATS_ENABLED
338 }
339 
340 /*!
341 @ingroup PARALLEL
342 @param loc source location information
343 @param global_tid global thread number
344 @param num_teams number of teams requested for the teams construct
345 @param num_threads number of threads per team requested for the teams construct
346 
347 Set the number of teams to be used by the teams construct.
348 This call is only required if the teams construct has a `num_teams` clause
349 or a `thread_limit` clause (or both).
350 */
351 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
352                            kmp_int32 num_teams, kmp_int32 num_threads) {
353   KA_TRACE(20,
354            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
355             global_tid, num_teams, num_threads));
356 
357   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
358 }
359 
360 /*!
361 @ingroup PARALLEL
362 @param loc  source location information
363 @param argc  total number of arguments in the ellipsis
364 @param microtask  pointer to callback routine consisting of outlined teams
365 construct
366 @param ...  pointers to shared variables that aren't global
367 
368 Do the actual fork and call the microtask in the relevant number of threads.
369 */
370 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
371                        ...) {
372   int gtid = __kmp_entry_gtid();
373   kmp_info_t *this_thr = __kmp_threads[gtid];
374   va_list ap;
375   va_start(ap, microtask);
376 
377 #if KMP_STATS_ENABLED
378   KMP_COUNT_BLOCK(OMP_TEAMS);
379   stats_state_e previous_state = KMP_GET_THREAD_STATE();
380   if (previous_state == stats_state_e::SERIAL_REGION) {
381     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
382   } else {
383     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
384   }
385 #endif
386 
387   // remember teams entry point and nesting level
388   this_thr->th.th_teams_microtask = microtask;
389   this_thr->th.th_teams_level =
390       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
391 
392 #if OMPT_SUPPORT
393   kmp_team_t *parent_team = this_thr->th.th_team;
394   int tid = __kmp_tid_from_gtid(gtid);
395   if (ompt_enabled.enabled) {
396     parent_team->t.t_implicit_task_taskdata[tid]
397         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
398   }
399   OMPT_STORE_RETURN_ADDRESS(gtid);
400 #endif
401 
402   // check if __kmpc_push_num_teams called, set default number of teams
403   // otherwise
404   if (this_thr->th.th_teams_size.nteams == 0) {
405     __kmp_push_num_teams(loc, gtid, 0, 0);
406   }
407   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
408   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
409   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
410 
411   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
412                   VOLATILE_CAST(microtask_t)
413                       __kmp_teams_master, // "wrapped" task
414                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
415 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
416                   &ap
417 #else
418                   ap
419 #endif
420                   );
421   __kmp_join_call(loc, gtid
422 #if OMPT_SUPPORT
423                   ,
424                   fork_context_intel
425 #endif
426                   );
427 
428   // Pop current CG root off list
429   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
430   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
431   this_thr->th.th_cg_roots = tmp->up;
432   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
433                  " to node %p. cg_nthreads was %d\n",
434                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
435   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
436   int i = tmp->cg_nthreads--;
437   if (i == 1) { // check is we are the last thread in CG (not always the case)
438     __kmp_free(tmp);
439   }
440   // Restore current task's thread_limit from CG root
441   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
442   this_thr->th.th_current_task->td_icvs.thread_limit =
443       this_thr->th.th_cg_roots->cg_thread_limit;
444 
445   this_thr->th.th_teams_microtask = NULL;
446   this_thr->th.th_teams_level = 0;
447   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
448   va_end(ap);
449 #if KMP_STATS_ENABLED
450   if (previous_state == stats_state_e::SERIAL_REGION) {
451     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
452   } else {
453     KMP_POP_PARTITIONED_TIMER();
454   }
455 #endif // KMP_STATS_ENABLED
456 }
457 
458 // I don't think this function should ever have been exported.
459 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
460 // openmp code ever called it, but it's been exported from the RTL for so
461 // long that I'm afraid to remove the definition.
462 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
463 
464 /*!
465 @ingroup PARALLEL
466 @param loc  source location information
467 @param global_tid  global thread number
468 
469 Enter a serialized parallel construct. This interface is used to handle a
470 conditional parallel region, like this,
471 @code
472 #pragma omp parallel if (condition)
473 @endcode
474 when the condition is false.
475 */
476 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
477 // The implementation is now in kmp_runtime.cpp so that it can share static
478 // functions with kmp_fork_call since the tasks to be done are similar in
479 // each case.
480 #if OMPT_SUPPORT
481   OMPT_STORE_RETURN_ADDRESS(global_tid);
482 #endif
483   __kmp_serialized_parallel(loc, global_tid);
484 }
485 
486 /*!
487 @ingroup PARALLEL
488 @param loc  source location information
489 @param global_tid  global thread number
490 
491 Leave a serialized parallel construct.
492 */
493 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
494   kmp_internal_control_t *top;
495   kmp_info_t *this_thr;
496   kmp_team_t *serial_team;
497 
498   KC_TRACE(10,
499            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
500 
501   /* skip all this code for autopar serialized loops since it results in
502      unacceptable overhead */
503   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
504     return;
505 
506   // Not autopar code
507   if (!TCR_4(__kmp_init_parallel))
508     __kmp_parallel_initialize();
509 
510   __kmp_resume_if_soft_paused();
511 
512   this_thr = __kmp_threads[global_tid];
513   serial_team = this_thr->th.th_serial_team;
514 
515   kmp_task_team_t *task_team = this_thr->th.th_task_team;
516   // we need to wait for the proxy tasks before finishing the thread
517   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
518     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
519 
520   KMP_MB();
521   KMP_DEBUG_ASSERT(serial_team);
522   KMP_ASSERT(serial_team->t.t_serialized);
523   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
524   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
525   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
526   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
527 
528 #if OMPT_SUPPORT
529   if (ompt_enabled.enabled &&
530       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
531     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
532     if (ompt_enabled.ompt_callback_implicit_task) {
533       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
534           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
535           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
536     }
537 
538     // reset clear the task id only after unlinking the task
539     ompt_data_t *parent_task_data;
540     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
541 
542     if (ompt_enabled.ompt_callback_parallel_end) {
543       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
544           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
545           ompt_parallel_invoker_program | ompt_parallel_team,
546           OMPT_LOAD_RETURN_ADDRESS(global_tid));
547     }
548     __ompt_lw_taskteam_unlink(this_thr);
549     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
550   }
551 #endif
552 
553   /* If necessary, pop the internal control stack values and replace the team
554    * values */
555   top = serial_team->t.t_control_stack_top;
556   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
557     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
558     serial_team->t.t_control_stack_top = top->next;
559     __kmp_free(top);
560   }
561 
562   // if( serial_team -> t.t_serialized > 1 )
563   serial_team->t.t_level--;
564 
565   /* pop dispatch buffers stack */
566   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
567   {
568     dispatch_private_info_t *disp_buffer =
569         serial_team->t.t_dispatch->th_disp_buffer;
570     serial_team->t.t_dispatch->th_disp_buffer =
571         serial_team->t.t_dispatch->th_disp_buffer->next;
572     __kmp_free(disp_buffer);
573   }
574   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
575 
576   --serial_team->t.t_serialized;
577   if (serial_team->t.t_serialized == 0) {
578 
579 /* return to the parallel section */
580 
581 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
582     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
583       __kmp_clear_x87_fpu_status_word();
584       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
585       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
586     }
587 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
588 
589     this_thr->th.th_team = serial_team->t.t_parent;
590     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
591 
592     /* restore values cached in the thread */
593     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
594     this_thr->th.th_team_master =
595         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
596     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
597 
598     /* TODO the below shouldn't need to be adjusted for serialized teams */
599     this_thr->th.th_dispatch =
600         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
601 
602     __kmp_pop_current_task_from_thread(this_thr);
603 
604     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
605     this_thr->th.th_current_task->td_flags.executing = 1;
606 
607     if (__kmp_tasking_mode != tskm_immediate_exec) {
608       // Copy the task team from the new child / old parent team to the thread.
609       this_thr->th.th_task_team =
610           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
611       KA_TRACE(20,
612                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
613                 "team %p\n",
614                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
615     }
616   } else {
617     if (__kmp_tasking_mode != tskm_immediate_exec) {
618       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
619                     "depth of serial team %p to %d\n",
620                     global_tid, serial_team, serial_team->t.t_serialized));
621     }
622   }
623 
624   if (__kmp_env_consistency_check)
625     __kmp_pop_parallel(global_tid, NULL);
626 #if OMPT_SUPPORT
627   if (ompt_enabled.enabled)
628     this_thr->th.ompt_thread_info.state =
629         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
630                                            : ompt_state_work_parallel);
631 #endif
632 }
633 
634 /*!
635 @ingroup SYNCHRONIZATION
636 @param loc  source location information.
637 
638 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
639 depending on the memory ordering convention obeyed by the compiler
640 even that may not be necessary).
641 */
642 void __kmpc_flush(ident_t *loc) {
643   KC_TRACE(10, ("__kmpc_flush: called\n"));
644 
645   /* need explicit __mf() here since use volatile instead in library */
646   KMP_MB(); /* Flush all pending memory write invalidates.  */
647 
648 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
649 #if KMP_MIC
650 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
651 // We shouldn't need it, though, since the ABI rules require that
652 // * If the compiler generates NGO stores it also generates the fence
653 // * If users hand-code NGO stores they should insert the fence
654 // therefore no incomplete unordered stores should be visible.
655 #else
656   // C74404
657   // This is to address non-temporal store instructions (sfence needed).
658   // The clflush instruction is addressed either (mfence needed).
659   // Probably the non-temporal load monvtdqa instruction should also be
660   // addressed.
661   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
662   if (!__kmp_cpuinfo.initialized) {
663     __kmp_query_cpuid(&__kmp_cpuinfo);
664   }
665   if (!__kmp_cpuinfo.sse2) {
666     // CPU cannot execute SSE2 instructions.
667   } else {
668 #if KMP_COMPILER_ICC
669     _mm_mfence();
670 #elif KMP_COMPILER_MSVC
671     MemoryBarrier();
672 #else
673     __sync_synchronize();
674 #endif // KMP_COMPILER_ICC
675   }
676 #endif // KMP_MIC
677 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
678        KMP_ARCH_RISCV64)
679 // Nothing to see here move along
680 #elif KMP_ARCH_PPC64
681 // Nothing needed here (we have a real MB above).
682 #if KMP_OS_CNK
683   // The flushing thread needs to yield here; this prevents a
684   // busy-waiting thread from saturating the pipeline. flush is
685   // often used in loops like this:
686   // while (!flag) {
687   //   #pragma omp flush(flag)
688   // }
689   // and adding the yield here is good for at least a 10x speedup
690   // when running >2 threads per core (on the NAS LU benchmark).
691   __kmp_yield();
692 #endif
693 #else
694 #error Unknown or unsupported architecture
695 #endif
696 
697 #if OMPT_SUPPORT && OMPT_OPTIONAL
698   if (ompt_enabled.ompt_callback_flush) {
699     ompt_callbacks.ompt_callback(ompt_callback_flush)(
700         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
701   }
702 #endif
703 }
704 
705 /* -------------------------------------------------------------------------- */
706 /*!
707 @ingroup SYNCHRONIZATION
708 @param loc source location information
709 @param global_tid thread id.
710 
711 Execute a barrier.
712 */
713 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
714   KMP_COUNT_BLOCK(OMP_BARRIER);
715   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
716 
717   if (!TCR_4(__kmp_init_parallel))
718     __kmp_parallel_initialize();
719 
720   __kmp_resume_if_soft_paused();
721 
722   if (__kmp_env_consistency_check) {
723     if (loc == 0) {
724       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
725     }
726     __kmp_check_barrier(global_tid, ct_barrier, loc);
727   }
728 
729 #if OMPT_SUPPORT
730   ompt_frame_t *ompt_frame;
731   if (ompt_enabled.enabled) {
732     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
733     if (ompt_frame->enter_frame.ptr == NULL)
734       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
735     OMPT_STORE_RETURN_ADDRESS(global_tid);
736   }
737 #endif
738   __kmp_threads[global_tid]->th.th_ident = loc;
739   // TODO: explicit barrier_wait_id:
740   //   this function is called when 'barrier' directive is present or
741   //   implicit barrier at the end of a worksharing construct.
742   // 1) better to add a per-thread barrier counter to a thread data structure
743   // 2) set to 0 when a new team is created
744   // 4) no sync is required
745 
746   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
747 #if OMPT_SUPPORT && OMPT_OPTIONAL
748   if (ompt_enabled.enabled) {
749     ompt_frame->enter_frame = ompt_data_none;
750   }
751 #endif
752 }
753 
754 /* The BARRIER for a MASTER section is always explicit   */
755 /*!
756 @ingroup WORK_SHARING
757 @param loc  source location information.
758 @param global_tid  global thread number .
759 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
760 */
761 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
762   int status = 0;
763 
764   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
765 
766   if (!TCR_4(__kmp_init_parallel))
767     __kmp_parallel_initialize();
768 
769   __kmp_resume_if_soft_paused();
770 
771   if (KMP_MASTER_GTID(global_tid)) {
772     KMP_COUNT_BLOCK(OMP_MASTER);
773     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
774     status = 1;
775   }
776 
777 #if OMPT_SUPPORT && OMPT_OPTIONAL
778   if (status) {
779     if (ompt_enabled.ompt_callback_master) {
780       kmp_info_t *this_thr = __kmp_threads[global_tid];
781       kmp_team_t *team = this_thr->th.th_team;
782 
783       int tid = __kmp_tid_from_gtid(global_tid);
784       ompt_callbacks.ompt_callback(ompt_callback_master)(
785           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
786           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
787           OMPT_GET_RETURN_ADDRESS(0));
788     }
789   }
790 #endif
791 
792   if (__kmp_env_consistency_check) {
793 #if KMP_USE_DYNAMIC_LOCK
794     if (status)
795       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
796     else
797       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
798 #else
799     if (status)
800       __kmp_push_sync(global_tid, ct_master, loc, NULL);
801     else
802       __kmp_check_sync(global_tid, ct_master, loc, NULL);
803 #endif
804   }
805 
806   return status;
807 }
808 
809 /*!
810 @ingroup WORK_SHARING
811 @param loc  source location information.
812 @param global_tid  global thread number .
813 
814 Mark the end of a <tt>master</tt> region. This should only be called by the
815 thread that executes the <tt>master</tt> region.
816 */
817 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
818   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
819 
820   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
821   KMP_POP_PARTITIONED_TIMER();
822 
823 #if OMPT_SUPPORT && OMPT_OPTIONAL
824   kmp_info_t *this_thr = __kmp_threads[global_tid];
825   kmp_team_t *team = this_thr->th.th_team;
826   if (ompt_enabled.ompt_callback_master) {
827     int tid = __kmp_tid_from_gtid(global_tid);
828     ompt_callbacks.ompt_callback(ompt_callback_master)(
829         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
830         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
831         OMPT_GET_RETURN_ADDRESS(0));
832   }
833 #endif
834 
835   if (__kmp_env_consistency_check) {
836     if (global_tid < 0)
837       KMP_WARNING(ThreadIdentInvalid);
838 
839     if (KMP_MASTER_GTID(global_tid))
840       __kmp_pop_sync(global_tid, ct_master, loc);
841   }
842 }
843 
844 /*!
845 @ingroup WORK_SHARING
846 @param loc  source location information.
847 @param gtid  global thread number.
848 
849 Start execution of an <tt>ordered</tt> construct.
850 */
851 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
852   int cid = 0;
853   kmp_info_t *th;
854   KMP_DEBUG_ASSERT(__kmp_init_serial);
855 
856   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
857 
858   if (!TCR_4(__kmp_init_parallel))
859     __kmp_parallel_initialize();
860 
861   __kmp_resume_if_soft_paused();
862 
863 #if USE_ITT_BUILD
864   __kmp_itt_ordered_prep(gtid);
865 // TODO: ordered_wait_id
866 #endif /* USE_ITT_BUILD */
867 
868   th = __kmp_threads[gtid];
869 
870 #if OMPT_SUPPORT && OMPT_OPTIONAL
871   kmp_team_t *team;
872   ompt_wait_id_t lck;
873   void *codeptr_ra;
874   if (ompt_enabled.enabled) {
875     OMPT_STORE_RETURN_ADDRESS(gtid);
876     team = __kmp_team_from_gtid(gtid);
877     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
878     /* OMPT state update */
879     th->th.ompt_thread_info.wait_id = lck;
880     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
881 
882     /* OMPT event callback */
883     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
884     if (ompt_enabled.ompt_callback_mutex_acquire) {
885       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
886           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
887           codeptr_ra);
888     }
889   }
890 #endif
891 
892   if (th->th.th_dispatch->th_deo_fcn != 0)
893     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
894   else
895     __kmp_parallel_deo(&gtid, &cid, loc);
896 
897 #if OMPT_SUPPORT && OMPT_OPTIONAL
898   if (ompt_enabled.enabled) {
899     /* OMPT state update */
900     th->th.ompt_thread_info.state = ompt_state_work_parallel;
901     th->th.ompt_thread_info.wait_id = 0;
902 
903     /* OMPT event callback */
904     if (ompt_enabled.ompt_callback_mutex_acquired) {
905       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
906           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
907     }
908   }
909 #endif
910 
911 #if USE_ITT_BUILD
912   __kmp_itt_ordered_start(gtid);
913 #endif /* USE_ITT_BUILD */
914 }
915 
916 /*!
917 @ingroup WORK_SHARING
918 @param loc  source location information.
919 @param gtid  global thread number.
920 
921 End execution of an <tt>ordered</tt> construct.
922 */
923 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
924   int cid = 0;
925   kmp_info_t *th;
926 
927   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
928 
929 #if USE_ITT_BUILD
930   __kmp_itt_ordered_end(gtid);
931 // TODO: ordered_wait_id
932 #endif /* USE_ITT_BUILD */
933 
934   th = __kmp_threads[gtid];
935 
936   if (th->th.th_dispatch->th_dxo_fcn != 0)
937     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
938   else
939     __kmp_parallel_dxo(&gtid, &cid, loc);
940 
941 #if OMPT_SUPPORT && OMPT_OPTIONAL
942   OMPT_STORE_RETURN_ADDRESS(gtid);
943   if (ompt_enabled.ompt_callback_mutex_released) {
944     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
945         ompt_mutex_ordered,
946         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
947             ->t.t_ordered.dt.t_value,
948         OMPT_LOAD_RETURN_ADDRESS(gtid));
949   }
950 #endif
951 }
952 
953 #if KMP_USE_DYNAMIC_LOCK
954 
955 static __forceinline void
956 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
957                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
958   // Pointer to the allocated indirect lock is written to crit, while indexing
959   // is ignored.
960   void *idx;
961   kmp_indirect_lock_t **lck;
962   lck = (kmp_indirect_lock_t **)crit;
963   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
964   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
965   KMP_SET_I_LOCK_LOCATION(ilk, loc);
966   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
967   KA_TRACE(20,
968            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
969 #if USE_ITT_BUILD
970   __kmp_itt_critical_creating(ilk->lock, loc);
971 #endif
972   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
973   if (status == 0) {
974 #if USE_ITT_BUILD
975     __kmp_itt_critical_destroyed(ilk->lock);
976 #endif
977     // We don't really need to destroy the unclaimed lock here since it will be
978     // cleaned up at program exit.
979     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
980   }
981   KMP_DEBUG_ASSERT(*lck != NULL);
982 }
983 
984 // Fast-path acquire tas lock
985 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
986   {                                                                            \
987     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
988     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
989     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
990     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
991         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
992       kmp_uint32 spins;                                                        \
993       KMP_FSYNC_PREPARE(l);                                                    \
994       KMP_INIT_YIELD(spins);                                                   \
995       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
996       do {                                                                     \
997         if (TCR_4(__kmp_nth) >                                                 \
998             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
999           KMP_YIELD(TRUE);                                                     \
1000         } else {                                                               \
1001           KMP_YIELD_SPIN(spins);                                               \
1002         }                                                                      \
1003         __kmp_spin_backoff(&backoff);                                          \
1004       } while (                                                                \
1005           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1006           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1007     }                                                                          \
1008     KMP_FSYNC_ACQUIRED(l);                                                     \
1009   }
1010 
1011 // Fast-path test tas lock
1012 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1013   {                                                                            \
1014     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1015     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1016     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1017     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1018          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1019   }
1020 
1021 // Fast-path release tas lock
1022 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1023   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1024 
1025 #if KMP_USE_FUTEX
1026 
1027 #include <sys/syscall.h>
1028 #include <unistd.h>
1029 #ifndef FUTEX_WAIT
1030 #define FUTEX_WAIT 0
1031 #endif
1032 #ifndef FUTEX_WAKE
1033 #define FUTEX_WAKE 1
1034 #endif
1035 
1036 // Fast-path acquire futex lock
1037 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1038   {                                                                            \
1039     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1040     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1041     KMP_MB();                                                                  \
1042     KMP_FSYNC_PREPARE(ftx);                                                    \
1043     kmp_int32 poll_val;                                                        \
1044     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1045                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1046                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1047       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1048       if (!cond) {                                                             \
1049         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1050                                          poll_val |                            \
1051                                              KMP_LOCK_BUSY(1, futex))) {       \
1052           continue;                                                            \
1053         }                                                                      \
1054         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1055       }                                                                        \
1056       kmp_int32 rc;                                                            \
1057       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1058                         NULL, NULL, 0)) != 0) {                                \
1059         continue;                                                              \
1060       }                                                                        \
1061       gtid_code |= 1;                                                          \
1062     }                                                                          \
1063     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1064   }
1065 
1066 // Fast-path test futex lock
1067 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1068   {                                                                            \
1069     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1070     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1071                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1072       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1073       rc = TRUE;                                                               \
1074     } else {                                                                   \
1075       rc = FALSE;                                                              \
1076     }                                                                          \
1077   }
1078 
1079 // Fast-path release futex lock
1080 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1081   {                                                                            \
1082     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1083     KMP_MB();                                                                  \
1084     KMP_FSYNC_RELEASING(ftx);                                                  \
1085     kmp_int32 poll_val =                                                       \
1086         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1087     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1088       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1089               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1090     }                                                                          \
1091     KMP_MB();                                                                  \
1092     KMP_YIELD_OVERSUB();                                                       \
1093   }
1094 
1095 #endif // KMP_USE_FUTEX
1096 
1097 #else // KMP_USE_DYNAMIC_LOCK
1098 
1099 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1100                                                       ident_t const *loc,
1101                                                       kmp_int32 gtid) {
1102   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1103 
1104   // Because of the double-check, the following load doesn't need to be volatile
1105   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1106 
1107   if (lck == NULL) {
1108     void *idx;
1109 
1110     // Allocate & initialize the lock.
1111     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1112     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1113     __kmp_init_user_lock_with_checks(lck);
1114     __kmp_set_user_lock_location(lck, loc);
1115 #if USE_ITT_BUILD
1116     __kmp_itt_critical_creating(lck);
1117 // __kmp_itt_critical_creating() should be called *before* the first usage
1118 // of underlying lock. It is the only place where we can guarantee it. There
1119 // are chances the lock will destroyed with no usage, but it is not a
1120 // problem, because this is not real event seen by user but rather setting
1121 // name for object (lock). See more details in kmp_itt.h.
1122 #endif /* USE_ITT_BUILD */
1123 
1124     // Use a cmpxchg instruction to slam the start of the critical section with
1125     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1126     // and use the lock that the other thread allocated.
1127     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1128 
1129     if (status == 0) {
1130 // Deallocate the lock and reload the value.
1131 #if USE_ITT_BUILD
1132       __kmp_itt_critical_destroyed(lck);
1133 // Let ITT know the lock is destroyed and the same memory location may be reused
1134 // for another purpose.
1135 #endif /* USE_ITT_BUILD */
1136       __kmp_destroy_user_lock_with_checks(lck);
1137       __kmp_user_lock_free(&idx, gtid, lck);
1138       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1139       KMP_DEBUG_ASSERT(lck != NULL);
1140     }
1141   }
1142   return lck;
1143 }
1144 
1145 #endif // KMP_USE_DYNAMIC_LOCK
1146 
1147 /*!
1148 @ingroup WORK_SHARING
1149 @param loc  source location information.
1150 @param global_tid  global thread number .
1151 @param crit identity of the critical section. This could be a pointer to a lock
1152 associated with the critical section, or some other suitably unique value.
1153 
1154 Enter code protected by a `critical` construct.
1155 This function blocks until the executing thread can enter the critical section.
1156 */
1157 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1158                      kmp_critical_name *crit) {
1159 #if KMP_USE_DYNAMIC_LOCK
1160 #if OMPT_SUPPORT && OMPT_OPTIONAL
1161   OMPT_STORE_RETURN_ADDRESS(global_tid);
1162 #endif // OMPT_SUPPORT
1163   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1164 #else
1165   KMP_COUNT_BLOCK(OMP_CRITICAL);
1166 #if OMPT_SUPPORT && OMPT_OPTIONAL
1167   ompt_state_t prev_state = ompt_state_undefined;
1168   ompt_thread_info_t ti;
1169 #endif
1170   kmp_user_lock_p lck;
1171 
1172   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1173 
1174   // TODO: add THR_OVHD_STATE
1175 
1176   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1177   KMP_CHECK_USER_LOCK_INIT();
1178 
1179   if ((__kmp_user_lock_kind == lk_tas) &&
1180       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1181     lck = (kmp_user_lock_p)crit;
1182   }
1183 #if KMP_USE_FUTEX
1184   else if ((__kmp_user_lock_kind == lk_futex) &&
1185            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1186     lck = (kmp_user_lock_p)crit;
1187   }
1188 #endif
1189   else { // ticket, queuing or drdpa
1190     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1191   }
1192 
1193   if (__kmp_env_consistency_check)
1194     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1195 
1196 // since the critical directive binds to all threads, not just the current
1197 // team we have to check this even if we are in a serialized team.
1198 // also, even if we are the uber thread, we still have to conduct the lock,
1199 // as we have to contend with sibling threads.
1200 
1201 #if USE_ITT_BUILD
1202   __kmp_itt_critical_acquiring(lck);
1203 #endif /* USE_ITT_BUILD */
1204 #if OMPT_SUPPORT && OMPT_OPTIONAL
1205   OMPT_STORE_RETURN_ADDRESS(gtid);
1206   void *codeptr_ra = NULL;
1207   if (ompt_enabled.enabled) {
1208     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1209     /* OMPT state update */
1210     prev_state = ti.state;
1211     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1212     ti.state = ompt_state_wait_critical;
1213 
1214     /* OMPT event callback */
1215     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1216     if (ompt_enabled.ompt_callback_mutex_acquire) {
1217       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1218           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1219           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1220     }
1221   }
1222 #endif
1223   // Value of 'crit' should be good for using as a critical_id of the critical
1224   // section directive.
1225   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1226 
1227 #if USE_ITT_BUILD
1228   __kmp_itt_critical_acquired(lck);
1229 #endif /* USE_ITT_BUILD */
1230 #if OMPT_SUPPORT && OMPT_OPTIONAL
1231   if (ompt_enabled.enabled) {
1232     /* OMPT state update */
1233     ti.state = prev_state;
1234     ti.wait_id = 0;
1235 
1236     /* OMPT event callback */
1237     if (ompt_enabled.ompt_callback_mutex_acquired) {
1238       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1239           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1240     }
1241   }
1242 #endif
1243   KMP_POP_PARTITIONED_TIMER();
1244 
1245   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1246   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1247 #endif // KMP_USE_DYNAMIC_LOCK
1248 }
1249 
1250 #if KMP_USE_DYNAMIC_LOCK
1251 
1252 // Converts the given hint to an internal lock implementation
1253 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1254 #if KMP_USE_TSX
1255 #define KMP_TSX_LOCK(seq) lockseq_##seq
1256 #else
1257 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1258 #endif
1259 
1260 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1261 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1262 #else
1263 #define KMP_CPUINFO_RTM 0
1264 #endif
1265 
1266   // Hints that do not require further logic
1267   if (hint & kmp_lock_hint_hle)
1268     return KMP_TSX_LOCK(hle);
1269   if (hint & kmp_lock_hint_rtm)
1270     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1271   if (hint & kmp_lock_hint_adaptive)
1272     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1273 
1274   // Rule out conflicting hints first by returning the default lock
1275   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1276     return __kmp_user_lock_seq;
1277   if ((hint & omp_lock_hint_speculative) &&
1278       (hint & omp_lock_hint_nonspeculative))
1279     return __kmp_user_lock_seq;
1280 
1281   // Do not even consider speculation when it appears to be contended
1282   if (hint & omp_lock_hint_contended)
1283     return lockseq_queuing;
1284 
1285   // Uncontended lock without speculation
1286   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1287     return lockseq_tas;
1288 
1289   // HLE lock for speculation
1290   if (hint & omp_lock_hint_speculative)
1291     return KMP_TSX_LOCK(hle);
1292 
1293   return __kmp_user_lock_seq;
1294 }
1295 
1296 #if OMPT_SUPPORT && OMPT_OPTIONAL
1297 #if KMP_USE_DYNAMIC_LOCK
1298 static kmp_mutex_impl_t
1299 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1300   if (user_lock) {
1301     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1302     case 0:
1303       break;
1304 #if KMP_USE_FUTEX
1305     case locktag_futex:
1306       return kmp_mutex_impl_queuing;
1307 #endif
1308     case locktag_tas:
1309       return kmp_mutex_impl_spin;
1310 #if KMP_USE_TSX
1311     case locktag_hle:
1312       return kmp_mutex_impl_speculative;
1313 #endif
1314     default:
1315       return kmp_mutex_impl_none;
1316     }
1317     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1318   }
1319   KMP_ASSERT(ilock);
1320   switch (ilock->type) {
1321 #if KMP_USE_TSX
1322   case locktag_adaptive:
1323   case locktag_rtm:
1324     return kmp_mutex_impl_speculative;
1325 #endif
1326   case locktag_nested_tas:
1327     return kmp_mutex_impl_spin;
1328 #if KMP_USE_FUTEX
1329   case locktag_nested_futex:
1330 #endif
1331   case locktag_ticket:
1332   case locktag_queuing:
1333   case locktag_drdpa:
1334   case locktag_nested_ticket:
1335   case locktag_nested_queuing:
1336   case locktag_nested_drdpa:
1337     return kmp_mutex_impl_queuing;
1338   default:
1339     return kmp_mutex_impl_none;
1340   }
1341 }
1342 #else
1343 // For locks without dynamic binding
1344 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1345   switch (__kmp_user_lock_kind) {
1346   case lk_tas:
1347     return kmp_mutex_impl_spin;
1348 #if KMP_USE_FUTEX
1349   case lk_futex:
1350 #endif
1351   case lk_ticket:
1352   case lk_queuing:
1353   case lk_drdpa:
1354     return kmp_mutex_impl_queuing;
1355 #if KMP_USE_TSX
1356   case lk_hle:
1357   case lk_rtm:
1358   case lk_adaptive:
1359     return kmp_mutex_impl_speculative;
1360 #endif
1361   default:
1362     return kmp_mutex_impl_none;
1363   }
1364 }
1365 #endif // KMP_USE_DYNAMIC_LOCK
1366 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1367 
1368 /*!
1369 @ingroup WORK_SHARING
1370 @param loc  source location information.
1371 @param global_tid  global thread number.
1372 @param crit identity of the critical section. This could be a pointer to a lock
1373 associated with the critical section, or some other suitably unique value.
1374 @param hint the lock hint.
1375 
1376 Enter code protected by a `critical` construct with a hint. The hint value is
1377 used to suggest a lock implementation. This function blocks until the executing
1378 thread can enter the critical section unless the hint suggests use of
1379 speculative execution and the hardware supports it.
1380 */
1381 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1382                                kmp_critical_name *crit, uint32_t hint) {
1383   KMP_COUNT_BLOCK(OMP_CRITICAL);
1384   kmp_user_lock_p lck;
1385 #if OMPT_SUPPORT && OMPT_OPTIONAL
1386   ompt_state_t prev_state = ompt_state_undefined;
1387   ompt_thread_info_t ti;
1388   // This is the case, if called from __kmpc_critical:
1389   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1390   if (!codeptr)
1391     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1392 #endif
1393 
1394   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1395 
1396   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1397   // Check if it is initialized.
1398   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1399   if (*lk == 0) {
1400     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1401     if (KMP_IS_D_LOCK(lckseq)) {
1402       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1403                                   KMP_GET_D_TAG(lckseq));
1404     } else {
1405       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1406     }
1407   }
1408   // Branch for accessing the actual lock object and set operation. This
1409   // branching is inevitable since this lock initialization does not follow the
1410   // normal dispatch path (lock table is not used).
1411   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1412     lck = (kmp_user_lock_p)lk;
1413     if (__kmp_env_consistency_check) {
1414       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1415                       __kmp_map_hint_to_lock(hint));
1416     }
1417 #if USE_ITT_BUILD
1418     __kmp_itt_critical_acquiring(lck);
1419 #endif
1420 #if OMPT_SUPPORT && OMPT_OPTIONAL
1421     if (ompt_enabled.enabled) {
1422       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1423       /* OMPT state update */
1424       prev_state = ti.state;
1425       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1426       ti.state = ompt_state_wait_critical;
1427 
1428       /* OMPT event callback */
1429       if (ompt_enabled.ompt_callback_mutex_acquire) {
1430         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1431             ompt_mutex_critical, (unsigned int)hint,
1432             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1433             codeptr);
1434       }
1435     }
1436 #endif
1437 #if KMP_USE_INLINED_TAS
1438     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1439       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1440     } else
1441 #elif KMP_USE_INLINED_FUTEX
1442     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1443       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1444     } else
1445 #endif
1446     {
1447       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1448     }
1449   } else {
1450     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1451     lck = ilk->lock;
1452     if (__kmp_env_consistency_check) {
1453       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1454                       __kmp_map_hint_to_lock(hint));
1455     }
1456 #if USE_ITT_BUILD
1457     __kmp_itt_critical_acquiring(lck);
1458 #endif
1459 #if OMPT_SUPPORT && OMPT_OPTIONAL
1460     if (ompt_enabled.enabled) {
1461       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1462       /* OMPT state update */
1463       prev_state = ti.state;
1464       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1465       ti.state = ompt_state_wait_critical;
1466 
1467       /* OMPT event callback */
1468       if (ompt_enabled.ompt_callback_mutex_acquire) {
1469         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1470             ompt_mutex_critical, (unsigned int)hint,
1471             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1472             codeptr);
1473       }
1474     }
1475 #endif
1476     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1477   }
1478   KMP_POP_PARTITIONED_TIMER();
1479 
1480 #if USE_ITT_BUILD
1481   __kmp_itt_critical_acquired(lck);
1482 #endif /* USE_ITT_BUILD */
1483 #if OMPT_SUPPORT && OMPT_OPTIONAL
1484   if (ompt_enabled.enabled) {
1485     /* OMPT state update */
1486     ti.state = prev_state;
1487     ti.wait_id = 0;
1488 
1489     /* OMPT event callback */
1490     if (ompt_enabled.ompt_callback_mutex_acquired) {
1491       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1492           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1493     }
1494   }
1495 #endif
1496 
1497   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1498   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1499 } // __kmpc_critical_with_hint
1500 
1501 #endif // KMP_USE_DYNAMIC_LOCK
1502 
1503 /*!
1504 @ingroup WORK_SHARING
1505 @param loc  source location information.
1506 @param global_tid  global thread number .
1507 @param crit identity of the critical section. This could be a pointer to a lock
1508 associated with the critical section, or some other suitably unique value.
1509 
1510 Leave a critical section, releasing any lock that was held during its execution.
1511 */
1512 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1513                          kmp_critical_name *crit) {
1514   kmp_user_lock_p lck;
1515 
1516   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1517 
1518 #if KMP_USE_DYNAMIC_LOCK
1519   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1520     lck = (kmp_user_lock_p)crit;
1521     KMP_ASSERT(lck != NULL);
1522     if (__kmp_env_consistency_check) {
1523       __kmp_pop_sync(global_tid, ct_critical, loc);
1524     }
1525 #if USE_ITT_BUILD
1526     __kmp_itt_critical_releasing(lck);
1527 #endif
1528 #if KMP_USE_INLINED_TAS
1529     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1530       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1531     } else
1532 #elif KMP_USE_INLINED_FUTEX
1533     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1534       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1535     } else
1536 #endif
1537     {
1538       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1539     }
1540   } else {
1541     kmp_indirect_lock_t *ilk =
1542         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1543     KMP_ASSERT(ilk != NULL);
1544     lck = ilk->lock;
1545     if (__kmp_env_consistency_check) {
1546       __kmp_pop_sync(global_tid, ct_critical, loc);
1547     }
1548 #if USE_ITT_BUILD
1549     __kmp_itt_critical_releasing(lck);
1550 #endif
1551     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1552   }
1553 
1554 #else // KMP_USE_DYNAMIC_LOCK
1555 
1556   if ((__kmp_user_lock_kind == lk_tas) &&
1557       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1558     lck = (kmp_user_lock_p)crit;
1559   }
1560 #if KMP_USE_FUTEX
1561   else if ((__kmp_user_lock_kind == lk_futex) &&
1562            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1563     lck = (kmp_user_lock_p)crit;
1564   }
1565 #endif
1566   else { // ticket, queuing or drdpa
1567     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1568   }
1569 
1570   KMP_ASSERT(lck != NULL);
1571 
1572   if (__kmp_env_consistency_check)
1573     __kmp_pop_sync(global_tid, ct_critical, loc);
1574 
1575 #if USE_ITT_BUILD
1576   __kmp_itt_critical_releasing(lck);
1577 #endif /* USE_ITT_BUILD */
1578   // Value of 'crit' should be good for using as a critical_id of the critical
1579   // section directive.
1580   __kmp_release_user_lock_with_checks(lck, global_tid);
1581 
1582 #endif // KMP_USE_DYNAMIC_LOCK
1583 
1584 #if OMPT_SUPPORT && OMPT_OPTIONAL
1585   /* OMPT release event triggers after lock is released; place here to trigger
1586    * for all #if branches */
1587   OMPT_STORE_RETURN_ADDRESS(global_tid);
1588   if (ompt_enabled.ompt_callback_mutex_released) {
1589     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1590         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1591         OMPT_LOAD_RETURN_ADDRESS(0));
1592   }
1593 #endif
1594 
1595   KMP_POP_PARTITIONED_TIMER();
1596   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1597 }
1598 
1599 /*!
1600 @ingroup SYNCHRONIZATION
1601 @param loc source location information
1602 @param global_tid thread id.
1603 @return one if the thread should execute the master block, zero otherwise
1604 
1605 Start execution of a combined barrier and master. The barrier is executed inside
1606 this function.
1607 */
1608 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1609   int status;
1610 
1611   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1612 
1613   if (!TCR_4(__kmp_init_parallel))
1614     __kmp_parallel_initialize();
1615 
1616   __kmp_resume_if_soft_paused();
1617 
1618   if (__kmp_env_consistency_check)
1619     __kmp_check_barrier(global_tid, ct_barrier, loc);
1620 
1621 #if OMPT_SUPPORT
1622   ompt_frame_t *ompt_frame;
1623   if (ompt_enabled.enabled) {
1624     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1625     if (ompt_frame->enter_frame.ptr == NULL)
1626       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1627     OMPT_STORE_RETURN_ADDRESS(global_tid);
1628   }
1629 #endif
1630 #if USE_ITT_NOTIFY
1631   __kmp_threads[global_tid]->th.th_ident = loc;
1632 #endif
1633   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1634 #if OMPT_SUPPORT && OMPT_OPTIONAL
1635   if (ompt_enabled.enabled) {
1636     ompt_frame->enter_frame = ompt_data_none;
1637   }
1638 #endif
1639 
1640   return (status != 0) ? 0 : 1;
1641 }
1642 
1643 /*!
1644 @ingroup SYNCHRONIZATION
1645 @param loc source location information
1646 @param global_tid thread id.
1647 
1648 Complete the execution of a combined barrier and master. This function should
1649 only be called at the completion of the <tt>master</tt> code. Other threads will
1650 still be waiting at the barrier and this call releases them.
1651 */
1652 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1653   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1654 
1655   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1656 }
1657 
1658 /*!
1659 @ingroup SYNCHRONIZATION
1660 @param loc source location information
1661 @param global_tid thread id.
1662 @return one if the thread should execute the master block, zero otherwise
1663 
1664 Start execution of a combined barrier and master(nowait) construct.
1665 The barrier is executed inside this function.
1666 There is no equivalent "end" function, since the
1667 */
1668 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1669   kmp_int32 ret;
1670 
1671   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1672 
1673   if (!TCR_4(__kmp_init_parallel))
1674     __kmp_parallel_initialize();
1675 
1676   __kmp_resume_if_soft_paused();
1677 
1678   if (__kmp_env_consistency_check) {
1679     if (loc == 0) {
1680       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1681     }
1682     __kmp_check_barrier(global_tid, ct_barrier, loc);
1683   }
1684 
1685 #if OMPT_SUPPORT
1686   ompt_frame_t *ompt_frame;
1687   if (ompt_enabled.enabled) {
1688     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1689     if (ompt_frame->enter_frame.ptr == NULL)
1690       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1691     OMPT_STORE_RETURN_ADDRESS(global_tid);
1692   }
1693 #endif
1694 #if USE_ITT_NOTIFY
1695   __kmp_threads[global_tid]->th.th_ident = loc;
1696 #endif
1697   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1698 #if OMPT_SUPPORT && OMPT_OPTIONAL
1699   if (ompt_enabled.enabled) {
1700     ompt_frame->enter_frame = ompt_data_none;
1701   }
1702 #endif
1703 
1704   ret = __kmpc_master(loc, global_tid);
1705 
1706   if (__kmp_env_consistency_check) {
1707     /*  there's no __kmpc_end_master called; so the (stats) */
1708     /*  actions of __kmpc_end_master are done here          */
1709 
1710     if (global_tid < 0) {
1711       KMP_WARNING(ThreadIdentInvalid);
1712     }
1713     if (ret) {
1714       /* only one thread should do the pop since only */
1715       /* one did the push (see __kmpc_master())       */
1716 
1717       __kmp_pop_sync(global_tid, ct_master, loc);
1718     }
1719   }
1720 
1721   return (ret);
1722 }
1723 
1724 /* The BARRIER for a SINGLE process section is always explicit   */
1725 /*!
1726 @ingroup WORK_SHARING
1727 @param loc  source location information
1728 @param global_tid  global thread number
1729 @return One if this thread should execute the single construct, zero otherwise.
1730 
1731 Test whether to execute a <tt>single</tt> construct.
1732 There are no implicit barriers in the two "single" calls, rather the compiler
1733 should introduce an explicit barrier if it is required.
1734 */
1735 
1736 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1737   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1738 
1739   if (rc) {
1740     // We are going to execute the single statement, so we should count it.
1741     KMP_COUNT_BLOCK(OMP_SINGLE);
1742     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1743   }
1744 
1745 #if OMPT_SUPPORT && OMPT_OPTIONAL
1746   kmp_info_t *this_thr = __kmp_threads[global_tid];
1747   kmp_team_t *team = this_thr->th.th_team;
1748   int tid = __kmp_tid_from_gtid(global_tid);
1749 
1750   if (ompt_enabled.enabled) {
1751     if (rc) {
1752       if (ompt_enabled.ompt_callback_work) {
1753         ompt_callbacks.ompt_callback(ompt_callback_work)(
1754             ompt_work_single_executor, ompt_scope_begin,
1755             &(team->t.ompt_team_info.parallel_data),
1756             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1757             1, OMPT_GET_RETURN_ADDRESS(0));
1758       }
1759     } else {
1760       if (ompt_enabled.ompt_callback_work) {
1761         ompt_callbacks.ompt_callback(ompt_callback_work)(
1762             ompt_work_single_other, ompt_scope_begin,
1763             &(team->t.ompt_team_info.parallel_data),
1764             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1765             1, OMPT_GET_RETURN_ADDRESS(0));
1766         ompt_callbacks.ompt_callback(ompt_callback_work)(
1767             ompt_work_single_other, ompt_scope_end,
1768             &(team->t.ompt_team_info.parallel_data),
1769             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1770             1, OMPT_GET_RETURN_ADDRESS(0));
1771       }
1772     }
1773   }
1774 #endif
1775 
1776   return rc;
1777 }
1778 
1779 /*!
1780 @ingroup WORK_SHARING
1781 @param loc  source location information
1782 @param global_tid  global thread number
1783 
1784 Mark the end of a <tt>single</tt> construct.  This function should
1785 only be called by the thread that executed the block of code protected
1786 by the `single` construct.
1787 */
1788 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1789   __kmp_exit_single(global_tid);
1790   KMP_POP_PARTITIONED_TIMER();
1791 
1792 #if OMPT_SUPPORT && OMPT_OPTIONAL
1793   kmp_info_t *this_thr = __kmp_threads[global_tid];
1794   kmp_team_t *team = this_thr->th.th_team;
1795   int tid = __kmp_tid_from_gtid(global_tid);
1796 
1797   if (ompt_enabled.ompt_callback_work) {
1798     ompt_callbacks.ompt_callback(ompt_callback_work)(
1799         ompt_work_single_executor, ompt_scope_end,
1800         &(team->t.ompt_team_info.parallel_data),
1801         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1802         OMPT_GET_RETURN_ADDRESS(0));
1803   }
1804 #endif
1805 }
1806 
1807 /*!
1808 @ingroup WORK_SHARING
1809 @param loc Source location
1810 @param global_tid Global thread id
1811 
1812 Mark the end of a statically scheduled loop.
1813 */
1814 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1815   KMP_POP_PARTITIONED_TIMER();
1816   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1817 
1818 #if OMPT_SUPPORT && OMPT_OPTIONAL
1819   if (ompt_enabled.ompt_callback_work) {
1820     ompt_work_t ompt_work_type = ompt_work_loop;
1821     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1822     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1823     // Determine workshare type
1824     if (loc != NULL) {
1825       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1826         ompt_work_type = ompt_work_loop;
1827       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1828         ompt_work_type = ompt_work_sections;
1829       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1830         ompt_work_type = ompt_work_distribute;
1831       } else {
1832         // use default set above.
1833         // a warning about this case is provided in __kmpc_for_static_init
1834       }
1835       KMP_DEBUG_ASSERT(ompt_work_type);
1836     }
1837     ompt_callbacks.ompt_callback(ompt_callback_work)(
1838         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1839         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1840   }
1841 #endif
1842   if (__kmp_env_consistency_check)
1843     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1844 }
1845 
1846 // User routines which take C-style arguments (call by value)
1847 // different from the Fortran equivalent routines
1848 
1849 void ompc_set_num_threads(int arg) {
1850   // !!!!! TODO: check the per-task binding
1851   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1852 }
1853 
1854 void ompc_set_dynamic(int flag) {
1855   kmp_info_t *thread;
1856 
1857   /* For the thread-private implementation of the internal controls */
1858   thread = __kmp_entry_thread();
1859 
1860   __kmp_save_internal_controls(thread);
1861 
1862   set__dynamic(thread, flag ? TRUE : FALSE);
1863 }
1864 
1865 void ompc_set_nested(int flag) {
1866   kmp_info_t *thread;
1867 
1868   /* For the thread-private internal controls implementation */
1869   thread = __kmp_entry_thread();
1870 
1871   __kmp_save_internal_controls(thread);
1872 
1873   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1874 }
1875 
1876 void ompc_set_max_active_levels(int max_active_levels) {
1877   /* TO DO */
1878   /* we want per-task implementation of this internal control */
1879 
1880   /* For the per-thread internal controls implementation */
1881   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1882 }
1883 
1884 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1885   // !!!!! TODO: check the per-task binding
1886   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1887 }
1888 
1889 int ompc_get_ancestor_thread_num(int level) {
1890   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1891 }
1892 
1893 int ompc_get_team_size(int level) {
1894   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1895 }
1896 
1897 /* OpenMP 5.0 Affinity Format API */
1898 
1899 void ompc_set_affinity_format(char const *format) {
1900   if (!__kmp_init_serial) {
1901     __kmp_serial_initialize();
1902   }
1903   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1904                          format, KMP_STRLEN(format) + 1);
1905 }
1906 
1907 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1908   size_t format_size;
1909   if (!__kmp_init_serial) {
1910     __kmp_serial_initialize();
1911   }
1912   format_size = KMP_STRLEN(__kmp_affinity_format);
1913   if (buffer && size) {
1914     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1915                            format_size + 1);
1916   }
1917   return format_size;
1918 }
1919 
1920 void ompc_display_affinity(char const *format) {
1921   int gtid;
1922   if (!TCR_4(__kmp_init_middle)) {
1923     __kmp_middle_initialize();
1924   }
1925   gtid = __kmp_get_gtid();
1926   __kmp_aux_display_affinity(gtid, format);
1927 }
1928 
1929 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1930                              char const *format) {
1931   int gtid;
1932   size_t num_required;
1933   kmp_str_buf_t capture_buf;
1934   if (!TCR_4(__kmp_init_middle)) {
1935     __kmp_middle_initialize();
1936   }
1937   gtid = __kmp_get_gtid();
1938   __kmp_str_buf_init(&capture_buf);
1939   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1940   if (buffer && buf_size) {
1941     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1942                            capture_buf.used + 1);
1943   }
1944   __kmp_str_buf_free(&capture_buf);
1945   return num_required;
1946 }
1947 
1948 void kmpc_set_stacksize(int arg) {
1949   // __kmp_aux_set_stacksize initializes the library if needed
1950   __kmp_aux_set_stacksize(arg);
1951 }
1952 
1953 void kmpc_set_stacksize_s(size_t arg) {
1954   // __kmp_aux_set_stacksize initializes the library if needed
1955   __kmp_aux_set_stacksize(arg);
1956 }
1957 
1958 void kmpc_set_blocktime(int arg) {
1959   int gtid, tid;
1960   kmp_info_t *thread;
1961 
1962   gtid = __kmp_entry_gtid();
1963   tid = __kmp_tid_from_gtid(gtid);
1964   thread = __kmp_thread_from_gtid(gtid);
1965 
1966   __kmp_aux_set_blocktime(arg, thread, tid);
1967 }
1968 
1969 void kmpc_set_library(int arg) {
1970   // __kmp_user_set_library initializes the library if needed
1971   __kmp_user_set_library((enum library_type)arg);
1972 }
1973 
1974 void kmpc_set_defaults(char const *str) {
1975   // __kmp_aux_set_defaults initializes the library if needed
1976   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1977 }
1978 
1979 void kmpc_set_disp_num_buffers(int arg) {
1980   // ignore after initialization because some teams have already
1981   // allocated dispatch buffers
1982   if (__kmp_init_serial == 0 && arg > 0)
1983     __kmp_dispatch_num_buffers = arg;
1984 }
1985 
1986 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1987 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1988   return -1;
1989 #else
1990   if (!TCR_4(__kmp_init_middle)) {
1991     __kmp_middle_initialize();
1992   }
1993   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1994 #endif
1995 }
1996 
1997 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1998 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1999   return -1;
2000 #else
2001   if (!TCR_4(__kmp_init_middle)) {
2002     __kmp_middle_initialize();
2003   }
2004   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2005 #endif
2006 }
2007 
2008 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2009 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2010   return -1;
2011 #else
2012   if (!TCR_4(__kmp_init_middle)) {
2013     __kmp_middle_initialize();
2014   }
2015   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2016 #endif
2017 }
2018 
2019 /* -------------------------------------------------------------------------- */
2020 /*!
2021 @ingroup THREADPRIVATE
2022 @param loc       source location information
2023 @param gtid      global thread number
2024 @param cpy_size  size of the cpy_data buffer
2025 @param cpy_data  pointer to data to be copied
2026 @param cpy_func  helper function to call for copying data
2027 @param didit     flag variable: 1=single thread; 0=not single thread
2028 
2029 __kmpc_copyprivate implements the interface for the private data broadcast
2030 needed for the copyprivate clause associated with a single region in an
2031 OpenMP<sup>*</sup> program (both C and Fortran).
2032 All threads participating in the parallel region call this routine.
2033 One of the threads (called the single thread) should have the <tt>didit</tt>
2034 variable set to 1 and all other threads should have that variable set to 0.
2035 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2036 
2037 The OpenMP specification forbids the use of nowait on the single region when a
2038 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2039 barrier internally to avoid race conditions, so the code generation for the
2040 single region should avoid generating a barrier after the call to @ref
2041 __kmpc_copyprivate.
2042 
2043 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2044 The <tt>loc</tt> parameter is a pointer to source location information.
2045 
2046 Internal implementation: The single thread will first copy its descriptor
2047 address (cpy_data) to a team-private location, then the other threads will each
2048 call the function pointed to by the parameter cpy_func, which carries out the
2049 copy by copying the data using the cpy_data buffer.
2050 
2051 The cpy_func routine used for the copy and the contents of the data area defined
2052 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2053 to be done. For instance, the cpy_data buffer can hold the actual data to be
2054 copied or it may hold a list of pointers to the data. The cpy_func routine must
2055 interpret the cpy_data buffer appropriately.
2056 
2057 The interface to cpy_func is as follows:
2058 @code
2059 void cpy_func( void *destination, void *source )
2060 @endcode
2061 where void *destination is the cpy_data pointer for the thread being copied to
2062 and void *source is the cpy_data pointer for the thread being copied from.
2063 */
2064 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2065                         void *cpy_data, void (*cpy_func)(void *, void *),
2066                         kmp_int32 didit) {
2067   void **data_ptr;
2068 
2069   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2070 
2071   KMP_MB();
2072 
2073   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2074 
2075   if (__kmp_env_consistency_check) {
2076     if (loc == 0) {
2077       KMP_WARNING(ConstructIdentInvalid);
2078     }
2079   }
2080 
2081   // ToDo: Optimize the following two barriers into some kind of split barrier
2082 
2083   if (didit)
2084     *data_ptr = cpy_data;
2085 
2086 #if OMPT_SUPPORT
2087   ompt_frame_t *ompt_frame;
2088   if (ompt_enabled.enabled) {
2089     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2090     if (ompt_frame->enter_frame.ptr == NULL)
2091       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2092     OMPT_STORE_RETURN_ADDRESS(gtid);
2093   }
2094 #endif
2095 /* This barrier is not a barrier region boundary */
2096 #if USE_ITT_NOTIFY
2097   __kmp_threads[gtid]->th.th_ident = loc;
2098 #endif
2099   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2100 
2101   if (!didit)
2102     (*cpy_func)(cpy_data, *data_ptr);
2103 
2104 // Consider next barrier a user-visible barrier for barrier region boundaries
2105 // Nesting checks are already handled by the single construct checks
2106 
2107 #if OMPT_SUPPORT
2108   if (ompt_enabled.enabled) {
2109     OMPT_STORE_RETURN_ADDRESS(gtid);
2110   }
2111 #endif
2112 #if USE_ITT_NOTIFY
2113   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2114 // tasks can overwrite the location)
2115 #endif
2116   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2117 #if OMPT_SUPPORT && OMPT_OPTIONAL
2118   if (ompt_enabled.enabled) {
2119     ompt_frame->enter_frame = ompt_data_none;
2120   }
2121 #endif
2122 }
2123 
2124 /* -------------------------------------------------------------------------- */
2125 
2126 #define INIT_LOCK __kmp_init_user_lock_with_checks
2127 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2128 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2129 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2130 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2131 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2132   __kmp_acquire_nested_user_lock_with_checks_timed
2133 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2134 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2135 #define TEST_LOCK __kmp_test_user_lock_with_checks
2136 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2137 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2138 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2139 
2140 // TODO: Make check abort messages use location info & pass it into
2141 // with_checks routines
2142 
2143 #if KMP_USE_DYNAMIC_LOCK
2144 
2145 // internal lock initializer
2146 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2147                                                     kmp_dyna_lockseq_t seq) {
2148   if (KMP_IS_D_LOCK(seq)) {
2149     KMP_INIT_D_LOCK(lock, seq);
2150 #if USE_ITT_BUILD
2151     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2152 #endif
2153   } else {
2154     KMP_INIT_I_LOCK(lock, seq);
2155 #if USE_ITT_BUILD
2156     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2157     __kmp_itt_lock_creating(ilk->lock, loc);
2158 #endif
2159   }
2160 }
2161 
2162 // internal nest lock initializer
2163 static __forceinline void
2164 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2165                                kmp_dyna_lockseq_t seq) {
2166 #if KMP_USE_TSX
2167   // Don't have nested lock implementation for speculative locks
2168   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2169     seq = __kmp_user_lock_seq;
2170 #endif
2171   switch (seq) {
2172   case lockseq_tas:
2173     seq = lockseq_nested_tas;
2174     break;
2175 #if KMP_USE_FUTEX
2176   case lockseq_futex:
2177     seq = lockseq_nested_futex;
2178     break;
2179 #endif
2180   case lockseq_ticket:
2181     seq = lockseq_nested_ticket;
2182     break;
2183   case lockseq_queuing:
2184     seq = lockseq_nested_queuing;
2185     break;
2186   case lockseq_drdpa:
2187     seq = lockseq_nested_drdpa;
2188     break;
2189   default:
2190     seq = lockseq_nested_queuing;
2191   }
2192   KMP_INIT_I_LOCK(lock, seq);
2193 #if USE_ITT_BUILD
2194   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2195   __kmp_itt_lock_creating(ilk->lock, loc);
2196 #endif
2197 }
2198 
2199 /* initialize the lock with a hint */
2200 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2201                                 uintptr_t hint) {
2202   KMP_DEBUG_ASSERT(__kmp_init_serial);
2203   if (__kmp_env_consistency_check && user_lock == NULL) {
2204     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2205   }
2206 
2207   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2208 
2209 #if OMPT_SUPPORT && OMPT_OPTIONAL
2210   // This is the case, if called from omp_init_lock_with_hint:
2211   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2212   if (!codeptr)
2213     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2214   if (ompt_enabled.ompt_callback_lock_init) {
2215     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2216         ompt_mutex_lock, (omp_lock_hint_t)hint,
2217         __ompt_get_mutex_impl_type(user_lock),
2218         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2219   }
2220 #endif
2221 }
2222 
2223 /* initialize the lock with a hint */
2224 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2225                                      void **user_lock, uintptr_t hint) {
2226   KMP_DEBUG_ASSERT(__kmp_init_serial);
2227   if (__kmp_env_consistency_check && user_lock == NULL) {
2228     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2229   }
2230 
2231   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2232 
2233 #if OMPT_SUPPORT && OMPT_OPTIONAL
2234   // This is the case, if called from omp_init_lock_with_hint:
2235   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2236   if (!codeptr)
2237     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2238   if (ompt_enabled.ompt_callback_lock_init) {
2239     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2240         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2241         __ompt_get_mutex_impl_type(user_lock),
2242         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2243   }
2244 #endif
2245 }
2246 
2247 #endif // KMP_USE_DYNAMIC_LOCK
2248 
2249 /* initialize the lock */
2250 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2251 #if KMP_USE_DYNAMIC_LOCK
2252 
2253   KMP_DEBUG_ASSERT(__kmp_init_serial);
2254   if (__kmp_env_consistency_check && user_lock == NULL) {
2255     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2256   }
2257   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2258 
2259 #if OMPT_SUPPORT && OMPT_OPTIONAL
2260   // This is the case, if called from omp_init_lock_with_hint:
2261   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2262   if (!codeptr)
2263     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2264   if (ompt_enabled.ompt_callback_lock_init) {
2265     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2266         ompt_mutex_lock, omp_lock_hint_none,
2267         __ompt_get_mutex_impl_type(user_lock),
2268         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2269   }
2270 #endif
2271 
2272 #else // KMP_USE_DYNAMIC_LOCK
2273 
2274   static char const *const func = "omp_init_lock";
2275   kmp_user_lock_p lck;
2276   KMP_DEBUG_ASSERT(__kmp_init_serial);
2277 
2278   if (__kmp_env_consistency_check) {
2279     if (user_lock == NULL) {
2280       KMP_FATAL(LockIsUninitialized, func);
2281     }
2282   }
2283 
2284   KMP_CHECK_USER_LOCK_INIT();
2285 
2286   if ((__kmp_user_lock_kind == lk_tas) &&
2287       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2288     lck = (kmp_user_lock_p)user_lock;
2289   }
2290 #if KMP_USE_FUTEX
2291   else if ((__kmp_user_lock_kind == lk_futex) &&
2292            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2293     lck = (kmp_user_lock_p)user_lock;
2294   }
2295 #endif
2296   else {
2297     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2298   }
2299   INIT_LOCK(lck);
2300   __kmp_set_user_lock_location(lck, loc);
2301 
2302 #if OMPT_SUPPORT && OMPT_OPTIONAL
2303   // This is the case, if called from omp_init_lock_with_hint:
2304   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2305   if (!codeptr)
2306     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2307   if (ompt_enabled.ompt_callback_lock_init) {
2308     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2309         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2310         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2311   }
2312 #endif
2313 
2314 #if USE_ITT_BUILD
2315   __kmp_itt_lock_creating(lck);
2316 #endif /* USE_ITT_BUILD */
2317 
2318 #endif // KMP_USE_DYNAMIC_LOCK
2319 } // __kmpc_init_lock
2320 
2321 /* initialize the lock */
2322 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2323 #if KMP_USE_DYNAMIC_LOCK
2324 
2325   KMP_DEBUG_ASSERT(__kmp_init_serial);
2326   if (__kmp_env_consistency_check && user_lock == NULL) {
2327     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2328   }
2329   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2330 
2331 #if OMPT_SUPPORT && OMPT_OPTIONAL
2332   // This is the case, if called from omp_init_lock_with_hint:
2333   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2334   if (!codeptr)
2335     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2336   if (ompt_enabled.ompt_callback_lock_init) {
2337     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2338         ompt_mutex_nest_lock, omp_lock_hint_none,
2339         __ompt_get_mutex_impl_type(user_lock),
2340         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2341   }
2342 #endif
2343 
2344 #else // KMP_USE_DYNAMIC_LOCK
2345 
2346   static char const *const func = "omp_init_nest_lock";
2347   kmp_user_lock_p lck;
2348   KMP_DEBUG_ASSERT(__kmp_init_serial);
2349 
2350   if (__kmp_env_consistency_check) {
2351     if (user_lock == NULL) {
2352       KMP_FATAL(LockIsUninitialized, func);
2353     }
2354   }
2355 
2356   KMP_CHECK_USER_LOCK_INIT();
2357 
2358   if ((__kmp_user_lock_kind == lk_tas) &&
2359       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2360        OMP_NEST_LOCK_T_SIZE)) {
2361     lck = (kmp_user_lock_p)user_lock;
2362   }
2363 #if KMP_USE_FUTEX
2364   else if ((__kmp_user_lock_kind == lk_futex) &&
2365            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2366             OMP_NEST_LOCK_T_SIZE)) {
2367     lck = (kmp_user_lock_p)user_lock;
2368   }
2369 #endif
2370   else {
2371     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2372   }
2373 
2374   INIT_NESTED_LOCK(lck);
2375   __kmp_set_user_lock_location(lck, loc);
2376 
2377 #if OMPT_SUPPORT && OMPT_OPTIONAL
2378   // This is the case, if called from omp_init_lock_with_hint:
2379   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2380   if (!codeptr)
2381     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2382   if (ompt_enabled.ompt_callback_lock_init) {
2383     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2384         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2385         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2386   }
2387 #endif
2388 
2389 #if USE_ITT_BUILD
2390   __kmp_itt_lock_creating(lck);
2391 #endif /* USE_ITT_BUILD */
2392 
2393 #endif // KMP_USE_DYNAMIC_LOCK
2394 } // __kmpc_init_nest_lock
2395 
2396 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2397 #if KMP_USE_DYNAMIC_LOCK
2398 
2399 #if USE_ITT_BUILD
2400   kmp_user_lock_p lck;
2401   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2402     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2403   } else {
2404     lck = (kmp_user_lock_p)user_lock;
2405   }
2406   __kmp_itt_lock_destroyed(lck);
2407 #endif
2408 #if OMPT_SUPPORT && OMPT_OPTIONAL
2409   // This is the case, if called from omp_init_lock_with_hint:
2410   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2411   if (!codeptr)
2412     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2413   if (ompt_enabled.ompt_callback_lock_destroy) {
2414     kmp_user_lock_p lck;
2415     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2416       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2417     } else {
2418       lck = (kmp_user_lock_p)user_lock;
2419     }
2420     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2421         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2422   }
2423 #endif
2424   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2425 #else
2426   kmp_user_lock_p lck;
2427 
2428   if ((__kmp_user_lock_kind == lk_tas) &&
2429       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2430     lck = (kmp_user_lock_p)user_lock;
2431   }
2432 #if KMP_USE_FUTEX
2433   else if ((__kmp_user_lock_kind == lk_futex) &&
2434            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2435     lck = (kmp_user_lock_p)user_lock;
2436   }
2437 #endif
2438   else {
2439     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2440   }
2441 
2442 #if OMPT_SUPPORT && OMPT_OPTIONAL
2443   // This is the case, if called from omp_init_lock_with_hint:
2444   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2445   if (!codeptr)
2446     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2447   if (ompt_enabled.ompt_callback_lock_destroy) {
2448     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2449         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2450   }
2451 #endif
2452 
2453 #if USE_ITT_BUILD
2454   __kmp_itt_lock_destroyed(lck);
2455 #endif /* USE_ITT_BUILD */
2456   DESTROY_LOCK(lck);
2457 
2458   if ((__kmp_user_lock_kind == lk_tas) &&
2459       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2460     ;
2461   }
2462 #if KMP_USE_FUTEX
2463   else if ((__kmp_user_lock_kind == lk_futex) &&
2464            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2465     ;
2466   }
2467 #endif
2468   else {
2469     __kmp_user_lock_free(user_lock, gtid, lck);
2470   }
2471 #endif // KMP_USE_DYNAMIC_LOCK
2472 } // __kmpc_destroy_lock
2473 
2474 /* destroy the lock */
2475 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2476 #if KMP_USE_DYNAMIC_LOCK
2477 
2478 #if USE_ITT_BUILD
2479   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2480   __kmp_itt_lock_destroyed(ilk->lock);
2481 #endif
2482 #if OMPT_SUPPORT && OMPT_OPTIONAL
2483   // This is the case, if called from omp_init_lock_with_hint:
2484   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2485   if (!codeptr)
2486     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2487   if (ompt_enabled.ompt_callback_lock_destroy) {
2488     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2489         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2490   }
2491 #endif
2492   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2493 
2494 #else // KMP_USE_DYNAMIC_LOCK
2495 
2496   kmp_user_lock_p lck;
2497 
2498   if ((__kmp_user_lock_kind == lk_tas) &&
2499       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2500        OMP_NEST_LOCK_T_SIZE)) {
2501     lck = (kmp_user_lock_p)user_lock;
2502   }
2503 #if KMP_USE_FUTEX
2504   else if ((__kmp_user_lock_kind == lk_futex) &&
2505            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2506             OMP_NEST_LOCK_T_SIZE)) {
2507     lck = (kmp_user_lock_p)user_lock;
2508   }
2509 #endif
2510   else {
2511     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2512   }
2513 
2514 #if OMPT_SUPPORT && OMPT_OPTIONAL
2515   // This is the case, if called from omp_init_lock_with_hint:
2516   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2517   if (!codeptr)
2518     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2519   if (ompt_enabled.ompt_callback_lock_destroy) {
2520     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2521         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2522   }
2523 #endif
2524 
2525 #if USE_ITT_BUILD
2526   __kmp_itt_lock_destroyed(lck);
2527 #endif /* USE_ITT_BUILD */
2528 
2529   DESTROY_NESTED_LOCK(lck);
2530 
2531   if ((__kmp_user_lock_kind == lk_tas) &&
2532       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2533        OMP_NEST_LOCK_T_SIZE)) {
2534     ;
2535   }
2536 #if KMP_USE_FUTEX
2537   else if ((__kmp_user_lock_kind == lk_futex) &&
2538            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2539             OMP_NEST_LOCK_T_SIZE)) {
2540     ;
2541   }
2542 #endif
2543   else {
2544     __kmp_user_lock_free(user_lock, gtid, lck);
2545   }
2546 #endif // KMP_USE_DYNAMIC_LOCK
2547 } // __kmpc_destroy_nest_lock
2548 
2549 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2550   KMP_COUNT_BLOCK(OMP_set_lock);
2551 #if KMP_USE_DYNAMIC_LOCK
2552   int tag = KMP_EXTRACT_D_TAG(user_lock);
2553 #if USE_ITT_BUILD
2554   __kmp_itt_lock_acquiring(
2555       (kmp_user_lock_p)
2556           user_lock); // itt function will get to the right lock object.
2557 #endif
2558 #if OMPT_SUPPORT && OMPT_OPTIONAL
2559   // This is the case, if called from omp_init_lock_with_hint:
2560   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2561   if (!codeptr)
2562     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2563   if (ompt_enabled.ompt_callback_mutex_acquire) {
2564     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2565         ompt_mutex_lock, omp_lock_hint_none,
2566         __ompt_get_mutex_impl_type(user_lock),
2567         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2568   }
2569 #endif
2570 #if KMP_USE_INLINED_TAS
2571   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2572     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2573   } else
2574 #elif KMP_USE_INLINED_FUTEX
2575   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2576     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2577   } else
2578 #endif
2579   {
2580     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2581   }
2582 #if USE_ITT_BUILD
2583   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2584 #endif
2585 #if OMPT_SUPPORT && OMPT_OPTIONAL
2586   if (ompt_enabled.ompt_callback_mutex_acquired) {
2587     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2588         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2589   }
2590 #endif
2591 
2592 #else // KMP_USE_DYNAMIC_LOCK
2593 
2594   kmp_user_lock_p lck;
2595 
2596   if ((__kmp_user_lock_kind == lk_tas) &&
2597       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2598     lck = (kmp_user_lock_p)user_lock;
2599   }
2600 #if KMP_USE_FUTEX
2601   else if ((__kmp_user_lock_kind == lk_futex) &&
2602            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2603     lck = (kmp_user_lock_p)user_lock;
2604   }
2605 #endif
2606   else {
2607     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2608   }
2609 
2610 #if USE_ITT_BUILD
2611   __kmp_itt_lock_acquiring(lck);
2612 #endif /* USE_ITT_BUILD */
2613 #if OMPT_SUPPORT && OMPT_OPTIONAL
2614   // This is the case, if called from omp_init_lock_with_hint:
2615   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2616   if (!codeptr)
2617     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2618   if (ompt_enabled.ompt_callback_mutex_acquire) {
2619     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2620         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2621         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2622   }
2623 #endif
2624 
2625   ACQUIRE_LOCK(lck, gtid);
2626 
2627 #if USE_ITT_BUILD
2628   __kmp_itt_lock_acquired(lck);
2629 #endif /* USE_ITT_BUILD */
2630 
2631 #if OMPT_SUPPORT && OMPT_OPTIONAL
2632   if (ompt_enabled.ompt_callback_mutex_acquired) {
2633     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2634         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2635   }
2636 #endif
2637 
2638 #endif // KMP_USE_DYNAMIC_LOCK
2639 }
2640 
2641 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2642 #if KMP_USE_DYNAMIC_LOCK
2643 
2644 #if USE_ITT_BUILD
2645   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2646 #endif
2647 #if OMPT_SUPPORT && OMPT_OPTIONAL
2648   // This is the case, if called from omp_init_lock_with_hint:
2649   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2650   if (!codeptr)
2651     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2652   if (ompt_enabled.enabled) {
2653     if (ompt_enabled.ompt_callback_mutex_acquire) {
2654       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2655           ompt_mutex_nest_lock, omp_lock_hint_none,
2656           __ompt_get_mutex_impl_type(user_lock),
2657           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2658     }
2659   }
2660 #endif
2661   int acquire_status =
2662       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2663   (void) acquire_status;
2664 #if USE_ITT_BUILD
2665   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2666 #endif
2667 
2668 #if OMPT_SUPPORT && OMPT_OPTIONAL
2669   if (ompt_enabled.enabled) {
2670     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2671       if (ompt_enabled.ompt_callback_mutex_acquired) {
2672         // lock_first
2673         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2674             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2675             codeptr);
2676       }
2677     } else {
2678       if (ompt_enabled.ompt_callback_nest_lock) {
2679         // lock_next
2680         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2681             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2682       }
2683     }
2684   }
2685 #endif
2686 
2687 #else // KMP_USE_DYNAMIC_LOCK
2688   int acquire_status;
2689   kmp_user_lock_p lck;
2690 
2691   if ((__kmp_user_lock_kind == lk_tas) &&
2692       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2693        OMP_NEST_LOCK_T_SIZE)) {
2694     lck = (kmp_user_lock_p)user_lock;
2695   }
2696 #if KMP_USE_FUTEX
2697   else if ((__kmp_user_lock_kind == lk_futex) &&
2698            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2699             OMP_NEST_LOCK_T_SIZE)) {
2700     lck = (kmp_user_lock_p)user_lock;
2701   }
2702 #endif
2703   else {
2704     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2705   }
2706 
2707 #if USE_ITT_BUILD
2708   __kmp_itt_lock_acquiring(lck);
2709 #endif /* USE_ITT_BUILD */
2710 #if OMPT_SUPPORT && OMPT_OPTIONAL
2711   // This is the case, if called from omp_init_lock_with_hint:
2712   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2713   if (!codeptr)
2714     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2715   if (ompt_enabled.enabled) {
2716     if (ompt_enabled.ompt_callback_mutex_acquire) {
2717       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2718           ompt_mutex_nest_lock, omp_lock_hint_none,
2719           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2720           codeptr);
2721     }
2722   }
2723 #endif
2724 
2725   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2726 
2727 #if USE_ITT_BUILD
2728   __kmp_itt_lock_acquired(lck);
2729 #endif /* USE_ITT_BUILD */
2730 
2731 #if OMPT_SUPPORT && OMPT_OPTIONAL
2732   if (ompt_enabled.enabled) {
2733     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2734       if (ompt_enabled.ompt_callback_mutex_acquired) {
2735         // lock_first
2736         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2737             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2738       }
2739     } else {
2740       if (ompt_enabled.ompt_callback_nest_lock) {
2741         // lock_next
2742         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2743             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2744       }
2745     }
2746   }
2747 #endif
2748 
2749 #endif // KMP_USE_DYNAMIC_LOCK
2750 }
2751 
2752 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2753 #if KMP_USE_DYNAMIC_LOCK
2754 
2755   int tag = KMP_EXTRACT_D_TAG(user_lock);
2756 #if USE_ITT_BUILD
2757   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2758 #endif
2759 #if KMP_USE_INLINED_TAS
2760   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2761     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2762   } else
2763 #elif KMP_USE_INLINED_FUTEX
2764   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2765     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2766   } else
2767 #endif
2768   {
2769     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2770   }
2771 
2772 #if OMPT_SUPPORT && OMPT_OPTIONAL
2773   // This is the case, if called from omp_init_lock_with_hint:
2774   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2775   if (!codeptr)
2776     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2777   if (ompt_enabled.ompt_callback_mutex_released) {
2778     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2779         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2780   }
2781 #endif
2782 
2783 #else // KMP_USE_DYNAMIC_LOCK
2784 
2785   kmp_user_lock_p lck;
2786 
2787   /* Can't use serial interval since not block structured */
2788   /* release the lock */
2789 
2790   if ((__kmp_user_lock_kind == lk_tas) &&
2791       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2792 #if KMP_OS_LINUX &&                                                            \
2793     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2794 // "fast" path implemented to fix customer performance issue
2795 #if USE_ITT_BUILD
2796     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2797 #endif /* USE_ITT_BUILD */
2798     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2799     KMP_MB();
2800 
2801 #if OMPT_SUPPORT && OMPT_OPTIONAL
2802     // This is the case, if called from omp_init_lock_with_hint:
2803     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2804     if (!codeptr)
2805       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2806     if (ompt_enabled.ompt_callback_mutex_released) {
2807       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2808           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2809     }
2810 #endif
2811 
2812     return;
2813 #else
2814     lck = (kmp_user_lock_p)user_lock;
2815 #endif
2816   }
2817 #if KMP_USE_FUTEX
2818   else if ((__kmp_user_lock_kind == lk_futex) &&
2819            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2820     lck = (kmp_user_lock_p)user_lock;
2821   }
2822 #endif
2823   else {
2824     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2825   }
2826 
2827 #if USE_ITT_BUILD
2828   __kmp_itt_lock_releasing(lck);
2829 #endif /* USE_ITT_BUILD */
2830 
2831   RELEASE_LOCK(lck, gtid);
2832 
2833 #if OMPT_SUPPORT && OMPT_OPTIONAL
2834   // This is the case, if called from omp_init_lock_with_hint:
2835   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2836   if (!codeptr)
2837     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2838   if (ompt_enabled.ompt_callback_mutex_released) {
2839     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2840         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2841   }
2842 #endif
2843 
2844 #endif // KMP_USE_DYNAMIC_LOCK
2845 }
2846 
2847 /* release the lock */
2848 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2849 #if KMP_USE_DYNAMIC_LOCK
2850 
2851 #if USE_ITT_BUILD
2852   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2853 #endif
2854   int release_status =
2855       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2856   (void) release_status;
2857 
2858 #if OMPT_SUPPORT && OMPT_OPTIONAL
2859   // This is the case, if called from omp_init_lock_with_hint:
2860   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2861   if (!codeptr)
2862     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2863   if (ompt_enabled.enabled) {
2864     if (release_status == KMP_LOCK_RELEASED) {
2865       if (ompt_enabled.ompt_callback_mutex_released) {
2866         // release_lock_last
2867         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2868             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2869             codeptr);
2870       }
2871     } else if (ompt_enabled.ompt_callback_nest_lock) {
2872       // release_lock_prev
2873       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2874           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2875     }
2876   }
2877 #endif
2878 
2879 #else // KMP_USE_DYNAMIC_LOCK
2880 
2881   kmp_user_lock_p lck;
2882 
2883   /* Can't use serial interval since not block structured */
2884 
2885   if ((__kmp_user_lock_kind == lk_tas) &&
2886       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2887        OMP_NEST_LOCK_T_SIZE)) {
2888 #if KMP_OS_LINUX &&                                                            \
2889     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2890     // "fast" path implemented to fix customer performance issue
2891     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2892 #if USE_ITT_BUILD
2893     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2894 #endif /* USE_ITT_BUILD */
2895 
2896 #if OMPT_SUPPORT && OMPT_OPTIONAL
2897     int release_status = KMP_LOCK_STILL_HELD;
2898 #endif
2899 
2900     if (--(tl->lk.depth_locked) == 0) {
2901       TCW_4(tl->lk.poll, 0);
2902 #if OMPT_SUPPORT && OMPT_OPTIONAL
2903       release_status = KMP_LOCK_RELEASED;
2904 #endif
2905     }
2906     KMP_MB();
2907 
2908 #if OMPT_SUPPORT && OMPT_OPTIONAL
2909     // This is the case, if called from omp_init_lock_with_hint:
2910     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2911     if (!codeptr)
2912       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2913     if (ompt_enabled.enabled) {
2914       if (release_status == KMP_LOCK_RELEASED) {
2915         if (ompt_enabled.ompt_callback_mutex_released) {
2916           // release_lock_last
2917           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2918               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2919         }
2920       } else if (ompt_enabled.ompt_callback_nest_lock) {
2921         // release_lock_previous
2922         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2923             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2924       }
2925     }
2926 #endif
2927 
2928     return;
2929 #else
2930     lck = (kmp_user_lock_p)user_lock;
2931 #endif
2932   }
2933 #if KMP_USE_FUTEX
2934   else if ((__kmp_user_lock_kind == lk_futex) &&
2935            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2936             OMP_NEST_LOCK_T_SIZE)) {
2937     lck = (kmp_user_lock_p)user_lock;
2938   }
2939 #endif
2940   else {
2941     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2942   }
2943 
2944 #if USE_ITT_BUILD
2945   __kmp_itt_lock_releasing(lck);
2946 #endif /* USE_ITT_BUILD */
2947 
2948   int release_status;
2949   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2950 #if OMPT_SUPPORT && OMPT_OPTIONAL
2951   // This is the case, if called from omp_init_lock_with_hint:
2952   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2953   if (!codeptr)
2954     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2955   if (ompt_enabled.enabled) {
2956     if (release_status == KMP_LOCK_RELEASED) {
2957       if (ompt_enabled.ompt_callback_mutex_released) {
2958         // release_lock_last
2959         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2960             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2961       }
2962     } else if (ompt_enabled.ompt_callback_nest_lock) {
2963       // release_lock_previous
2964       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2965           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2966     }
2967   }
2968 #endif
2969 
2970 #endif // KMP_USE_DYNAMIC_LOCK
2971 }
2972 
2973 /* try to acquire the lock */
2974 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2975   KMP_COUNT_BLOCK(OMP_test_lock);
2976 
2977 #if KMP_USE_DYNAMIC_LOCK
2978   int rc;
2979   int tag = KMP_EXTRACT_D_TAG(user_lock);
2980 #if USE_ITT_BUILD
2981   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2982 #endif
2983 #if OMPT_SUPPORT && OMPT_OPTIONAL
2984   // This is the case, if called from omp_init_lock_with_hint:
2985   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2986   if (!codeptr)
2987     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2988   if (ompt_enabled.ompt_callback_mutex_acquire) {
2989     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2990         ompt_mutex_lock, omp_lock_hint_none,
2991         __ompt_get_mutex_impl_type(user_lock),
2992         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2993   }
2994 #endif
2995 #if KMP_USE_INLINED_TAS
2996   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2997     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2998   } else
2999 #elif KMP_USE_INLINED_FUTEX
3000   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3001     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3002   } else
3003 #endif
3004   {
3005     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3006   }
3007   if (rc) {
3008 #if USE_ITT_BUILD
3009     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3010 #endif
3011 #if OMPT_SUPPORT && OMPT_OPTIONAL
3012     if (ompt_enabled.ompt_callback_mutex_acquired) {
3013       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3014           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3015     }
3016 #endif
3017     return FTN_TRUE;
3018   } else {
3019 #if USE_ITT_BUILD
3020     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3021 #endif
3022     return FTN_FALSE;
3023   }
3024 
3025 #else // KMP_USE_DYNAMIC_LOCK
3026 
3027   kmp_user_lock_p lck;
3028   int rc;
3029 
3030   if ((__kmp_user_lock_kind == lk_tas) &&
3031       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3032     lck = (kmp_user_lock_p)user_lock;
3033   }
3034 #if KMP_USE_FUTEX
3035   else if ((__kmp_user_lock_kind == lk_futex) &&
3036            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3037     lck = (kmp_user_lock_p)user_lock;
3038   }
3039 #endif
3040   else {
3041     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3042   }
3043 
3044 #if USE_ITT_BUILD
3045   __kmp_itt_lock_acquiring(lck);
3046 #endif /* USE_ITT_BUILD */
3047 #if OMPT_SUPPORT && OMPT_OPTIONAL
3048   // This is the case, if called from omp_init_lock_with_hint:
3049   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3050   if (!codeptr)
3051     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3052   if (ompt_enabled.ompt_callback_mutex_acquire) {
3053     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3054         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3055         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3056   }
3057 #endif
3058 
3059   rc = TEST_LOCK(lck, gtid);
3060 #if USE_ITT_BUILD
3061   if (rc) {
3062     __kmp_itt_lock_acquired(lck);
3063   } else {
3064     __kmp_itt_lock_cancelled(lck);
3065   }
3066 #endif /* USE_ITT_BUILD */
3067 #if OMPT_SUPPORT && OMPT_OPTIONAL
3068   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3069     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3070         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3071   }
3072 #endif
3073 
3074   return (rc ? FTN_TRUE : FTN_FALSE);
3075 
3076 /* Can't use serial interval since not block structured */
3077 
3078 #endif // KMP_USE_DYNAMIC_LOCK
3079 }
3080 
3081 /* try to acquire the lock */
3082 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3083 #if KMP_USE_DYNAMIC_LOCK
3084   int rc;
3085 #if USE_ITT_BUILD
3086   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3087 #endif
3088 #if OMPT_SUPPORT && OMPT_OPTIONAL
3089   // This is the case, if called from omp_init_lock_with_hint:
3090   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3091   if (!codeptr)
3092     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3093   if (ompt_enabled.ompt_callback_mutex_acquire) {
3094     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3095         ompt_mutex_nest_lock, omp_lock_hint_none,
3096         __ompt_get_mutex_impl_type(user_lock),
3097         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3098   }
3099 #endif
3100   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3101 #if USE_ITT_BUILD
3102   if (rc) {
3103     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3104   } else {
3105     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3106   }
3107 #endif
3108 #if OMPT_SUPPORT && OMPT_OPTIONAL
3109   if (ompt_enabled.enabled && rc) {
3110     if (rc == 1) {
3111       if (ompt_enabled.ompt_callback_mutex_acquired) {
3112         // lock_first
3113         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3114             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3115             codeptr);
3116       }
3117     } else {
3118       if (ompt_enabled.ompt_callback_nest_lock) {
3119         // lock_next
3120         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3121             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3122       }
3123     }
3124   }
3125 #endif
3126   return rc;
3127 
3128 #else // KMP_USE_DYNAMIC_LOCK
3129 
3130   kmp_user_lock_p lck;
3131   int rc;
3132 
3133   if ((__kmp_user_lock_kind == lk_tas) &&
3134       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3135        OMP_NEST_LOCK_T_SIZE)) {
3136     lck = (kmp_user_lock_p)user_lock;
3137   }
3138 #if KMP_USE_FUTEX
3139   else if ((__kmp_user_lock_kind == lk_futex) &&
3140            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3141             OMP_NEST_LOCK_T_SIZE)) {
3142     lck = (kmp_user_lock_p)user_lock;
3143   }
3144 #endif
3145   else {
3146     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3147   }
3148 
3149 #if USE_ITT_BUILD
3150   __kmp_itt_lock_acquiring(lck);
3151 #endif /* USE_ITT_BUILD */
3152 
3153 #if OMPT_SUPPORT && OMPT_OPTIONAL
3154   // This is the case, if called from omp_init_lock_with_hint:
3155   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3156   if (!codeptr)
3157     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3158   if (ompt_enabled.enabled) &&
3159         ompt_enabled.ompt_callback_mutex_acquire) {
3160       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3161           ompt_mutex_nest_lock, omp_lock_hint_none,
3162           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3163           codeptr);
3164     }
3165 #endif
3166 
3167   rc = TEST_NESTED_LOCK(lck, gtid);
3168 #if USE_ITT_BUILD
3169   if (rc) {
3170     __kmp_itt_lock_acquired(lck);
3171   } else {
3172     __kmp_itt_lock_cancelled(lck);
3173   }
3174 #endif /* USE_ITT_BUILD */
3175 #if OMPT_SUPPORT && OMPT_OPTIONAL
3176   if (ompt_enabled.enabled && rc) {
3177     if (rc == 1) {
3178       if (ompt_enabled.ompt_callback_mutex_acquired) {
3179         // lock_first
3180         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3181             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3182       }
3183     } else {
3184       if (ompt_enabled.ompt_callback_nest_lock) {
3185         // lock_next
3186         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3187             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3188       }
3189     }
3190   }
3191 #endif
3192   return rc;
3193 
3194 /* Can't use serial interval since not block structured */
3195 
3196 #endif // KMP_USE_DYNAMIC_LOCK
3197 }
3198 
3199 // Interface to fast scalable reduce methods routines
3200 
3201 // keep the selected method in a thread local structure for cross-function
3202 // usage: will be used in __kmpc_end_reduce* functions;
3203 // another solution: to re-determine the method one more time in
3204 // __kmpc_end_reduce* functions (new prototype required then)
3205 // AT: which solution is better?
3206 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3207   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3208 
3209 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3210   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3211 
3212 // description of the packed_reduction_method variable: look at the macros in
3213 // kmp.h
3214 
3215 // used in a critical section reduce block
3216 static __forceinline void
3217 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3218                                           kmp_critical_name *crit) {
3219 
3220   // this lock was visible to a customer and to the threading profile tool as a
3221   // serial overhead span (although it's used for an internal purpose only)
3222   //            why was it visible in previous implementation?
3223   //            should we keep it visible in new reduce block?
3224   kmp_user_lock_p lck;
3225 
3226 #if KMP_USE_DYNAMIC_LOCK
3227 
3228   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3229   // Check if it is initialized.
3230   if (*lk == 0) {
3231     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3232       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3233                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3234     } else {
3235       __kmp_init_indirect_csptr(crit, loc, global_tid,
3236                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3237     }
3238   }
3239   // Branch for accessing the actual lock object and set operation. This
3240   // branching is inevitable since this lock initialization does not follow the
3241   // normal dispatch path (lock table is not used).
3242   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3243     lck = (kmp_user_lock_p)lk;
3244     KMP_DEBUG_ASSERT(lck != NULL);
3245     if (__kmp_env_consistency_check) {
3246       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3247     }
3248     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3249   } else {
3250     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3251     lck = ilk->lock;
3252     KMP_DEBUG_ASSERT(lck != NULL);
3253     if (__kmp_env_consistency_check) {
3254       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3255     }
3256     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3257   }
3258 
3259 #else // KMP_USE_DYNAMIC_LOCK
3260 
3261   // We know that the fast reduction code is only emitted by Intel compilers
3262   // with 32 byte critical sections. If there isn't enough space, then we
3263   // have to use a pointer.
3264   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3265     lck = (kmp_user_lock_p)crit;
3266   } else {
3267     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3268   }
3269   KMP_DEBUG_ASSERT(lck != NULL);
3270 
3271   if (__kmp_env_consistency_check)
3272     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3273 
3274   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3275 
3276 #endif // KMP_USE_DYNAMIC_LOCK
3277 }
3278 
3279 // used in a critical section reduce block
3280 static __forceinline void
3281 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3282                                         kmp_critical_name *crit) {
3283 
3284   kmp_user_lock_p lck;
3285 
3286 #if KMP_USE_DYNAMIC_LOCK
3287 
3288   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3289     lck = (kmp_user_lock_p)crit;
3290     if (__kmp_env_consistency_check)
3291       __kmp_pop_sync(global_tid, ct_critical, loc);
3292     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3293   } else {
3294     kmp_indirect_lock_t *ilk =
3295         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3296     if (__kmp_env_consistency_check)
3297       __kmp_pop_sync(global_tid, ct_critical, loc);
3298     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3299   }
3300 
3301 #else // KMP_USE_DYNAMIC_LOCK
3302 
3303   // We know that the fast reduction code is only emitted by Intel compilers
3304   // with 32 byte critical sections. If there isn't enough space, then we have
3305   // to use a pointer.
3306   if (__kmp_base_user_lock_size > 32) {
3307     lck = *((kmp_user_lock_p *)crit);
3308     KMP_ASSERT(lck != NULL);
3309   } else {
3310     lck = (kmp_user_lock_p)crit;
3311   }
3312 
3313   if (__kmp_env_consistency_check)
3314     __kmp_pop_sync(global_tid, ct_critical, loc);
3315 
3316   __kmp_release_user_lock_with_checks(lck, global_tid);
3317 
3318 #endif // KMP_USE_DYNAMIC_LOCK
3319 } // __kmp_end_critical_section_reduce_block
3320 
3321 static __forceinline int
3322 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3323                                      int *task_state) {
3324   kmp_team_t *team;
3325 
3326   // Check if we are inside the teams construct?
3327   if (th->th.th_teams_microtask) {
3328     *team_p = team = th->th.th_team;
3329     if (team->t.t_level == th->th.th_teams_level) {
3330       // This is reduction at teams construct.
3331       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3332       // Let's swap teams temporarily for the reduction.
3333       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3334       th->th.th_team = team->t.t_parent;
3335       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3336       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3337       *task_state = th->th.th_task_state;
3338       th->th.th_task_state = 0;
3339 
3340       return 1;
3341     }
3342   }
3343   return 0;
3344 }
3345 
3346 static __forceinline void
3347 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3348   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3349   th->th.th_info.ds.ds_tid = 0;
3350   th->th.th_team = team;
3351   th->th.th_team_nproc = team->t.t_nproc;
3352   th->th.th_task_team = team->t.t_task_team[task_state];
3353   th->th.th_task_state = task_state;
3354 }
3355 
3356 /* 2.a.i. Reduce Block without a terminating barrier */
3357 /*!
3358 @ingroup SYNCHRONIZATION
3359 @param loc source location information
3360 @param global_tid global thread number
3361 @param num_vars number of items (variables) to be reduced
3362 @param reduce_size size of data in bytes to be reduced
3363 @param reduce_data pointer to data to be reduced
3364 @param reduce_func callback function providing reduction operation on two
3365 operands and returning result of reduction in lhs_data
3366 @param lck pointer to the unique lock data structure
3367 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3368 threads if atomic reduction needed
3369 
3370 The nowait version is used for a reduce clause with the nowait argument.
3371 */
3372 kmp_int32
3373 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3374                      size_t reduce_size, void *reduce_data,
3375                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3376                      kmp_critical_name *lck) {
3377 
3378   KMP_COUNT_BLOCK(REDUCE_nowait);
3379   int retval = 0;
3380   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3381   kmp_info_t *th;
3382   kmp_team_t *team;
3383   int teams_swapped = 0, task_state;
3384   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3385 
3386   // why do we need this initialization here at all?
3387   // Reduction clause can not be used as a stand-alone directive.
3388 
3389   // do not call __kmp_serial_initialize(), it will be called by
3390   // __kmp_parallel_initialize() if needed
3391   // possible detection of false-positive race by the threadchecker ???
3392   if (!TCR_4(__kmp_init_parallel))
3393     __kmp_parallel_initialize();
3394 
3395   __kmp_resume_if_soft_paused();
3396 
3397 // check correctness of reduce block nesting
3398 #if KMP_USE_DYNAMIC_LOCK
3399   if (__kmp_env_consistency_check)
3400     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3401 #else
3402   if (__kmp_env_consistency_check)
3403     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3404 #endif
3405 
3406   th = __kmp_thread_from_gtid(global_tid);
3407   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3408 
3409   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3410   // the value should be kept in a variable
3411   // the variable should be either a construct-specific or thread-specific
3412   // property, not a team specific property
3413   //     (a thread can reach the next reduce block on the next construct, reduce
3414   //     method may differ on the next construct)
3415   // an ident_t "loc" parameter could be used as a construct-specific property
3416   // (what if loc == 0?)
3417   //     (if both construct-specific and team-specific variables were shared,
3418   //     then unness extra syncs should be needed)
3419   // a thread-specific variable is better regarding two issues above (next
3420   // construct and extra syncs)
3421   // a thread-specific "th_local.reduction_method" variable is used currently
3422   // each thread executes 'determine' and 'set' lines (no need to execute by one
3423   // thread, to avoid unness extra syncs)
3424 
3425   packed_reduction_method = __kmp_determine_reduction_method(
3426       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3427   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3428 
3429   OMPT_REDUCTION_DECL(th, global_tid);
3430   if (packed_reduction_method == critical_reduce_block) {
3431 
3432     OMPT_REDUCTION_BEGIN;
3433 
3434     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3435     retval = 1;
3436 
3437   } else if (packed_reduction_method == empty_reduce_block) {
3438 
3439     OMPT_REDUCTION_BEGIN;
3440 
3441     // usage: if team size == 1, no synchronization is required ( Intel
3442     // platforms only )
3443     retval = 1;
3444 
3445   } else if (packed_reduction_method == atomic_reduce_block) {
3446 
3447     retval = 2;
3448 
3449     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3450     // won't be called by the code gen)
3451     //     (it's not quite good, because the checking block has been closed by
3452     //     this 'pop',
3453     //      but atomic operation has not been executed yet, will be executed
3454     //      slightly later, literally on next instruction)
3455     if (__kmp_env_consistency_check)
3456       __kmp_pop_sync(global_tid, ct_reduce, loc);
3457 
3458   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3459                                    tree_reduce_block)) {
3460 
3461 // AT: performance issue: a real barrier here
3462 // AT:     (if master goes slow, other threads are blocked here waiting for the
3463 // master to come and release them)
3464 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3465 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3466 // be confusing to a customer)
3467 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3468 // might go faster and be more in line with sense of NOWAIT
3469 // AT: TO DO: do epcc test and compare times
3470 
3471 // this barrier should be invisible to a customer and to the threading profile
3472 // tool (it's neither a terminating barrier nor customer's code, it's
3473 // used for an internal purpose)
3474 #if OMPT_SUPPORT
3475     // JP: can this barrier potentially leed to task scheduling?
3476     // JP: as long as there is a barrier in the implementation, OMPT should and
3477     // will provide the barrier events
3478     //         so we set-up the necessary frame/return addresses.
3479     ompt_frame_t *ompt_frame;
3480     if (ompt_enabled.enabled) {
3481       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3482       if (ompt_frame->enter_frame.ptr == NULL)
3483         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3484       OMPT_STORE_RETURN_ADDRESS(global_tid);
3485     }
3486 #endif
3487 #if USE_ITT_NOTIFY
3488     __kmp_threads[global_tid]->th.th_ident = loc;
3489 #endif
3490     retval =
3491         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3492                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3493     retval = (retval != 0) ? (0) : (1);
3494 #if OMPT_SUPPORT && OMPT_OPTIONAL
3495     if (ompt_enabled.enabled) {
3496       ompt_frame->enter_frame = ompt_data_none;
3497     }
3498 #endif
3499 
3500     // all other workers except master should do this pop here
3501     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3502     if (__kmp_env_consistency_check) {
3503       if (retval == 0) {
3504         __kmp_pop_sync(global_tid, ct_reduce, loc);
3505       }
3506     }
3507 
3508   } else {
3509 
3510     // should never reach this block
3511     KMP_ASSERT(0); // "unexpected method"
3512   }
3513   if (teams_swapped) {
3514     __kmp_restore_swapped_teams(th, team, task_state);
3515   }
3516   KA_TRACE(
3517       10,
3518       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3519        global_tid, packed_reduction_method, retval));
3520 
3521   return retval;
3522 }
3523 
3524 /*!
3525 @ingroup SYNCHRONIZATION
3526 @param loc source location information
3527 @param global_tid global thread id.
3528 @param lck pointer to the unique lock data structure
3529 
3530 Finish the execution of a reduce nowait.
3531 */
3532 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3533                               kmp_critical_name *lck) {
3534 
3535   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3536 
3537   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3538 
3539   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3540 
3541   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3542 
3543   if (packed_reduction_method == critical_reduce_block) {
3544 
3545     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3546     OMPT_REDUCTION_END;
3547 
3548   } else if (packed_reduction_method == empty_reduce_block) {
3549 
3550     // usage: if team size == 1, no synchronization is required ( on Intel
3551     // platforms only )
3552 
3553     OMPT_REDUCTION_END;
3554 
3555   } else if (packed_reduction_method == atomic_reduce_block) {
3556 
3557     // neither master nor other workers should get here
3558     //     (code gen does not generate this call in case 2: atomic reduce block)
3559     // actually it's better to remove this elseif at all;
3560     // after removal this value will checked by the 'else' and will assert
3561 
3562   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3563                                    tree_reduce_block)) {
3564 
3565     // only master gets here
3566     // OMPT: tree reduction is annotated in the barrier code
3567 
3568   } else {
3569 
3570     // should never reach this block
3571     KMP_ASSERT(0); // "unexpected method"
3572   }
3573 
3574   if (__kmp_env_consistency_check)
3575     __kmp_pop_sync(global_tid, ct_reduce, loc);
3576 
3577   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3578                 global_tid, packed_reduction_method));
3579 
3580   return;
3581 }
3582 
3583 /* 2.a.ii. Reduce Block with a terminating barrier */
3584 
3585 /*!
3586 @ingroup SYNCHRONIZATION
3587 @param loc source location information
3588 @param global_tid global thread number
3589 @param num_vars number of items (variables) to be reduced
3590 @param reduce_size size of data in bytes to be reduced
3591 @param reduce_data pointer to data to be reduced
3592 @param reduce_func callback function providing reduction operation on two
3593 operands and returning result of reduction in lhs_data
3594 @param lck pointer to the unique lock data structure
3595 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3596 threads if atomic reduction needed
3597 
3598 A blocking reduce that includes an implicit barrier.
3599 */
3600 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3601                         size_t reduce_size, void *reduce_data,
3602                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3603                         kmp_critical_name *lck) {
3604   KMP_COUNT_BLOCK(REDUCE_wait);
3605   int retval = 0;
3606   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3607   kmp_info_t *th;
3608   kmp_team_t *team;
3609   int teams_swapped = 0, task_state;
3610 
3611   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3612 
3613   // why do we need this initialization here at all?
3614   // Reduction clause can not be a stand-alone directive.
3615 
3616   // do not call __kmp_serial_initialize(), it will be called by
3617   // __kmp_parallel_initialize() if needed
3618   // possible detection of false-positive race by the threadchecker ???
3619   if (!TCR_4(__kmp_init_parallel))
3620     __kmp_parallel_initialize();
3621 
3622   __kmp_resume_if_soft_paused();
3623 
3624 // check correctness of reduce block nesting
3625 #if KMP_USE_DYNAMIC_LOCK
3626   if (__kmp_env_consistency_check)
3627     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3628 #else
3629   if (__kmp_env_consistency_check)
3630     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3631 #endif
3632 
3633   th = __kmp_thread_from_gtid(global_tid);
3634   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3635 
3636   packed_reduction_method = __kmp_determine_reduction_method(
3637       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3638   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3639 
3640   OMPT_REDUCTION_DECL(th, global_tid);
3641 
3642   if (packed_reduction_method == critical_reduce_block) {
3643 
3644     OMPT_REDUCTION_BEGIN;
3645     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3646     retval = 1;
3647 
3648   } else if (packed_reduction_method == empty_reduce_block) {
3649 
3650     OMPT_REDUCTION_BEGIN;
3651     // usage: if team size == 1, no synchronization is required ( Intel
3652     // platforms only )
3653     retval = 1;
3654 
3655   } else if (packed_reduction_method == atomic_reduce_block) {
3656 
3657     retval = 2;
3658 
3659   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3660                                    tree_reduce_block)) {
3661 
3662 // case tree_reduce_block:
3663 // this barrier should be visible to a customer and to the threading profile
3664 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3665 #if OMPT_SUPPORT
3666     ompt_frame_t *ompt_frame;
3667     if (ompt_enabled.enabled) {
3668       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3669       if (ompt_frame->enter_frame.ptr == NULL)
3670         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3671       OMPT_STORE_RETURN_ADDRESS(global_tid);
3672     }
3673 #endif
3674 #if USE_ITT_NOTIFY
3675     __kmp_threads[global_tid]->th.th_ident =
3676         loc; // needed for correct notification of frames
3677 #endif
3678     retval =
3679         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3680                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3681     retval = (retval != 0) ? (0) : (1);
3682 #if OMPT_SUPPORT && OMPT_OPTIONAL
3683     if (ompt_enabled.enabled) {
3684       ompt_frame->enter_frame = ompt_data_none;
3685     }
3686 #endif
3687 
3688     // all other workers except master should do this pop here
3689     // ( none of other workers except master will enter __kmpc_end_reduce() )
3690     if (__kmp_env_consistency_check) {
3691       if (retval == 0) { // 0: all other workers; 1: master
3692         __kmp_pop_sync(global_tid, ct_reduce, loc);
3693       }
3694     }
3695 
3696   } else {
3697 
3698     // should never reach this block
3699     KMP_ASSERT(0); // "unexpected method"
3700   }
3701   if (teams_swapped) {
3702     __kmp_restore_swapped_teams(th, team, task_state);
3703   }
3704 
3705   KA_TRACE(10,
3706            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3707             global_tid, packed_reduction_method, retval));
3708   return retval;
3709 }
3710 
3711 /*!
3712 @ingroup SYNCHRONIZATION
3713 @param loc source location information
3714 @param global_tid global thread id.
3715 @param lck pointer to the unique lock data structure
3716 
3717 Finish the execution of a blocking reduce.
3718 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3719 start function.
3720 */
3721 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3722                        kmp_critical_name *lck) {
3723 
3724   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3725   kmp_info_t *th;
3726   kmp_team_t *team;
3727   int teams_swapped = 0, task_state;
3728 
3729   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3730 
3731   th = __kmp_thread_from_gtid(global_tid);
3732   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3733 
3734   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3735 
3736   // this barrier should be visible to a customer and to the threading profile
3737   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3738   OMPT_REDUCTION_DECL(th, global_tid);
3739 
3740   if (packed_reduction_method == critical_reduce_block) {
3741     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3742 
3743     OMPT_REDUCTION_END;
3744 
3745 // TODO: implicit barrier: should be exposed
3746 #if OMPT_SUPPORT
3747     ompt_frame_t *ompt_frame;
3748     if (ompt_enabled.enabled) {
3749       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3750       if (ompt_frame->enter_frame.ptr == NULL)
3751         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3752       OMPT_STORE_RETURN_ADDRESS(global_tid);
3753     }
3754 #endif
3755 #if USE_ITT_NOTIFY
3756     __kmp_threads[global_tid]->th.th_ident = loc;
3757 #endif
3758     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3759 #if OMPT_SUPPORT && OMPT_OPTIONAL
3760     if (ompt_enabled.enabled) {
3761       ompt_frame->enter_frame = ompt_data_none;
3762     }
3763 #endif
3764 
3765   } else if (packed_reduction_method == empty_reduce_block) {
3766 
3767     OMPT_REDUCTION_END;
3768 
3769 // usage: if team size==1, no synchronization is required (Intel platforms only)
3770 
3771 // TODO: implicit barrier: should be exposed
3772 #if OMPT_SUPPORT
3773     ompt_frame_t *ompt_frame;
3774     if (ompt_enabled.enabled) {
3775       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3776       if (ompt_frame->enter_frame.ptr == NULL)
3777         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3778       OMPT_STORE_RETURN_ADDRESS(global_tid);
3779     }
3780 #endif
3781 #if USE_ITT_NOTIFY
3782     __kmp_threads[global_tid]->th.th_ident = loc;
3783 #endif
3784     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3785 #if OMPT_SUPPORT && OMPT_OPTIONAL
3786     if (ompt_enabled.enabled) {
3787       ompt_frame->enter_frame = ompt_data_none;
3788     }
3789 #endif
3790 
3791   } else if (packed_reduction_method == atomic_reduce_block) {
3792 
3793 #if OMPT_SUPPORT
3794     ompt_frame_t *ompt_frame;
3795     if (ompt_enabled.enabled) {
3796       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3797       if (ompt_frame->enter_frame.ptr == NULL)
3798         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3799       OMPT_STORE_RETURN_ADDRESS(global_tid);
3800     }
3801 #endif
3802 // TODO: implicit barrier: should be exposed
3803 #if USE_ITT_NOTIFY
3804     __kmp_threads[global_tid]->th.th_ident = loc;
3805 #endif
3806     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3807 #if OMPT_SUPPORT && OMPT_OPTIONAL
3808     if (ompt_enabled.enabled) {
3809       ompt_frame->enter_frame = ompt_data_none;
3810     }
3811 #endif
3812 
3813   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3814                                    tree_reduce_block)) {
3815 
3816     // only master executes here (master releases all other workers)
3817     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3818                             global_tid);
3819 
3820   } else {
3821 
3822     // should never reach this block
3823     KMP_ASSERT(0); // "unexpected method"
3824   }
3825   if (teams_swapped) {
3826     __kmp_restore_swapped_teams(th, team, task_state);
3827   }
3828 
3829   if (__kmp_env_consistency_check)
3830     __kmp_pop_sync(global_tid, ct_reduce, loc);
3831 
3832   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3833                 global_tid, packed_reduction_method));
3834 
3835   return;
3836 }
3837 
3838 #undef __KMP_GET_REDUCTION_METHOD
3839 #undef __KMP_SET_REDUCTION_METHOD
3840 
3841 /* end of interface to fast scalable reduce routines */
3842 
3843 kmp_uint64 __kmpc_get_taskid() {
3844 
3845   kmp_int32 gtid;
3846   kmp_info_t *thread;
3847 
3848   gtid = __kmp_get_gtid();
3849   if (gtid < 0) {
3850     return 0;
3851   }
3852   thread = __kmp_thread_from_gtid(gtid);
3853   return thread->th.th_current_task->td_task_id;
3854 
3855 } // __kmpc_get_taskid
3856 
3857 kmp_uint64 __kmpc_get_parent_taskid() {
3858 
3859   kmp_int32 gtid;
3860   kmp_info_t *thread;
3861   kmp_taskdata_t *parent_task;
3862 
3863   gtid = __kmp_get_gtid();
3864   if (gtid < 0) {
3865     return 0;
3866   }
3867   thread = __kmp_thread_from_gtid(gtid);
3868   parent_task = thread->th.th_current_task->td_parent;
3869   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3870 
3871 } // __kmpc_get_parent_taskid
3872 
3873 /*!
3874 @ingroup WORK_SHARING
3875 @param loc  source location information.
3876 @param gtid  global thread number.
3877 @param num_dims  number of associated doacross loops.
3878 @param dims  info on loops bounds.
3879 
3880 Initialize doacross loop information.
3881 Expect compiler send us inclusive bounds,
3882 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3883 */
3884 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3885                           const struct kmp_dim *dims) {
3886   int j, idx;
3887   kmp_int64 last, trace_count;
3888   kmp_info_t *th = __kmp_threads[gtid];
3889   kmp_team_t *team = th->th.th_team;
3890   kmp_uint32 *flags;
3891   kmp_disp_t *pr_buf = th->th.th_dispatch;
3892   dispatch_shared_info_t *sh_buf;
3893 
3894   KA_TRACE(
3895       20,
3896       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3897        gtid, num_dims, !team->t.t_serialized));
3898   KMP_DEBUG_ASSERT(dims != NULL);
3899   KMP_DEBUG_ASSERT(num_dims > 0);
3900 
3901   if (team->t.t_serialized) {
3902     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3903     return; // no dependencies if team is serialized
3904   }
3905   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3906   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3907   // the next loop
3908   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3909 
3910   // Save bounds info into allocated private buffer
3911   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3912   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3913       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3914   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3915   pr_buf->th_doacross_info[0] =
3916       (kmp_int64)num_dims; // first element is number of dimensions
3917   // Save also address of num_done in order to access it later without knowing
3918   // the buffer index
3919   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3920   pr_buf->th_doacross_info[2] = dims[0].lo;
3921   pr_buf->th_doacross_info[3] = dims[0].up;
3922   pr_buf->th_doacross_info[4] = dims[0].st;
3923   last = 5;
3924   for (j = 1; j < num_dims; ++j) {
3925     kmp_int64
3926         range_length; // To keep ranges of all dimensions but the first dims[0]
3927     if (dims[j].st == 1) { // most common case
3928       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3929       range_length = dims[j].up - dims[j].lo + 1;
3930     } else {
3931       if (dims[j].st > 0) {
3932         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3933         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3934       } else { // negative increment
3935         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3936         range_length =
3937             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3938       }
3939     }
3940     pr_buf->th_doacross_info[last++] = range_length;
3941     pr_buf->th_doacross_info[last++] = dims[j].lo;
3942     pr_buf->th_doacross_info[last++] = dims[j].up;
3943     pr_buf->th_doacross_info[last++] = dims[j].st;
3944   }
3945 
3946   // Compute total trip count.
3947   // Start with range of dims[0] which we don't need to keep in the buffer.
3948   if (dims[0].st == 1) { // most common case
3949     trace_count = dims[0].up - dims[0].lo + 1;
3950   } else if (dims[0].st > 0) {
3951     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3952     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3953   } else { // negative increment
3954     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3955     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3956   }
3957   for (j = 1; j < num_dims; ++j) {
3958     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3959   }
3960   KMP_DEBUG_ASSERT(trace_count > 0);
3961 
3962   // Check if shared buffer is not occupied by other loop (idx -
3963   // __kmp_dispatch_num_buffers)
3964   if (idx != sh_buf->doacross_buf_idx) {
3965     // Shared buffer is occupied, wait for it to be free
3966     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3967                  __kmp_eq_4, NULL);
3968   }
3969 #if KMP_32_BIT_ARCH
3970   // Check if we are the first thread. After the CAS the first thread gets 0,
3971   // others get 1 if initialization is in progress, allocated pointer otherwise.
3972   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3973   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3974       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3975 #else
3976   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3977       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3978 #endif
3979   if (flags == NULL) {
3980     // we are the first thread, allocate the array of flags
3981     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3982     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3983     KMP_MB();
3984     sh_buf->doacross_flags = flags;
3985   } else if (flags == (kmp_uint32 *)1) {
3986 #if KMP_32_BIT_ARCH
3987     // initialization is still in progress, need to wait
3988     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3989 #else
3990     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3991 #endif
3992       KMP_YIELD(TRUE);
3993     KMP_MB();
3994   } else {
3995     KMP_MB();
3996   }
3997   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3998   pr_buf->th_doacross_flags =
3999       sh_buf->doacross_flags; // save private copy in order to not
4000   // touch shared buffer on each iteration
4001   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4002 }
4003 
4004 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4005   kmp_int32 shft, num_dims, i;
4006   kmp_uint32 flag;
4007   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4008   kmp_info_t *th = __kmp_threads[gtid];
4009   kmp_team_t *team = th->th.th_team;
4010   kmp_disp_t *pr_buf;
4011   kmp_int64 lo, up, st;
4012 
4013   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4014   if (team->t.t_serialized) {
4015     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4016     return; // no dependencies if team is serialized
4017   }
4018 
4019   // calculate sequential iteration number and check out-of-bounds condition
4020   pr_buf = th->th.th_dispatch;
4021   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4022   num_dims = pr_buf->th_doacross_info[0];
4023   lo = pr_buf->th_doacross_info[2];
4024   up = pr_buf->th_doacross_info[3];
4025   st = pr_buf->th_doacross_info[4];
4026   if (st == 1) { // most common case
4027     if (vec[0] < lo || vec[0] > up) {
4028       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4029                     "bounds [%lld,%lld]\n",
4030                     gtid, vec[0], lo, up));
4031       return;
4032     }
4033     iter_number = vec[0] - lo;
4034   } else if (st > 0) {
4035     if (vec[0] < lo || vec[0] > up) {
4036       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4037                     "bounds [%lld,%lld]\n",
4038                     gtid, vec[0], lo, up));
4039       return;
4040     }
4041     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4042   } else { // negative increment
4043     if (vec[0] > lo || vec[0] < up) {
4044       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4045                     "bounds [%lld,%lld]\n",
4046                     gtid, vec[0], lo, up));
4047       return;
4048     }
4049     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4050   }
4051   for (i = 1; i < num_dims; ++i) {
4052     kmp_int64 iter, ln;
4053     kmp_int32 j = i * 4;
4054     ln = pr_buf->th_doacross_info[j + 1];
4055     lo = pr_buf->th_doacross_info[j + 2];
4056     up = pr_buf->th_doacross_info[j + 3];
4057     st = pr_buf->th_doacross_info[j + 4];
4058     if (st == 1) {
4059       if (vec[i] < lo || vec[i] > up) {
4060         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4061                       "bounds [%lld,%lld]\n",
4062                       gtid, vec[i], lo, up));
4063         return;
4064       }
4065       iter = vec[i] - lo;
4066     } else if (st > 0) {
4067       if (vec[i] < lo || vec[i] > up) {
4068         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4069                       "bounds [%lld,%lld]\n",
4070                       gtid, vec[i], lo, up));
4071         return;
4072       }
4073       iter = (kmp_uint64)(vec[i] - lo) / st;
4074     } else { // st < 0
4075       if (vec[i] > lo || vec[i] < up) {
4076         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4077                       "bounds [%lld,%lld]\n",
4078                       gtid, vec[i], lo, up));
4079         return;
4080       }
4081       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4082     }
4083     iter_number = iter + ln * iter_number;
4084   }
4085   shft = iter_number % 32; // use 32-bit granularity
4086   iter_number >>= 5; // divided by 32
4087   flag = 1 << shft;
4088   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4089     KMP_YIELD(TRUE);
4090   }
4091   KMP_MB();
4092   KA_TRACE(20,
4093            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4094             gtid, (iter_number << 5) + shft));
4095 }
4096 
4097 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4098   kmp_int32 shft, num_dims, i;
4099   kmp_uint32 flag;
4100   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4101   kmp_info_t *th = __kmp_threads[gtid];
4102   kmp_team_t *team = th->th.th_team;
4103   kmp_disp_t *pr_buf;
4104   kmp_int64 lo, st;
4105 
4106   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4107   if (team->t.t_serialized) {
4108     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4109     return; // no dependencies if team is serialized
4110   }
4111 
4112   // calculate sequential iteration number (same as in "wait" but no
4113   // out-of-bounds checks)
4114   pr_buf = th->th.th_dispatch;
4115   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4116   num_dims = pr_buf->th_doacross_info[0];
4117   lo = pr_buf->th_doacross_info[2];
4118   st = pr_buf->th_doacross_info[4];
4119   if (st == 1) { // most common case
4120     iter_number = vec[0] - lo;
4121   } else if (st > 0) {
4122     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4123   } else { // negative increment
4124     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4125   }
4126   for (i = 1; i < num_dims; ++i) {
4127     kmp_int64 iter, ln;
4128     kmp_int32 j = i * 4;
4129     ln = pr_buf->th_doacross_info[j + 1];
4130     lo = pr_buf->th_doacross_info[j + 2];
4131     st = pr_buf->th_doacross_info[j + 4];
4132     if (st == 1) {
4133       iter = vec[i] - lo;
4134     } else if (st > 0) {
4135       iter = (kmp_uint64)(vec[i] - lo) / st;
4136     } else { // st < 0
4137       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4138     }
4139     iter_number = iter + ln * iter_number;
4140   }
4141   shft = iter_number % 32; // use 32-bit granularity
4142   iter_number >>= 5; // divided by 32
4143   flag = 1 << shft;
4144   KMP_MB();
4145   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4146     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4147   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4148                 (iter_number << 5) + shft));
4149 }
4150 
4151 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4152   kmp_int32 num_done;
4153   kmp_info_t *th = __kmp_threads[gtid];
4154   kmp_team_t *team = th->th.th_team;
4155   kmp_disp_t *pr_buf = th->th.th_dispatch;
4156 
4157   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4158   if (team->t.t_serialized) {
4159     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4160     return; // nothing to do
4161   }
4162   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4163   if (num_done == th->th.th_team_nproc) {
4164     // we are the last thread, need to free shared resources
4165     int idx = pr_buf->th_doacross_buf_idx - 1;
4166     dispatch_shared_info_t *sh_buf =
4167         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4168     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4169                      (kmp_int64)&sh_buf->doacross_num_done);
4170     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4171     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4172     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4173     sh_buf->doacross_flags = NULL;
4174     sh_buf->doacross_num_done = 0;
4175     sh_buf->doacross_buf_idx +=
4176         __kmp_dispatch_num_buffers; // free buffer for future re-use
4177   }
4178   // free private resources (need to keep buffer index forever)
4179   pr_buf->th_doacross_flags = NULL;
4180   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4181   pr_buf->th_doacross_info = NULL;
4182   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4183 }
4184 
4185 /* omp_alloc/omp_free only defined for C/C++, not for Fortran */
4186 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4187   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4188 }
4189 
4190 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4191   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4192 }
4193 
4194 int __kmpc_get_target_offload(void) {
4195   if (!__kmp_init_serial) {
4196     __kmp_serial_initialize();
4197   }
4198   return __kmp_target_offload;
4199 }
4200 
4201 int __kmpc_pause_resource(kmp_pause_status_t level) {
4202   if (!__kmp_init_serial) {
4203     return 1; // Can't pause if runtime is not initialized
4204   }
4205   return __kmp_pause_resource(level);
4206 }
4207