xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_csupport.cpp (revision 77013d11e6483b970af25e13c9b892075742f7e5)
1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
43   } else if (__kmp_ignore_mppbeg() == FALSE) {
44     // By default __kmp_ignore_mppbeg() returns TRUE.
45     __kmp_internal_begin();
46     KC_TRACE(10, ("__kmpc_begin: called\n"));
47   }
48 }
49 
50 /*!
51  * @ingroup STARTUP_SHUTDOWN
52  * @param loc source location information
53  *
54  * Shutdown the runtime library. This is also optional, and even if called will
55  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
56  * zero.
57  */
58 void __kmpc_end(ident_t *loc) {
59   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
60   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
61   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
62   // returns FALSE and __kmpc_end() will unregister this root (it can cause
63   // library shut down).
64   if (__kmp_ignore_mppend() == FALSE) {
65     KC_TRACE(10, ("__kmpc_end: called\n"));
66     KA_TRACE(30, ("__kmpc_end\n"));
67 
68     __kmp_internal_end_thread(-1);
69   }
70 #if KMP_OS_WINDOWS && OMPT_SUPPORT
71   // Normal exit process on Windows does not allow worker threads of the final
72   // parallel region to finish reporting their events, so shutting down the
73   // library here fixes the issue at least for the cases where __kmpc_end() is
74   // placed properly.
75   if (ompt_enabled.enabled)
76     __kmp_internal_end_library(__kmp_gtid_get_specific());
77 #endif
78 }
79 
80 /*!
81 @ingroup THREAD_STATES
82 @param loc Source location information.
83 @return The global thread index of the active thread.
84 
85 This function can be called in any context.
86 
87 If the runtime has ony been entered at the outermost level from a
88 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
89 that which would be returned by omp_get_thread_num() in the outermost
90 active parallel construct. (Or zero if there is no active parallel
91 construct, since the master thread is necessarily thread zero).
92 
93 If multiple non-OpenMP threads all enter an OpenMP construct then this
94 will be a unique thread identifier among all the threads created by
95 the OpenMP runtime (but the value cannot be defined in terms of
96 OpenMP thread ids returned by omp_get_thread_num()).
97 */
98 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
99   kmp_int32 gtid = __kmp_entry_gtid();
100 
101   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
102 
103   return gtid;
104 }
105 
106 /*!
107 @ingroup THREAD_STATES
108 @param loc Source location information.
109 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
110 
111 This function can be called in any context.
112 It returns the total number of threads under the control of the OpenMP runtime.
113 That is not a number that can be determined by any OpenMP standard calls, since
114 the library may be called from more than one non-OpenMP thread, and this
115 reflects the total over all such calls. Similarly the runtime maintains
116 underlying threads even when they are not active (since the cost of creating
117 and destroying OS threads is high), this call counts all such threads even if
118 they are not waiting for work.
119 */
120 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
121   KC_TRACE(10,
122            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
123 
124   return TCR_4(__kmp_all_nth);
125 }
126 
127 /*!
128 @ingroup THREAD_STATES
129 @param loc Source location information.
130 @return The thread number of the calling thread in the innermost active parallel
131 construct.
132 */
133 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
134   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
135   return __kmp_tid_from_gtid(__kmp_entry_gtid());
136 }
137 
138 /*!
139 @ingroup THREAD_STATES
140 @param loc Source location information.
141 @return The number of threads in the innermost active parallel construct.
142 */
143 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
144   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
145 
146   return __kmp_entry_thread()->th.th_team->t.t_nproc;
147 }
148 
149 /*!
150  * @ingroup DEPRECATED
151  * @param loc location description
152  *
153  * This function need not be called. It always returns TRUE.
154  */
155 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
156 #ifndef KMP_DEBUG
157 
158   return TRUE;
159 
160 #else
161 
162   const char *semi2;
163   const char *semi3;
164   int line_no;
165 
166   if (__kmp_par_range == 0) {
167     return TRUE;
168   }
169   semi2 = loc->psource;
170   if (semi2 == NULL) {
171     return TRUE;
172   }
173   semi2 = strchr(semi2, ';');
174   if (semi2 == NULL) {
175     return TRUE;
176   }
177   semi2 = strchr(semi2 + 1, ';');
178   if (semi2 == NULL) {
179     return TRUE;
180   }
181   if (__kmp_par_range_filename[0]) {
182     const char *name = semi2 - 1;
183     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
184       name--;
185     }
186     if ((*name == '/') || (*name == ';')) {
187       name++;
188     }
189     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
190       return __kmp_par_range < 0;
191     }
192   }
193   semi3 = strchr(semi2 + 1, ';');
194   if (__kmp_par_range_routine[0]) {
195     if ((semi3 != NULL) && (semi3 > semi2) &&
196         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
197       return __kmp_par_range < 0;
198     }
199   }
200   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
201     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
202       return __kmp_par_range > 0;
203     }
204     return __kmp_par_range < 0;
205   }
206   return TRUE;
207 
208 #endif /* KMP_DEBUG */
209 }
210 
211 /*!
212 @ingroup THREAD_STATES
213 @param loc Source location information.
214 @return 1 if this thread is executing inside an active parallel region, zero if
215 not.
216 */
217 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
218   return __kmp_entry_thread()->th.th_root->r.r_active;
219 }
220 
221 /*!
222 @ingroup PARALLEL
223 @param loc source location information
224 @param global_tid global thread number
225 @param num_threads number of threads requested for this parallel construct
226 
227 Set the number of threads to be used by the next fork spawned by this thread.
228 This call is only required if the parallel construct has a `num_threads` clause.
229 */
230 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
231                              kmp_int32 num_threads) {
232   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
233                 global_tid, num_threads));
234   __kmp_assert_valid_gtid(global_tid);
235   __kmp_push_num_threads(loc, global_tid, num_threads);
236 }
237 
238 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
239   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
240   /* the num_threads are automatically popped */
241 }
242 
243 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
244                            kmp_int32 proc_bind) {
245   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
246                 proc_bind));
247   __kmp_assert_valid_gtid(global_tid);
248   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
249 }
250 
251 /*!
252 @ingroup PARALLEL
253 @param loc  source location information
254 @param argc  total number of arguments in the ellipsis
255 @param microtask  pointer to callback routine consisting of outlined parallel
256 construct
257 @param ...  pointers to shared variables that aren't global
258 
259 Do the actual fork and call the microtask in the relevant number of threads.
260 */
261 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
262   int gtid = __kmp_entry_gtid();
263 
264 #if (KMP_STATS_ENABLED)
265   // If we were in a serial region, then stop the serial timer, record
266   // the event, and start parallel region timer
267   stats_state_e previous_state = KMP_GET_THREAD_STATE();
268   if (previous_state == stats_state_e::SERIAL_REGION) {
269     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
270   } else {
271     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
272   }
273   int inParallel = __kmpc_in_parallel(loc);
274   if (inParallel) {
275     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
276   } else {
277     KMP_COUNT_BLOCK(OMP_PARALLEL);
278   }
279 #endif
280 
281   // maybe to save thr_state is enough here
282   {
283     va_list ap;
284     va_start(ap, microtask);
285 
286 #if OMPT_SUPPORT
287     ompt_frame_t *ompt_frame;
288     if (ompt_enabled.enabled) {
289       kmp_info_t *master_th = __kmp_threads[gtid];
290       kmp_team_t *parent_team = master_th->th.th_team;
291       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
292       if (lwt)
293         ompt_frame = &(lwt->ompt_task_info.frame);
294       else {
295         int tid = __kmp_tid_from_gtid(gtid);
296         ompt_frame = &(
297             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
298       }
299       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
300     }
301     OMPT_STORE_RETURN_ADDRESS(gtid);
302 #endif
303 
304 #if INCLUDE_SSC_MARKS
305     SSC_MARK_FORKING();
306 #endif
307     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
308                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
309                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
310                     kmp_va_addr_of(ap));
311 #if INCLUDE_SSC_MARKS
312     SSC_MARK_JOINING();
313 #endif
314     __kmp_join_call(loc, gtid
315 #if OMPT_SUPPORT
316                     ,
317                     fork_context_intel
318 #endif
319                     );
320 
321     va_end(ap);
322   }
323 
324 #if KMP_STATS_ENABLED
325   if (previous_state == stats_state_e::SERIAL_REGION) {
326     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
327     KMP_SET_THREAD_STATE(previous_state);
328   } else {
329     KMP_POP_PARTITIONED_TIMER();
330   }
331 #endif // KMP_STATS_ENABLED
332 }
333 
334 /*!
335 @ingroup PARALLEL
336 @param loc source location information
337 @param global_tid global thread number
338 @param num_teams number of teams requested for the teams construct
339 @param num_threads number of threads per team requested for the teams construct
340 
341 Set the number of teams to be used by the teams construct.
342 This call is only required if the teams construct has a `num_teams` clause
343 or a `thread_limit` clause (or both).
344 */
345 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
346                            kmp_int32 num_teams, kmp_int32 num_threads) {
347   KA_TRACE(20,
348            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
349             global_tid, num_teams, num_threads));
350   __kmp_assert_valid_gtid(global_tid);
351   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
352 }
353 
354 /*!
355 @ingroup PARALLEL
356 @param loc  source location information
357 @param argc  total number of arguments in the ellipsis
358 @param microtask  pointer to callback routine consisting of outlined teams
359 construct
360 @param ...  pointers to shared variables that aren't global
361 
362 Do the actual fork and call the microtask in the relevant number of threads.
363 */
364 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
365                        ...) {
366   int gtid = __kmp_entry_gtid();
367   kmp_info_t *this_thr = __kmp_threads[gtid];
368   va_list ap;
369   va_start(ap, microtask);
370 
371 #if KMP_STATS_ENABLED
372   KMP_COUNT_BLOCK(OMP_TEAMS);
373   stats_state_e previous_state = KMP_GET_THREAD_STATE();
374   if (previous_state == stats_state_e::SERIAL_REGION) {
375     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
376   } else {
377     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
378   }
379 #endif
380 
381   // remember teams entry point and nesting level
382   this_thr->th.th_teams_microtask = microtask;
383   this_thr->th.th_teams_level =
384       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
385 
386 #if OMPT_SUPPORT
387   kmp_team_t *parent_team = this_thr->th.th_team;
388   int tid = __kmp_tid_from_gtid(gtid);
389   if (ompt_enabled.enabled) {
390     parent_team->t.t_implicit_task_taskdata[tid]
391         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
392   }
393   OMPT_STORE_RETURN_ADDRESS(gtid);
394 #endif
395 
396   // check if __kmpc_push_num_teams called, set default number of teams
397   // otherwise
398   if (this_thr->th.th_teams_size.nteams == 0) {
399     __kmp_push_num_teams(loc, gtid, 0, 0);
400   }
401   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
402   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
403   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
404 
405   __kmp_fork_call(
406       loc, gtid, fork_context_intel, argc,
407       VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
408       VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
409   __kmp_join_call(loc, gtid
410 #if OMPT_SUPPORT
411                   ,
412                   fork_context_intel
413 #endif
414                   );
415 
416   // Pop current CG root off list
417   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
418   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
419   this_thr->th.th_cg_roots = tmp->up;
420   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
421                  " to node %p. cg_nthreads was %d\n",
422                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
423   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
424   int i = tmp->cg_nthreads--;
425   if (i == 1) { // check is we are the last thread in CG (not always the case)
426     __kmp_free(tmp);
427   }
428   // Restore current task's thread_limit from CG root
429   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
430   this_thr->th.th_current_task->td_icvs.thread_limit =
431       this_thr->th.th_cg_roots->cg_thread_limit;
432 
433   this_thr->th.th_teams_microtask = NULL;
434   this_thr->th.th_teams_level = 0;
435   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
436   va_end(ap);
437 #if KMP_STATS_ENABLED
438   if (previous_state == stats_state_e::SERIAL_REGION) {
439     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
440     KMP_SET_THREAD_STATE(previous_state);
441   } else {
442     KMP_POP_PARTITIONED_TIMER();
443   }
444 #endif // KMP_STATS_ENABLED
445 }
446 
447 // I don't think this function should ever have been exported.
448 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
449 // openmp code ever called it, but it's been exported from the RTL for so
450 // long that I'm afraid to remove the definition.
451 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
452 
453 /*!
454 @ingroup PARALLEL
455 @param loc  source location information
456 @param global_tid  global thread number
457 
458 Enter a serialized parallel construct. This interface is used to handle a
459 conditional parallel region, like this,
460 @code
461 #pragma omp parallel if (condition)
462 @endcode
463 when the condition is false.
464 */
465 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
466   // The implementation is now in kmp_runtime.cpp so that it can share static
467   // functions with kmp_fork_call since the tasks to be done are similar in
468   // each case.
469   __kmp_assert_valid_gtid(global_tid);
470 #if OMPT_SUPPORT
471   OMPT_STORE_RETURN_ADDRESS(global_tid);
472 #endif
473   __kmp_serialized_parallel(loc, global_tid);
474 }
475 
476 /*!
477 @ingroup PARALLEL
478 @param loc  source location information
479 @param global_tid  global thread number
480 
481 Leave a serialized parallel construct.
482 */
483 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
484   kmp_internal_control_t *top;
485   kmp_info_t *this_thr;
486   kmp_team_t *serial_team;
487 
488   KC_TRACE(10,
489            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
490 
491   /* skip all this code for autopar serialized loops since it results in
492      unacceptable overhead */
493   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
494     return;
495 
496   // Not autopar code
497   __kmp_assert_valid_gtid(global_tid);
498   if (!TCR_4(__kmp_init_parallel))
499     __kmp_parallel_initialize();
500 
501   __kmp_resume_if_soft_paused();
502 
503   this_thr = __kmp_threads[global_tid];
504   serial_team = this_thr->th.th_serial_team;
505 
506   kmp_task_team_t *task_team = this_thr->th.th_task_team;
507   // we need to wait for the proxy tasks before finishing the thread
508   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
509     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
510 
511   KMP_MB();
512   KMP_DEBUG_ASSERT(serial_team);
513   KMP_ASSERT(serial_team->t.t_serialized);
514   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
515   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
516   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
517   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
518 
519 #if OMPT_SUPPORT
520   if (ompt_enabled.enabled &&
521       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
522     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
523     if (ompt_enabled.ompt_callback_implicit_task) {
524       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
525           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
526           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
527     }
528 
529     // reset clear the task id only after unlinking the task
530     ompt_data_t *parent_task_data;
531     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
532 
533     if (ompt_enabled.ompt_callback_parallel_end) {
534       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
535           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
536           ompt_parallel_invoker_program | ompt_parallel_team,
537           OMPT_LOAD_RETURN_ADDRESS(global_tid));
538     }
539     __ompt_lw_taskteam_unlink(this_thr);
540     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
541   }
542 #endif
543 
544   /* If necessary, pop the internal control stack values and replace the team
545    * values */
546   top = serial_team->t.t_control_stack_top;
547   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
548     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
549     serial_team->t.t_control_stack_top = top->next;
550     __kmp_free(top);
551   }
552 
553   // if( serial_team -> t.t_serialized > 1 )
554   serial_team->t.t_level--;
555 
556   /* pop dispatch buffers stack */
557   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
558   {
559     dispatch_private_info_t *disp_buffer =
560         serial_team->t.t_dispatch->th_disp_buffer;
561     serial_team->t.t_dispatch->th_disp_buffer =
562         serial_team->t.t_dispatch->th_disp_buffer->next;
563     __kmp_free(disp_buffer);
564   }
565   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
566 
567   --serial_team->t.t_serialized;
568   if (serial_team->t.t_serialized == 0) {
569 
570 /* return to the parallel section */
571 
572 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
573     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
574       __kmp_clear_x87_fpu_status_word();
575       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
576       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
577     }
578 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
579 
580     this_thr->th.th_team = serial_team->t.t_parent;
581     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
582 
583     /* restore values cached in the thread */
584     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
585     this_thr->th.th_team_master =
586         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
587     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
588 
589     /* TODO the below shouldn't need to be adjusted for serialized teams */
590     this_thr->th.th_dispatch =
591         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
592 
593     __kmp_pop_current_task_from_thread(this_thr);
594 
595     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
596     this_thr->th.th_current_task->td_flags.executing = 1;
597 
598     if (__kmp_tasking_mode != tskm_immediate_exec) {
599       // Copy the task team from the new child / old parent team to the thread.
600       this_thr->th.th_task_team =
601           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
602       KA_TRACE(20,
603                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
604                 "team %p\n",
605                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
606     }
607   } else {
608     if (__kmp_tasking_mode != tskm_immediate_exec) {
609       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
610                     "depth of serial team %p to %d\n",
611                     global_tid, serial_team, serial_team->t.t_serialized));
612     }
613   }
614 
615   if (__kmp_env_consistency_check)
616     __kmp_pop_parallel(global_tid, NULL);
617 #if OMPT_SUPPORT
618   if (ompt_enabled.enabled)
619     this_thr->th.ompt_thread_info.state =
620         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
621                                            : ompt_state_work_parallel);
622 #endif
623 }
624 
625 /*!
626 @ingroup SYNCHRONIZATION
627 @param loc  source location information.
628 
629 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
630 depending on the memory ordering convention obeyed by the compiler
631 even that may not be necessary).
632 */
633 void __kmpc_flush(ident_t *loc) {
634   KC_TRACE(10, ("__kmpc_flush: called\n"));
635 
636   /* need explicit __mf() here since use volatile instead in library */
637   KMP_MB(); /* Flush all pending memory write invalidates.  */
638 
639 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
640 #if KMP_MIC
641 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
642 // We shouldn't need it, though, since the ABI rules require that
643 // * If the compiler generates NGO stores it also generates the fence
644 // * If users hand-code NGO stores they should insert the fence
645 // therefore no incomplete unordered stores should be visible.
646 #else
647   // C74404
648   // This is to address non-temporal store instructions (sfence needed).
649   // The clflush instruction is addressed either (mfence needed).
650   // Probably the non-temporal load monvtdqa instruction should also be
651   // addressed.
652   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
653   if (!__kmp_cpuinfo.initialized) {
654     __kmp_query_cpuid(&__kmp_cpuinfo);
655   }
656   if (!__kmp_cpuinfo.sse2) {
657     // CPU cannot execute SSE2 instructions.
658   } else {
659 #if KMP_COMPILER_ICC
660     _mm_mfence();
661 #elif KMP_COMPILER_MSVC
662     MemoryBarrier();
663 #else
664     __sync_synchronize();
665 #endif // KMP_COMPILER_ICC
666   }
667 #endif // KMP_MIC
668 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
669        KMP_ARCH_RISCV64)
670 // Nothing to see here move along
671 #elif KMP_ARCH_PPC64
672 // Nothing needed here (we have a real MB above).
673 #else
674 #error Unknown or unsupported architecture
675 #endif
676 
677 #if OMPT_SUPPORT && OMPT_OPTIONAL
678   if (ompt_enabled.ompt_callback_flush) {
679     ompt_callbacks.ompt_callback(ompt_callback_flush)(
680         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
681   }
682 #endif
683 }
684 
685 /* -------------------------------------------------------------------------- */
686 /*!
687 @ingroup SYNCHRONIZATION
688 @param loc source location information
689 @param global_tid thread id.
690 
691 Execute a barrier.
692 */
693 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
694   KMP_COUNT_BLOCK(OMP_BARRIER);
695   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
696   __kmp_assert_valid_gtid(global_tid);
697 
698   if (!TCR_4(__kmp_init_parallel))
699     __kmp_parallel_initialize();
700 
701   __kmp_resume_if_soft_paused();
702 
703   if (__kmp_env_consistency_check) {
704     if (loc == 0) {
705       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
706     }
707     __kmp_check_barrier(global_tid, ct_barrier, loc);
708   }
709 
710 #if OMPT_SUPPORT
711   ompt_frame_t *ompt_frame;
712   if (ompt_enabled.enabled) {
713     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
714     if (ompt_frame->enter_frame.ptr == NULL)
715       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
716   }
717   OMPT_STORE_RETURN_ADDRESS(global_tid);
718 #endif
719   __kmp_threads[global_tid]->th.th_ident = loc;
720   // TODO: explicit barrier_wait_id:
721   //   this function is called when 'barrier' directive is present or
722   //   implicit barrier at the end of a worksharing construct.
723   // 1) better to add a per-thread barrier counter to a thread data structure
724   // 2) set to 0 when a new team is created
725   // 4) no sync is required
726 
727   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
728 #if OMPT_SUPPORT && OMPT_OPTIONAL
729   if (ompt_enabled.enabled) {
730     ompt_frame->enter_frame = ompt_data_none;
731   }
732 #endif
733 }
734 
735 /* The BARRIER for a MASTER section is always explicit   */
736 /*!
737 @ingroup WORK_SHARING
738 @param loc  source location information.
739 @param global_tid  global thread number .
740 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
741 */
742 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
743   int status = 0;
744 
745   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
746   __kmp_assert_valid_gtid(global_tid);
747 
748   if (!TCR_4(__kmp_init_parallel))
749     __kmp_parallel_initialize();
750 
751   __kmp_resume_if_soft_paused();
752 
753   if (KMP_MASTER_GTID(global_tid)) {
754     KMP_COUNT_BLOCK(OMP_MASTER);
755     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
756     status = 1;
757   }
758 
759 #if OMPT_SUPPORT && OMPT_OPTIONAL
760   if (status) {
761     if (ompt_enabled.ompt_callback_masked) {
762       kmp_info_t *this_thr = __kmp_threads[global_tid];
763       kmp_team_t *team = this_thr->th.th_team;
764 
765       int tid = __kmp_tid_from_gtid(global_tid);
766       ompt_callbacks.ompt_callback(ompt_callback_masked)(
767           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
768           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
769           OMPT_GET_RETURN_ADDRESS(0));
770     }
771   }
772 #endif
773 
774   if (__kmp_env_consistency_check) {
775 #if KMP_USE_DYNAMIC_LOCK
776     if (status)
777       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
778     else
779       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
780 #else
781     if (status)
782       __kmp_push_sync(global_tid, ct_master, loc, NULL);
783     else
784       __kmp_check_sync(global_tid, ct_master, loc, NULL);
785 #endif
786   }
787 
788   return status;
789 }
790 
791 /*!
792 @ingroup WORK_SHARING
793 @param loc  source location information.
794 @param global_tid  global thread number .
795 
796 Mark the end of a <tt>master</tt> region. This should only be called by the
797 thread that executes the <tt>master</tt> region.
798 */
799 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
800   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
801   __kmp_assert_valid_gtid(global_tid);
802   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
803   KMP_POP_PARTITIONED_TIMER();
804 
805 #if OMPT_SUPPORT && OMPT_OPTIONAL
806   kmp_info_t *this_thr = __kmp_threads[global_tid];
807   kmp_team_t *team = this_thr->th.th_team;
808   if (ompt_enabled.ompt_callback_masked) {
809     int tid = __kmp_tid_from_gtid(global_tid);
810     ompt_callbacks.ompt_callback(ompt_callback_masked)(
811         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
812         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
813         OMPT_GET_RETURN_ADDRESS(0));
814   }
815 #endif
816 
817   if (__kmp_env_consistency_check) {
818     if (KMP_MASTER_GTID(global_tid))
819       __kmp_pop_sync(global_tid, ct_master, loc);
820   }
821 }
822 
823 /*!
824 @ingroup WORK_SHARING
825 @param loc  source location information.
826 @param gtid  global thread number.
827 
828 Start execution of an <tt>ordered</tt> construct.
829 */
830 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
831   int cid = 0;
832   kmp_info_t *th;
833   KMP_DEBUG_ASSERT(__kmp_init_serial);
834 
835   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
836   __kmp_assert_valid_gtid(gtid);
837 
838   if (!TCR_4(__kmp_init_parallel))
839     __kmp_parallel_initialize();
840 
841   __kmp_resume_if_soft_paused();
842 
843 #if USE_ITT_BUILD
844   __kmp_itt_ordered_prep(gtid);
845 // TODO: ordered_wait_id
846 #endif /* USE_ITT_BUILD */
847 
848   th = __kmp_threads[gtid];
849 
850 #if OMPT_SUPPORT && OMPT_OPTIONAL
851   kmp_team_t *team;
852   ompt_wait_id_t lck;
853   void *codeptr_ra;
854   OMPT_STORE_RETURN_ADDRESS(gtid);
855   if (ompt_enabled.enabled) {
856     team = __kmp_team_from_gtid(gtid);
857     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
858     /* OMPT state update */
859     th->th.ompt_thread_info.wait_id = lck;
860     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
861 
862     /* OMPT event callback */
863     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
864     if (ompt_enabled.ompt_callback_mutex_acquire) {
865       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
866           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
867           codeptr_ra);
868     }
869   }
870 #endif
871 
872   if (th->th.th_dispatch->th_deo_fcn != 0)
873     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
874   else
875     __kmp_parallel_deo(&gtid, &cid, loc);
876 
877 #if OMPT_SUPPORT && OMPT_OPTIONAL
878   if (ompt_enabled.enabled) {
879     /* OMPT state update */
880     th->th.ompt_thread_info.state = ompt_state_work_parallel;
881     th->th.ompt_thread_info.wait_id = 0;
882 
883     /* OMPT event callback */
884     if (ompt_enabled.ompt_callback_mutex_acquired) {
885       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
886           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
887     }
888   }
889 #endif
890 
891 #if USE_ITT_BUILD
892   __kmp_itt_ordered_start(gtid);
893 #endif /* USE_ITT_BUILD */
894 }
895 
896 /*!
897 @ingroup WORK_SHARING
898 @param loc  source location information.
899 @param gtid  global thread number.
900 
901 End execution of an <tt>ordered</tt> construct.
902 */
903 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
904   int cid = 0;
905   kmp_info_t *th;
906 
907   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
908   __kmp_assert_valid_gtid(gtid);
909 
910 #if USE_ITT_BUILD
911   __kmp_itt_ordered_end(gtid);
912 // TODO: ordered_wait_id
913 #endif /* USE_ITT_BUILD */
914 
915   th = __kmp_threads[gtid];
916 
917   if (th->th.th_dispatch->th_dxo_fcn != 0)
918     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
919   else
920     __kmp_parallel_dxo(&gtid, &cid, loc);
921 
922 #if OMPT_SUPPORT && OMPT_OPTIONAL
923   OMPT_STORE_RETURN_ADDRESS(gtid);
924   if (ompt_enabled.ompt_callback_mutex_released) {
925     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
926         ompt_mutex_ordered,
927         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
928             ->t.t_ordered.dt.t_value,
929         OMPT_LOAD_RETURN_ADDRESS(gtid));
930   }
931 #endif
932 }
933 
934 #if KMP_USE_DYNAMIC_LOCK
935 
936 static __forceinline void
937 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
938                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
939   // Pointer to the allocated indirect lock is written to crit, while indexing
940   // is ignored.
941   void *idx;
942   kmp_indirect_lock_t **lck;
943   lck = (kmp_indirect_lock_t **)crit;
944   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
945   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
946   KMP_SET_I_LOCK_LOCATION(ilk, loc);
947   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
948   KA_TRACE(20,
949            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
950 #if USE_ITT_BUILD
951   __kmp_itt_critical_creating(ilk->lock, loc);
952 #endif
953   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
954   if (status == 0) {
955 #if USE_ITT_BUILD
956     __kmp_itt_critical_destroyed(ilk->lock);
957 #endif
958     // We don't really need to destroy the unclaimed lock here since it will be
959     // cleaned up at program exit.
960     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
961   }
962   KMP_DEBUG_ASSERT(*lck != NULL);
963 }
964 
965 // Fast-path acquire tas lock
966 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
967   {                                                                            \
968     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
969     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
970     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
971     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
972         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
973       kmp_uint32 spins;                                                        \
974       KMP_FSYNC_PREPARE(l);                                                    \
975       KMP_INIT_YIELD(spins);                                                   \
976       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
977       do {                                                                     \
978         if (TCR_4(__kmp_nth) >                                                 \
979             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
980           KMP_YIELD(TRUE);                                                     \
981         } else {                                                               \
982           KMP_YIELD_SPIN(spins);                                               \
983         }                                                                      \
984         __kmp_spin_backoff(&backoff);                                          \
985       } while (                                                                \
986           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
987           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
988     }                                                                          \
989     KMP_FSYNC_ACQUIRED(l);                                                     \
990   }
991 
992 // Fast-path test tas lock
993 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
994   {                                                                            \
995     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
996     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
997     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
998     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
999          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1000   }
1001 
1002 // Fast-path release tas lock
1003 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1004   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1005 
1006 #if KMP_USE_FUTEX
1007 
1008 #include <sys/syscall.h>
1009 #include <unistd.h>
1010 #ifndef FUTEX_WAIT
1011 #define FUTEX_WAIT 0
1012 #endif
1013 #ifndef FUTEX_WAKE
1014 #define FUTEX_WAKE 1
1015 #endif
1016 
1017 // Fast-path acquire futex lock
1018 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1019   {                                                                            \
1020     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1021     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1022     KMP_MB();                                                                  \
1023     KMP_FSYNC_PREPARE(ftx);                                                    \
1024     kmp_int32 poll_val;                                                        \
1025     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1026                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1027                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1028       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1029       if (!cond) {                                                             \
1030         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1031                                          poll_val |                            \
1032                                              KMP_LOCK_BUSY(1, futex))) {       \
1033           continue;                                                            \
1034         }                                                                      \
1035         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1036       }                                                                        \
1037       kmp_int32 rc;                                                            \
1038       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1039                         NULL, NULL, 0)) != 0) {                                \
1040         continue;                                                              \
1041       }                                                                        \
1042       gtid_code |= 1;                                                          \
1043     }                                                                          \
1044     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1045   }
1046 
1047 // Fast-path test futex lock
1048 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1049   {                                                                            \
1050     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1051     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1052                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1053       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1054       rc = TRUE;                                                               \
1055     } else {                                                                   \
1056       rc = FALSE;                                                              \
1057     }                                                                          \
1058   }
1059 
1060 // Fast-path release futex lock
1061 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1062   {                                                                            \
1063     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1064     KMP_MB();                                                                  \
1065     KMP_FSYNC_RELEASING(ftx);                                                  \
1066     kmp_int32 poll_val =                                                       \
1067         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1068     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1069       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1070               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1071     }                                                                          \
1072     KMP_MB();                                                                  \
1073     KMP_YIELD_OVERSUB();                                                       \
1074   }
1075 
1076 #endif // KMP_USE_FUTEX
1077 
1078 #else // KMP_USE_DYNAMIC_LOCK
1079 
1080 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1081                                                       ident_t const *loc,
1082                                                       kmp_int32 gtid) {
1083   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1084 
1085   // Because of the double-check, the following load doesn't need to be volatile
1086   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1087 
1088   if (lck == NULL) {
1089     void *idx;
1090 
1091     // Allocate & initialize the lock.
1092     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1093     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1094     __kmp_init_user_lock_with_checks(lck);
1095     __kmp_set_user_lock_location(lck, loc);
1096 #if USE_ITT_BUILD
1097     __kmp_itt_critical_creating(lck);
1098 // __kmp_itt_critical_creating() should be called *before* the first usage
1099 // of underlying lock. It is the only place where we can guarantee it. There
1100 // are chances the lock will destroyed with no usage, but it is not a
1101 // problem, because this is not real event seen by user but rather setting
1102 // name for object (lock). See more details in kmp_itt.h.
1103 #endif /* USE_ITT_BUILD */
1104 
1105     // Use a cmpxchg instruction to slam the start of the critical section with
1106     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1107     // and use the lock that the other thread allocated.
1108     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1109 
1110     if (status == 0) {
1111 // Deallocate the lock and reload the value.
1112 #if USE_ITT_BUILD
1113       __kmp_itt_critical_destroyed(lck);
1114 // Let ITT know the lock is destroyed and the same memory location may be reused
1115 // for another purpose.
1116 #endif /* USE_ITT_BUILD */
1117       __kmp_destroy_user_lock_with_checks(lck);
1118       __kmp_user_lock_free(&idx, gtid, lck);
1119       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1120       KMP_DEBUG_ASSERT(lck != NULL);
1121     }
1122   }
1123   return lck;
1124 }
1125 
1126 #endif // KMP_USE_DYNAMIC_LOCK
1127 
1128 /*!
1129 @ingroup WORK_SHARING
1130 @param loc  source location information.
1131 @param global_tid  global thread number.
1132 @param crit identity of the critical section. This could be a pointer to a lock
1133 associated with the critical section, or some other suitably unique value.
1134 
1135 Enter code protected by a `critical` construct.
1136 This function blocks until the executing thread can enter the critical section.
1137 */
1138 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1139                      kmp_critical_name *crit) {
1140 #if KMP_USE_DYNAMIC_LOCK
1141 #if OMPT_SUPPORT && OMPT_OPTIONAL
1142   OMPT_STORE_RETURN_ADDRESS(global_tid);
1143 #endif // OMPT_SUPPORT
1144   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1145 #else
1146   KMP_COUNT_BLOCK(OMP_CRITICAL);
1147 #if OMPT_SUPPORT && OMPT_OPTIONAL
1148   ompt_state_t prev_state = ompt_state_undefined;
1149   ompt_thread_info_t ti;
1150 #endif
1151   kmp_user_lock_p lck;
1152 
1153   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1154   __kmp_assert_valid_gtid(global_tid);
1155 
1156   // TODO: add THR_OVHD_STATE
1157 
1158   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1159   KMP_CHECK_USER_LOCK_INIT();
1160 
1161   if ((__kmp_user_lock_kind == lk_tas) &&
1162       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1163     lck = (kmp_user_lock_p)crit;
1164   }
1165 #if KMP_USE_FUTEX
1166   else if ((__kmp_user_lock_kind == lk_futex) &&
1167            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1168     lck = (kmp_user_lock_p)crit;
1169   }
1170 #endif
1171   else { // ticket, queuing or drdpa
1172     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1173   }
1174 
1175   if (__kmp_env_consistency_check)
1176     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1177 
1178 // since the critical directive binds to all threads, not just the current
1179 // team we have to check this even if we are in a serialized team.
1180 // also, even if we are the uber thread, we still have to conduct the lock,
1181 // as we have to contend with sibling threads.
1182 
1183 #if USE_ITT_BUILD
1184   __kmp_itt_critical_acquiring(lck);
1185 #endif /* USE_ITT_BUILD */
1186 #if OMPT_SUPPORT && OMPT_OPTIONAL
1187   OMPT_STORE_RETURN_ADDRESS(gtid);
1188   void *codeptr_ra = NULL;
1189   if (ompt_enabled.enabled) {
1190     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1191     /* OMPT state update */
1192     prev_state = ti.state;
1193     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1194     ti.state = ompt_state_wait_critical;
1195 
1196     /* OMPT event callback */
1197     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1198     if (ompt_enabled.ompt_callback_mutex_acquire) {
1199       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1200           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1201           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1202     }
1203   }
1204 #endif
1205   // Value of 'crit' should be good for using as a critical_id of the critical
1206   // section directive.
1207   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1208 
1209 #if USE_ITT_BUILD
1210   __kmp_itt_critical_acquired(lck);
1211 #endif /* USE_ITT_BUILD */
1212 #if OMPT_SUPPORT && OMPT_OPTIONAL
1213   if (ompt_enabled.enabled) {
1214     /* OMPT state update */
1215     ti.state = prev_state;
1216     ti.wait_id = 0;
1217 
1218     /* OMPT event callback */
1219     if (ompt_enabled.ompt_callback_mutex_acquired) {
1220       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1221           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1222     }
1223   }
1224 #endif
1225   KMP_POP_PARTITIONED_TIMER();
1226 
1227   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1228   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1229 #endif // KMP_USE_DYNAMIC_LOCK
1230 }
1231 
1232 #if KMP_USE_DYNAMIC_LOCK
1233 
1234 // Converts the given hint to an internal lock implementation
1235 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1236 #if KMP_USE_TSX
1237 #define KMP_TSX_LOCK(seq) lockseq_##seq
1238 #else
1239 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1240 #endif
1241 
1242 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1243 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1244 #else
1245 #define KMP_CPUINFO_RTM 0
1246 #endif
1247 
1248   // Hints that do not require further logic
1249   if (hint & kmp_lock_hint_hle)
1250     return KMP_TSX_LOCK(hle);
1251   if (hint & kmp_lock_hint_rtm)
1252     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1253   if (hint & kmp_lock_hint_adaptive)
1254     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1255 
1256   // Rule out conflicting hints first by returning the default lock
1257   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1258     return __kmp_user_lock_seq;
1259   if ((hint & omp_lock_hint_speculative) &&
1260       (hint & omp_lock_hint_nonspeculative))
1261     return __kmp_user_lock_seq;
1262 
1263   // Do not even consider speculation when it appears to be contended
1264   if (hint & omp_lock_hint_contended)
1265     return lockseq_queuing;
1266 
1267   // Uncontended lock without speculation
1268   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1269     return lockseq_tas;
1270 
1271   // Use RTM lock for speculation
1272   if (hint & omp_lock_hint_speculative)
1273     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1274 
1275   return __kmp_user_lock_seq;
1276 }
1277 
1278 #if OMPT_SUPPORT && OMPT_OPTIONAL
1279 #if KMP_USE_DYNAMIC_LOCK
1280 static kmp_mutex_impl_t
1281 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1282   if (user_lock) {
1283     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1284     case 0:
1285       break;
1286 #if KMP_USE_FUTEX
1287     case locktag_futex:
1288       return kmp_mutex_impl_queuing;
1289 #endif
1290     case locktag_tas:
1291       return kmp_mutex_impl_spin;
1292 #if KMP_USE_TSX
1293     case locktag_hle:
1294     case locktag_rtm_spin:
1295       return kmp_mutex_impl_speculative;
1296 #endif
1297     default:
1298       return kmp_mutex_impl_none;
1299     }
1300     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1301   }
1302   KMP_ASSERT(ilock);
1303   switch (ilock->type) {
1304 #if KMP_USE_TSX
1305   case locktag_adaptive:
1306   case locktag_rtm_queuing:
1307     return kmp_mutex_impl_speculative;
1308 #endif
1309   case locktag_nested_tas:
1310     return kmp_mutex_impl_spin;
1311 #if KMP_USE_FUTEX
1312   case locktag_nested_futex:
1313 #endif
1314   case locktag_ticket:
1315   case locktag_queuing:
1316   case locktag_drdpa:
1317   case locktag_nested_ticket:
1318   case locktag_nested_queuing:
1319   case locktag_nested_drdpa:
1320     return kmp_mutex_impl_queuing;
1321   default:
1322     return kmp_mutex_impl_none;
1323   }
1324 }
1325 #else
1326 // For locks without dynamic binding
1327 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1328   switch (__kmp_user_lock_kind) {
1329   case lk_tas:
1330     return kmp_mutex_impl_spin;
1331 #if KMP_USE_FUTEX
1332   case lk_futex:
1333 #endif
1334   case lk_ticket:
1335   case lk_queuing:
1336   case lk_drdpa:
1337     return kmp_mutex_impl_queuing;
1338 #if KMP_USE_TSX
1339   case lk_hle:
1340   case lk_rtm_queuing:
1341   case lk_rtm_spin:
1342   case lk_adaptive:
1343     return kmp_mutex_impl_speculative;
1344 #endif
1345   default:
1346     return kmp_mutex_impl_none;
1347   }
1348 }
1349 #endif // KMP_USE_DYNAMIC_LOCK
1350 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1351 
1352 /*!
1353 @ingroup WORK_SHARING
1354 @param loc  source location information.
1355 @param global_tid  global thread number.
1356 @param crit identity of the critical section. This could be a pointer to a lock
1357 associated with the critical section, or some other suitably unique value.
1358 @param hint the lock hint.
1359 
1360 Enter code protected by a `critical` construct with a hint. The hint value is
1361 used to suggest a lock implementation. This function blocks until the executing
1362 thread can enter the critical section unless the hint suggests use of
1363 speculative execution and the hardware supports it.
1364 */
1365 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1366                                kmp_critical_name *crit, uint32_t hint) {
1367   KMP_COUNT_BLOCK(OMP_CRITICAL);
1368   kmp_user_lock_p lck;
1369 #if OMPT_SUPPORT && OMPT_OPTIONAL
1370   ompt_state_t prev_state = ompt_state_undefined;
1371   ompt_thread_info_t ti;
1372   // This is the case, if called from __kmpc_critical:
1373   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1374   if (!codeptr)
1375     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1376 #endif
1377 
1378   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1379   __kmp_assert_valid_gtid(global_tid);
1380 
1381   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1382   // Check if it is initialized.
1383   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1384   if (*lk == 0) {
1385     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1386     if (KMP_IS_D_LOCK(lckseq)) {
1387       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1388                                   KMP_GET_D_TAG(lckseq));
1389     } else {
1390       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1391     }
1392   }
1393   // Branch for accessing the actual lock object and set operation. This
1394   // branching is inevitable since this lock initialization does not follow the
1395   // normal dispatch path (lock table is not used).
1396   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1397     lck = (kmp_user_lock_p)lk;
1398     if (__kmp_env_consistency_check) {
1399       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1400                       __kmp_map_hint_to_lock(hint));
1401     }
1402 #if USE_ITT_BUILD
1403     __kmp_itt_critical_acquiring(lck);
1404 #endif
1405 #if OMPT_SUPPORT && OMPT_OPTIONAL
1406     if (ompt_enabled.enabled) {
1407       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1408       /* OMPT state update */
1409       prev_state = ti.state;
1410       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1411       ti.state = ompt_state_wait_critical;
1412 
1413       /* OMPT event callback */
1414       if (ompt_enabled.ompt_callback_mutex_acquire) {
1415         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1416             ompt_mutex_critical, (unsigned int)hint,
1417             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1418             codeptr);
1419       }
1420     }
1421 #endif
1422 #if KMP_USE_INLINED_TAS
1423     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1424       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1425     } else
1426 #elif KMP_USE_INLINED_FUTEX
1427     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1428       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1429     } else
1430 #endif
1431     {
1432       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1433     }
1434   } else {
1435     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1436     lck = ilk->lock;
1437     if (__kmp_env_consistency_check) {
1438       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1439                       __kmp_map_hint_to_lock(hint));
1440     }
1441 #if USE_ITT_BUILD
1442     __kmp_itt_critical_acquiring(lck);
1443 #endif
1444 #if OMPT_SUPPORT && OMPT_OPTIONAL
1445     if (ompt_enabled.enabled) {
1446       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1447       /* OMPT state update */
1448       prev_state = ti.state;
1449       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1450       ti.state = ompt_state_wait_critical;
1451 
1452       /* OMPT event callback */
1453       if (ompt_enabled.ompt_callback_mutex_acquire) {
1454         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1455             ompt_mutex_critical, (unsigned int)hint,
1456             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1457             codeptr);
1458       }
1459     }
1460 #endif
1461     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1462   }
1463   KMP_POP_PARTITIONED_TIMER();
1464 
1465 #if USE_ITT_BUILD
1466   __kmp_itt_critical_acquired(lck);
1467 #endif /* USE_ITT_BUILD */
1468 #if OMPT_SUPPORT && OMPT_OPTIONAL
1469   if (ompt_enabled.enabled) {
1470     /* OMPT state update */
1471     ti.state = prev_state;
1472     ti.wait_id = 0;
1473 
1474     /* OMPT event callback */
1475     if (ompt_enabled.ompt_callback_mutex_acquired) {
1476       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1477           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1478     }
1479   }
1480 #endif
1481 
1482   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1483   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1484 } // __kmpc_critical_with_hint
1485 
1486 #endif // KMP_USE_DYNAMIC_LOCK
1487 
1488 /*!
1489 @ingroup WORK_SHARING
1490 @param loc  source location information.
1491 @param global_tid  global thread number .
1492 @param crit identity of the critical section. This could be a pointer to a lock
1493 associated with the critical section, or some other suitably unique value.
1494 
1495 Leave a critical section, releasing any lock that was held during its execution.
1496 */
1497 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1498                          kmp_critical_name *crit) {
1499   kmp_user_lock_p lck;
1500 
1501   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1502 
1503 #if KMP_USE_DYNAMIC_LOCK
1504   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1505     lck = (kmp_user_lock_p)crit;
1506     KMP_ASSERT(lck != NULL);
1507     if (__kmp_env_consistency_check) {
1508       __kmp_pop_sync(global_tid, ct_critical, loc);
1509     }
1510 #if USE_ITT_BUILD
1511     __kmp_itt_critical_releasing(lck);
1512 #endif
1513 #if KMP_USE_INLINED_TAS
1514     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1515       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1516     } else
1517 #elif KMP_USE_INLINED_FUTEX
1518     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1519       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1520     } else
1521 #endif
1522     {
1523       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1524     }
1525   } else {
1526     kmp_indirect_lock_t *ilk =
1527         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1528     KMP_ASSERT(ilk != NULL);
1529     lck = ilk->lock;
1530     if (__kmp_env_consistency_check) {
1531       __kmp_pop_sync(global_tid, ct_critical, loc);
1532     }
1533 #if USE_ITT_BUILD
1534     __kmp_itt_critical_releasing(lck);
1535 #endif
1536     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1537   }
1538 
1539 #else // KMP_USE_DYNAMIC_LOCK
1540 
1541   if ((__kmp_user_lock_kind == lk_tas) &&
1542       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1543     lck = (kmp_user_lock_p)crit;
1544   }
1545 #if KMP_USE_FUTEX
1546   else if ((__kmp_user_lock_kind == lk_futex) &&
1547            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1548     lck = (kmp_user_lock_p)crit;
1549   }
1550 #endif
1551   else { // ticket, queuing or drdpa
1552     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1553   }
1554 
1555   KMP_ASSERT(lck != NULL);
1556 
1557   if (__kmp_env_consistency_check)
1558     __kmp_pop_sync(global_tid, ct_critical, loc);
1559 
1560 #if USE_ITT_BUILD
1561   __kmp_itt_critical_releasing(lck);
1562 #endif /* USE_ITT_BUILD */
1563   // Value of 'crit' should be good for using as a critical_id of the critical
1564   // section directive.
1565   __kmp_release_user_lock_with_checks(lck, global_tid);
1566 
1567 #endif // KMP_USE_DYNAMIC_LOCK
1568 
1569 #if OMPT_SUPPORT && OMPT_OPTIONAL
1570   /* OMPT release event triggers after lock is released; place here to trigger
1571    * for all #if branches */
1572   OMPT_STORE_RETURN_ADDRESS(global_tid);
1573   if (ompt_enabled.ompt_callback_mutex_released) {
1574     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1575         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1576         OMPT_LOAD_RETURN_ADDRESS(0));
1577   }
1578 #endif
1579 
1580   KMP_POP_PARTITIONED_TIMER();
1581   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1582 }
1583 
1584 /*!
1585 @ingroup SYNCHRONIZATION
1586 @param loc source location information
1587 @param global_tid thread id.
1588 @return one if the thread should execute the master block, zero otherwise
1589 
1590 Start execution of a combined barrier and master. The barrier is executed inside
1591 this function.
1592 */
1593 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1594   int status;
1595   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1596   __kmp_assert_valid_gtid(global_tid);
1597 
1598   if (!TCR_4(__kmp_init_parallel))
1599     __kmp_parallel_initialize();
1600 
1601   __kmp_resume_if_soft_paused();
1602 
1603   if (__kmp_env_consistency_check)
1604     __kmp_check_barrier(global_tid, ct_barrier, loc);
1605 
1606 #if OMPT_SUPPORT
1607   ompt_frame_t *ompt_frame;
1608   if (ompt_enabled.enabled) {
1609     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1610     if (ompt_frame->enter_frame.ptr == NULL)
1611       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1612   }
1613   OMPT_STORE_RETURN_ADDRESS(global_tid);
1614 #endif
1615 #if USE_ITT_NOTIFY
1616   __kmp_threads[global_tid]->th.th_ident = loc;
1617 #endif
1618   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1619 #if OMPT_SUPPORT && OMPT_OPTIONAL
1620   if (ompt_enabled.enabled) {
1621     ompt_frame->enter_frame = ompt_data_none;
1622   }
1623 #endif
1624 
1625   return (status != 0) ? 0 : 1;
1626 }
1627 
1628 /*!
1629 @ingroup SYNCHRONIZATION
1630 @param loc source location information
1631 @param global_tid thread id.
1632 
1633 Complete the execution of a combined barrier and master. This function should
1634 only be called at the completion of the <tt>master</tt> code. Other threads will
1635 still be waiting at the barrier and this call releases them.
1636 */
1637 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1638   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1639   __kmp_assert_valid_gtid(global_tid);
1640   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1641 }
1642 
1643 /*!
1644 @ingroup SYNCHRONIZATION
1645 @param loc source location information
1646 @param global_tid thread id.
1647 @return one if the thread should execute the master block, zero otherwise
1648 
1649 Start execution of a combined barrier and master(nowait) construct.
1650 The barrier is executed inside this function.
1651 There is no equivalent "end" function, since the
1652 */
1653 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1654   kmp_int32 ret;
1655   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1656   __kmp_assert_valid_gtid(global_tid);
1657 
1658   if (!TCR_4(__kmp_init_parallel))
1659     __kmp_parallel_initialize();
1660 
1661   __kmp_resume_if_soft_paused();
1662 
1663   if (__kmp_env_consistency_check) {
1664     if (loc == 0) {
1665       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1666     }
1667     __kmp_check_barrier(global_tid, ct_barrier, loc);
1668   }
1669 
1670 #if OMPT_SUPPORT
1671   ompt_frame_t *ompt_frame;
1672   if (ompt_enabled.enabled) {
1673     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1674     if (ompt_frame->enter_frame.ptr == NULL)
1675       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1676   }
1677   OMPT_STORE_RETURN_ADDRESS(global_tid);
1678 #endif
1679 #if USE_ITT_NOTIFY
1680   __kmp_threads[global_tid]->th.th_ident = loc;
1681 #endif
1682   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1683 #if OMPT_SUPPORT && OMPT_OPTIONAL
1684   if (ompt_enabled.enabled) {
1685     ompt_frame->enter_frame = ompt_data_none;
1686   }
1687 #endif
1688 
1689   ret = __kmpc_master(loc, global_tid);
1690 
1691   if (__kmp_env_consistency_check) {
1692     /*  there's no __kmpc_end_master called; so the (stats) */
1693     /*  actions of __kmpc_end_master are done here          */
1694     if (ret) {
1695       /* only one thread should do the pop since only */
1696       /* one did the push (see __kmpc_master())       */
1697       __kmp_pop_sync(global_tid, ct_master, loc);
1698     }
1699   }
1700 
1701   return (ret);
1702 }
1703 
1704 /* The BARRIER for a SINGLE process section is always explicit   */
1705 /*!
1706 @ingroup WORK_SHARING
1707 @param loc  source location information
1708 @param global_tid  global thread number
1709 @return One if this thread should execute the single construct, zero otherwise.
1710 
1711 Test whether to execute a <tt>single</tt> construct.
1712 There are no implicit barriers in the two "single" calls, rather the compiler
1713 should introduce an explicit barrier if it is required.
1714 */
1715 
1716 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1717   __kmp_assert_valid_gtid(global_tid);
1718   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1719 
1720   if (rc) {
1721     // We are going to execute the single statement, so we should count it.
1722     KMP_COUNT_BLOCK(OMP_SINGLE);
1723     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1724   }
1725 
1726 #if OMPT_SUPPORT && OMPT_OPTIONAL
1727   kmp_info_t *this_thr = __kmp_threads[global_tid];
1728   kmp_team_t *team = this_thr->th.th_team;
1729   int tid = __kmp_tid_from_gtid(global_tid);
1730 
1731   if (ompt_enabled.enabled) {
1732     if (rc) {
1733       if (ompt_enabled.ompt_callback_work) {
1734         ompt_callbacks.ompt_callback(ompt_callback_work)(
1735             ompt_work_single_executor, ompt_scope_begin,
1736             &(team->t.ompt_team_info.parallel_data),
1737             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1738             1, OMPT_GET_RETURN_ADDRESS(0));
1739       }
1740     } else {
1741       if (ompt_enabled.ompt_callback_work) {
1742         ompt_callbacks.ompt_callback(ompt_callback_work)(
1743             ompt_work_single_other, ompt_scope_begin,
1744             &(team->t.ompt_team_info.parallel_data),
1745             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1746             1, OMPT_GET_RETURN_ADDRESS(0));
1747         ompt_callbacks.ompt_callback(ompt_callback_work)(
1748             ompt_work_single_other, ompt_scope_end,
1749             &(team->t.ompt_team_info.parallel_data),
1750             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1751             1, OMPT_GET_RETURN_ADDRESS(0));
1752       }
1753     }
1754   }
1755 #endif
1756 
1757   return rc;
1758 }
1759 
1760 /*!
1761 @ingroup WORK_SHARING
1762 @param loc  source location information
1763 @param global_tid  global thread number
1764 
1765 Mark the end of a <tt>single</tt> construct.  This function should
1766 only be called by the thread that executed the block of code protected
1767 by the `single` construct.
1768 */
1769 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1770   __kmp_assert_valid_gtid(global_tid);
1771   __kmp_exit_single(global_tid);
1772   KMP_POP_PARTITIONED_TIMER();
1773 
1774 #if OMPT_SUPPORT && OMPT_OPTIONAL
1775   kmp_info_t *this_thr = __kmp_threads[global_tid];
1776   kmp_team_t *team = this_thr->th.th_team;
1777   int tid = __kmp_tid_from_gtid(global_tid);
1778 
1779   if (ompt_enabled.ompt_callback_work) {
1780     ompt_callbacks.ompt_callback(ompt_callback_work)(
1781         ompt_work_single_executor, ompt_scope_end,
1782         &(team->t.ompt_team_info.parallel_data),
1783         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1784         OMPT_GET_RETURN_ADDRESS(0));
1785   }
1786 #endif
1787 }
1788 
1789 /*!
1790 @ingroup WORK_SHARING
1791 @param loc Source location
1792 @param global_tid Global thread id
1793 
1794 Mark the end of a statically scheduled loop.
1795 */
1796 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1797   KMP_POP_PARTITIONED_TIMER();
1798   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1799 
1800 #if OMPT_SUPPORT && OMPT_OPTIONAL
1801   if (ompt_enabled.ompt_callback_work) {
1802     ompt_work_t ompt_work_type = ompt_work_loop;
1803     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1804     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1805     // Determine workshare type
1806     if (loc != NULL) {
1807       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1808         ompt_work_type = ompt_work_loop;
1809       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1810         ompt_work_type = ompt_work_sections;
1811       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1812         ompt_work_type = ompt_work_distribute;
1813       } else {
1814         // use default set above.
1815         // a warning about this case is provided in __kmpc_for_static_init
1816       }
1817       KMP_DEBUG_ASSERT(ompt_work_type);
1818     }
1819     ompt_callbacks.ompt_callback(ompt_callback_work)(
1820         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1821         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1822   }
1823 #endif
1824   if (__kmp_env_consistency_check)
1825     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1826 }
1827 
1828 // User routines which take C-style arguments (call by value)
1829 // different from the Fortran equivalent routines
1830 
1831 void ompc_set_num_threads(int arg) {
1832   // !!!!! TODO: check the per-task binding
1833   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1834 }
1835 
1836 void ompc_set_dynamic(int flag) {
1837   kmp_info_t *thread;
1838 
1839   /* For the thread-private implementation of the internal controls */
1840   thread = __kmp_entry_thread();
1841 
1842   __kmp_save_internal_controls(thread);
1843 
1844   set__dynamic(thread, flag ? true : false);
1845 }
1846 
1847 void ompc_set_nested(int flag) {
1848   kmp_info_t *thread;
1849 
1850   /* For the thread-private internal controls implementation */
1851   thread = __kmp_entry_thread();
1852 
1853   __kmp_save_internal_controls(thread);
1854 
1855   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1856 }
1857 
1858 void ompc_set_max_active_levels(int max_active_levels) {
1859   /* TO DO */
1860   /* we want per-task implementation of this internal control */
1861 
1862   /* For the per-thread internal controls implementation */
1863   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1864 }
1865 
1866 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1867   // !!!!! TODO: check the per-task binding
1868   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1869 }
1870 
1871 int ompc_get_ancestor_thread_num(int level) {
1872   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1873 }
1874 
1875 int ompc_get_team_size(int level) {
1876   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1877 }
1878 
1879 /* OpenMP 5.0 Affinity Format API */
1880 
1881 void ompc_set_affinity_format(char const *format) {
1882   if (!__kmp_init_serial) {
1883     __kmp_serial_initialize();
1884   }
1885   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1886                          format, KMP_STRLEN(format) + 1);
1887 }
1888 
1889 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1890   size_t format_size;
1891   if (!__kmp_init_serial) {
1892     __kmp_serial_initialize();
1893   }
1894   format_size = KMP_STRLEN(__kmp_affinity_format);
1895   if (buffer && size) {
1896     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1897                            format_size + 1);
1898   }
1899   return format_size;
1900 }
1901 
1902 void ompc_display_affinity(char const *format) {
1903   int gtid;
1904   if (!TCR_4(__kmp_init_middle)) {
1905     __kmp_middle_initialize();
1906   }
1907   gtid = __kmp_get_gtid();
1908   __kmp_aux_display_affinity(gtid, format);
1909 }
1910 
1911 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1912                              char const *format) {
1913   int gtid;
1914   size_t num_required;
1915   kmp_str_buf_t capture_buf;
1916   if (!TCR_4(__kmp_init_middle)) {
1917     __kmp_middle_initialize();
1918   }
1919   gtid = __kmp_get_gtid();
1920   __kmp_str_buf_init(&capture_buf);
1921   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1922   if (buffer && buf_size) {
1923     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1924                            capture_buf.used + 1);
1925   }
1926   __kmp_str_buf_free(&capture_buf);
1927   return num_required;
1928 }
1929 
1930 void kmpc_set_stacksize(int arg) {
1931   // __kmp_aux_set_stacksize initializes the library if needed
1932   __kmp_aux_set_stacksize(arg);
1933 }
1934 
1935 void kmpc_set_stacksize_s(size_t arg) {
1936   // __kmp_aux_set_stacksize initializes the library if needed
1937   __kmp_aux_set_stacksize(arg);
1938 }
1939 
1940 void kmpc_set_blocktime(int arg) {
1941   int gtid, tid;
1942   kmp_info_t *thread;
1943 
1944   gtid = __kmp_entry_gtid();
1945   tid = __kmp_tid_from_gtid(gtid);
1946   thread = __kmp_thread_from_gtid(gtid);
1947 
1948   __kmp_aux_set_blocktime(arg, thread, tid);
1949 }
1950 
1951 void kmpc_set_library(int arg) {
1952   // __kmp_user_set_library initializes the library if needed
1953   __kmp_user_set_library((enum library_type)arg);
1954 }
1955 
1956 void kmpc_set_defaults(char const *str) {
1957   // __kmp_aux_set_defaults initializes the library if needed
1958   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1959 }
1960 
1961 void kmpc_set_disp_num_buffers(int arg) {
1962   // ignore after initialization because some teams have already
1963   // allocated dispatch buffers
1964   if (__kmp_init_serial == 0 && arg > 0)
1965     __kmp_dispatch_num_buffers = arg;
1966 }
1967 
1968 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1969 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1970   return -1;
1971 #else
1972   if (!TCR_4(__kmp_init_middle)) {
1973     __kmp_middle_initialize();
1974   }
1975   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1976 #endif
1977 }
1978 
1979 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1980 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1981   return -1;
1982 #else
1983   if (!TCR_4(__kmp_init_middle)) {
1984     __kmp_middle_initialize();
1985   }
1986   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
1987 #endif
1988 }
1989 
1990 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
1991 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1992   return -1;
1993 #else
1994   if (!TCR_4(__kmp_init_middle)) {
1995     __kmp_middle_initialize();
1996   }
1997   return __kmp_aux_get_affinity_mask_proc(proc, mask);
1998 #endif
1999 }
2000 
2001 /* -------------------------------------------------------------------------- */
2002 /*!
2003 @ingroup THREADPRIVATE
2004 @param loc       source location information
2005 @param gtid      global thread number
2006 @param cpy_size  size of the cpy_data buffer
2007 @param cpy_data  pointer to data to be copied
2008 @param cpy_func  helper function to call for copying data
2009 @param didit     flag variable: 1=single thread; 0=not single thread
2010 
2011 __kmpc_copyprivate implements the interface for the private data broadcast
2012 needed for the copyprivate clause associated with a single region in an
2013 OpenMP<sup>*</sup> program (both C and Fortran).
2014 All threads participating in the parallel region call this routine.
2015 One of the threads (called the single thread) should have the <tt>didit</tt>
2016 variable set to 1 and all other threads should have that variable set to 0.
2017 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2018 
2019 The OpenMP specification forbids the use of nowait on the single region when a
2020 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2021 barrier internally to avoid race conditions, so the code generation for the
2022 single region should avoid generating a barrier after the call to @ref
2023 __kmpc_copyprivate.
2024 
2025 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2026 The <tt>loc</tt> parameter is a pointer to source location information.
2027 
2028 Internal implementation: The single thread will first copy its descriptor
2029 address (cpy_data) to a team-private location, then the other threads will each
2030 call the function pointed to by the parameter cpy_func, which carries out the
2031 copy by copying the data using the cpy_data buffer.
2032 
2033 The cpy_func routine used for the copy and the contents of the data area defined
2034 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2035 to be done. For instance, the cpy_data buffer can hold the actual data to be
2036 copied or it may hold a list of pointers to the data. The cpy_func routine must
2037 interpret the cpy_data buffer appropriately.
2038 
2039 The interface to cpy_func is as follows:
2040 @code
2041 void cpy_func( void *destination, void *source )
2042 @endcode
2043 where void *destination is the cpy_data pointer for the thread being copied to
2044 and void *source is the cpy_data pointer for the thread being copied from.
2045 */
2046 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2047                         void *cpy_data, void (*cpy_func)(void *, void *),
2048                         kmp_int32 didit) {
2049   void **data_ptr;
2050   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2051   __kmp_assert_valid_gtid(gtid);
2052 
2053   KMP_MB();
2054 
2055   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2056 
2057   if (__kmp_env_consistency_check) {
2058     if (loc == 0) {
2059       KMP_WARNING(ConstructIdentInvalid);
2060     }
2061   }
2062 
2063   // ToDo: Optimize the following two barriers into some kind of split barrier
2064 
2065   if (didit)
2066     *data_ptr = cpy_data;
2067 
2068 #if OMPT_SUPPORT
2069   ompt_frame_t *ompt_frame;
2070   if (ompt_enabled.enabled) {
2071     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2072     if (ompt_frame->enter_frame.ptr == NULL)
2073       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2074   }
2075   OMPT_STORE_RETURN_ADDRESS(gtid);
2076 #endif
2077 /* This barrier is not a barrier region boundary */
2078 #if USE_ITT_NOTIFY
2079   __kmp_threads[gtid]->th.th_ident = loc;
2080 #endif
2081   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2082 
2083   if (!didit)
2084     (*cpy_func)(cpy_data, *data_ptr);
2085 
2086 // Consider next barrier a user-visible barrier for barrier region boundaries
2087 // Nesting checks are already handled by the single construct checks
2088   {
2089 #if OMPT_SUPPORT
2090     OMPT_STORE_RETURN_ADDRESS(gtid);
2091 #endif
2092 #if USE_ITT_NOTIFY
2093   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2094 // tasks can overwrite the location)
2095 #endif
2096   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2097 #if OMPT_SUPPORT && OMPT_OPTIONAL
2098   if (ompt_enabled.enabled) {
2099     ompt_frame->enter_frame = ompt_data_none;
2100   }
2101 #endif
2102   }
2103 }
2104 
2105 /* -------------------------------------------------------------------------- */
2106 
2107 #define INIT_LOCK __kmp_init_user_lock_with_checks
2108 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2109 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2110 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2111 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2112 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2113   __kmp_acquire_nested_user_lock_with_checks_timed
2114 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2115 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2116 #define TEST_LOCK __kmp_test_user_lock_with_checks
2117 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2118 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2119 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2120 
2121 // TODO: Make check abort messages use location info & pass it into
2122 // with_checks routines
2123 
2124 #if KMP_USE_DYNAMIC_LOCK
2125 
2126 // internal lock initializer
2127 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2128                                                     kmp_dyna_lockseq_t seq) {
2129   if (KMP_IS_D_LOCK(seq)) {
2130     KMP_INIT_D_LOCK(lock, seq);
2131 #if USE_ITT_BUILD
2132     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2133 #endif
2134   } else {
2135     KMP_INIT_I_LOCK(lock, seq);
2136 #if USE_ITT_BUILD
2137     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2138     __kmp_itt_lock_creating(ilk->lock, loc);
2139 #endif
2140   }
2141 }
2142 
2143 // internal nest lock initializer
2144 static __forceinline void
2145 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2146                                kmp_dyna_lockseq_t seq) {
2147 #if KMP_USE_TSX
2148   // Don't have nested lock implementation for speculative locks
2149   if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2150       seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2151     seq = __kmp_user_lock_seq;
2152 #endif
2153   switch (seq) {
2154   case lockseq_tas:
2155     seq = lockseq_nested_tas;
2156     break;
2157 #if KMP_USE_FUTEX
2158   case lockseq_futex:
2159     seq = lockseq_nested_futex;
2160     break;
2161 #endif
2162   case lockseq_ticket:
2163     seq = lockseq_nested_ticket;
2164     break;
2165   case lockseq_queuing:
2166     seq = lockseq_nested_queuing;
2167     break;
2168   case lockseq_drdpa:
2169     seq = lockseq_nested_drdpa;
2170     break;
2171   default:
2172     seq = lockseq_nested_queuing;
2173   }
2174   KMP_INIT_I_LOCK(lock, seq);
2175 #if USE_ITT_BUILD
2176   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2177   __kmp_itt_lock_creating(ilk->lock, loc);
2178 #endif
2179 }
2180 
2181 /* initialize the lock with a hint */
2182 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2183                                 uintptr_t hint) {
2184   KMP_DEBUG_ASSERT(__kmp_init_serial);
2185   if (__kmp_env_consistency_check && user_lock == NULL) {
2186     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2187   }
2188 
2189   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2190 
2191 #if OMPT_SUPPORT && OMPT_OPTIONAL
2192   // This is the case, if called from omp_init_lock_with_hint:
2193   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2194   if (!codeptr)
2195     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2196   if (ompt_enabled.ompt_callback_lock_init) {
2197     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2198         ompt_mutex_lock, (omp_lock_hint_t)hint,
2199         __ompt_get_mutex_impl_type(user_lock),
2200         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2201   }
2202 #endif
2203 }
2204 
2205 /* initialize the lock with a hint */
2206 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2207                                      void **user_lock, uintptr_t hint) {
2208   KMP_DEBUG_ASSERT(__kmp_init_serial);
2209   if (__kmp_env_consistency_check && user_lock == NULL) {
2210     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2211   }
2212 
2213   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2214 
2215 #if OMPT_SUPPORT && OMPT_OPTIONAL
2216   // This is the case, if called from omp_init_lock_with_hint:
2217   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2218   if (!codeptr)
2219     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2220   if (ompt_enabled.ompt_callback_lock_init) {
2221     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2222         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2223         __ompt_get_mutex_impl_type(user_lock),
2224         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2225   }
2226 #endif
2227 }
2228 
2229 #endif // KMP_USE_DYNAMIC_LOCK
2230 
2231 /* initialize the lock */
2232 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2233 #if KMP_USE_DYNAMIC_LOCK
2234 
2235   KMP_DEBUG_ASSERT(__kmp_init_serial);
2236   if (__kmp_env_consistency_check && user_lock == NULL) {
2237     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2238   }
2239   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2240 
2241 #if OMPT_SUPPORT && OMPT_OPTIONAL
2242   // This is the case, if called from omp_init_lock_with_hint:
2243   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2244   if (!codeptr)
2245     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2246   if (ompt_enabled.ompt_callback_lock_init) {
2247     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2248         ompt_mutex_lock, omp_lock_hint_none,
2249         __ompt_get_mutex_impl_type(user_lock),
2250         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2251   }
2252 #endif
2253 
2254 #else // KMP_USE_DYNAMIC_LOCK
2255 
2256   static char const *const func = "omp_init_lock";
2257   kmp_user_lock_p lck;
2258   KMP_DEBUG_ASSERT(__kmp_init_serial);
2259 
2260   if (__kmp_env_consistency_check) {
2261     if (user_lock == NULL) {
2262       KMP_FATAL(LockIsUninitialized, func);
2263     }
2264   }
2265 
2266   KMP_CHECK_USER_LOCK_INIT();
2267 
2268   if ((__kmp_user_lock_kind == lk_tas) &&
2269       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2270     lck = (kmp_user_lock_p)user_lock;
2271   }
2272 #if KMP_USE_FUTEX
2273   else if ((__kmp_user_lock_kind == lk_futex) &&
2274            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2275     lck = (kmp_user_lock_p)user_lock;
2276   }
2277 #endif
2278   else {
2279     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2280   }
2281   INIT_LOCK(lck);
2282   __kmp_set_user_lock_location(lck, loc);
2283 
2284 #if OMPT_SUPPORT && OMPT_OPTIONAL
2285   // This is the case, if called from omp_init_lock_with_hint:
2286   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2287   if (!codeptr)
2288     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2289   if (ompt_enabled.ompt_callback_lock_init) {
2290     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2291         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2292         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2293   }
2294 #endif
2295 
2296 #if USE_ITT_BUILD
2297   __kmp_itt_lock_creating(lck);
2298 #endif /* USE_ITT_BUILD */
2299 
2300 #endif // KMP_USE_DYNAMIC_LOCK
2301 } // __kmpc_init_lock
2302 
2303 /* initialize the lock */
2304 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2305 #if KMP_USE_DYNAMIC_LOCK
2306 
2307   KMP_DEBUG_ASSERT(__kmp_init_serial);
2308   if (__kmp_env_consistency_check && user_lock == NULL) {
2309     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2310   }
2311   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2312 
2313 #if OMPT_SUPPORT && OMPT_OPTIONAL
2314   // This is the case, if called from omp_init_lock_with_hint:
2315   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2316   if (!codeptr)
2317     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2318   if (ompt_enabled.ompt_callback_lock_init) {
2319     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2320         ompt_mutex_nest_lock, omp_lock_hint_none,
2321         __ompt_get_mutex_impl_type(user_lock),
2322         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2323   }
2324 #endif
2325 
2326 #else // KMP_USE_DYNAMIC_LOCK
2327 
2328   static char const *const func = "omp_init_nest_lock";
2329   kmp_user_lock_p lck;
2330   KMP_DEBUG_ASSERT(__kmp_init_serial);
2331 
2332   if (__kmp_env_consistency_check) {
2333     if (user_lock == NULL) {
2334       KMP_FATAL(LockIsUninitialized, func);
2335     }
2336   }
2337 
2338   KMP_CHECK_USER_LOCK_INIT();
2339 
2340   if ((__kmp_user_lock_kind == lk_tas) &&
2341       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2342        OMP_NEST_LOCK_T_SIZE)) {
2343     lck = (kmp_user_lock_p)user_lock;
2344   }
2345 #if KMP_USE_FUTEX
2346   else if ((__kmp_user_lock_kind == lk_futex) &&
2347            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2348             OMP_NEST_LOCK_T_SIZE)) {
2349     lck = (kmp_user_lock_p)user_lock;
2350   }
2351 #endif
2352   else {
2353     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2354   }
2355 
2356   INIT_NESTED_LOCK(lck);
2357   __kmp_set_user_lock_location(lck, loc);
2358 
2359 #if OMPT_SUPPORT && OMPT_OPTIONAL
2360   // This is the case, if called from omp_init_lock_with_hint:
2361   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2362   if (!codeptr)
2363     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2364   if (ompt_enabled.ompt_callback_lock_init) {
2365     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2366         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2367         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2368   }
2369 #endif
2370 
2371 #if USE_ITT_BUILD
2372   __kmp_itt_lock_creating(lck);
2373 #endif /* USE_ITT_BUILD */
2374 
2375 #endif // KMP_USE_DYNAMIC_LOCK
2376 } // __kmpc_init_nest_lock
2377 
2378 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2379 #if KMP_USE_DYNAMIC_LOCK
2380 
2381 #if USE_ITT_BUILD
2382   kmp_user_lock_p lck;
2383   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2384     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2385   } else {
2386     lck = (kmp_user_lock_p)user_lock;
2387   }
2388   __kmp_itt_lock_destroyed(lck);
2389 #endif
2390 #if OMPT_SUPPORT && OMPT_OPTIONAL
2391   // This is the case, if called from omp_init_lock_with_hint:
2392   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2393   if (!codeptr)
2394     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2395   if (ompt_enabled.ompt_callback_lock_destroy) {
2396     kmp_user_lock_p lck;
2397     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2398       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2399     } else {
2400       lck = (kmp_user_lock_p)user_lock;
2401     }
2402     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2403         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2404   }
2405 #endif
2406   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2407 #else
2408   kmp_user_lock_p lck;
2409 
2410   if ((__kmp_user_lock_kind == lk_tas) &&
2411       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2412     lck = (kmp_user_lock_p)user_lock;
2413   }
2414 #if KMP_USE_FUTEX
2415   else if ((__kmp_user_lock_kind == lk_futex) &&
2416            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2417     lck = (kmp_user_lock_p)user_lock;
2418   }
2419 #endif
2420   else {
2421     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2422   }
2423 
2424 #if OMPT_SUPPORT && OMPT_OPTIONAL
2425   // This is the case, if called from omp_init_lock_with_hint:
2426   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2427   if (!codeptr)
2428     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2429   if (ompt_enabled.ompt_callback_lock_destroy) {
2430     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2431         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2432   }
2433 #endif
2434 
2435 #if USE_ITT_BUILD
2436   __kmp_itt_lock_destroyed(lck);
2437 #endif /* USE_ITT_BUILD */
2438   DESTROY_LOCK(lck);
2439 
2440   if ((__kmp_user_lock_kind == lk_tas) &&
2441       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2442     ;
2443   }
2444 #if KMP_USE_FUTEX
2445   else if ((__kmp_user_lock_kind == lk_futex) &&
2446            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2447     ;
2448   }
2449 #endif
2450   else {
2451     __kmp_user_lock_free(user_lock, gtid, lck);
2452   }
2453 #endif // KMP_USE_DYNAMIC_LOCK
2454 } // __kmpc_destroy_lock
2455 
2456 /* destroy the lock */
2457 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2458 #if KMP_USE_DYNAMIC_LOCK
2459 
2460 #if USE_ITT_BUILD
2461   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2462   __kmp_itt_lock_destroyed(ilk->lock);
2463 #endif
2464 #if OMPT_SUPPORT && OMPT_OPTIONAL
2465   // This is the case, if called from omp_init_lock_with_hint:
2466   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2467   if (!codeptr)
2468     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2469   if (ompt_enabled.ompt_callback_lock_destroy) {
2470     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2471         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2472   }
2473 #endif
2474   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2475 
2476 #else // KMP_USE_DYNAMIC_LOCK
2477 
2478   kmp_user_lock_p lck;
2479 
2480   if ((__kmp_user_lock_kind == lk_tas) &&
2481       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2482        OMP_NEST_LOCK_T_SIZE)) {
2483     lck = (kmp_user_lock_p)user_lock;
2484   }
2485 #if KMP_USE_FUTEX
2486   else if ((__kmp_user_lock_kind == lk_futex) &&
2487            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2488             OMP_NEST_LOCK_T_SIZE)) {
2489     lck = (kmp_user_lock_p)user_lock;
2490   }
2491 #endif
2492   else {
2493     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2494   }
2495 
2496 #if OMPT_SUPPORT && OMPT_OPTIONAL
2497   // This is the case, if called from omp_init_lock_with_hint:
2498   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2499   if (!codeptr)
2500     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2501   if (ompt_enabled.ompt_callback_lock_destroy) {
2502     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2503         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2504   }
2505 #endif
2506 
2507 #if USE_ITT_BUILD
2508   __kmp_itt_lock_destroyed(lck);
2509 #endif /* USE_ITT_BUILD */
2510 
2511   DESTROY_NESTED_LOCK(lck);
2512 
2513   if ((__kmp_user_lock_kind == lk_tas) &&
2514       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2515        OMP_NEST_LOCK_T_SIZE)) {
2516     ;
2517   }
2518 #if KMP_USE_FUTEX
2519   else if ((__kmp_user_lock_kind == lk_futex) &&
2520            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2521             OMP_NEST_LOCK_T_SIZE)) {
2522     ;
2523   }
2524 #endif
2525   else {
2526     __kmp_user_lock_free(user_lock, gtid, lck);
2527   }
2528 #endif // KMP_USE_DYNAMIC_LOCK
2529 } // __kmpc_destroy_nest_lock
2530 
2531 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2532   KMP_COUNT_BLOCK(OMP_set_lock);
2533 #if KMP_USE_DYNAMIC_LOCK
2534   int tag = KMP_EXTRACT_D_TAG(user_lock);
2535 #if USE_ITT_BUILD
2536   __kmp_itt_lock_acquiring(
2537       (kmp_user_lock_p)
2538           user_lock); // itt function will get to the right lock object.
2539 #endif
2540 #if OMPT_SUPPORT && OMPT_OPTIONAL
2541   // This is the case, if called from omp_init_lock_with_hint:
2542   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2543   if (!codeptr)
2544     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2545   if (ompt_enabled.ompt_callback_mutex_acquire) {
2546     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2547         ompt_mutex_lock, omp_lock_hint_none,
2548         __ompt_get_mutex_impl_type(user_lock),
2549         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2550   }
2551 #endif
2552 #if KMP_USE_INLINED_TAS
2553   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2554     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2555   } else
2556 #elif KMP_USE_INLINED_FUTEX
2557   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2558     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2559   } else
2560 #endif
2561   {
2562     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2563   }
2564 #if USE_ITT_BUILD
2565   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2566 #endif
2567 #if OMPT_SUPPORT && OMPT_OPTIONAL
2568   if (ompt_enabled.ompt_callback_mutex_acquired) {
2569     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2570         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2571   }
2572 #endif
2573 
2574 #else // KMP_USE_DYNAMIC_LOCK
2575 
2576   kmp_user_lock_p lck;
2577 
2578   if ((__kmp_user_lock_kind == lk_tas) &&
2579       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2580     lck = (kmp_user_lock_p)user_lock;
2581   }
2582 #if KMP_USE_FUTEX
2583   else if ((__kmp_user_lock_kind == lk_futex) &&
2584            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2585     lck = (kmp_user_lock_p)user_lock;
2586   }
2587 #endif
2588   else {
2589     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2590   }
2591 
2592 #if USE_ITT_BUILD
2593   __kmp_itt_lock_acquiring(lck);
2594 #endif /* USE_ITT_BUILD */
2595 #if OMPT_SUPPORT && OMPT_OPTIONAL
2596   // This is the case, if called from omp_init_lock_with_hint:
2597   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2598   if (!codeptr)
2599     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2600   if (ompt_enabled.ompt_callback_mutex_acquire) {
2601     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2602         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2603         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2604   }
2605 #endif
2606 
2607   ACQUIRE_LOCK(lck, gtid);
2608 
2609 #if USE_ITT_BUILD
2610   __kmp_itt_lock_acquired(lck);
2611 #endif /* USE_ITT_BUILD */
2612 
2613 #if OMPT_SUPPORT && OMPT_OPTIONAL
2614   if (ompt_enabled.ompt_callback_mutex_acquired) {
2615     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2616         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2617   }
2618 #endif
2619 
2620 #endif // KMP_USE_DYNAMIC_LOCK
2621 }
2622 
2623 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2624 #if KMP_USE_DYNAMIC_LOCK
2625 
2626 #if USE_ITT_BUILD
2627   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2628 #endif
2629 #if OMPT_SUPPORT && OMPT_OPTIONAL
2630   // This is the case, if called from omp_init_lock_with_hint:
2631   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2632   if (!codeptr)
2633     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2634   if (ompt_enabled.enabled) {
2635     if (ompt_enabled.ompt_callback_mutex_acquire) {
2636       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2637           ompt_mutex_nest_lock, omp_lock_hint_none,
2638           __ompt_get_mutex_impl_type(user_lock),
2639           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2640     }
2641   }
2642 #endif
2643   int acquire_status =
2644       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2645   (void) acquire_status;
2646 #if USE_ITT_BUILD
2647   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2648 #endif
2649 
2650 #if OMPT_SUPPORT && OMPT_OPTIONAL
2651   if (ompt_enabled.enabled) {
2652     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2653       if (ompt_enabled.ompt_callback_mutex_acquired) {
2654         // lock_first
2655         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2656             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2657             codeptr);
2658       }
2659     } else {
2660       if (ompt_enabled.ompt_callback_nest_lock) {
2661         // lock_next
2662         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2663             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2664       }
2665     }
2666   }
2667 #endif
2668 
2669 #else // KMP_USE_DYNAMIC_LOCK
2670   int acquire_status;
2671   kmp_user_lock_p lck;
2672 
2673   if ((__kmp_user_lock_kind == lk_tas) &&
2674       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2675        OMP_NEST_LOCK_T_SIZE)) {
2676     lck = (kmp_user_lock_p)user_lock;
2677   }
2678 #if KMP_USE_FUTEX
2679   else if ((__kmp_user_lock_kind == lk_futex) &&
2680            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2681             OMP_NEST_LOCK_T_SIZE)) {
2682     lck = (kmp_user_lock_p)user_lock;
2683   }
2684 #endif
2685   else {
2686     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2687   }
2688 
2689 #if USE_ITT_BUILD
2690   __kmp_itt_lock_acquiring(lck);
2691 #endif /* USE_ITT_BUILD */
2692 #if OMPT_SUPPORT && OMPT_OPTIONAL
2693   // This is the case, if called from omp_init_lock_with_hint:
2694   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2695   if (!codeptr)
2696     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2697   if (ompt_enabled.enabled) {
2698     if (ompt_enabled.ompt_callback_mutex_acquire) {
2699       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2700           ompt_mutex_nest_lock, omp_lock_hint_none,
2701           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2702           codeptr);
2703     }
2704   }
2705 #endif
2706 
2707   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2708 
2709 #if USE_ITT_BUILD
2710   __kmp_itt_lock_acquired(lck);
2711 #endif /* USE_ITT_BUILD */
2712 
2713 #if OMPT_SUPPORT && OMPT_OPTIONAL
2714   if (ompt_enabled.enabled) {
2715     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2716       if (ompt_enabled.ompt_callback_mutex_acquired) {
2717         // lock_first
2718         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2719             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2720       }
2721     } else {
2722       if (ompt_enabled.ompt_callback_nest_lock) {
2723         // lock_next
2724         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2725             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2726       }
2727     }
2728   }
2729 #endif
2730 
2731 #endif // KMP_USE_DYNAMIC_LOCK
2732 }
2733 
2734 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2735 #if KMP_USE_DYNAMIC_LOCK
2736 
2737   int tag = KMP_EXTRACT_D_TAG(user_lock);
2738 #if USE_ITT_BUILD
2739   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2740 #endif
2741 #if KMP_USE_INLINED_TAS
2742   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2743     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2744   } else
2745 #elif KMP_USE_INLINED_FUTEX
2746   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2747     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2748   } else
2749 #endif
2750   {
2751     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2752   }
2753 
2754 #if OMPT_SUPPORT && OMPT_OPTIONAL
2755   // This is the case, if called from omp_init_lock_with_hint:
2756   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2757   if (!codeptr)
2758     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2759   if (ompt_enabled.ompt_callback_mutex_released) {
2760     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2761         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2762   }
2763 #endif
2764 
2765 #else // KMP_USE_DYNAMIC_LOCK
2766 
2767   kmp_user_lock_p lck;
2768 
2769   /* Can't use serial interval since not block structured */
2770   /* release the lock */
2771 
2772   if ((__kmp_user_lock_kind == lk_tas) &&
2773       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2774 #if KMP_OS_LINUX &&                                                            \
2775     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2776 // "fast" path implemented to fix customer performance issue
2777 #if USE_ITT_BUILD
2778     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2779 #endif /* USE_ITT_BUILD */
2780     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2781     KMP_MB();
2782 
2783 #if OMPT_SUPPORT && OMPT_OPTIONAL
2784     // This is the case, if called from omp_init_lock_with_hint:
2785     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2786     if (!codeptr)
2787       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2788     if (ompt_enabled.ompt_callback_mutex_released) {
2789       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2790           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2791     }
2792 #endif
2793 
2794     return;
2795 #else
2796     lck = (kmp_user_lock_p)user_lock;
2797 #endif
2798   }
2799 #if KMP_USE_FUTEX
2800   else if ((__kmp_user_lock_kind == lk_futex) &&
2801            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2802     lck = (kmp_user_lock_p)user_lock;
2803   }
2804 #endif
2805   else {
2806     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2807   }
2808 
2809 #if USE_ITT_BUILD
2810   __kmp_itt_lock_releasing(lck);
2811 #endif /* USE_ITT_BUILD */
2812 
2813   RELEASE_LOCK(lck, gtid);
2814 
2815 #if OMPT_SUPPORT && OMPT_OPTIONAL
2816   // This is the case, if called from omp_init_lock_with_hint:
2817   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2818   if (!codeptr)
2819     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2820   if (ompt_enabled.ompt_callback_mutex_released) {
2821     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2822         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2823   }
2824 #endif
2825 
2826 #endif // KMP_USE_DYNAMIC_LOCK
2827 }
2828 
2829 /* release the lock */
2830 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2831 #if KMP_USE_DYNAMIC_LOCK
2832 
2833 #if USE_ITT_BUILD
2834   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2835 #endif
2836   int release_status =
2837       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2838   (void) release_status;
2839 
2840 #if OMPT_SUPPORT && OMPT_OPTIONAL
2841   // This is the case, if called from omp_init_lock_with_hint:
2842   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2843   if (!codeptr)
2844     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2845   if (ompt_enabled.enabled) {
2846     if (release_status == KMP_LOCK_RELEASED) {
2847       if (ompt_enabled.ompt_callback_mutex_released) {
2848         // release_lock_last
2849         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2850             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2851             codeptr);
2852       }
2853     } else if (ompt_enabled.ompt_callback_nest_lock) {
2854       // release_lock_prev
2855       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2856           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2857     }
2858   }
2859 #endif
2860 
2861 #else // KMP_USE_DYNAMIC_LOCK
2862 
2863   kmp_user_lock_p lck;
2864 
2865   /* Can't use serial interval since not block structured */
2866 
2867   if ((__kmp_user_lock_kind == lk_tas) &&
2868       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2869        OMP_NEST_LOCK_T_SIZE)) {
2870 #if KMP_OS_LINUX &&                                                            \
2871     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2872     // "fast" path implemented to fix customer performance issue
2873     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2874 #if USE_ITT_BUILD
2875     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2876 #endif /* USE_ITT_BUILD */
2877 
2878 #if OMPT_SUPPORT && OMPT_OPTIONAL
2879     int release_status = KMP_LOCK_STILL_HELD;
2880 #endif
2881 
2882     if (--(tl->lk.depth_locked) == 0) {
2883       TCW_4(tl->lk.poll, 0);
2884 #if OMPT_SUPPORT && OMPT_OPTIONAL
2885       release_status = KMP_LOCK_RELEASED;
2886 #endif
2887     }
2888     KMP_MB();
2889 
2890 #if OMPT_SUPPORT && OMPT_OPTIONAL
2891     // This is the case, if called from omp_init_lock_with_hint:
2892     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2893     if (!codeptr)
2894       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2895     if (ompt_enabled.enabled) {
2896       if (release_status == KMP_LOCK_RELEASED) {
2897         if (ompt_enabled.ompt_callback_mutex_released) {
2898           // release_lock_last
2899           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2900               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2901         }
2902       } else if (ompt_enabled.ompt_callback_nest_lock) {
2903         // release_lock_previous
2904         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2905             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2906       }
2907     }
2908 #endif
2909 
2910     return;
2911 #else
2912     lck = (kmp_user_lock_p)user_lock;
2913 #endif
2914   }
2915 #if KMP_USE_FUTEX
2916   else if ((__kmp_user_lock_kind == lk_futex) &&
2917            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2918             OMP_NEST_LOCK_T_SIZE)) {
2919     lck = (kmp_user_lock_p)user_lock;
2920   }
2921 #endif
2922   else {
2923     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2924   }
2925 
2926 #if USE_ITT_BUILD
2927   __kmp_itt_lock_releasing(lck);
2928 #endif /* USE_ITT_BUILD */
2929 
2930   int release_status;
2931   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2932 #if OMPT_SUPPORT && OMPT_OPTIONAL
2933   // This is the case, if called from omp_init_lock_with_hint:
2934   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2935   if (!codeptr)
2936     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2937   if (ompt_enabled.enabled) {
2938     if (release_status == KMP_LOCK_RELEASED) {
2939       if (ompt_enabled.ompt_callback_mutex_released) {
2940         // release_lock_last
2941         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2942             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2943       }
2944     } else if (ompt_enabled.ompt_callback_nest_lock) {
2945       // release_lock_previous
2946       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2947           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2948     }
2949   }
2950 #endif
2951 
2952 #endif // KMP_USE_DYNAMIC_LOCK
2953 }
2954 
2955 /* try to acquire the lock */
2956 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2957   KMP_COUNT_BLOCK(OMP_test_lock);
2958 
2959 #if KMP_USE_DYNAMIC_LOCK
2960   int rc;
2961   int tag = KMP_EXTRACT_D_TAG(user_lock);
2962 #if USE_ITT_BUILD
2963   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2964 #endif
2965 #if OMPT_SUPPORT && OMPT_OPTIONAL
2966   // This is the case, if called from omp_init_lock_with_hint:
2967   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2968   if (!codeptr)
2969     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2970   if (ompt_enabled.ompt_callback_mutex_acquire) {
2971     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2972         ompt_mutex_lock, omp_lock_hint_none,
2973         __ompt_get_mutex_impl_type(user_lock),
2974         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2975   }
2976 #endif
2977 #if KMP_USE_INLINED_TAS
2978   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2979     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2980   } else
2981 #elif KMP_USE_INLINED_FUTEX
2982   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2983     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2984   } else
2985 #endif
2986   {
2987     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2988   }
2989   if (rc) {
2990 #if USE_ITT_BUILD
2991     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2992 #endif
2993 #if OMPT_SUPPORT && OMPT_OPTIONAL
2994     if (ompt_enabled.ompt_callback_mutex_acquired) {
2995       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2996           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2997     }
2998 #endif
2999     return FTN_TRUE;
3000   } else {
3001 #if USE_ITT_BUILD
3002     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3003 #endif
3004     return FTN_FALSE;
3005   }
3006 
3007 #else // KMP_USE_DYNAMIC_LOCK
3008 
3009   kmp_user_lock_p lck;
3010   int rc;
3011 
3012   if ((__kmp_user_lock_kind == lk_tas) &&
3013       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3014     lck = (kmp_user_lock_p)user_lock;
3015   }
3016 #if KMP_USE_FUTEX
3017   else if ((__kmp_user_lock_kind == lk_futex) &&
3018            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3019     lck = (kmp_user_lock_p)user_lock;
3020   }
3021 #endif
3022   else {
3023     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3024   }
3025 
3026 #if USE_ITT_BUILD
3027   __kmp_itt_lock_acquiring(lck);
3028 #endif /* USE_ITT_BUILD */
3029 #if OMPT_SUPPORT && OMPT_OPTIONAL
3030   // This is the case, if called from omp_init_lock_with_hint:
3031   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3032   if (!codeptr)
3033     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3034   if (ompt_enabled.ompt_callback_mutex_acquire) {
3035     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3036         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3037         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3038   }
3039 #endif
3040 
3041   rc = TEST_LOCK(lck, gtid);
3042 #if USE_ITT_BUILD
3043   if (rc) {
3044     __kmp_itt_lock_acquired(lck);
3045   } else {
3046     __kmp_itt_lock_cancelled(lck);
3047   }
3048 #endif /* USE_ITT_BUILD */
3049 #if OMPT_SUPPORT && OMPT_OPTIONAL
3050   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3051     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3052         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3053   }
3054 #endif
3055 
3056   return (rc ? FTN_TRUE : FTN_FALSE);
3057 
3058 /* Can't use serial interval since not block structured */
3059 
3060 #endif // KMP_USE_DYNAMIC_LOCK
3061 }
3062 
3063 /* try to acquire the lock */
3064 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3065 #if KMP_USE_DYNAMIC_LOCK
3066   int rc;
3067 #if USE_ITT_BUILD
3068   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3069 #endif
3070 #if OMPT_SUPPORT && OMPT_OPTIONAL
3071   // This is the case, if called from omp_init_lock_with_hint:
3072   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3073   if (!codeptr)
3074     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3075   if (ompt_enabled.ompt_callback_mutex_acquire) {
3076     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3077         ompt_mutex_nest_lock, omp_lock_hint_none,
3078         __ompt_get_mutex_impl_type(user_lock),
3079         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3080   }
3081 #endif
3082   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3083 #if USE_ITT_BUILD
3084   if (rc) {
3085     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3086   } else {
3087     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3088   }
3089 #endif
3090 #if OMPT_SUPPORT && OMPT_OPTIONAL
3091   if (ompt_enabled.enabled && rc) {
3092     if (rc == 1) {
3093       if (ompt_enabled.ompt_callback_mutex_acquired) {
3094         // lock_first
3095         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3096             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3097             codeptr);
3098       }
3099     } else {
3100       if (ompt_enabled.ompt_callback_nest_lock) {
3101         // lock_next
3102         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3103             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3104       }
3105     }
3106   }
3107 #endif
3108   return rc;
3109 
3110 #else // KMP_USE_DYNAMIC_LOCK
3111 
3112   kmp_user_lock_p lck;
3113   int rc;
3114 
3115   if ((__kmp_user_lock_kind == lk_tas) &&
3116       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3117        OMP_NEST_LOCK_T_SIZE)) {
3118     lck = (kmp_user_lock_p)user_lock;
3119   }
3120 #if KMP_USE_FUTEX
3121   else if ((__kmp_user_lock_kind == lk_futex) &&
3122            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3123             OMP_NEST_LOCK_T_SIZE)) {
3124     lck = (kmp_user_lock_p)user_lock;
3125   }
3126 #endif
3127   else {
3128     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3129   }
3130 
3131 #if USE_ITT_BUILD
3132   __kmp_itt_lock_acquiring(lck);
3133 #endif /* USE_ITT_BUILD */
3134 
3135 #if OMPT_SUPPORT && OMPT_OPTIONAL
3136   // This is the case, if called from omp_init_lock_with_hint:
3137   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3138   if (!codeptr)
3139     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3140   if (ompt_enabled.enabled) &&
3141         ompt_enabled.ompt_callback_mutex_acquire) {
3142       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3143           ompt_mutex_nest_lock, omp_lock_hint_none,
3144           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3145           codeptr);
3146     }
3147 #endif
3148 
3149   rc = TEST_NESTED_LOCK(lck, gtid);
3150 #if USE_ITT_BUILD
3151   if (rc) {
3152     __kmp_itt_lock_acquired(lck);
3153   } else {
3154     __kmp_itt_lock_cancelled(lck);
3155   }
3156 #endif /* USE_ITT_BUILD */
3157 #if OMPT_SUPPORT && OMPT_OPTIONAL
3158   if (ompt_enabled.enabled && rc) {
3159     if (rc == 1) {
3160       if (ompt_enabled.ompt_callback_mutex_acquired) {
3161         // lock_first
3162         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3163             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3164       }
3165     } else {
3166       if (ompt_enabled.ompt_callback_nest_lock) {
3167         // lock_next
3168         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3169             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3170       }
3171     }
3172   }
3173 #endif
3174   return rc;
3175 
3176 /* Can't use serial interval since not block structured */
3177 
3178 #endif // KMP_USE_DYNAMIC_LOCK
3179 }
3180 
3181 // Interface to fast scalable reduce methods routines
3182 
3183 // keep the selected method in a thread local structure for cross-function
3184 // usage: will be used in __kmpc_end_reduce* functions;
3185 // another solution: to re-determine the method one more time in
3186 // __kmpc_end_reduce* functions (new prototype required then)
3187 // AT: which solution is better?
3188 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3189   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3190 
3191 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3192   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3193 
3194 // description of the packed_reduction_method variable: look at the macros in
3195 // kmp.h
3196 
3197 // used in a critical section reduce block
3198 static __forceinline void
3199 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3200                                           kmp_critical_name *crit) {
3201 
3202   // this lock was visible to a customer and to the threading profile tool as a
3203   // serial overhead span (although it's used for an internal purpose only)
3204   //            why was it visible in previous implementation?
3205   //            should we keep it visible in new reduce block?
3206   kmp_user_lock_p lck;
3207 
3208 #if KMP_USE_DYNAMIC_LOCK
3209 
3210   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3211   // Check if it is initialized.
3212   if (*lk == 0) {
3213     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3214       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3215                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3216     } else {
3217       __kmp_init_indirect_csptr(crit, loc, global_tid,
3218                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3219     }
3220   }
3221   // Branch for accessing the actual lock object and set operation. This
3222   // branching is inevitable since this lock initialization does not follow the
3223   // normal dispatch path (lock table is not used).
3224   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3225     lck = (kmp_user_lock_p)lk;
3226     KMP_DEBUG_ASSERT(lck != NULL);
3227     if (__kmp_env_consistency_check) {
3228       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3229     }
3230     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3231   } else {
3232     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3233     lck = ilk->lock;
3234     KMP_DEBUG_ASSERT(lck != NULL);
3235     if (__kmp_env_consistency_check) {
3236       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3237     }
3238     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3239   }
3240 
3241 #else // KMP_USE_DYNAMIC_LOCK
3242 
3243   // We know that the fast reduction code is only emitted by Intel compilers
3244   // with 32 byte critical sections. If there isn't enough space, then we
3245   // have to use a pointer.
3246   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3247     lck = (kmp_user_lock_p)crit;
3248   } else {
3249     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3250   }
3251   KMP_DEBUG_ASSERT(lck != NULL);
3252 
3253   if (__kmp_env_consistency_check)
3254     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3255 
3256   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3257 
3258 #endif // KMP_USE_DYNAMIC_LOCK
3259 }
3260 
3261 // used in a critical section reduce block
3262 static __forceinline void
3263 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3264                                         kmp_critical_name *crit) {
3265 
3266   kmp_user_lock_p lck;
3267 
3268 #if KMP_USE_DYNAMIC_LOCK
3269 
3270   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3271     lck = (kmp_user_lock_p)crit;
3272     if (__kmp_env_consistency_check)
3273       __kmp_pop_sync(global_tid, ct_critical, loc);
3274     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3275   } else {
3276     kmp_indirect_lock_t *ilk =
3277         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3278     if (__kmp_env_consistency_check)
3279       __kmp_pop_sync(global_tid, ct_critical, loc);
3280     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3281   }
3282 
3283 #else // KMP_USE_DYNAMIC_LOCK
3284 
3285   // We know that the fast reduction code is only emitted by Intel compilers
3286   // with 32 byte critical sections. If there isn't enough space, then we have
3287   // to use a pointer.
3288   if (__kmp_base_user_lock_size > 32) {
3289     lck = *((kmp_user_lock_p *)crit);
3290     KMP_ASSERT(lck != NULL);
3291   } else {
3292     lck = (kmp_user_lock_p)crit;
3293   }
3294 
3295   if (__kmp_env_consistency_check)
3296     __kmp_pop_sync(global_tid, ct_critical, loc);
3297 
3298   __kmp_release_user_lock_with_checks(lck, global_tid);
3299 
3300 #endif // KMP_USE_DYNAMIC_LOCK
3301 } // __kmp_end_critical_section_reduce_block
3302 
3303 static __forceinline int
3304 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3305                                      int *task_state) {
3306   kmp_team_t *team;
3307 
3308   // Check if we are inside the teams construct?
3309   if (th->th.th_teams_microtask) {
3310     *team_p = team = th->th.th_team;
3311     if (team->t.t_level == th->th.th_teams_level) {
3312       // This is reduction at teams construct.
3313       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3314       // Let's swap teams temporarily for the reduction.
3315       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3316       th->th.th_team = team->t.t_parent;
3317       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3318       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3319       *task_state = th->th.th_task_state;
3320       th->th.th_task_state = 0;
3321 
3322       return 1;
3323     }
3324   }
3325   return 0;
3326 }
3327 
3328 static __forceinline void
3329 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3330   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3331   th->th.th_info.ds.ds_tid = 0;
3332   th->th.th_team = team;
3333   th->th.th_team_nproc = team->t.t_nproc;
3334   th->th.th_task_team = team->t.t_task_team[task_state];
3335   __kmp_type_convert(task_state, &(th->th.th_task_state));
3336 }
3337 
3338 /* 2.a.i. Reduce Block without a terminating barrier */
3339 /*!
3340 @ingroup SYNCHRONIZATION
3341 @param loc source location information
3342 @param global_tid global thread number
3343 @param num_vars number of items (variables) to be reduced
3344 @param reduce_size size of data in bytes to be reduced
3345 @param reduce_data pointer to data to be reduced
3346 @param reduce_func callback function providing reduction operation on two
3347 operands and returning result of reduction in lhs_data
3348 @param lck pointer to the unique lock data structure
3349 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3350 threads if atomic reduction needed
3351 
3352 The nowait version is used for a reduce clause with the nowait argument.
3353 */
3354 kmp_int32
3355 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3356                      size_t reduce_size, void *reduce_data,
3357                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3358                      kmp_critical_name *lck) {
3359 
3360   KMP_COUNT_BLOCK(REDUCE_nowait);
3361   int retval = 0;
3362   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3363   kmp_info_t *th;
3364   kmp_team_t *team;
3365   int teams_swapped = 0, task_state;
3366   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3367   __kmp_assert_valid_gtid(global_tid);
3368 
3369   // why do we need this initialization here at all?
3370   // Reduction clause can not be used as a stand-alone directive.
3371 
3372   // do not call __kmp_serial_initialize(), it will be called by
3373   // __kmp_parallel_initialize() if needed
3374   // possible detection of false-positive race by the threadchecker ???
3375   if (!TCR_4(__kmp_init_parallel))
3376     __kmp_parallel_initialize();
3377 
3378   __kmp_resume_if_soft_paused();
3379 
3380 // check correctness of reduce block nesting
3381 #if KMP_USE_DYNAMIC_LOCK
3382   if (__kmp_env_consistency_check)
3383     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3384 #else
3385   if (__kmp_env_consistency_check)
3386     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3387 #endif
3388 
3389   th = __kmp_thread_from_gtid(global_tid);
3390   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3391 
3392   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3393   // the value should be kept in a variable
3394   // the variable should be either a construct-specific or thread-specific
3395   // property, not a team specific property
3396   //     (a thread can reach the next reduce block on the next construct, reduce
3397   //     method may differ on the next construct)
3398   // an ident_t "loc" parameter could be used as a construct-specific property
3399   // (what if loc == 0?)
3400   //     (if both construct-specific and team-specific variables were shared,
3401   //     then unness extra syncs should be needed)
3402   // a thread-specific variable is better regarding two issues above (next
3403   // construct and extra syncs)
3404   // a thread-specific "th_local.reduction_method" variable is used currently
3405   // each thread executes 'determine' and 'set' lines (no need to execute by one
3406   // thread, to avoid unness extra syncs)
3407 
3408   packed_reduction_method = __kmp_determine_reduction_method(
3409       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3410   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3411 
3412   OMPT_REDUCTION_DECL(th, global_tid);
3413   if (packed_reduction_method == critical_reduce_block) {
3414 
3415     OMPT_REDUCTION_BEGIN;
3416 
3417     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3418     retval = 1;
3419 
3420   } else if (packed_reduction_method == empty_reduce_block) {
3421 
3422     OMPT_REDUCTION_BEGIN;
3423 
3424     // usage: if team size == 1, no synchronization is required ( Intel
3425     // platforms only )
3426     retval = 1;
3427 
3428   } else if (packed_reduction_method == atomic_reduce_block) {
3429 
3430     retval = 2;
3431 
3432     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3433     // won't be called by the code gen)
3434     //     (it's not quite good, because the checking block has been closed by
3435     //     this 'pop',
3436     //      but atomic operation has not been executed yet, will be executed
3437     //      slightly later, literally on next instruction)
3438     if (__kmp_env_consistency_check)
3439       __kmp_pop_sync(global_tid, ct_reduce, loc);
3440 
3441   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3442                                    tree_reduce_block)) {
3443 
3444 // AT: performance issue: a real barrier here
3445 // AT:     (if master goes slow, other threads are blocked here waiting for the
3446 // master to come and release them)
3447 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3448 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3449 // be confusing to a customer)
3450 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3451 // might go faster and be more in line with sense of NOWAIT
3452 // AT: TO DO: do epcc test and compare times
3453 
3454 // this barrier should be invisible to a customer and to the threading profile
3455 // tool (it's neither a terminating barrier nor customer's code, it's
3456 // used for an internal purpose)
3457 #if OMPT_SUPPORT
3458     // JP: can this barrier potentially leed to task scheduling?
3459     // JP: as long as there is a barrier in the implementation, OMPT should and
3460     // will provide the barrier events
3461     //         so we set-up the necessary frame/return addresses.
3462     ompt_frame_t *ompt_frame;
3463     if (ompt_enabled.enabled) {
3464       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3465       if (ompt_frame->enter_frame.ptr == NULL)
3466         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3467     }
3468     OMPT_STORE_RETURN_ADDRESS(global_tid);
3469 #endif
3470 #if USE_ITT_NOTIFY
3471     __kmp_threads[global_tid]->th.th_ident = loc;
3472 #endif
3473     retval =
3474         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3475                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3476     retval = (retval != 0) ? (0) : (1);
3477 #if OMPT_SUPPORT && OMPT_OPTIONAL
3478     if (ompt_enabled.enabled) {
3479       ompt_frame->enter_frame = ompt_data_none;
3480     }
3481 #endif
3482 
3483     // all other workers except master should do this pop here
3484     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3485     if (__kmp_env_consistency_check) {
3486       if (retval == 0) {
3487         __kmp_pop_sync(global_tid, ct_reduce, loc);
3488       }
3489     }
3490 
3491   } else {
3492 
3493     // should never reach this block
3494     KMP_ASSERT(0); // "unexpected method"
3495   }
3496   if (teams_swapped) {
3497     __kmp_restore_swapped_teams(th, team, task_state);
3498   }
3499   KA_TRACE(
3500       10,
3501       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3502        global_tid, packed_reduction_method, retval));
3503 
3504   return retval;
3505 }
3506 
3507 /*!
3508 @ingroup SYNCHRONIZATION
3509 @param loc source location information
3510 @param global_tid global thread id.
3511 @param lck pointer to the unique lock data structure
3512 
3513 Finish the execution of a reduce nowait.
3514 */
3515 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3516                               kmp_critical_name *lck) {
3517 
3518   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3519 
3520   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3521   __kmp_assert_valid_gtid(global_tid);
3522 
3523   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3524 
3525   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3526 
3527   if (packed_reduction_method == critical_reduce_block) {
3528 
3529     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3530     OMPT_REDUCTION_END;
3531 
3532   } else if (packed_reduction_method == empty_reduce_block) {
3533 
3534     // usage: if team size == 1, no synchronization is required ( on Intel
3535     // platforms only )
3536 
3537     OMPT_REDUCTION_END;
3538 
3539   } else if (packed_reduction_method == atomic_reduce_block) {
3540 
3541     // neither master nor other workers should get here
3542     //     (code gen does not generate this call in case 2: atomic reduce block)
3543     // actually it's better to remove this elseif at all;
3544     // after removal this value will checked by the 'else' and will assert
3545 
3546   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3547                                    tree_reduce_block)) {
3548 
3549     // only master gets here
3550     // OMPT: tree reduction is annotated in the barrier code
3551 
3552   } else {
3553 
3554     // should never reach this block
3555     KMP_ASSERT(0); // "unexpected method"
3556   }
3557 
3558   if (__kmp_env_consistency_check)
3559     __kmp_pop_sync(global_tid, ct_reduce, loc);
3560 
3561   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3562                 global_tid, packed_reduction_method));
3563 
3564   return;
3565 }
3566 
3567 /* 2.a.ii. Reduce Block with a terminating barrier */
3568 
3569 /*!
3570 @ingroup SYNCHRONIZATION
3571 @param loc source location information
3572 @param global_tid global thread number
3573 @param num_vars number of items (variables) to be reduced
3574 @param reduce_size size of data in bytes to be reduced
3575 @param reduce_data pointer to data to be reduced
3576 @param reduce_func callback function providing reduction operation on two
3577 operands and returning result of reduction in lhs_data
3578 @param lck pointer to the unique lock data structure
3579 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3580 threads if atomic reduction needed
3581 
3582 A blocking reduce that includes an implicit barrier.
3583 */
3584 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3585                         size_t reduce_size, void *reduce_data,
3586                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3587                         kmp_critical_name *lck) {
3588   KMP_COUNT_BLOCK(REDUCE_wait);
3589   int retval = 0;
3590   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3591   kmp_info_t *th;
3592   kmp_team_t *team;
3593   int teams_swapped = 0, task_state;
3594 
3595   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3596   __kmp_assert_valid_gtid(global_tid);
3597 
3598   // why do we need this initialization here at all?
3599   // Reduction clause can not be a stand-alone directive.
3600 
3601   // do not call __kmp_serial_initialize(), it will be called by
3602   // __kmp_parallel_initialize() if needed
3603   // possible detection of false-positive race by the threadchecker ???
3604   if (!TCR_4(__kmp_init_parallel))
3605     __kmp_parallel_initialize();
3606 
3607   __kmp_resume_if_soft_paused();
3608 
3609 // check correctness of reduce block nesting
3610 #if KMP_USE_DYNAMIC_LOCK
3611   if (__kmp_env_consistency_check)
3612     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3613 #else
3614   if (__kmp_env_consistency_check)
3615     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3616 #endif
3617 
3618   th = __kmp_thread_from_gtid(global_tid);
3619   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3620 
3621   packed_reduction_method = __kmp_determine_reduction_method(
3622       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3623   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3624 
3625   OMPT_REDUCTION_DECL(th, global_tid);
3626 
3627   if (packed_reduction_method == critical_reduce_block) {
3628 
3629     OMPT_REDUCTION_BEGIN;
3630     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3631     retval = 1;
3632 
3633   } else if (packed_reduction_method == empty_reduce_block) {
3634 
3635     OMPT_REDUCTION_BEGIN;
3636     // usage: if team size == 1, no synchronization is required ( Intel
3637     // platforms only )
3638     retval = 1;
3639 
3640   } else if (packed_reduction_method == atomic_reduce_block) {
3641 
3642     retval = 2;
3643 
3644   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3645                                    tree_reduce_block)) {
3646 
3647 // case tree_reduce_block:
3648 // this barrier should be visible to a customer and to the threading profile
3649 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3650 #if OMPT_SUPPORT
3651     ompt_frame_t *ompt_frame;
3652     if (ompt_enabled.enabled) {
3653       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3654       if (ompt_frame->enter_frame.ptr == NULL)
3655         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3656     }
3657     OMPT_STORE_RETURN_ADDRESS(global_tid);
3658 #endif
3659 #if USE_ITT_NOTIFY
3660     __kmp_threads[global_tid]->th.th_ident =
3661         loc; // needed for correct notification of frames
3662 #endif
3663     retval =
3664         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3665                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3666     retval = (retval != 0) ? (0) : (1);
3667 #if OMPT_SUPPORT && OMPT_OPTIONAL
3668     if (ompt_enabled.enabled) {
3669       ompt_frame->enter_frame = ompt_data_none;
3670     }
3671 #endif
3672 
3673     // all other workers except master should do this pop here
3674     // ( none of other workers except master will enter __kmpc_end_reduce() )
3675     if (__kmp_env_consistency_check) {
3676       if (retval == 0) { // 0: all other workers; 1: master
3677         __kmp_pop_sync(global_tid, ct_reduce, loc);
3678       }
3679     }
3680 
3681   } else {
3682 
3683     // should never reach this block
3684     KMP_ASSERT(0); // "unexpected method"
3685   }
3686   if (teams_swapped) {
3687     __kmp_restore_swapped_teams(th, team, task_state);
3688   }
3689 
3690   KA_TRACE(10,
3691            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3692             global_tid, packed_reduction_method, retval));
3693   return retval;
3694 }
3695 
3696 /*!
3697 @ingroup SYNCHRONIZATION
3698 @param loc source location information
3699 @param global_tid global thread id.
3700 @param lck pointer to the unique lock data structure
3701 
3702 Finish the execution of a blocking reduce.
3703 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3704 start function.
3705 */
3706 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3707                        kmp_critical_name *lck) {
3708 
3709   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3710   kmp_info_t *th;
3711   kmp_team_t *team;
3712   int teams_swapped = 0, task_state;
3713 
3714   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3715   __kmp_assert_valid_gtid(global_tid);
3716 
3717   th = __kmp_thread_from_gtid(global_tid);
3718   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3719 
3720   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3721 
3722   // this barrier should be visible to a customer and to the threading profile
3723   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3724   OMPT_REDUCTION_DECL(th, global_tid);
3725 
3726   if (packed_reduction_method == critical_reduce_block) {
3727     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3728 
3729     OMPT_REDUCTION_END;
3730 
3731 // TODO: implicit barrier: should be exposed
3732 #if OMPT_SUPPORT
3733     ompt_frame_t *ompt_frame;
3734     if (ompt_enabled.enabled) {
3735       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3736       if (ompt_frame->enter_frame.ptr == NULL)
3737         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3738     }
3739     OMPT_STORE_RETURN_ADDRESS(global_tid);
3740 #endif
3741 #if USE_ITT_NOTIFY
3742     __kmp_threads[global_tid]->th.th_ident = loc;
3743 #endif
3744     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3745 #if OMPT_SUPPORT && OMPT_OPTIONAL
3746     if (ompt_enabled.enabled) {
3747       ompt_frame->enter_frame = ompt_data_none;
3748     }
3749 #endif
3750 
3751   } else if (packed_reduction_method == empty_reduce_block) {
3752 
3753     OMPT_REDUCTION_END;
3754 
3755 // usage: if team size==1, no synchronization is required (Intel platforms only)
3756 
3757 // TODO: implicit barrier: should be exposed
3758 #if OMPT_SUPPORT
3759     ompt_frame_t *ompt_frame;
3760     if (ompt_enabled.enabled) {
3761       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3762       if (ompt_frame->enter_frame.ptr == NULL)
3763         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3764     }
3765     OMPT_STORE_RETURN_ADDRESS(global_tid);
3766 #endif
3767 #if USE_ITT_NOTIFY
3768     __kmp_threads[global_tid]->th.th_ident = loc;
3769 #endif
3770     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3771 #if OMPT_SUPPORT && OMPT_OPTIONAL
3772     if (ompt_enabled.enabled) {
3773       ompt_frame->enter_frame = ompt_data_none;
3774     }
3775 #endif
3776 
3777   } else if (packed_reduction_method == atomic_reduce_block) {
3778 
3779 #if OMPT_SUPPORT
3780     ompt_frame_t *ompt_frame;
3781     if (ompt_enabled.enabled) {
3782       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3783       if (ompt_frame->enter_frame.ptr == NULL)
3784         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3785     }
3786     OMPT_STORE_RETURN_ADDRESS(global_tid);
3787 #endif
3788 // TODO: implicit barrier: should be exposed
3789 #if USE_ITT_NOTIFY
3790     __kmp_threads[global_tid]->th.th_ident = loc;
3791 #endif
3792     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3793 #if OMPT_SUPPORT && OMPT_OPTIONAL
3794     if (ompt_enabled.enabled) {
3795       ompt_frame->enter_frame = ompt_data_none;
3796     }
3797 #endif
3798 
3799   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3800                                    tree_reduce_block)) {
3801 
3802     // only master executes here (master releases all other workers)
3803     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3804                             global_tid);
3805 
3806   } else {
3807 
3808     // should never reach this block
3809     KMP_ASSERT(0); // "unexpected method"
3810   }
3811   if (teams_swapped) {
3812     __kmp_restore_swapped_teams(th, team, task_state);
3813   }
3814 
3815   if (__kmp_env_consistency_check)
3816     __kmp_pop_sync(global_tid, ct_reduce, loc);
3817 
3818   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3819                 global_tid, packed_reduction_method));
3820 
3821   return;
3822 }
3823 
3824 #undef __KMP_GET_REDUCTION_METHOD
3825 #undef __KMP_SET_REDUCTION_METHOD
3826 
3827 /* end of interface to fast scalable reduce routines */
3828 
3829 kmp_uint64 __kmpc_get_taskid() {
3830 
3831   kmp_int32 gtid;
3832   kmp_info_t *thread;
3833 
3834   gtid = __kmp_get_gtid();
3835   if (gtid < 0) {
3836     return 0;
3837   }
3838   thread = __kmp_thread_from_gtid(gtid);
3839   return thread->th.th_current_task->td_task_id;
3840 
3841 } // __kmpc_get_taskid
3842 
3843 kmp_uint64 __kmpc_get_parent_taskid() {
3844 
3845   kmp_int32 gtid;
3846   kmp_info_t *thread;
3847   kmp_taskdata_t *parent_task;
3848 
3849   gtid = __kmp_get_gtid();
3850   if (gtid < 0) {
3851     return 0;
3852   }
3853   thread = __kmp_thread_from_gtid(gtid);
3854   parent_task = thread->th.th_current_task->td_parent;
3855   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3856 
3857 } // __kmpc_get_parent_taskid
3858 
3859 /*!
3860 @ingroup WORK_SHARING
3861 @param loc  source location information.
3862 @param gtid  global thread number.
3863 @param num_dims  number of associated doacross loops.
3864 @param dims  info on loops bounds.
3865 
3866 Initialize doacross loop information.
3867 Expect compiler send us inclusive bounds,
3868 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3869 */
3870 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3871                           const struct kmp_dim *dims) {
3872   __kmp_assert_valid_gtid(gtid);
3873   int j, idx;
3874   kmp_int64 last, trace_count;
3875   kmp_info_t *th = __kmp_threads[gtid];
3876   kmp_team_t *team = th->th.th_team;
3877   kmp_uint32 *flags;
3878   kmp_disp_t *pr_buf = th->th.th_dispatch;
3879   dispatch_shared_info_t *sh_buf;
3880 
3881   KA_TRACE(
3882       20,
3883       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3884        gtid, num_dims, !team->t.t_serialized));
3885   KMP_DEBUG_ASSERT(dims != NULL);
3886   KMP_DEBUG_ASSERT(num_dims > 0);
3887 
3888   if (team->t.t_serialized) {
3889     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3890     return; // no dependencies if team is serialized
3891   }
3892   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3893   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3894   // the next loop
3895   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3896 
3897   // Save bounds info into allocated private buffer
3898   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3899   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3900       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3901   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3902   pr_buf->th_doacross_info[0] =
3903       (kmp_int64)num_dims; // first element is number of dimensions
3904   // Save also address of num_done in order to access it later without knowing
3905   // the buffer index
3906   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3907   pr_buf->th_doacross_info[2] = dims[0].lo;
3908   pr_buf->th_doacross_info[3] = dims[0].up;
3909   pr_buf->th_doacross_info[4] = dims[0].st;
3910   last = 5;
3911   for (j = 1; j < num_dims; ++j) {
3912     kmp_int64
3913         range_length; // To keep ranges of all dimensions but the first dims[0]
3914     if (dims[j].st == 1) { // most common case
3915       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3916       range_length = dims[j].up - dims[j].lo + 1;
3917     } else {
3918       if (dims[j].st > 0) {
3919         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3920         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3921       } else { // negative increment
3922         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3923         range_length =
3924             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3925       }
3926     }
3927     pr_buf->th_doacross_info[last++] = range_length;
3928     pr_buf->th_doacross_info[last++] = dims[j].lo;
3929     pr_buf->th_doacross_info[last++] = dims[j].up;
3930     pr_buf->th_doacross_info[last++] = dims[j].st;
3931   }
3932 
3933   // Compute total trip count.
3934   // Start with range of dims[0] which we don't need to keep in the buffer.
3935   if (dims[0].st == 1) { // most common case
3936     trace_count = dims[0].up - dims[0].lo + 1;
3937   } else if (dims[0].st > 0) {
3938     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3939     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3940   } else { // negative increment
3941     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3942     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3943   }
3944   for (j = 1; j < num_dims; ++j) {
3945     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3946   }
3947   KMP_DEBUG_ASSERT(trace_count > 0);
3948 
3949   // Check if shared buffer is not occupied by other loop (idx -
3950   // __kmp_dispatch_num_buffers)
3951   if (idx != sh_buf->doacross_buf_idx) {
3952     // Shared buffer is occupied, wait for it to be free
3953     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3954                  __kmp_eq_4, NULL);
3955   }
3956 #if KMP_32_BIT_ARCH
3957   // Check if we are the first thread. After the CAS the first thread gets 0,
3958   // others get 1 if initialization is in progress, allocated pointer otherwise.
3959   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3960   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3961       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3962 #else
3963   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3964       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3965 #endif
3966   if (flags == NULL) {
3967     // we are the first thread, allocate the array of flags
3968     size_t size =
3969         (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
3970     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3971     KMP_MB();
3972     sh_buf->doacross_flags = flags;
3973   } else if (flags == (kmp_uint32 *)1) {
3974 #if KMP_32_BIT_ARCH
3975     // initialization is still in progress, need to wait
3976     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3977 #else
3978     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3979 #endif
3980       KMP_YIELD(TRUE);
3981     KMP_MB();
3982   } else {
3983     KMP_MB();
3984   }
3985   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3986   pr_buf->th_doacross_flags =
3987       sh_buf->doacross_flags; // save private copy in order to not
3988   // touch shared buffer on each iteration
3989   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3990 }
3991 
3992 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
3993   __kmp_assert_valid_gtid(gtid);
3994   kmp_int64 shft;
3995   size_t num_dims, i;
3996   kmp_uint32 flag;
3997   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3998   kmp_info_t *th = __kmp_threads[gtid];
3999   kmp_team_t *team = th->th.th_team;
4000   kmp_disp_t *pr_buf;
4001   kmp_int64 lo, up, st;
4002 
4003   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4004   if (team->t.t_serialized) {
4005     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4006     return; // no dependencies if team is serialized
4007   }
4008 
4009   // calculate sequential iteration number and check out-of-bounds condition
4010   pr_buf = th->th.th_dispatch;
4011   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4012   num_dims = (size_t)pr_buf->th_doacross_info[0];
4013   lo = pr_buf->th_doacross_info[2];
4014   up = pr_buf->th_doacross_info[3];
4015   st = pr_buf->th_doacross_info[4];
4016 #if OMPT_SUPPORT && OMPT_OPTIONAL
4017   ompt_dependence_t deps[num_dims];
4018 #endif
4019   if (st == 1) { // most common case
4020     if (vec[0] < lo || vec[0] > up) {
4021       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4022                     "bounds [%lld,%lld]\n",
4023                     gtid, vec[0], lo, up));
4024       return;
4025     }
4026     iter_number = vec[0] - lo;
4027   } else if (st > 0) {
4028     if (vec[0] < lo || vec[0] > up) {
4029       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4030                     "bounds [%lld,%lld]\n",
4031                     gtid, vec[0], lo, up));
4032       return;
4033     }
4034     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4035   } else { // negative increment
4036     if (vec[0] > lo || vec[0] < up) {
4037       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4038                     "bounds [%lld,%lld]\n",
4039                     gtid, vec[0], lo, up));
4040       return;
4041     }
4042     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4043   }
4044 #if OMPT_SUPPORT && OMPT_OPTIONAL
4045   deps[0].variable.value = iter_number;
4046   deps[0].dependence_type = ompt_dependence_type_sink;
4047 #endif
4048   for (i = 1; i < num_dims; ++i) {
4049     kmp_int64 iter, ln;
4050     size_t j = i * 4;
4051     ln = pr_buf->th_doacross_info[j + 1];
4052     lo = pr_buf->th_doacross_info[j + 2];
4053     up = pr_buf->th_doacross_info[j + 3];
4054     st = pr_buf->th_doacross_info[j + 4];
4055     if (st == 1) {
4056       if (vec[i] < lo || vec[i] > up) {
4057         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4058                       "bounds [%lld,%lld]\n",
4059                       gtid, vec[i], lo, up));
4060         return;
4061       }
4062       iter = vec[i] - lo;
4063     } else if (st > 0) {
4064       if (vec[i] < lo || vec[i] > up) {
4065         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4066                       "bounds [%lld,%lld]\n",
4067                       gtid, vec[i], lo, up));
4068         return;
4069       }
4070       iter = (kmp_uint64)(vec[i] - lo) / st;
4071     } else { // st < 0
4072       if (vec[i] > lo || vec[i] < up) {
4073         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4074                       "bounds [%lld,%lld]\n",
4075                       gtid, vec[i], lo, up));
4076         return;
4077       }
4078       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4079     }
4080     iter_number = iter + ln * iter_number;
4081 #if OMPT_SUPPORT && OMPT_OPTIONAL
4082     deps[i].variable.value = iter;
4083     deps[i].dependence_type = ompt_dependence_type_sink;
4084 #endif
4085   }
4086   shft = iter_number % 32; // use 32-bit granularity
4087   iter_number >>= 5; // divided by 32
4088   flag = 1 << shft;
4089   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4090     KMP_YIELD(TRUE);
4091   }
4092   KMP_MB();
4093 #if OMPT_SUPPORT && OMPT_OPTIONAL
4094   if (ompt_enabled.ompt_callback_dependences) {
4095     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4096         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4097   }
4098 #endif
4099   KA_TRACE(20,
4100            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4101             gtid, (iter_number << 5) + shft));
4102 }
4103 
4104 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4105   __kmp_assert_valid_gtid(gtid);
4106   kmp_int64 shft;
4107   size_t num_dims, i;
4108   kmp_uint32 flag;
4109   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4110   kmp_info_t *th = __kmp_threads[gtid];
4111   kmp_team_t *team = th->th.th_team;
4112   kmp_disp_t *pr_buf;
4113   kmp_int64 lo, st;
4114 
4115   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4116   if (team->t.t_serialized) {
4117     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4118     return; // no dependencies if team is serialized
4119   }
4120 
4121   // calculate sequential iteration number (same as in "wait" but no
4122   // out-of-bounds checks)
4123   pr_buf = th->th.th_dispatch;
4124   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4125   num_dims = (size_t)pr_buf->th_doacross_info[0];
4126   lo = pr_buf->th_doacross_info[2];
4127   st = pr_buf->th_doacross_info[4];
4128 #if OMPT_SUPPORT && OMPT_OPTIONAL
4129   ompt_dependence_t deps[num_dims];
4130 #endif
4131   if (st == 1) { // most common case
4132     iter_number = vec[0] - lo;
4133   } else if (st > 0) {
4134     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4135   } else { // negative increment
4136     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4137   }
4138 #if OMPT_SUPPORT && OMPT_OPTIONAL
4139   deps[0].variable.value = iter_number;
4140   deps[0].dependence_type = ompt_dependence_type_source;
4141 #endif
4142   for (i = 1; i < num_dims; ++i) {
4143     kmp_int64 iter, ln;
4144     size_t j = i * 4;
4145     ln = pr_buf->th_doacross_info[j + 1];
4146     lo = pr_buf->th_doacross_info[j + 2];
4147     st = pr_buf->th_doacross_info[j + 4];
4148     if (st == 1) {
4149       iter = vec[i] - lo;
4150     } else if (st > 0) {
4151       iter = (kmp_uint64)(vec[i] - lo) / st;
4152     } else { // st < 0
4153       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4154     }
4155     iter_number = iter + ln * iter_number;
4156 #if OMPT_SUPPORT && OMPT_OPTIONAL
4157     deps[i].variable.value = iter;
4158     deps[i].dependence_type = ompt_dependence_type_source;
4159 #endif
4160   }
4161 #if OMPT_SUPPORT && OMPT_OPTIONAL
4162   if (ompt_enabled.ompt_callback_dependences) {
4163     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4164         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4165   }
4166 #endif
4167   shft = iter_number % 32; // use 32-bit granularity
4168   iter_number >>= 5; // divided by 32
4169   flag = 1 << shft;
4170   KMP_MB();
4171   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4172     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4173   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4174                 (iter_number << 5) + shft));
4175 }
4176 
4177 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4178   __kmp_assert_valid_gtid(gtid);
4179   kmp_int32 num_done;
4180   kmp_info_t *th = __kmp_threads[gtid];
4181   kmp_team_t *team = th->th.th_team;
4182   kmp_disp_t *pr_buf = th->th.th_dispatch;
4183 
4184   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4185   if (team->t.t_serialized) {
4186     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4187     return; // nothing to do
4188   }
4189   num_done =
4190       KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4191   if (num_done == th->th.th_team_nproc) {
4192     // we are the last thread, need to free shared resources
4193     int idx = pr_buf->th_doacross_buf_idx - 1;
4194     dispatch_shared_info_t *sh_buf =
4195         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4196     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4197                      (kmp_int64)&sh_buf->doacross_num_done);
4198     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4199     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4200     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4201     sh_buf->doacross_flags = NULL;
4202     sh_buf->doacross_num_done = 0;
4203     sh_buf->doacross_buf_idx +=
4204         __kmp_dispatch_num_buffers; // free buffer for future re-use
4205   }
4206   // free private resources (need to keep buffer index forever)
4207   pr_buf->th_doacross_flags = NULL;
4208   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4209   pr_buf->th_doacross_info = NULL;
4210   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4211 }
4212 
4213 /* omp_alloc/omp_calloc/omp_free only defined for C/C++, not for Fortran */
4214 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4215   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4216 }
4217 
4218 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4219   return __kmpc_calloc(__kmp_entry_gtid(), nmemb, size, allocator);
4220 }
4221 
4222 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4223                   omp_allocator_handle_t free_allocator) {
4224   return __kmpc_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4225                         free_allocator);
4226 }
4227 
4228 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4229   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4230 }
4231 
4232 int __kmpc_get_target_offload(void) {
4233   if (!__kmp_init_serial) {
4234     __kmp_serial_initialize();
4235   }
4236   return __kmp_target_offload;
4237 }
4238 
4239 int __kmpc_pause_resource(kmp_pause_status_t level) {
4240   if (!__kmp_init_serial) {
4241     return 1; // Can't pause if runtime is not initialized
4242   }
4243   return __kmp_pause_resource(level);
4244 }
4245