xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_csupport.cpp (revision 5e801ac66d24704442eba426ed13c3effb8a34e7)
1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     __kmp_assign_root_init_mask();
43     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
44   } else if (__kmp_ignore_mppbeg() == FALSE) {
45     // By default __kmp_ignore_mppbeg() returns TRUE.
46     __kmp_internal_begin();
47     KC_TRACE(10, ("__kmpc_begin: called\n"));
48   }
49 }
50 
51 /*!
52  * @ingroup STARTUP_SHUTDOWN
53  * @param loc source location information
54  *
55  * Shutdown the runtime library. This is also optional, and even if called will
56  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
57  * zero.
58  */
59 void __kmpc_end(ident_t *loc) {
60   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
61   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
62   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
63   // returns FALSE and __kmpc_end() will unregister this root (it can cause
64   // library shut down).
65   if (__kmp_ignore_mppend() == FALSE) {
66     KC_TRACE(10, ("__kmpc_end: called\n"));
67     KA_TRACE(30, ("__kmpc_end\n"));
68 
69     __kmp_internal_end_thread(-1);
70   }
71 #if KMP_OS_WINDOWS && OMPT_SUPPORT
72   // Normal exit process on Windows does not allow worker threads of the final
73   // parallel region to finish reporting their events, so shutting down the
74   // library here fixes the issue at least for the cases where __kmpc_end() is
75   // placed properly.
76   if (ompt_enabled.enabled)
77     __kmp_internal_end_library(__kmp_gtid_get_specific());
78 #endif
79 }
80 
81 /*!
82 @ingroup THREAD_STATES
83 @param loc Source location information.
84 @return The global thread index of the active thread.
85 
86 This function can be called in any context.
87 
88 If the runtime has ony been entered at the outermost level from a
89 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
90 that which would be returned by omp_get_thread_num() in the outermost
91 active parallel construct. (Or zero if there is no active parallel
92 construct, since the primary thread is necessarily thread zero).
93 
94 If multiple non-OpenMP threads all enter an OpenMP construct then this
95 will be a unique thread identifier among all the threads created by
96 the OpenMP runtime (but the value cannot be defined in terms of
97 OpenMP thread ids returned by omp_get_thread_num()).
98 */
99 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
100   kmp_int32 gtid = __kmp_entry_gtid();
101 
102   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
103 
104   return gtid;
105 }
106 
107 /*!
108 @ingroup THREAD_STATES
109 @param loc Source location information.
110 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
111 
112 This function can be called in any context.
113 It returns the total number of threads under the control of the OpenMP runtime.
114 That is not a number that can be determined by any OpenMP standard calls, since
115 the library may be called from more than one non-OpenMP thread, and this
116 reflects the total over all such calls. Similarly the runtime maintains
117 underlying threads even when they are not active (since the cost of creating
118 and destroying OS threads is high), this call counts all such threads even if
119 they are not waiting for work.
120 */
121 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
122   KC_TRACE(10,
123            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
124 
125   return TCR_4(__kmp_all_nth);
126 }
127 
128 /*!
129 @ingroup THREAD_STATES
130 @param loc Source location information.
131 @return The thread number of the calling thread in the innermost active parallel
132 construct.
133 */
134 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
135   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
136   return __kmp_tid_from_gtid(__kmp_entry_gtid());
137 }
138 
139 /*!
140 @ingroup THREAD_STATES
141 @param loc Source location information.
142 @return The number of threads in the innermost active parallel construct.
143 */
144 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
145   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
146 
147   return __kmp_entry_thread()->th.th_team->t.t_nproc;
148 }
149 
150 /*!
151  * @ingroup DEPRECATED
152  * @param loc location description
153  *
154  * This function need not be called. It always returns TRUE.
155  */
156 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
157 #ifndef KMP_DEBUG
158 
159   return TRUE;
160 
161 #else
162 
163   const char *semi2;
164   const char *semi3;
165   int line_no;
166 
167   if (__kmp_par_range == 0) {
168     return TRUE;
169   }
170   semi2 = loc->psource;
171   if (semi2 == NULL) {
172     return TRUE;
173   }
174   semi2 = strchr(semi2, ';');
175   if (semi2 == NULL) {
176     return TRUE;
177   }
178   semi2 = strchr(semi2 + 1, ';');
179   if (semi2 == NULL) {
180     return TRUE;
181   }
182   if (__kmp_par_range_filename[0]) {
183     const char *name = semi2 - 1;
184     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
185       name--;
186     }
187     if ((*name == '/') || (*name == ';')) {
188       name++;
189     }
190     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
191       return __kmp_par_range < 0;
192     }
193   }
194   semi3 = strchr(semi2 + 1, ';');
195   if (__kmp_par_range_routine[0]) {
196     if ((semi3 != NULL) && (semi3 > semi2) &&
197         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
198       return __kmp_par_range < 0;
199     }
200   }
201   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
202     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
203       return __kmp_par_range > 0;
204     }
205     return __kmp_par_range < 0;
206   }
207   return TRUE;
208 
209 #endif /* KMP_DEBUG */
210 }
211 
212 /*!
213 @ingroup THREAD_STATES
214 @param loc Source location information.
215 @return 1 if this thread is executing inside an active parallel region, zero if
216 not.
217 */
218 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
219   return __kmp_entry_thread()->th.th_root->r.r_active;
220 }
221 
222 /*!
223 @ingroup PARALLEL
224 @param loc source location information
225 @param global_tid global thread number
226 @param num_threads number of threads requested for this parallel construct
227 
228 Set the number of threads to be used by the next fork spawned by this thread.
229 This call is only required if the parallel construct has a `num_threads` clause.
230 */
231 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
232                              kmp_int32 num_threads) {
233   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
234                 global_tid, num_threads));
235   __kmp_assert_valid_gtid(global_tid);
236   __kmp_push_num_threads(loc, global_tid, num_threads);
237 }
238 
239 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
240   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
241   /* the num_threads are automatically popped */
242 }
243 
244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245                            kmp_int32 proc_bind) {
246   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247                 proc_bind));
248   __kmp_assert_valid_gtid(global_tid);
249   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250 }
251 
252 /*!
253 @ingroup PARALLEL
254 @param loc  source location information
255 @param argc  total number of arguments in the ellipsis
256 @param microtask  pointer to callback routine consisting of outlined parallel
257 construct
258 @param ...  pointers to shared variables that aren't global
259 
260 Do the actual fork and call the microtask in the relevant number of threads.
261 */
262 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263   int gtid = __kmp_entry_gtid();
264 
265 #if (KMP_STATS_ENABLED)
266   // If we were in a serial region, then stop the serial timer, record
267   // the event, and start parallel region timer
268   stats_state_e previous_state = KMP_GET_THREAD_STATE();
269   if (previous_state == stats_state_e::SERIAL_REGION) {
270     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
271   } else {
272     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273   }
274   int inParallel = __kmpc_in_parallel(loc);
275   if (inParallel) {
276     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
277   } else {
278     KMP_COUNT_BLOCK(OMP_PARALLEL);
279   }
280 #endif
281 
282   // maybe to save thr_state is enough here
283   {
284     va_list ap;
285     va_start(ap, microtask);
286 
287 #if OMPT_SUPPORT
288     ompt_frame_t *ompt_frame;
289     if (ompt_enabled.enabled) {
290       kmp_info_t *master_th = __kmp_threads[gtid];
291       ompt_frame = &master_th->th.th_current_task->ompt_task_info.frame;
292       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
293     }
294     OMPT_STORE_RETURN_ADDRESS(gtid);
295 #endif
296 
297 #if INCLUDE_SSC_MARKS
298     SSC_MARK_FORKING();
299 #endif
300     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
301                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
302                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
303                     kmp_va_addr_of(ap));
304 #if INCLUDE_SSC_MARKS
305     SSC_MARK_JOINING();
306 #endif
307     __kmp_join_call(loc, gtid
308 #if OMPT_SUPPORT
309                     ,
310                     fork_context_intel
311 #endif
312     );
313 
314     va_end(ap);
315 
316 #if OMPT_SUPPORT
317     if (ompt_enabled.enabled) {
318       ompt_frame->enter_frame = ompt_data_none;
319     }
320 #endif
321   }
322 
323 #if KMP_STATS_ENABLED
324   if (previous_state == stats_state_e::SERIAL_REGION) {
325     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
326     KMP_SET_THREAD_STATE(previous_state);
327   } else {
328     KMP_POP_PARTITIONED_TIMER();
329   }
330 #endif // KMP_STATS_ENABLED
331 }
332 
333 /*!
334 @ingroup PARALLEL
335 @param loc source location information
336 @param global_tid global thread number
337 @param num_teams number of teams requested for the teams construct
338 @param num_threads number of threads per team requested for the teams construct
339 
340 Set the number of teams to be used by the teams construct.
341 This call is only required if the teams construct has a `num_teams` clause
342 or a `thread_limit` clause (or both).
343 */
344 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
345                            kmp_int32 num_teams, kmp_int32 num_threads) {
346   KA_TRACE(20,
347            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
348             global_tid, num_teams, num_threads));
349   __kmp_assert_valid_gtid(global_tid);
350   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
351 }
352 
353 /*!
354 @ingroup PARALLEL
355 @param loc source location information
356 @param global_tid global thread number
357 @param num_teams_lo lower bound on number of teams requested for the teams
358 construct
359 @param num_teams_up upper bound on number of teams requested for the teams
360 construct
361 @param num_threads number of threads per team requested for the teams construct
362 
363 Set the number of teams to be used by the teams construct. The number of initial
364 teams cretaed will be greater than or equal to the lower bound and less than or
365 equal to the upper bound.
366 This call is only required if the teams construct has a `num_teams` clause
367 or a `thread_limit` clause (or both).
368 */
369 void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
370                               kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
371                               kmp_int32 num_threads) {
372   KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
373                 " num_teams_ub=%d num_threads=%d\n",
374                 global_tid, num_teams_lb, num_teams_ub, num_threads));
375   __kmp_assert_valid_gtid(global_tid);
376   __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
377                           num_threads);
378 }
379 
380 /*!
381 @ingroup PARALLEL
382 @param loc  source location information
383 @param argc  total number of arguments in the ellipsis
384 @param microtask  pointer to callback routine consisting of outlined teams
385 construct
386 @param ...  pointers to shared variables that aren't global
387 
388 Do the actual fork and call the microtask in the relevant number of threads.
389 */
390 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
391                        ...) {
392   int gtid = __kmp_entry_gtid();
393   kmp_info_t *this_thr = __kmp_threads[gtid];
394   va_list ap;
395   va_start(ap, microtask);
396 
397 #if KMP_STATS_ENABLED
398   KMP_COUNT_BLOCK(OMP_TEAMS);
399   stats_state_e previous_state = KMP_GET_THREAD_STATE();
400   if (previous_state == stats_state_e::SERIAL_REGION) {
401     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
402   } else {
403     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
404   }
405 #endif
406 
407   // remember teams entry point and nesting level
408   this_thr->th.th_teams_microtask = microtask;
409   this_thr->th.th_teams_level =
410       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
411 
412 #if OMPT_SUPPORT
413   kmp_team_t *parent_team = this_thr->th.th_team;
414   int tid = __kmp_tid_from_gtid(gtid);
415   if (ompt_enabled.enabled) {
416     parent_team->t.t_implicit_task_taskdata[tid]
417         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
418   }
419   OMPT_STORE_RETURN_ADDRESS(gtid);
420 #endif
421 
422   // check if __kmpc_push_num_teams called, set default number of teams
423   // otherwise
424   if (this_thr->th.th_teams_size.nteams == 0) {
425     __kmp_push_num_teams(loc, gtid, 0, 0);
426   }
427   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
428   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
429   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
430 
431   __kmp_fork_call(
432       loc, gtid, fork_context_intel, argc,
433       VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
434       VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
435   __kmp_join_call(loc, gtid
436 #if OMPT_SUPPORT
437                   ,
438                   fork_context_intel
439 #endif
440   );
441 
442   // Pop current CG root off list
443   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
444   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
445   this_thr->th.th_cg_roots = tmp->up;
446   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
447                  " to node %p. cg_nthreads was %d\n",
448                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
449   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
450   int i = tmp->cg_nthreads--;
451   if (i == 1) { // check is we are the last thread in CG (not always the case)
452     __kmp_free(tmp);
453   }
454   // Restore current task's thread_limit from CG root
455   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
456   this_thr->th.th_current_task->td_icvs.thread_limit =
457       this_thr->th.th_cg_roots->cg_thread_limit;
458 
459   this_thr->th.th_teams_microtask = NULL;
460   this_thr->th.th_teams_level = 0;
461   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
462   va_end(ap);
463 #if KMP_STATS_ENABLED
464   if (previous_state == stats_state_e::SERIAL_REGION) {
465     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
466     KMP_SET_THREAD_STATE(previous_state);
467   } else {
468     KMP_POP_PARTITIONED_TIMER();
469   }
470 #endif // KMP_STATS_ENABLED
471 }
472 
473 // I don't think this function should ever have been exported.
474 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
475 // openmp code ever called it, but it's been exported from the RTL for so
476 // long that I'm afraid to remove the definition.
477 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
478 
479 /*!
480 @ingroup PARALLEL
481 @param loc  source location information
482 @param global_tid  global thread number
483 
484 Enter a serialized parallel construct. This interface is used to handle a
485 conditional parallel region, like this,
486 @code
487 #pragma omp parallel if (condition)
488 @endcode
489 when the condition is false.
490 */
491 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
492   // The implementation is now in kmp_runtime.cpp so that it can share static
493   // functions with kmp_fork_call since the tasks to be done are similar in
494   // each case.
495   __kmp_assert_valid_gtid(global_tid);
496 #if OMPT_SUPPORT
497   OMPT_STORE_RETURN_ADDRESS(global_tid);
498 #endif
499   __kmp_serialized_parallel(loc, global_tid);
500 }
501 
502 /*!
503 @ingroup PARALLEL
504 @param loc  source location information
505 @param global_tid  global thread number
506 
507 Leave a serialized parallel construct.
508 */
509 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
510   kmp_internal_control_t *top;
511   kmp_info_t *this_thr;
512   kmp_team_t *serial_team;
513 
514   KC_TRACE(10,
515            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
516 
517   /* skip all this code for autopar serialized loops since it results in
518      unacceptable overhead */
519   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
520     return;
521 
522   // Not autopar code
523   __kmp_assert_valid_gtid(global_tid);
524   if (!TCR_4(__kmp_init_parallel))
525     __kmp_parallel_initialize();
526 
527   __kmp_resume_if_soft_paused();
528 
529   this_thr = __kmp_threads[global_tid];
530   serial_team = this_thr->th.th_serial_team;
531 
532   kmp_task_team_t *task_team = this_thr->th.th_task_team;
533   // we need to wait for the proxy tasks before finishing the thread
534   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
535     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
536 
537   KMP_MB();
538   KMP_DEBUG_ASSERT(serial_team);
539   KMP_ASSERT(serial_team->t.t_serialized);
540   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
541   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
542   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
543   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
544 
545 #if OMPT_SUPPORT
546   if (ompt_enabled.enabled &&
547       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
548     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
549     if (ompt_enabled.ompt_callback_implicit_task) {
550       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
551           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
552           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
553     }
554 
555     // reset clear the task id only after unlinking the task
556     ompt_data_t *parent_task_data;
557     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
558 
559     if (ompt_enabled.ompt_callback_parallel_end) {
560       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
561           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
562           ompt_parallel_invoker_program | ompt_parallel_team,
563           OMPT_LOAD_RETURN_ADDRESS(global_tid));
564     }
565     __ompt_lw_taskteam_unlink(this_thr);
566     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
567   }
568 #endif
569 
570   /* If necessary, pop the internal control stack values and replace the team
571    * values */
572   top = serial_team->t.t_control_stack_top;
573   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
574     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
575     serial_team->t.t_control_stack_top = top->next;
576     __kmp_free(top);
577   }
578 
579   /* pop dispatch buffers stack */
580   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
581   {
582     dispatch_private_info_t *disp_buffer =
583         serial_team->t.t_dispatch->th_disp_buffer;
584     serial_team->t.t_dispatch->th_disp_buffer =
585         serial_team->t.t_dispatch->th_disp_buffer->next;
586     __kmp_free(disp_buffer);
587   }
588   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
589 
590   --serial_team->t.t_serialized;
591   if (serial_team->t.t_serialized == 0) {
592 
593     /* return to the parallel section */
594 
595 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
596     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
597       __kmp_clear_x87_fpu_status_word();
598       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
599       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
600     }
601 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
602 
603     __kmp_pop_current_task_from_thread(this_thr);
604 #if OMPD_SUPPORT
605     if (ompd_state & OMPD_ENABLE_BP)
606       ompd_bp_parallel_end();
607 #endif
608 
609     this_thr->th.th_team = serial_team->t.t_parent;
610     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
611 
612     /* restore values cached in the thread */
613     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
614     this_thr->th.th_team_master =
615         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
616     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
617 
618     /* TODO the below shouldn't need to be adjusted for serialized teams */
619     this_thr->th.th_dispatch =
620         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
621 
622     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
623     this_thr->th.th_current_task->td_flags.executing = 1;
624 
625     if (__kmp_tasking_mode != tskm_immediate_exec) {
626       // Copy the task team from the new child / old parent team to the thread.
627       this_thr->th.th_task_team =
628           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
629       KA_TRACE(20,
630                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
631                 "team %p\n",
632                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
633     }
634   } else {
635     if (__kmp_tasking_mode != tskm_immediate_exec) {
636       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
637                     "depth of serial team %p to %d\n",
638                     global_tid, serial_team, serial_team->t.t_serialized));
639     }
640   }
641 
642   serial_team->t.t_level--;
643   if (__kmp_env_consistency_check)
644     __kmp_pop_parallel(global_tid, NULL);
645 #if OMPT_SUPPORT
646   if (ompt_enabled.enabled)
647     this_thr->th.ompt_thread_info.state =
648         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
649                                            : ompt_state_work_parallel);
650 #endif
651 }
652 
653 /*!
654 @ingroup SYNCHRONIZATION
655 @param loc  source location information.
656 
657 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
658 depending on the memory ordering convention obeyed by the compiler
659 even that may not be necessary).
660 */
661 void __kmpc_flush(ident_t *loc) {
662   KC_TRACE(10, ("__kmpc_flush: called\n"));
663 
664   /* need explicit __mf() here since use volatile instead in library */
665   KMP_MB(); /* Flush all pending memory write invalidates.  */
666 
667 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
668 #if KMP_MIC
669 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
670 // We shouldn't need it, though, since the ABI rules require that
671 // * If the compiler generates NGO stores it also generates the fence
672 // * If users hand-code NGO stores they should insert the fence
673 // therefore no incomplete unordered stores should be visible.
674 #else
675   // C74404
676   // This is to address non-temporal store instructions (sfence needed).
677   // The clflush instruction is addressed either (mfence needed).
678   // Probably the non-temporal load monvtdqa instruction should also be
679   // addressed.
680   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
681   if (!__kmp_cpuinfo.initialized) {
682     __kmp_query_cpuid(&__kmp_cpuinfo);
683   }
684   if (!__kmp_cpuinfo.flags.sse2) {
685     // CPU cannot execute SSE2 instructions.
686   } else {
687 #if KMP_COMPILER_ICC
688     _mm_mfence();
689 #elif KMP_COMPILER_MSVC
690     MemoryBarrier();
691 #else
692     __sync_synchronize();
693 #endif // KMP_COMPILER_ICC
694   }
695 #endif // KMP_MIC
696 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
697        KMP_ARCH_RISCV64)
698 // Nothing to see here move along
699 #elif KMP_ARCH_PPC64
700 // Nothing needed here (we have a real MB above).
701 #else
702 #error Unknown or unsupported architecture
703 #endif
704 
705 #if OMPT_SUPPORT && OMPT_OPTIONAL
706   if (ompt_enabled.ompt_callback_flush) {
707     ompt_callbacks.ompt_callback(ompt_callback_flush)(
708         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
709   }
710 #endif
711 }
712 
713 /* -------------------------------------------------------------------------- */
714 /*!
715 @ingroup SYNCHRONIZATION
716 @param loc source location information
717 @param global_tid thread id.
718 
719 Execute a barrier.
720 */
721 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
722   KMP_COUNT_BLOCK(OMP_BARRIER);
723   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
724   __kmp_assert_valid_gtid(global_tid);
725 
726   if (!TCR_4(__kmp_init_parallel))
727     __kmp_parallel_initialize();
728 
729   __kmp_resume_if_soft_paused();
730 
731   if (__kmp_env_consistency_check) {
732     if (loc == 0) {
733       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
734     }
735     __kmp_check_barrier(global_tid, ct_barrier, loc);
736   }
737 
738 #if OMPT_SUPPORT
739   ompt_frame_t *ompt_frame;
740   if (ompt_enabled.enabled) {
741     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
742     if (ompt_frame->enter_frame.ptr == NULL)
743       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
744   }
745   OMPT_STORE_RETURN_ADDRESS(global_tid);
746 #endif
747   __kmp_threads[global_tid]->th.th_ident = loc;
748   // TODO: explicit barrier_wait_id:
749   //   this function is called when 'barrier' directive is present or
750   //   implicit barrier at the end of a worksharing construct.
751   // 1) better to add a per-thread barrier counter to a thread data structure
752   // 2) set to 0 when a new team is created
753   // 4) no sync is required
754 
755   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
756 #if OMPT_SUPPORT && OMPT_OPTIONAL
757   if (ompt_enabled.enabled) {
758     ompt_frame->enter_frame = ompt_data_none;
759   }
760 #endif
761 }
762 
763 /* The BARRIER for a MASTER section is always explicit   */
764 /*!
765 @ingroup WORK_SHARING
766 @param loc  source location information.
767 @param global_tid  global thread number .
768 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
769 */
770 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
771   int status = 0;
772 
773   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
774   __kmp_assert_valid_gtid(global_tid);
775 
776   if (!TCR_4(__kmp_init_parallel))
777     __kmp_parallel_initialize();
778 
779   __kmp_resume_if_soft_paused();
780 
781   if (KMP_MASTER_GTID(global_tid)) {
782     KMP_COUNT_BLOCK(OMP_MASTER);
783     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
784     status = 1;
785   }
786 
787 #if OMPT_SUPPORT && OMPT_OPTIONAL
788   if (status) {
789     if (ompt_enabled.ompt_callback_masked) {
790       kmp_info_t *this_thr = __kmp_threads[global_tid];
791       kmp_team_t *team = this_thr->th.th_team;
792 
793       int tid = __kmp_tid_from_gtid(global_tid);
794       ompt_callbacks.ompt_callback(ompt_callback_masked)(
795           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
796           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
797           OMPT_GET_RETURN_ADDRESS(0));
798     }
799   }
800 #endif
801 
802   if (__kmp_env_consistency_check) {
803 #if KMP_USE_DYNAMIC_LOCK
804     if (status)
805       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
806     else
807       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
808 #else
809     if (status)
810       __kmp_push_sync(global_tid, ct_master, loc, NULL);
811     else
812       __kmp_check_sync(global_tid, ct_master, loc, NULL);
813 #endif
814   }
815 
816   return status;
817 }
818 
819 /*!
820 @ingroup WORK_SHARING
821 @param loc  source location information.
822 @param global_tid  global thread number .
823 
824 Mark the end of a <tt>master</tt> region. This should only be called by the
825 thread that executes the <tt>master</tt> region.
826 */
827 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
828   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
829   __kmp_assert_valid_gtid(global_tid);
830   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
831   KMP_POP_PARTITIONED_TIMER();
832 
833 #if OMPT_SUPPORT && OMPT_OPTIONAL
834   kmp_info_t *this_thr = __kmp_threads[global_tid];
835   kmp_team_t *team = this_thr->th.th_team;
836   if (ompt_enabled.ompt_callback_masked) {
837     int tid = __kmp_tid_from_gtid(global_tid);
838     ompt_callbacks.ompt_callback(ompt_callback_masked)(
839         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
840         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
841         OMPT_GET_RETURN_ADDRESS(0));
842   }
843 #endif
844 
845   if (__kmp_env_consistency_check) {
846     if (KMP_MASTER_GTID(global_tid))
847       __kmp_pop_sync(global_tid, ct_master, loc);
848   }
849 }
850 
851 /*!
852 @ingroup WORK_SHARING
853 @param loc  source location information.
854 @param global_tid  global thread number.
855 @param filter result of evaluating filter clause on thread global_tid, or zero
856 if no filter clause present
857 @return 1 if this thread should execute the <tt>masked</tt> block, 0 otherwise.
858 */
859 kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
860   int status = 0;
861   int tid;
862   KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
863   __kmp_assert_valid_gtid(global_tid);
864 
865   if (!TCR_4(__kmp_init_parallel))
866     __kmp_parallel_initialize();
867 
868   __kmp_resume_if_soft_paused();
869 
870   tid = __kmp_tid_from_gtid(global_tid);
871   if (tid == filter) {
872     KMP_COUNT_BLOCK(OMP_MASKED);
873     KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
874     status = 1;
875   }
876 
877 #if OMPT_SUPPORT && OMPT_OPTIONAL
878   if (status) {
879     if (ompt_enabled.ompt_callback_masked) {
880       kmp_info_t *this_thr = __kmp_threads[global_tid];
881       kmp_team_t *team = this_thr->th.th_team;
882       ompt_callbacks.ompt_callback(ompt_callback_masked)(
883           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
884           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
885           OMPT_GET_RETURN_ADDRESS(0));
886     }
887   }
888 #endif
889 
890   if (__kmp_env_consistency_check) {
891 #if KMP_USE_DYNAMIC_LOCK
892     if (status)
893       __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
894     else
895       __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
896 #else
897     if (status)
898       __kmp_push_sync(global_tid, ct_masked, loc, NULL);
899     else
900       __kmp_check_sync(global_tid, ct_masked, loc, NULL);
901 #endif
902   }
903 
904   return status;
905 }
906 
907 /*!
908 @ingroup WORK_SHARING
909 @param loc  source location information.
910 @param global_tid  global thread number .
911 
912 Mark the end of a <tt>masked</tt> region. This should only be called by the
913 thread that executes the <tt>masked</tt> region.
914 */
915 void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
916   KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
917   __kmp_assert_valid_gtid(global_tid);
918   KMP_POP_PARTITIONED_TIMER();
919 
920 #if OMPT_SUPPORT && OMPT_OPTIONAL
921   kmp_info_t *this_thr = __kmp_threads[global_tid];
922   kmp_team_t *team = this_thr->th.th_team;
923   if (ompt_enabled.ompt_callback_masked) {
924     int tid = __kmp_tid_from_gtid(global_tid);
925     ompt_callbacks.ompt_callback(ompt_callback_masked)(
926         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
927         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
928         OMPT_GET_RETURN_ADDRESS(0));
929   }
930 #endif
931 
932   if (__kmp_env_consistency_check) {
933     __kmp_pop_sync(global_tid, ct_masked, loc);
934   }
935 }
936 
937 /*!
938 @ingroup WORK_SHARING
939 @param loc  source location information.
940 @param gtid  global thread number.
941 
942 Start execution of an <tt>ordered</tt> construct.
943 */
944 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
945   int cid = 0;
946   kmp_info_t *th;
947   KMP_DEBUG_ASSERT(__kmp_init_serial);
948 
949   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
950   __kmp_assert_valid_gtid(gtid);
951 
952   if (!TCR_4(__kmp_init_parallel))
953     __kmp_parallel_initialize();
954 
955   __kmp_resume_if_soft_paused();
956 
957 #if USE_ITT_BUILD
958   __kmp_itt_ordered_prep(gtid);
959 // TODO: ordered_wait_id
960 #endif /* USE_ITT_BUILD */
961 
962   th = __kmp_threads[gtid];
963 
964 #if OMPT_SUPPORT && OMPT_OPTIONAL
965   kmp_team_t *team;
966   ompt_wait_id_t lck;
967   void *codeptr_ra;
968   OMPT_STORE_RETURN_ADDRESS(gtid);
969   if (ompt_enabled.enabled) {
970     team = __kmp_team_from_gtid(gtid);
971     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
972     /* OMPT state update */
973     th->th.ompt_thread_info.wait_id = lck;
974     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
975 
976     /* OMPT event callback */
977     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
978     if (ompt_enabled.ompt_callback_mutex_acquire) {
979       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
980           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
981           codeptr_ra);
982     }
983   }
984 #endif
985 
986   if (th->th.th_dispatch->th_deo_fcn != 0)
987     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
988   else
989     __kmp_parallel_deo(&gtid, &cid, loc);
990 
991 #if OMPT_SUPPORT && OMPT_OPTIONAL
992   if (ompt_enabled.enabled) {
993     /* OMPT state update */
994     th->th.ompt_thread_info.state = ompt_state_work_parallel;
995     th->th.ompt_thread_info.wait_id = 0;
996 
997     /* OMPT event callback */
998     if (ompt_enabled.ompt_callback_mutex_acquired) {
999       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1000           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1001     }
1002   }
1003 #endif
1004 
1005 #if USE_ITT_BUILD
1006   __kmp_itt_ordered_start(gtid);
1007 #endif /* USE_ITT_BUILD */
1008 }
1009 
1010 /*!
1011 @ingroup WORK_SHARING
1012 @param loc  source location information.
1013 @param gtid  global thread number.
1014 
1015 End execution of an <tt>ordered</tt> construct.
1016 */
1017 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
1018   int cid = 0;
1019   kmp_info_t *th;
1020 
1021   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
1022   __kmp_assert_valid_gtid(gtid);
1023 
1024 #if USE_ITT_BUILD
1025   __kmp_itt_ordered_end(gtid);
1026 // TODO: ordered_wait_id
1027 #endif /* USE_ITT_BUILD */
1028 
1029   th = __kmp_threads[gtid];
1030 
1031   if (th->th.th_dispatch->th_dxo_fcn != 0)
1032     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
1033   else
1034     __kmp_parallel_dxo(&gtid, &cid, loc);
1035 
1036 #if OMPT_SUPPORT && OMPT_OPTIONAL
1037   OMPT_STORE_RETURN_ADDRESS(gtid);
1038   if (ompt_enabled.ompt_callback_mutex_released) {
1039     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1040         ompt_mutex_ordered,
1041         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
1042             ->t.t_ordered.dt.t_value,
1043         OMPT_LOAD_RETURN_ADDRESS(gtid));
1044   }
1045 #endif
1046 }
1047 
1048 #if KMP_USE_DYNAMIC_LOCK
1049 
1050 static __forceinline void
1051 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
1052                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
1053   // Pointer to the allocated indirect lock is written to crit, while indexing
1054   // is ignored.
1055   void *idx;
1056   kmp_indirect_lock_t **lck;
1057   lck = (kmp_indirect_lock_t **)crit;
1058   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
1059   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
1060   KMP_SET_I_LOCK_LOCATION(ilk, loc);
1061   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
1062   KA_TRACE(20,
1063            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
1064 #if USE_ITT_BUILD
1065   __kmp_itt_critical_creating(ilk->lock, loc);
1066 #endif
1067   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
1068   if (status == 0) {
1069 #if USE_ITT_BUILD
1070     __kmp_itt_critical_destroyed(ilk->lock);
1071 #endif
1072     // We don't really need to destroy the unclaimed lock here since it will be
1073     // cleaned up at program exit.
1074     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
1075   }
1076   KMP_DEBUG_ASSERT(*lck != NULL);
1077 }
1078 
1079 // Fast-path acquire tas lock
1080 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
1081   {                                                                            \
1082     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1083     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1084     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1085     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
1086         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
1087       kmp_uint32 spins;                                                        \
1088       KMP_FSYNC_PREPARE(l);                                                    \
1089       KMP_INIT_YIELD(spins);                                                   \
1090       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1091       do {                                                                     \
1092         if (TCR_4(__kmp_nth) >                                                 \
1093             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1094           KMP_YIELD(TRUE);                                                     \
1095         } else {                                                               \
1096           KMP_YIELD_SPIN(spins);                                               \
1097         }                                                                      \
1098         __kmp_spin_backoff(&backoff);                                          \
1099       } while (                                                                \
1100           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1101           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1102     }                                                                          \
1103     KMP_FSYNC_ACQUIRED(l);                                                     \
1104   }
1105 
1106 // Fast-path test tas lock
1107 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1108   {                                                                            \
1109     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1110     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1111     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1112     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1113          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1114   }
1115 
1116 // Fast-path release tas lock
1117 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1118   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1119 
1120 #if KMP_USE_FUTEX
1121 
1122 #include <sys/syscall.h>
1123 #include <unistd.h>
1124 #ifndef FUTEX_WAIT
1125 #define FUTEX_WAIT 0
1126 #endif
1127 #ifndef FUTEX_WAKE
1128 #define FUTEX_WAKE 1
1129 #endif
1130 
1131 // Fast-path acquire futex lock
1132 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1133   {                                                                            \
1134     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1135     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1136     KMP_MB();                                                                  \
1137     KMP_FSYNC_PREPARE(ftx);                                                    \
1138     kmp_int32 poll_val;                                                        \
1139     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1140                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1141                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1142       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1143       if (!cond) {                                                             \
1144         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1145                                          poll_val |                            \
1146                                              KMP_LOCK_BUSY(1, futex))) {       \
1147           continue;                                                            \
1148         }                                                                      \
1149         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1150       }                                                                        \
1151       kmp_int32 rc;                                                            \
1152       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1153                         NULL, NULL, 0)) != 0) {                                \
1154         continue;                                                              \
1155       }                                                                        \
1156       gtid_code |= 1;                                                          \
1157     }                                                                          \
1158     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1159   }
1160 
1161 // Fast-path test futex lock
1162 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1163   {                                                                            \
1164     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1165     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1166                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1167       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1168       rc = TRUE;                                                               \
1169     } else {                                                                   \
1170       rc = FALSE;                                                              \
1171     }                                                                          \
1172   }
1173 
1174 // Fast-path release futex lock
1175 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1176   {                                                                            \
1177     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1178     KMP_MB();                                                                  \
1179     KMP_FSYNC_RELEASING(ftx);                                                  \
1180     kmp_int32 poll_val =                                                       \
1181         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1182     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1183       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1184               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1185     }                                                                          \
1186     KMP_MB();                                                                  \
1187     KMP_YIELD_OVERSUB();                                                       \
1188   }
1189 
1190 #endif // KMP_USE_FUTEX
1191 
1192 #else // KMP_USE_DYNAMIC_LOCK
1193 
1194 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1195                                                       ident_t const *loc,
1196                                                       kmp_int32 gtid) {
1197   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1198 
1199   // Because of the double-check, the following load doesn't need to be volatile
1200   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1201 
1202   if (lck == NULL) {
1203     void *idx;
1204 
1205     // Allocate & initialize the lock.
1206     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1207     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1208     __kmp_init_user_lock_with_checks(lck);
1209     __kmp_set_user_lock_location(lck, loc);
1210 #if USE_ITT_BUILD
1211     __kmp_itt_critical_creating(lck);
1212 // __kmp_itt_critical_creating() should be called *before* the first usage
1213 // of underlying lock. It is the only place where we can guarantee it. There
1214 // are chances the lock will destroyed with no usage, but it is not a
1215 // problem, because this is not real event seen by user but rather setting
1216 // name for object (lock). See more details in kmp_itt.h.
1217 #endif /* USE_ITT_BUILD */
1218 
1219     // Use a cmpxchg instruction to slam the start of the critical section with
1220     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1221     // and use the lock that the other thread allocated.
1222     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1223 
1224     if (status == 0) {
1225 // Deallocate the lock and reload the value.
1226 #if USE_ITT_BUILD
1227       __kmp_itt_critical_destroyed(lck);
1228 // Let ITT know the lock is destroyed and the same memory location may be reused
1229 // for another purpose.
1230 #endif /* USE_ITT_BUILD */
1231       __kmp_destroy_user_lock_with_checks(lck);
1232       __kmp_user_lock_free(&idx, gtid, lck);
1233       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1234       KMP_DEBUG_ASSERT(lck != NULL);
1235     }
1236   }
1237   return lck;
1238 }
1239 
1240 #endif // KMP_USE_DYNAMIC_LOCK
1241 
1242 /*!
1243 @ingroup WORK_SHARING
1244 @param loc  source location information.
1245 @param global_tid  global thread number.
1246 @param crit identity of the critical section. This could be a pointer to a lock
1247 associated with the critical section, or some other suitably unique value.
1248 
1249 Enter code protected by a `critical` construct.
1250 This function blocks until the executing thread can enter the critical section.
1251 */
1252 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1253                      kmp_critical_name *crit) {
1254 #if KMP_USE_DYNAMIC_LOCK
1255 #if OMPT_SUPPORT && OMPT_OPTIONAL
1256   OMPT_STORE_RETURN_ADDRESS(global_tid);
1257 #endif // OMPT_SUPPORT
1258   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1259 #else
1260   KMP_COUNT_BLOCK(OMP_CRITICAL);
1261 #if OMPT_SUPPORT && OMPT_OPTIONAL
1262   ompt_state_t prev_state = ompt_state_undefined;
1263   ompt_thread_info_t ti;
1264 #endif
1265   kmp_user_lock_p lck;
1266 
1267   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1268   __kmp_assert_valid_gtid(global_tid);
1269 
1270   // TODO: add THR_OVHD_STATE
1271 
1272   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1273   KMP_CHECK_USER_LOCK_INIT();
1274 
1275   if ((__kmp_user_lock_kind == lk_tas) &&
1276       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1277     lck = (kmp_user_lock_p)crit;
1278   }
1279 #if KMP_USE_FUTEX
1280   else if ((__kmp_user_lock_kind == lk_futex) &&
1281            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1282     lck = (kmp_user_lock_p)crit;
1283   }
1284 #endif
1285   else { // ticket, queuing or drdpa
1286     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1287   }
1288 
1289   if (__kmp_env_consistency_check)
1290     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1291 
1292     // since the critical directive binds to all threads, not just the current
1293     // team we have to check this even if we are in a serialized team.
1294     // also, even if we are the uber thread, we still have to conduct the lock,
1295     // as we have to contend with sibling threads.
1296 
1297 #if USE_ITT_BUILD
1298   __kmp_itt_critical_acquiring(lck);
1299 #endif /* USE_ITT_BUILD */
1300 #if OMPT_SUPPORT && OMPT_OPTIONAL
1301   OMPT_STORE_RETURN_ADDRESS(gtid);
1302   void *codeptr_ra = NULL;
1303   if (ompt_enabled.enabled) {
1304     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1305     /* OMPT state update */
1306     prev_state = ti.state;
1307     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1308     ti.state = ompt_state_wait_critical;
1309 
1310     /* OMPT event callback */
1311     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1312     if (ompt_enabled.ompt_callback_mutex_acquire) {
1313       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1314           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1315           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1316     }
1317   }
1318 #endif
1319   // Value of 'crit' should be good for using as a critical_id of the critical
1320   // section directive.
1321   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1322 
1323 #if USE_ITT_BUILD
1324   __kmp_itt_critical_acquired(lck);
1325 #endif /* USE_ITT_BUILD */
1326 #if OMPT_SUPPORT && OMPT_OPTIONAL
1327   if (ompt_enabled.enabled) {
1328     /* OMPT state update */
1329     ti.state = prev_state;
1330     ti.wait_id = 0;
1331 
1332     /* OMPT event callback */
1333     if (ompt_enabled.ompt_callback_mutex_acquired) {
1334       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1335           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1336     }
1337   }
1338 #endif
1339   KMP_POP_PARTITIONED_TIMER();
1340 
1341   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1342   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1343 #endif // KMP_USE_DYNAMIC_LOCK
1344 }
1345 
1346 #if KMP_USE_DYNAMIC_LOCK
1347 
1348 // Converts the given hint to an internal lock implementation
1349 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1350 #if KMP_USE_TSX
1351 #define KMP_TSX_LOCK(seq) lockseq_##seq
1352 #else
1353 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1354 #endif
1355 
1356 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1357 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.flags.rtm)
1358 #else
1359 #define KMP_CPUINFO_RTM 0
1360 #endif
1361 
1362   // Hints that do not require further logic
1363   if (hint & kmp_lock_hint_hle)
1364     return KMP_TSX_LOCK(hle);
1365   if (hint & kmp_lock_hint_rtm)
1366     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1367   if (hint & kmp_lock_hint_adaptive)
1368     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1369 
1370   // Rule out conflicting hints first by returning the default lock
1371   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1372     return __kmp_user_lock_seq;
1373   if ((hint & omp_lock_hint_speculative) &&
1374       (hint & omp_lock_hint_nonspeculative))
1375     return __kmp_user_lock_seq;
1376 
1377   // Do not even consider speculation when it appears to be contended
1378   if (hint & omp_lock_hint_contended)
1379     return lockseq_queuing;
1380 
1381   // Uncontended lock without speculation
1382   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1383     return lockseq_tas;
1384 
1385   // Use RTM lock for speculation
1386   if (hint & omp_lock_hint_speculative)
1387     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1388 
1389   return __kmp_user_lock_seq;
1390 }
1391 
1392 #if OMPT_SUPPORT && OMPT_OPTIONAL
1393 #if KMP_USE_DYNAMIC_LOCK
1394 static kmp_mutex_impl_t
1395 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1396   if (user_lock) {
1397     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1398     case 0:
1399       break;
1400 #if KMP_USE_FUTEX
1401     case locktag_futex:
1402       return kmp_mutex_impl_queuing;
1403 #endif
1404     case locktag_tas:
1405       return kmp_mutex_impl_spin;
1406 #if KMP_USE_TSX
1407     case locktag_hle:
1408     case locktag_rtm_spin:
1409       return kmp_mutex_impl_speculative;
1410 #endif
1411     default:
1412       return kmp_mutex_impl_none;
1413     }
1414     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1415   }
1416   KMP_ASSERT(ilock);
1417   switch (ilock->type) {
1418 #if KMP_USE_TSX
1419   case locktag_adaptive:
1420   case locktag_rtm_queuing:
1421     return kmp_mutex_impl_speculative;
1422 #endif
1423   case locktag_nested_tas:
1424     return kmp_mutex_impl_spin;
1425 #if KMP_USE_FUTEX
1426   case locktag_nested_futex:
1427 #endif
1428   case locktag_ticket:
1429   case locktag_queuing:
1430   case locktag_drdpa:
1431   case locktag_nested_ticket:
1432   case locktag_nested_queuing:
1433   case locktag_nested_drdpa:
1434     return kmp_mutex_impl_queuing;
1435   default:
1436     return kmp_mutex_impl_none;
1437   }
1438 }
1439 #else
1440 // For locks without dynamic binding
1441 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1442   switch (__kmp_user_lock_kind) {
1443   case lk_tas:
1444     return kmp_mutex_impl_spin;
1445 #if KMP_USE_FUTEX
1446   case lk_futex:
1447 #endif
1448   case lk_ticket:
1449   case lk_queuing:
1450   case lk_drdpa:
1451     return kmp_mutex_impl_queuing;
1452 #if KMP_USE_TSX
1453   case lk_hle:
1454   case lk_rtm_queuing:
1455   case lk_rtm_spin:
1456   case lk_adaptive:
1457     return kmp_mutex_impl_speculative;
1458 #endif
1459   default:
1460     return kmp_mutex_impl_none;
1461   }
1462 }
1463 #endif // KMP_USE_DYNAMIC_LOCK
1464 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1465 
1466 /*!
1467 @ingroup WORK_SHARING
1468 @param loc  source location information.
1469 @param global_tid  global thread number.
1470 @param crit identity of the critical section. This could be a pointer to a lock
1471 associated with the critical section, or some other suitably unique value.
1472 @param hint the lock hint.
1473 
1474 Enter code protected by a `critical` construct with a hint. The hint value is
1475 used to suggest a lock implementation. This function blocks until the executing
1476 thread can enter the critical section unless the hint suggests use of
1477 speculative execution and the hardware supports it.
1478 */
1479 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1480                                kmp_critical_name *crit, uint32_t hint) {
1481   KMP_COUNT_BLOCK(OMP_CRITICAL);
1482   kmp_user_lock_p lck;
1483 #if OMPT_SUPPORT && OMPT_OPTIONAL
1484   ompt_state_t prev_state = ompt_state_undefined;
1485   ompt_thread_info_t ti;
1486   // This is the case, if called from __kmpc_critical:
1487   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1488   if (!codeptr)
1489     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1490 #endif
1491 
1492   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1493   __kmp_assert_valid_gtid(global_tid);
1494 
1495   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1496   // Check if it is initialized.
1497   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1498   kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
1499   if (*lk == 0) {
1500     if (KMP_IS_D_LOCK(lockseq)) {
1501       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1502                                   KMP_GET_D_TAG(lockseq));
1503     } else {
1504       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
1505     }
1506   }
1507   // Branch for accessing the actual lock object and set operation. This
1508   // branching is inevitable since this lock initialization does not follow the
1509   // normal dispatch path (lock table is not used).
1510   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1511     lck = (kmp_user_lock_p)lk;
1512     if (__kmp_env_consistency_check) {
1513       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1514                       __kmp_map_hint_to_lock(hint));
1515     }
1516 #if USE_ITT_BUILD
1517     __kmp_itt_critical_acquiring(lck);
1518 #endif
1519 #if OMPT_SUPPORT && OMPT_OPTIONAL
1520     if (ompt_enabled.enabled) {
1521       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1522       /* OMPT state update */
1523       prev_state = ti.state;
1524       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1525       ti.state = ompt_state_wait_critical;
1526 
1527       /* OMPT event callback */
1528       if (ompt_enabled.ompt_callback_mutex_acquire) {
1529         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1530             ompt_mutex_critical, (unsigned int)hint,
1531             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1532             codeptr);
1533       }
1534     }
1535 #endif
1536 #if KMP_USE_INLINED_TAS
1537     if (lockseq == lockseq_tas && !__kmp_env_consistency_check) {
1538       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1539     } else
1540 #elif KMP_USE_INLINED_FUTEX
1541     if (lockseq == lockseq_futex && !__kmp_env_consistency_check) {
1542       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1543     } else
1544 #endif
1545     {
1546       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1547     }
1548   } else {
1549     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1550     lck = ilk->lock;
1551     if (__kmp_env_consistency_check) {
1552       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1553                       __kmp_map_hint_to_lock(hint));
1554     }
1555 #if USE_ITT_BUILD
1556     __kmp_itt_critical_acquiring(lck);
1557 #endif
1558 #if OMPT_SUPPORT && OMPT_OPTIONAL
1559     if (ompt_enabled.enabled) {
1560       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1561       /* OMPT state update */
1562       prev_state = ti.state;
1563       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1564       ti.state = ompt_state_wait_critical;
1565 
1566       /* OMPT event callback */
1567       if (ompt_enabled.ompt_callback_mutex_acquire) {
1568         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1569             ompt_mutex_critical, (unsigned int)hint,
1570             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1571             codeptr);
1572       }
1573     }
1574 #endif
1575     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1576   }
1577   KMP_POP_PARTITIONED_TIMER();
1578 
1579 #if USE_ITT_BUILD
1580   __kmp_itt_critical_acquired(lck);
1581 #endif /* USE_ITT_BUILD */
1582 #if OMPT_SUPPORT && OMPT_OPTIONAL
1583   if (ompt_enabled.enabled) {
1584     /* OMPT state update */
1585     ti.state = prev_state;
1586     ti.wait_id = 0;
1587 
1588     /* OMPT event callback */
1589     if (ompt_enabled.ompt_callback_mutex_acquired) {
1590       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1591           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1592     }
1593   }
1594 #endif
1595 
1596   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1597   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1598 } // __kmpc_critical_with_hint
1599 
1600 #endif // KMP_USE_DYNAMIC_LOCK
1601 
1602 /*!
1603 @ingroup WORK_SHARING
1604 @param loc  source location information.
1605 @param global_tid  global thread number .
1606 @param crit identity of the critical section. This could be a pointer to a lock
1607 associated with the critical section, or some other suitably unique value.
1608 
1609 Leave a critical section, releasing any lock that was held during its execution.
1610 */
1611 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1612                          kmp_critical_name *crit) {
1613   kmp_user_lock_p lck;
1614 
1615   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1616 
1617 #if KMP_USE_DYNAMIC_LOCK
1618   int locktag = KMP_EXTRACT_D_TAG(crit);
1619   if (locktag) {
1620     lck = (kmp_user_lock_p)crit;
1621     KMP_ASSERT(lck != NULL);
1622     if (__kmp_env_consistency_check) {
1623       __kmp_pop_sync(global_tid, ct_critical, loc);
1624     }
1625 #if USE_ITT_BUILD
1626     __kmp_itt_critical_releasing(lck);
1627 #endif
1628 #if KMP_USE_INLINED_TAS
1629     if (locktag == locktag_tas && !__kmp_env_consistency_check) {
1630       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1631     } else
1632 #elif KMP_USE_INLINED_FUTEX
1633     if (locktag == locktag_futex && !__kmp_env_consistency_check) {
1634       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1635     } else
1636 #endif
1637     {
1638       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1639     }
1640   } else {
1641     kmp_indirect_lock_t *ilk =
1642         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1643     KMP_ASSERT(ilk != NULL);
1644     lck = ilk->lock;
1645     if (__kmp_env_consistency_check) {
1646       __kmp_pop_sync(global_tid, ct_critical, loc);
1647     }
1648 #if USE_ITT_BUILD
1649     __kmp_itt_critical_releasing(lck);
1650 #endif
1651     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1652   }
1653 
1654 #else // KMP_USE_DYNAMIC_LOCK
1655 
1656   if ((__kmp_user_lock_kind == lk_tas) &&
1657       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1658     lck = (kmp_user_lock_p)crit;
1659   }
1660 #if KMP_USE_FUTEX
1661   else if ((__kmp_user_lock_kind == lk_futex) &&
1662            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1663     lck = (kmp_user_lock_p)crit;
1664   }
1665 #endif
1666   else { // ticket, queuing or drdpa
1667     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1668   }
1669 
1670   KMP_ASSERT(lck != NULL);
1671 
1672   if (__kmp_env_consistency_check)
1673     __kmp_pop_sync(global_tid, ct_critical, loc);
1674 
1675 #if USE_ITT_BUILD
1676   __kmp_itt_critical_releasing(lck);
1677 #endif /* USE_ITT_BUILD */
1678   // Value of 'crit' should be good for using as a critical_id of the critical
1679   // section directive.
1680   __kmp_release_user_lock_with_checks(lck, global_tid);
1681 
1682 #endif // KMP_USE_DYNAMIC_LOCK
1683 
1684 #if OMPT_SUPPORT && OMPT_OPTIONAL
1685   /* OMPT release event triggers after lock is released; place here to trigger
1686    * for all #if branches */
1687   OMPT_STORE_RETURN_ADDRESS(global_tid);
1688   if (ompt_enabled.ompt_callback_mutex_released) {
1689     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1690         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1691         OMPT_LOAD_RETURN_ADDRESS(0));
1692   }
1693 #endif
1694 
1695   KMP_POP_PARTITIONED_TIMER();
1696   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1697 }
1698 
1699 /*!
1700 @ingroup SYNCHRONIZATION
1701 @param loc source location information
1702 @param global_tid thread id.
1703 @return one if the thread should execute the master block, zero otherwise
1704 
1705 Start execution of a combined barrier and master. The barrier is executed inside
1706 this function.
1707 */
1708 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1709   int status;
1710   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1711   __kmp_assert_valid_gtid(global_tid);
1712 
1713   if (!TCR_4(__kmp_init_parallel))
1714     __kmp_parallel_initialize();
1715 
1716   __kmp_resume_if_soft_paused();
1717 
1718   if (__kmp_env_consistency_check)
1719     __kmp_check_barrier(global_tid, ct_barrier, loc);
1720 
1721 #if OMPT_SUPPORT
1722   ompt_frame_t *ompt_frame;
1723   if (ompt_enabled.enabled) {
1724     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1725     if (ompt_frame->enter_frame.ptr == NULL)
1726       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1727   }
1728   OMPT_STORE_RETURN_ADDRESS(global_tid);
1729 #endif
1730 #if USE_ITT_NOTIFY
1731   __kmp_threads[global_tid]->th.th_ident = loc;
1732 #endif
1733   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1734 #if OMPT_SUPPORT && OMPT_OPTIONAL
1735   if (ompt_enabled.enabled) {
1736     ompt_frame->enter_frame = ompt_data_none;
1737   }
1738 #endif
1739 
1740   return (status != 0) ? 0 : 1;
1741 }
1742 
1743 /*!
1744 @ingroup SYNCHRONIZATION
1745 @param loc source location information
1746 @param global_tid thread id.
1747 
1748 Complete the execution of a combined barrier and master. This function should
1749 only be called at the completion of the <tt>master</tt> code. Other threads will
1750 still be waiting at the barrier and this call releases them.
1751 */
1752 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1753   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1754   __kmp_assert_valid_gtid(global_tid);
1755   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1756 }
1757 
1758 /*!
1759 @ingroup SYNCHRONIZATION
1760 @param loc source location information
1761 @param global_tid thread id.
1762 @return one if the thread should execute the master block, zero otherwise
1763 
1764 Start execution of a combined barrier and master(nowait) construct.
1765 The barrier is executed inside this function.
1766 There is no equivalent "end" function, since the
1767 */
1768 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1769   kmp_int32 ret;
1770   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1771   __kmp_assert_valid_gtid(global_tid);
1772 
1773   if (!TCR_4(__kmp_init_parallel))
1774     __kmp_parallel_initialize();
1775 
1776   __kmp_resume_if_soft_paused();
1777 
1778   if (__kmp_env_consistency_check) {
1779     if (loc == 0) {
1780       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1781     }
1782     __kmp_check_barrier(global_tid, ct_barrier, loc);
1783   }
1784 
1785 #if OMPT_SUPPORT
1786   ompt_frame_t *ompt_frame;
1787   if (ompt_enabled.enabled) {
1788     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1789     if (ompt_frame->enter_frame.ptr == NULL)
1790       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1791   }
1792   OMPT_STORE_RETURN_ADDRESS(global_tid);
1793 #endif
1794 #if USE_ITT_NOTIFY
1795   __kmp_threads[global_tid]->th.th_ident = loc;
1796 #endif
1797   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1798 #if OMPT_SUPPORT && OMPT_OPTIONAL
1799   if (ompt_enabled.enabled) {
1800     ompt_frame->enter_frame = ompt_data_none;
1801   }
1802 #endif
1803 
1804   ret = __kmpc_master(loc, global_tid);
1805 
1806   if (__kmp_env_consistency_check) {
1807     /*  there's no __kmpc_end_master called; so the (stats) */
1808     /*  actions of __kmpc_end_master are done here          */
1809     if (ret) {
1810       /* only one thread should do the pop since only */
1811       /* one did the push (see __kmpc_master())       */
1812       __kmp_pop_sync(global_tid, ct_master, loc);
1813     }
1814   }
1815 
1816   return (ret);
1817 }
1818 
1819 /* The BARRIER for a SINGLE process section is always explicit   */
1820 /*!
1821 @ingroup WORK_SHARING
1822 @param loc  source location information
1823 @param global_tid  global thread number
1824 @return One if this thread should execute the single construct, zero otherwise.
1825 
1826 Test whether to execute a <tt>single</tt> construct.
1827 There are no implicit barriers in the two "single" calls, rather the compiler
1828 should introduce an explicit barrier if it is required.
1829 */
1830 
1831 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1832   __kmp_assert_valid_gtid(global_tid);
1833   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1834 
1835   if (rc) {
1836     // We are going to execute the single statement, so we should count it.
1837     KMP_COUNT_BLOCK(OMP_SINGLE);
1838     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1839   }
1840 
1841 #if OMPT_SUPPORT && OMPT_OPTIONAL
1842   kmp_info_t *this_thr = __kmp_threads[global_tid];
1843   kmp_team_t *team = this_thr->th.th_team;
1844   int tid = __kmp_tid_from_gtid(global_tid);
1845 
1846   if (ompt_enabled.enabled) {
1847     if (rc) {
1848       if (ompt_enabled.ompt_callback_work) {
1849         ompt_callbacks.ompt_callback(ompt_callback_work)(
1850             ompt_work_single_executor, ompt_scope_begin,
1851             &(team->t.ompt_team_info.parallel_data),
1852             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1853             1, OMPT_GET_RETURN_ADDRESS(0));
1854       }
1855     } else {
1856       if (ompt_enabled.ompt_callback_work) {
1857         ompt_callbacks.ompt_callback(ompt_callback_work)(
1858             ompt_work_single_other, ompt_scope_begin,
1859             &(team->t.ompt_team_info.parallel_data),
1860             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1861             1, OMPT_GET_RETURN_ADDRESS(0));
1862         ompt_callbacks.ompt_callback(ompt_callback_work)(
1863             ompt_work_single_other, ompt_scope_end,
1864             &(team->t.ompt_team_info.parallel_data),
1865             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1866             1, OMPT_GET_RETURN_ADDRESS(0));
1867       }
1868     }
1869   }
1870 #endif
1871 
1872   return rc;
1873 }
1874 
1875 /*!
1876 @ingroup WORK_SHARING
1877 @param loc  source location information
1878 @param global_tid  global thread number
1879 
1880 Mark the end of a <tt>single</tt> construct.  This function should
1881 only be called by the thread that executed the block of code protected
1882 by the `single` construct.
1883 */
1884 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1885   __kmp_assert_valid_gtid(global_tid);
1886   __kmp_exit_single(global_tid);
1887   KMP_POP_PARTITIONED_TIMER();
1888 
1889 #if OMPT_SUPPORT && OMPT_OPTIONAL
1890   kmp_info_t *this_thr = __kmp_threads[global_tid];
1891   kmp_team_t *team = this_thr->th.th_team;
1892   int tid = __kmp_tid_from_gtid(global_tid);
1893 
1894   if (ompt_enabled.ompt_callback_work) {
1895     ompt_callbacks.ompt_callback(ompt_callback_work)(
1896         ompt_work_single_executor, ompt_scope_end,
1897         &(team->t.ompt_team_info.parallel_data),
1898         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1899         OMPT_GET_RETURN_ADDRESS(0));
1900   }
1901 #endif
1902 }
1903 
1904 /*!
1905 @ingroup WORK_SHARING
1906 @param loc Source location
1907 @param global_tid Global thread id
1908 
1909 Mark the end of a statically scheduled loop.
1910 */
1911 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1912   KMP_POP_PARTITIONED_TIMER();
1913   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1914 
1915 #if OMPT_SUPPORT && OMPT_OPTIONAL
1916   if (ompt_enabled.ompt_callback_work) {
1917     ompt_work_t ompt_work_type = ompt_work_loop;
1918     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1919     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1920     // Determine workshare type
1921     if (loc != NULL) {
1922       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1923         ompt_work_type = ompt_work_loop;
1924       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1925         ompt_work_type = ompt_work_sections;
1926       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1927         ompt_work_type = ompt_work_distribute;
1928       } else {
1929         // use default set above.
1930         // a warning about this case is provided in __kmpc_for_static_init
1931       }
1932       KMP_DEBUG_ASSERT(ompt_work_type);
1933     }
1934     ompt_callbacks.ompt_callback(ompt_callback_work)(
1935         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1936         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1937   }
1938 #endif
1939   if (__kmp_env_consistency_check)
1940     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1941 }
1942 
1943 // User routines which take C-style arguments (call by value)
1944 // different from the Fortran equivalent routines
1945 
1946 void ompc_set_num_threads(int arg) {
1947   // !!!!! TODO: check the per-task binding
1948   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1949 }
1950 
1951 void ompc_set_dynamic(int flag) {
1952   kmp_info_t *thread;
1953 
1954   /* For the thread-private implementation of the internal controls */
1955   thread = __kmp_entry_thread();
1956 
1957   __kmp_save_internal_controls(thread);
1958 
1959   set__dynamic(thread, flag ? true : false);
1960 }
1961 
1962 void ompc_set_nested(int flag) {
1963   kmp_info_t *thread;
1964 
1965   /* For the thread-private internal controls implementation */
1966   thread = __kmp_entry_thread();
1967 
1968   __kmp_save_internal_controls(thread);
1969 
1970   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1971 }
1972 
1973 void ompc_set_max_active_levels(int max_active_levels) {
1974   /* TO DO */
1975   /* we want per-task implementation of this internal control */
1976 
1977   /* For the per-thread internal controls implementation */
1978   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1979 }
1980 
1981 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1982   // !!!!! TODO: check the per-task binding
1983   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1984 }
1985 
1986 int ompc_get_ancestor_thread_num(int level) {
1987   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1988 }
1989 
1990 int ompc_get_team_size(int level) {
1991   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1992 }
1993 
1994 /* OpenMP 5.0 Affinity Format API */
1995 void KMP_EXPAND_NAME(ompc_set_affinity_format)(char const *format) {
1996   if (!__kmp_init_serial) {
1997     __kmp_serial_initialize();
1998   }
1999   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
2000                          format, KMP_STRLEN(format) + 1);
2001 }
2002 
2003 size_t KMP_EXPAND_NAME(ompc_get_affinity_format)(char *buffer, size_t size) {
2004   size_t format_size;
2005   if (!__kmp_init_serial) {
2006     __kmp_serial_initialize();
2007   }
2008   format_size = KMP_STRLEN(__kmp_affinity_format);
2009   if (buffer && size) {
2010     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
2011                            format_size + 1);
2012   }
2013   return format_size;
2014 }
2015 
2016 void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format) {
2017   int gtid;
2018   if (!TCR_4(__kmp_init_middle)) {
2019     __kmp_middle_initialize();
2020   }
2021   __kmp_assign_root_init_mask();
2022   gtid = __kmp_get_gtid();
2023   __kmp_aux_display_affinity(gtid, format);
2024 }
2025 
2026 size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
2027                                               char const *format) {
2028   int gtid;
2029   size_t num_required;
2030   kmp_str_buf_t capture_buf;
2031   if (!TCR_4(__kmp_init_middle)) {
2032     __kmp_middle_initialize();
2033   }
2034   __kmp_assign_root_init_mask();
2035   gtid = __kmp_get_gtid();
2036   __kmp_str_buf_init(&capture_buf);
2037   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
2038   if (buffer && buf_size) {
2039     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
2040                            capture_buf.used + 1);
2041   }
2042   __kmp_str_buf_free(&capture_buf);
2043   return num_required;
2044 }
2045 
2046 void kmpc_set_stacksize(int arg) {
2047   // __kmp_aux_set_stacksize initializes the library if needed
2048   __kmp_aux_set_stacksize(arg);
2049 }
2050 
2051 void kmpc_set_stacksize_s(size_t arg) {
2052   // __kmp_aux_set_stacksize initializes the library if needed
2053   __kmp_aux_set_stacksize(arg);
2054 }
2055 
2056 void kmpc_set_blocktime(int arg) {
2057   int gtid, tid;
2058   kmp_info_t *thread;
2059 
2060   gtid = __kmp_entry_gtid();
2061   tid = __kmp_tid_from_gtid(gtid);
2062   thread = __kmp_thread_from_gtid(gtid);
2063 
2064   __kmp_aux_set_blocktime(arg, thread, tid);
2065 }
2066 
2067 void kmpc_set_library(int arg) {
2068   // __kmp_user_set_library initializes the library if needed
2069   __kmp_user_set_library((enum library_type)arg);
2070 }
2071 
2072 void kmpc_set_defaults(char const *str) {
2073   // __kmp_aux_set_defaults initializes the library if needed
2074   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
2075 }
2076 
2077 void kmpc_set_disp_num_buffers(int arg) {
2078   // ignore after initialization because some teams have already
2079   // allocated dispatch buffers
2080   if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
2081       arg <= KMP_MAX_DISP_NUM_BUFF) {
2082     __kmp_dispatch_num_buffers = arg;
2083   }
2084 }
2085 
2086 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2087 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2088   return -1;
2089 #else
2090   if (!TCR_4(__kmp_init_middle)) {
2091     __kmp_middle_initialize();
2092   }
2093   __kmp_assign_root_init_mask();
2094   return __kmp_aux_set_affinity_mask_proc(proc, mask);
2095 #endif
2096 }
2097 
2098 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2099 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2100   return -1;
2101 #else
2102   if (!TCR_4(__kmp_init_middle)) {
2103     __kmp_middle_initialize();
2104   }
2105   __kmp_assign_root_init_mask();
2106   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2107 #endif
2108 }
2109 
2110 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2111 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2112   return -1;
2113 #else
2114   if (!TCR_4(__kmp_init_middle)) {
2115     __kmp_middle_initialize();
2116   }
2117   __kmp_assign_root_init_mask();
2118   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2119 #endif
2120 }
2121 
2122 /* -------------------------------------------------------------------------- */
2123 /*!
2124 @ingroup THREADPRIVATE
2125 @param loc       source location information
2126 @param gtid      global thread number
2127 @param cpy_size  size of the cpy_data buffer
2128 @param cpy_data  pointer to data to be copied
2129 @param cpy_func  helper function to call for copying data
2130 @param didit     flag variable: 1=single thread; 0=not single thread
2131 
2132 __kmpc_copyprivate implements the interface for the private data broadcast
2133 needed for the copyprivate clause associated with a single region in an
2134 OpenMP<sup>*</sup> program (both C and Fortran).
2135 All threads participating in the parallel region call this routine.
2136 One of the threads (called the single thread) should have the <tt>didit</tt>
2137 variable set to 1 and all other threads should have that variable set to 0.
2138 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2139 
2140 The OpenMP specification forbids the use of nowait on the single region when a
2141 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2142 barrier internally to avoid race conditions, so the code generation for the
2143 single region should avoid generating a barrier after the call to @ref
2144 __kmpc_copyprivate.
2145 
2146 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2147 The <tt>loc</tt> parameter is a pointer to source location information.
2148 
2149 Internal implementation: The single thread will first copy its descriptor
2150 address (cpy_data) to a team-private location, then the other threads will each
2151 call the function pointed to by the parameter cpy_func, which carries out the
2152 copy by copying the data using the cpy_data buffer.
2153 
2154 The cpy_func routine used for the copy and the contents of the data area defined
2155 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2156 to be done. For instance, the cpy_data buffer can hold the actual data to be
2157 copied or it may hold a list of pointers to the data. The cpy_func routine must
2158 interpret the cpy_data buffer appropriately.
2159 
2160 The interface to cpy_func is as follows:
2161 @code
2162 void cpy_func( void *destination, void *source )
2163 @endcode
2164 where void *destination is the cpy_data pointer for the thread being copied to
2165 and void *source is the cpy_data pointer for the thread being copied from.
2166 */
2167 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2168                         void *cpy_data, void (*cpy_func)(void *, void *),
2169                         kmp_int32 didit) {
2170   void **data_ptr;
2171   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2172   __kmp_assert_valid_gtid(gtid);
2173 
2174   KMP_MB();
2175 
2176   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2177 
2178   if (__kmp_env_consistency_check) {
2179     if (loc == 0) {
2180       KMP_WARNING(ConstructIdentInvalid);
2181     }
2182   }
2183 
2184   // ToDo: Optimize the following two barriers into some kind of split barrier
2185 
2186   if (didit)
2187     *data_ptr = cpy_data;
2188 
2189 #if OMPT_SUPPORT
2190   ompt_frame_t *ompt_frame;
2191   if (ompt_enabled.enabled) {
2192     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2193     if (ompt_frame->enter_frame.ptr == NULL)
2194       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2195   }
2196   OMPT_STORE_RETURN_ADDRESS(gtid);
2197 #endif
2198 /* This barrier is not a barrier region boundary */
2199 #if USE_ITT_NOTIFY
2200   __kmp_threads[gtid]->th.th_ident = loc;
2201 #endif
2202   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2203 
2204   if (!didit)
2205     (*cpy_func)(cpy_data, *data_ptr);
2206 
2207   // Consider next barrier a user-visible barrier for barrier region boundaries
2208   // Nesting checks are already handled by the single construct checks
2209   {
2210 #if OMPT_SUPPORT
2211     OMPT_STORE_RETURN_ADDRESS(gtid);
2212 #endif
2213 #if USE_ITT_NOTIFY
2214     __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2215 // tasks can overwrite the location)
2216 #endif
2217     __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2218 #if OMPT_SUPPORT && OMPT_OPTIONAL
2219     if (ompt_enabled.enabled) {
2220       ompt_frame->enter_frame = ompt_data_none;
2221     }
2222 #endif
2223   }
2224 }
2225 
2226 /* -------------------------------------------------------------------------- */
2227 
2228 #define INIT_LOCK __kmp_init_user_lock_with_checks
2229 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2230 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2231 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2232 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2233 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2234   __kmp_acquire_nested_user_lock_with_checks_timed
2235 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2236 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2237 #define TEST_LOCK __kmp_test_user_lock_with_checks
2238 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2239 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2240 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2241 
2242 // TODO: Make check abort messages use location info & pass it into
2243 // with_checks routines
2244 
2245 #if KMP_USE_DYNAMIC_LOCK
2246 
2247 // internal lock initializer
2248 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2249                                                     kmp_dyna_lockseq_t seq) {
2250   if (KMP_IS_D_LOCK(seq)) {
2251     KMP_INIT_D_LOCK(lock, seq);
2252 #if USE_ITT_BUILD
2253     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2254 #endif
2255   } else {
2256     KMP_INIT_I_LOCK(lock, seq);
2257 #if USE_ITT_BUILD
2258     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2259     __kmp_itt_lock_creating(ilk->lock, loc);
2260 #endif
2261   }
2262 }
2263 
2264 // internal nest lock initializer
2265 static __forceinline void
2266 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2267                                kmp_dyna_lockseq_t seq) {
2268 #if KMP_USE_TSX
2269   // Don't have nested lock implementation for speculative locks
2270   if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2271       seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2272     seq = __kmp_user_lock_seq;
2273 #endif
2274   switch (seq) {
2275   case lockseq_tas:
2276     seq = lockseq_nested_tas;
2277     break;
2278 #if KMP_USE_FUTEX
2279   case lockseq_futex:
2280     seq = lockseq_nested_futex;
2281     break;
2282 #endif
2283   case lockseq_ticket:
2284     seq = lockseq_nested_ticket;
2285     break;
2286   case lockseq_queuing:
2287     seq = lockseq_nested_queuing;
2288     break;
2289   case lockseq_drdpa:
2290     seq = lockseq_nested_drdpa;
2291     break;
2292   default:
2293     seq = lockseq_nested_queuing;
2294   }
2295   KMP_INIT_I_LOCK(lock, seq);
2296 #if USE_ITT_BUILD
2297   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2298   __kmp_itt_lock_creating(ilk->lock, loc);
2299 #endif
2300 }
2301 
2302 /* initialize the lock with a hint */
2303 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2304                                 uintptr_t hint) {
2305   KMP_DEBUG_ASSERT(__kmp_init_serial);
2306   if (__kmp_env_consistency_check && user_lock == NULL) {
2307     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2308   }
2309 
2310   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2311 
2312 #if OMPT_SUPPORT && OMPT_OPTIONAL
2313   // This is the case, if called from omp_init_lock_with_hint:
2314   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2315   if (!codeptr)
2316     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2317   if (ompt_enabled.ompt_callback_lock_init) {
2318     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2319         ompt_mutex_lock, (omp_lock_hint_t)hint,
2320         __ompt_get_mutex_impl_type(user_lock),
2321         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2322   }
2323 #endif
2324 }
2325 
2326 /* initialize the lock with a hint */
2327 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2328                                      void **user_lock, uintptr_t hint) {
2329   KMP_DEBUG_ASSERT(__kmp_init_serial);
2330   if (__kmp_env_consistency_check && user_lock == NULL) {
2331     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2332   }
2333 
2334   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2335 
2336 #if OMPT_SUPPORT && OMPT_OPTIONAL
2337   // This is the case, if called from omp_init_lock_with_hint:
2338   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2339   if (!codeptr)
2340     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2341   if (ompt_enabled.ompt_callback_lock_init) {
2342     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2343         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2344         __ompt_get_mutex_impl_type(user_lock),
2345         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2346   }
2347 #endif
2348 }
2349 
2350 #endif // KMP_USE_DYNAMIC_LOCK
2351 
2352 /* initialize the lock */
2353 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2354 #if KMP_USE_DYNAMIC_LOCK
2355 
2356   KMP_DEBUG_ASSERT(__kmp_init_serial);
2357   if (__kmp_env_consistency_check && user_lock == NULL) {
2358     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2359   }
2360   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2361 
2362 #if OMPT_SUPPORT && OMPT_OPTIONAL
2363   // This is the case, if called from omp_init_lock_with_hint:
2364   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2365   if (!codeptr)
2366     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2367   if (ompt_enabled.ompt_callback_lock_init) {
2368     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2369         ompt_mutex_lock, omp_lock_hint_none,
2370         __ompt_get_mutex_impl_type(user_lock),
2371         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2372   }
2373 #endif
2374 
2375 #else // KMP_USE_DYNAMIC_LOCK
2376 
2377   static char const *const func = "omp_init_lock";
2378   kmp_user_lock_p lck;
2379   KMP_DEBUG_ASSERT(__kmp_init_serial);
2380 
2381   if (__kmp_env_consistency_check) {
2382     if (user_lock == NULL) {
2383       KMP_FATAL(LockIsUninitialized, func);
2384     }
2385   }
2386 
2387   KMP_CHECK_USER_LOCK_INIT();
2388 
2389   if ((__kmp_user_lock_kind == lk_tas) &&
2390       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2391     lck = (kmp_user_lock_p)user_lock;
2392   }
2393 #if KMP_USE_FUTEX
2394   else if ((__kmp_user_lock_kind == lk_futex) &&
2395            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2396     lck = (kmp_user_lock_p)user_lock;
2397   }
2398 #endif
2399   else {
2400     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2401   }
2402   INIT_LOCK(lck);
2403   __kmp_set_user_lock_location(lck, loc);
2404 
2405 #if OMPT_SUPPORT && OMPT_OPTIONAL
2406   // This is the case, if called from omp_init_lock_with_hint:
2407   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2408   if (!codeptr)
2409     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2410   if (ompt_enabled.ompt_callback_lock_init) {
2411     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2412         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2413         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2414   }
2415 #endif
2416 
2417 #if USE_ITT_BUILD
2418   __kmp_itt_lock_creating(lck);
2419 #endif /* USE_ITT_BUILD */
2420 
2421 #endif // KMP_USE_DYNAMIC_LOCK
2422 } // __kmpc_init_lock
2423 
2424 /* initialize the lock */
2425 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2426 #if KMP_USE_DYNAMIC_LOCK
2427 
2428   KMP_DEBUG_ASSERT(__kmp_init_serial);
2429   if (__kmp_env_consistency_check && user_lock == NULL) {
2430     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2431   }
2432   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2433 
2434 #if OMPT_SUPPORT && OMPT_OPTIONAL
2435   // This is the case, if called from omp_init_lock_with_hint:
2436   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2437   if (!codeptr)
2438     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2439   if (ompt_enabled.ompt_callback_lock_init) {
2440     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2441         ompt_mutex_nest_lock, omp_lock_hint_none,
2442         __ompt_get_mutex_impl_type(user_lock),
2443         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2444   }
2445 #endif
2446 
2447 #else // KMP_USE_DYNAMIC_LOCK
2448 
2449   static char const *const func = "omp_init_nest_lock";
2450   kmp_user_lock_p lck;
2451   KMP_DEBUG_ASSERT(__kmp_init_serial);
2452 
2453   if (__kmp_env_consistency_check) {
2454     if (user_lock == NULL) {
2455       KMP_FATAL(LockIsUninitialized, func);
2456     }
2457   }
2458 
2459   KMP_CHECK_USER_LOCK_INIT();
2460 
2461   if ((__kmp_user_lock_kind == lk_tas) &&
2462       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2463        OMP_NEST_LOCK_T_SIZE)) {
2464     lck = (kmp_user_lock_p)user_lock;
2465   }
2466 #if KMP_USE_FUTEX
2467   else if ((__kmp_user_lock_kind == lk_futex) &&
2468            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2469             OMP_NEST_LOCK_T_SIZE)) {
2470     lck = (kmp_user_lock_p)user_lock;
2471   }
2472 #endif
2473   else {
2474     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2475   }
2476 
2477   INIT_NESTED_LOCK(lck);
2478   __kmp_set_user_lock_location(lck, loc);
2479 
2480 #if OMPT_SUPPORT && OMPT_OPTIONAL
2481   // This is the case, if called from omp_init_lock_with_hint:
2482   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2483   if (!codeptr)
2484     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2485   if (ompt_enabled.ompt_callback_lock_init) {
2486     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2487         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2488         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2489   }
2490 #endif
2491 
2492 #if USE_ITT_BUILD
2493   __kmp_itt_lock_creating(lck);
2494 #endif /* USE_ITT_BUILD */
2495 
2496 #endif // KMP_USE_DYNAMIC_LOCK
2497 } // __kmpc_init_nest_lock
2498 
2499 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2500 #if KMP_USE_DYNAMIC_LOCK
2501 
2502 #if USE_ITT_BUILD
2503   kmp_user_lock_p lck;
2504   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2505     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2506   } else {
2507     lck = (kmp_user_lock_p)user_lock;
2508   }
2509   __kmp_itt_lock_destroyed(lck);
2510 #endif
2511 #if OMPT_SUPPORT && OMPT_OPTIONAL
2512   // This is the case, if called from omp_init_lock_with_hint:
2513   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2514   if (!codeptr)
2515     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2516   if (ompt_enabled.ompt_callback_lock_destroy) {
2517     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2518         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2519   }
2520 #endif
2521   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2522 #else
2523   kmp_user_lock_p lck;
2524 
2525   if ((__kmp_user_lock_kind == lk_tas) &&
2526       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2527     lck = (kmp_user_lock_p)user_lock;
2528   }
2529 #if KMP_USE_FUTEX
2530   else if ((__kmp_user_lock_kind == lk_futex) &&
2531            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2532     lck = (kmp_user_lock_p)user_lock;
2533   }
2534 #endif
2535   else {
2536     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2537   }
2538 
2539 #if OMPT_SUPPORT && OMPT_OPTIONAL
2540   // This is the case, if called from omp_init_lock_with_hint:
2541   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2542   if (!codeptr)
2543     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2544   if (ompt_enabled.ompt_callback_lock_destroy) {
2545     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2546         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2547   }
2548 #endif
2549 
2550 #if USE_ITT_BUILD
2551   __kmp_itt_lock_destroyed(lck);
2552 #endif /* USE_ITT_BUILD */
2553   DESTROY_LOCK(lck);
2554 
2555   if ((__kmp_user_lock_kind == lk_tas) &&
2556       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2557     ;
2558   }
2559 #if KMP_USE_FUTEX
2560   else if ((__kmp_user_lock_kind == lk_futex) &&
2561            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2562     ;
2563   }
2564 #endif
2565   else {
2566     __kmp_user_lock_free(user_lock, gtid, lck);
2567   }
2568 #endif // KMP_USE_DYNAMIC_LOCK
2569 } // __kmpc_destroy_lock
2570 
2571 /* destroy the lock */
2572 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2573 #if KMP_USE_DYNAMIC_LOCK
2574 
2575 #if USE_ITT_BUILD
2576   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2577   __kmp_itt_lock_destroyed(ilk->lock);
2578 #endif
2579 #if OMPT_SUPPORT && OMPT_OPTIONAL
2580   // This is the case, if called from omp_init_lock_with_hint:
2581   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2582   if (!codeptr)
2583     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2584   if (ompt_enabled.ompt_callback_lock_destroy) {
2585     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2586         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2587   }
2588 #endif
2589   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2590 
2591 #else // KMP_USE_DYNAMIC_LOCK
2592 
2593   kmp_user_lock_p lck;
2594 
2595   if ((__kmp_user_lock_kind == lk_tas) &&
2596       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2597        OMP_NEST_LOCK_T_SIZE)) {
2598     lck = (kmp_user_lock_p)user_lock;
2599   }
2600 #if KMP_USE_FUTEX
2601   else if ((__kmp_user_lock_kind == lk_futex) &&
2602            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2603             OMP_NEST_LOCK_T_SIZE)) {
2604     lck = (kmp_user_lock_p)user_lock;
2605   }
2606 #endif
2607   else {
2608     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2609   }
2610 
2611 #if OMPT_SUPPORT && OMPT_OPTIONAL
2612   // This is the case, if called from omp_init_lock_with_hint:
2613   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2614   if (!codeptr)
2615     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2616   if (ompt_enabled.ompt_callback_lock_destroy) {
2617     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2618         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2619   }
2620 #endif
2621 
2622 #if USE_ITT_BUILD
2623   __kmp_itt_lock_destroyed(lck);
2624 #endif /* USE_ITT_BUILD */
2625 
2626   DESTROY_NESTED_LOCK(lck);
2627 
2628   if ((__kmp_user_lock_kind == lk_tas) &&
2629       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2630        OMP_NEST_LOCK_T_SIZE)) {
2631     ;
2632   }
2633 #if KMP_USE_FUTEX
2634   else if ((__kmp_user_lock_kind == lk_futex) &&
2635            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2636             OMP_NEST_LOCK_T_SIZE)) {
2637     ;
2638   }
2639 #endif
2640   else {
2641     __kmp_user_lock_free(user_lock, gtid, lck);
2642   }
2643 #endif // KMP_USE_DYNAMIC_LOCK
2644 } // __kmpc_destroy_nest_lock
2645 
2646 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2647   KMP_COUNT_BLOCK(OMP_set_lock);
2648 #if KMP_USE_DYNAMIC_LOCK
2649   int tag = KMP_EXTRACT_D_TAG(user_lock);
2650 #if USE_ITT_BUILD
2651   __kmp_itt_lock_acquiring(
2652       (kmp_user_lock_p)
2653           user_lock); // itt function will get to the right lock object.
2654 #endif
2655 #if OMPT_SUPPORT && OMPT_OPTIONAL
2656   // This is the case, if called from omp_init_lock_with_hint:
2657   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2658   if (!codeptr)
2659     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2660   if (ompt_enabled.ompt_callback_mutex_acquire) {
2661     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2662         ompt_mutex_lock, omp_lock_hint_none,
2663         __ompt_get_mutex_impl_type(user_lock),
2664         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2665   }
2666 #endif
2667 #if KMP_USE_INLINED_TAS
2668   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2669     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2670   } else
2671 #elif KMP_USE_INLINED_FUTEX
2672   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2673     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2674   } else
2675 #endif
2676   {
2677     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2678   }
2679 #if USE_ITT_BUILD
2680   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2681 #endif
2682 #if OMPT_SUPPORT && OMPT_OPTIONAL
2683   if (ompt_enabled.ompt_callback_mutex_acquired) {
2684     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2685         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2686   }
2687 #endif
2688 
2689 #else // KMP_USE_DYNAMIC_LOCK
2690 
2691   kmp_user_lock_p lck;
2692 
2693   if ((__kmp_user_lock_kind == lk_tas) &&
2694       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2695     lck = (kmp_user_lock_p)user_lock;
2696   }
2697 #if KMP_USE_FUTEX
2698   else if ((__kmp_user_lock_kind == lk_futex) &&
2699            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2700     lck = (kmp_user_lock_p)user_lock;
2701   }
2702 #endif
2703   else {
2704     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2705   }
2706 
2707 #if USE_ITT_BUILD
2708   __kmp_itt_lock_acquiring(lck);
2709 #endif /* USE_ITT_BUILD */
2710 #if OMPT_SUPPORT && OMPT_OPTIONAL
2711   // This is the case, if called from omp_init_lock_with_hint:
2712   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2713   if (!codeptr)
2714     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2715   if (ompt_enabled.ompt_callback_mutex_acquire) {
2716     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2717         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2718         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2719   }
2720 #endif
2721 
2722   ACQUIRE_LOCK(lck, gtid);
2723 
2724 #if USE_ITT_BUILD
2725   __kmp_itt_lock_acquired(lck);
2726 #endif /* USE_ITT_BUILD */
2727 
2728 #if OMPT_SUPPORT && OMPT_OPTIONAL
2729   if (ompt_enabled.ompt_callback_mutex_acquired) {
2730     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2731         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2732   }
2733 #endif
2734 
2735 #endif // KMP_USE_DYNAMIC_LOCK
2736 }
2737 
2738 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2739 #if KMP_USE_DYNAMIC_LOCK
2740 
2741 #if USE_ITT_BUILD
2742   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2743 #endif
2744 #if OMPT_SUPPORT && OMPT_OPTIONAL
2745   // This is the case, if called from omp_init_lock_with_hint:
2746   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2747   if (!codeptr)
2748     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2749   if (ompt_enabled.enabled) {
2750     if (ompt_enabled.ompt_callback_mutex_acquire) {
2751       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2752           ompt_mutex_nest_lock, omp_lock_hint_none,
2753           __ompt_get_mutex_impl_type(user_lock),
2754           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2755     }
2756   }
2757 #endif
2758   int acquire_status =
2759       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2760   (void)acquire_status;
2761 #if USE_ITT_BUILD
2762   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2763 #endif
2764 
2765 #if OMPT_SUPPORT && OMPT_OPTIONAL
2766   if (ompt_enabled.enabled) {
2767     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2768       if (ompt_enabled.ompt_callback_mutex_acquired) {
2769         // lock_first
2770         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2771             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2772             codeptr);
2773       }
2774     } else {
2775       if (ompt_enabled.ompt_callback_nest_lock) {
2776         // lock_next
2777         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2778             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2779       }
2780     }
2781   }
2782 #endif
2783 
2784 #else // KMP_USE_DYNAMIC_LOCK
2785   int acquire_status;
2786   kmp_user_lock_p lck;
2787 
2788   if ((__kmp_user_lock_kind == lk_tas) &&
2789       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2790        OMP_NEST_LOCK_T_SIZE)) {
2791     lck = (kmp_user_lock_p)user_lock;
2792   }
2793 #if KMP_USE_FUTEX
2794   else if ((__kmp_user_lock_kind == lk_futex) &&
2795            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2796             OMP_NEST_LOCK_T_SIZE)) {
2797     lck = (kmp_user_lock_p)user_lock;
2798   }
2799 #endif
2800   else {
2801     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2802   }
2803 
2804 #if USE_ITT_BUILD
2805   __kmp_itt_lock_acquiring(lck);
2806 #endif /* USE_ITT_BUILD */
2807 #if OMPT_SUPPORT && OMPT_OPTIONAL
2808   // This is the case, if called from omp_init_lock_with_hint:
2809   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2810   if (!codeptr)
2811     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2812   if (ompt_enabled.enabled) {
2813     if (ompt_enabled.ompt_callback_mutex_acquire) {
2814       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2815           ompt_mutex_nest_lock, omp_lock_hint_none,
2816           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2817           codeptr);
2818     }
2819   }
2820 #endif
2821 
2822   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2823 
2824 #if USE_ITT_BUILD
2825   __kmp_itt_lock_acquired(lck);
2826 #endif /* USE_ITT_BUILD */
2827 
2828 #if OMPT_SUPPORT && OMPT_OPTIONAL
2829   if (ompt_enabled.enabled) {
2830     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2831       if (ompt_enabled.ompt_callback_mutex_acquired) {
2832         // lock_first
2833         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2834             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2835       }
2836     } else {
2837       if (ompt_enabled.ompt_callback_nest_lock) {
2838         // lock_next
2839         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2840             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2841       }
2842     }
2843   }
2844 #endif
2845 
2846 #endif // KMP_USE_DYNAMIC_LOCK
2847 }
2848 
2849 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2850 #if KMP_USE_DYNAMIC_LOCK
2851 
2852   int tag = KMP_EXTRACT_D_TAG(user_lock);
2853 #if USE_ITT_BUILD
2854   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2855 #endif
2856 #if KMP_USE_INLINED_TAS
2857   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2858     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2859   } else
2860 #elif KMP_USE_INLINED_FUTEX
2861   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2862     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2863   } else
2864 #endif
2865   {
2866     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2867   }
2868 
2869 #if OMPT_SUPPORT && OMPT_OPTIONAL
2870   // This is the case, if called from omp_init_lock_with_hint:
2871   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2872   if (!codeptr)
2873     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2874   if (ompt_enabled.ompt_callback_mutex_released) {
2875     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2876         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2877   }
2878 #endif
2879 
2880 #else // KMP_USE_DYNAMIC_LOCK
2881 
2882   kmp_user_lock_p lck;
2883 
2884   /* Can't use serial interval since not block structured */
2885   /* release the lock */
2886 
2887   if ((__kmp_user_lock_kind == lk_tas) &&
2888       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2889 #if KMP_OS_LINUX &&                                                            \
2890     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2891 // "fast" path implemented to fix customer performance issue
2892 #if USE_ITT_BUILD
2893     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2894 #endif /* USE_ITT_BUILD */
2895     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2896     KMP_MB();
2897 
2898 #if OMPT_SUPPORT && OMPT_OPTIONAL
2899     // This is the case, if called from omp_init_lock_with_hint:
2900     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2901     if (!codeptr)
2902       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2903     if (ompt_enabled.ompt_callback_mutex_released) {
2904       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2905           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2906     }
2907 #endif
2908 
2909     return;
2910 #else
2911     lck = (kmp_user_lock_p)user_lock;
2912 #endif
2913   }
2914 #if KMP_USE_FUTEX
2915   else if ((__kmp_user_lock_kind == lk_futex) &&
2916            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2917     lck = (kmp_user_lock_p)user_lock;
2918   }
2919 #endif
2920   else {
2921     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2922   }
2923 
2924 #if USE_ITT_BUILD
2925   __kmp_itt_lock_releasing(lck);
2926 #endif /* USE_ITT_BUILD */
2927 
2928   RELEASE_LOCK(lck, gtid);
2929 
2930 #if OMPT_SUPPORT && OMPT_OPTIONAL
2931   // This is the case, if called from omp_init_lock_with_hint:
2932   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2933   if (!codeptr)
2934     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2935   if (ompt_enabled.ompt_callback_mutex_released) {
2936     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2937         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2938   }
2939 #endif
2940 
2941 #endif // KMP_USE_DYNAMIC_LOCK
2942 }
2943 
2944 /* release the lock */
2945 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2946 #if KMP_USE_DYNAMIC_LOCK
2947 
2948 #if USE_ITT_BUILD
2949   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2950 #endif
2951   int release_status =
2952       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2953   (void)release_status;
2954 
2955 #if OMPT_SUPPORT && OMPT_OPTIONAL
2956   // This is the case, if called from omp_init_lock_with_hint:
2957   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2958   if (!codeptr)
2959     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2960   if (ompt_enabled.enabled) {
2961     if (release_status == KMP_LOCK_RELEASED) {
2962       if (ompt_enabled.ompt_callback_mutex_released) {
2963         // release_lock_last
2964         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2965             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2966             codeptr);
2967       }
2968     } else if (ompt_enabled.ompt_callback_nest_lock) {
2969       // release_lock_prev
2970       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2971           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2972     }
2973   }
2974 #endif
2975 
2976 #else // KMP_USE_DYNAMIC_LOCK
2977 
2978   kmp_user_lock_p lck;
2979 
2980   /* Can't use serial interval since not block structured */
2981 
2982   if ((__kmp_user_lock_kind == lk_tas) &&
2983       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2984        OMP_NEST_LOCK_T_SIZE)) {
2985 #if KMP_OS_LINUX &&                                                            \
2986     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2987     // "fast" path implemented to fix customer performance issue
2988     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2989 #if USE_ITT_BUILD
2990     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2991 #endif /* USE_ITT_BUILD */
2992 
2993 #if OMPT_SUPPORT && OMPT_OPTIONAL
2994     int release_status = KMP_LOCK_STILL_HELD;
2995 #endif
2996 
2997     if (--(tl->lk.depth_locked) == 0) {
2998       TCW_4(tl->lk.poll, 0);
2999 #if OMPT_SUPPORT && OMPT_OPTIONAL
3000       release_status = KMP_LOCK_RELEASED;
3001 #endif
3002     }
3003     KMP_MB();
3004 
3005 #if OMPT_SUPPORT && OMPT_OPTIONAL
3006     // This is the case, if called from omp_init_lock_with_hint:
3007     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3008     if (!codeptr)
3009       codeptr = OMPT_GET_RETURN_ADDRESS(0);
3010     if (ompt_enabled.enabled) {
3011       if (release_status == KMP_LOCK_RELEASED) {
3012         if (ompt_enabled.ompt_callback_mutex_released) {
3013           // release_lock_last
3014           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3015               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3016         }
3017       } else if (ompt_enabled.ompt_callback_nest_lock) {
3018         // release_lock_previous
3019         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3020             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3021       }
3022     }
3023 #endif
3024 
3025     return;
3026 #else
3027     lck = (kmp_user_lock_p)user_lock;
3028 #endif
3029   }
3030 #if KMP_USE_FUTEX
3031   else if ((__kmp_user_lock_kind == lk_futex) &&
3032            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3033             OMP_NEST_LOCK_T_SIZE)) {
3034     lck = (kmp_user_lock_p)user_lock;
3035   }
3036 #endif
3037   else {
3038     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
3039   }
3040 
3041 #if USE_ITT_BUILD
3042   __kmp_itt_lock_releasing(lck);
3043 #endif /* USE_ITT_BUILD */
3044 
3045   int release_status;
3046   release_status = RELEASE_NESTED_LOCK(lck, gtid);
3047 #if OMPT_SUPPORT && OMPT_OPTIONAL
3048   // This is the case, if called from omp_init_lock_with_hint:
3049   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3050   if (!codeptr)
3051     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3052   if (ompt_enabled.enabled) {
3053     if (release_status == KMP_LOCK_RELEASED) {
3054       if (ompt_enabled.ompt_callback_mutex_released) {
3055         // release_lock_last
3056         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3057             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3058       }
3059     } else if (ompt_enabled.ompt_callback_nest_lock) {
3060       // release_lock_previous
3061       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3062           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3063     }
3064   }
3065 #endif
3066 
3067 #endif // KMP_USE_DYNAMIC_LOCK
3068 }
3069 
3070 /* try to acquire the lock */
3071 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3072   KMP_COUNT_BLOCK(OMP_test_lock);
3073 
3074 #if KMP_USE_DYNAMIC_LOCK
3075   int rc;
3076   int tag = KMP_EXTRACT_D_TAG(user_lock);
3077 #if USE_ITT_BUILD
3078   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3079 #endif
3080 #if OMPT_SUPPORT && OMPT_OPTIONAL
3081   // This is the case, if called from omp_init_lock_with_hint:
3082   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3083   if (!codeptr)
3084     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3085   if (ompt_enabled.ompt_callback_mutex_acquire) {
3086     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3087         ompt_mutex_lock, omp_lock_hint_none,
3088         __ompt_get_mutex_impl_type(user_lock),
3089         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3090   }
3091 #endif
3092 #if KMP_USE_INLINED_TAS
3093   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3094     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3095   } else
3096 #elif KMP_USE_INLINED_FUTEX
3097   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3098     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3099   } else
3100 #endif
3101   {
3102     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3103   }
3104   if (rc) {
3105 #if USE_ITT_BUILD
3106     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3107 #endif
3108 #if OMPT_SUPPORT && OMPT_OPTIONAL
3109     if (ompt_enabled.ompt_callback_mutex_acquired) {
3110       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3111           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3112     }
3113 #endif
3114     return FTN_TRUE;
3115   } else {
3116 #if USE_ITT_BUILD
3117     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3118 #endif
3119     return FTN_FALSE;
3120   }
3121 
3122 #else // KMP_USE_DYNAMIC_LOCK
3123 
3124   kmp_user_lock_p lck;
3125   int rc;
3126 
3127   if ((__kmp_user_lock_kind == lk_tas) &&
3128       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3129     lck = (kmp_user_lock_p)user_lock;
3130   }
3131 #if KMP_USE_FUTEX
3132   else if ((__kmp_user_lock_kind == lk_futex) &&
3133            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3134     lck = (kmp_user_lock_p)user_lock;
3135   }
3136 #endif
3137   else {
3138     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3139   }
3140 
3141 #if USE_ITT_BUILD
3142   __kmp_itt_lock_acquiring(lck);
3143 #endif /* USE_ITT_BUILD */
3144 #if OMPT_SUPPORT && OMPT_OPTIONAL
3145   // This is the case, if called from omp_init_lock_with_hint:
3146   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3147   if (!codeptr)
3148     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3149   if (ompt_enabled.ompt_callback_mutex_acquire) {
3150     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3151         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3152         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3153   }
3154 #endif
3155 
3156   rc = TEST_LOCK(lck, gtid);
3157 #if USE_ITT_BUILD
3158   if (rc) {
3159     __kmp_itt_lock_acquired(lck);
3160   } else {
3161     __kmp_itt_lock_cancelled(lck);
3162   }
3163 #endif /* USE_ITT_BUILD */
3164 #if OMPT_SUPPORT && OMPT_OPTIONAL
3165   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3166     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3167         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3168   }
3169 #endif
3170 
3171   return (rc ? FTN_TRUE : FTN_FALSE);
3172 
3173   /* Can't use serial interval since not block structured */
3174 
3175 #endif // KMP_USE_DYNAMIC_LOCK
3176 }
3177 
3178 /* try to acquire the lock */
3179 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3180 #if KMP_USE_DYNAMIC_LOCK
3181   int rc;
3182 #if USE_ITT_BUILD
3183   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3184 #endif
3185 #if OMPT_SUPPORT && OMPT_OPTIONAL
3186   // This is the case, if called from omp_init_lock_with_hint:
3187   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3188   if (!codeptr)
3189     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3190   if (ompt_enabled.ompt_callback_mutex_acquire) {
3191     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3192         ompt_mutex_nest_lock, omp_lock_hint_none,
3193         __ompt_get_mutex_impl_type(user_lock),
3194         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3195   }
3196 #endif
3197   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3198 #if USE_ITT_BUILD
3199   if (rc) {
3200     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3201   } else {
3202     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3203   }
3204 #endif
3205 #if OMPT_SUPPORT && OMPT_OPTIONAL
3206   if (ompt_enabled.enabled && rc) {
3207     if (rc == 1) {
3208       if (ompt_enabled.ompt_callback_mutex_acquired) {
3209         // lock_first
3210         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3211             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3212             codeptr);
3213       }
3214     } else {
3215       if (ompt_enabled.ompt_callback_nest_lock) {
3216         // lock_next
3217         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3218             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3219       }
3220     }
3221   }
3222 #endif
3223   return rc;
3224 
3225 #else // KMP_USE_DYNAMIC_LOCK
3226 
3227   kmp_user_lock_p lck;
3228   int rc;
3229 
3230   if ((__kmp_user_lock_kind == lk_tas) &&
3231       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3232        OMP_NEST_LOCK_T_SIZE)) {
3233     lck = (kmp_user_lock_p)user_lock;
3234   }
3235 #if KMP_USE_FUTEX
3236   else if ((__kmp_user_lock_kind == lk_futex) &&
3237            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3238             OMP_NEST_LOCK_T_SIZE)) {
3239     lck = (kmp_user_lock_p)user_lock;
3240   }
3241 #endif
3242   else {
3243     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3244   }
3245 
3246 #if USE_ITT_BUILD
3247   __kmp_itt_lock_acquiring(lck);
3248 #endif /* USE_ITT_BUILD */
3249 
3250 #if OMPT_SUPPORT && OMPT_OPTIONAL
3251   // This is the case, if called from omp_init_lock_with_hint:
3252   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3253   if (!codeptr)
3254     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3255   if (ompt_enabled.enabled) &&
3256         ompt_enabled.ompt_callback_mutex_acquire) {
3257       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3258           ompt_mutex_nest_lock, omp_lock_hint_none,
3259           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3260           codeptr);
3261     }
3262 #endif
3263 
3264   rc = TEST_NESTED_LOCK(lck, gtid);
3265 #if USE_ITT_BUILD
3266   if (rc) {
3267     __kmp_itt_lock_acquired(lck);
3268   } else {
3269     __kmp_itt_lock_cancelled(lck);
3270   }
3271 #endif /* USE_ITT_BUILD */
3272 #if OMPT_SUPPORT && OMPT_OPTIONAL
3273   if (ompt_enabled.enabled && rc) {
3274     if (rc == 1) {
3275       if (ompt_enabled.ompt_callback_mutex_acquired) {
3276         // lock_first
3277         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3278             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3279       }
3280     } else {
3281       if (ompt_enabled.ompt_callback_nest_lock) {
3282         // lock_next
3283         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3284             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3285       }
3286     }
3287   }
3288 #endif
3289   return rc;
3290 
3291   /* Can't use serial interval since not block structured */
3292 
3293 #endif // KMP_USE_DYNAMIC_LOCK
3294 }
3295 
3296 // Interface to fast scalable reduce methods routines
3297 
3298 // keep the selected method in a thread local structure for cross-function
3299 // usage: will be used in __kmpc_end_reduce* functions;
3300 // another solution: to re-determine the method one more time in
3301 // __kmpc_end_reduce* functions (new prototype required then)
3302 // AT: which solution is better?
3303 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3304   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3305 
3306 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3307   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3308 
3309 // description of the packed_reduction_method variable: look at the macros in
3310 // kmp.h
3311 
3312 // used in a critical section reduce block
3313 static __forceinline void
3314 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3315                                           kmp_critical_name *crit) {
3316 
3317   // this lock was visible to a customer and to the threading profile tool as a
3318   // serial overhead span (although it's used for an internal purpose only)
3319   //            why was it visible in previous implementation?
3320   //            should we keep it visible in new reduce block?
3321   kmp_user_lock_p lck;
3322 
3323 #if KMP_USE_DYNAMIC_LOCK
3324 
3325   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3326   // Check if it is initialized.
3327   if (*lk == 0) {
3328     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3329       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3330                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3331     } else {
3332       __kmp_init_indirect_csptr(crit, loc, global_tid,
3333                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3334     }
3335   }
3336   // Branch for accessing the actual lock object and set operation. This
3337   // branching is inevitable since this lock initialization does not follow the
3338   // normal dispatch path (lock table is not used).
3339   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3340     lck = (kmp_user_lock_p)lk;
3341     KMP_DEBUG_ASSERT(lck != NULL);
3342     if (__kmp_env_consistency_check) {
3343       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3344     }
3345     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3346   } else {
3347     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3348     lck = ilk->lock;
3349     KMP_DEBUG_ASSERT(lck != NULL);
3350     if (__kmp_env_consistency_check) {
3351       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3352     }
3353     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3354   }
3355 
3356 #else // KMP_USE_DYNAMIC_LOCK
3357 
3358   // We know that the fast reduction code is only emitted by Intel compilers
3359   // with 32 byte critical sections. If there isn't enough space, then we
3360   // have to use a pointer.
3361   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3362     lck = (kmp_user_lock_p)crit;
3363   } else {
3364     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3365   }
3366   KMP_DEBUG_ASSERT(lck != NULL);
3367 
3368   if (__kmp_env_consistency_check)
3369     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3370 
3371   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3372 
3373 #endif // KMP_USE_DYNAMIC_LOCK
3374 }
3375 
3376 // used in a critical section reduce block
3377 static __forceinline void
3378 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3379                                         kmp_critical_name *crit) {
3380 
3381   kmp_user_lock_p lck;
3382 
3383 #if KMP_USE_DYNAMIC_LOCK
3384 
3385   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3386     lck = (kmp_user_lock_p)crit;
3387     if (__kmp_env_consistency_check)
3388       __kmp_pop_sync(global_tid, ct_critical, loc);
3389     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3390   } else {
3391     kmp_indirect_lock_t *ilk =
3392         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3393     if (__kmp_env_consistency_check)
3394       __kmp_pop_sync(global_tid, ct_critical, loc);
3395     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3396   }
3397 
3398 #else // KMP_USE_DYNAMIC_LOCK
3399 
3400   // We know that the fast reduction code is only emitted by Intel compilers
3401   // with 32 byte critical sections. If there isn't enough space, then we have
3402   // to use a pointer.
3403   if (__kmp_base_user_lock_size > 32) {
3404     lck = *((kmp_user_lock_p *)crit);
3405     KMP_ASSERT(lck != NULL);
3406   } else {
3407     lck = (kmp_user_lock_p)crit;
3408   }
3409 
3410   if (__kmp_env_consistency_check)
3411     __kmp_pop_sync(global_tid, ct_critical, loc);
3412 
3413   __kmp_release_user_lock_with_checks(lck, global_tid);
3414 
3415 #endif // KMP_USE_DYNAMIC_LOCK
3416 } // __kmp_end_critical_section_reduce_block
3417 
3418 static __forceinline int
3419 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3420                                      int *task_state) {
3421   kmp_team_t *team;
3422 
3423   // Check if we are inside the teams construct?
3424   if (th->th.th_teams_microtask) {
3425     *team_p = team = th->th.th_team;
3426     if (team->t.t_level == th->th.th_teams_level) {
3427       // This is reduction at teams construct.
3428       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3429       // Let's swap teams temporarily for the reduction.
3430       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3431       th->th.th_team = team->t.t_parent;
3432       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3433       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3434       *task_state = th->th.th_task_state;
3435       th->th.th_task_state = 0;
3436 
3437       return 1;
3438     }
3439   }
3440   return 0;
3441 }
3442 
3443 static __forceinline void
3444 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3445   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3446   th->th.th_info.ds.ds_tid = 0;
3447   th->th.th_team = team;
3448   th->th.th_team_nproc = team->t.t_nproc;
3449   th->th.th_task_team = team->t.t_task_team[task_state];
3450   __kmp_type_convert(task_state, &(th->th.th_task_state));
3451 }
3452 
3453 /* 2.a.i. Reduce Block without a terminating barrier */
3454 /*!
3455 @ingroup SYNCHRONIZATION
3456 @param loc source location information
3457 @param global_tid global thread number
3458 @param num_vars number of items (variables) to be reduced
3459 @param reduce_size size of data in bytes to be reduced
3460 @param reduce_data pointer to data to be reduced
3461 @param reduce_func callback function providing reduction operation on two
3462 operands and returning result of reduction in lhs_data
3463 @param lck pointer to the unique lock data structure
3464 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3465 threads if atomic reduction needed
3466 
3467 The nowait version is used for a reduce clause with the nowait argument.
3468 */
3469 kmp_int32
3470 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3471                      size_t reduce_size, void *reduce_data,
3472                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3473                      kmp_critical_name *lck) {
3474 
3475   KMP_COUNT_BLOCK(REDUCE_nowait);
3476   int retval = 0;
3477   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3478   kmp_info_t *th;
3479   kmp_team_t *team;
3480   int teams_swapped = 0, task_state;
3481   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3482   __kmp_assert_valid_gtid(global_tid);
3483 
3484   // why do we need this initialization here at all?
3485   // Reduction clause can not be used as a stand-alone directive.
3486 
3487   // do not call __kmp_serial_initialize(), it will be called by
3488   // __kmp_parallel_initialize() if needed
3489   // possible detection of false-positive race by the threadchecker ???
3490   if (!TCR_4(__kmp_init_parallel))
3491     __kmp_parallel_initialize();
3492 
3493   __kmp_resume_if_soft_paused();
3494 
3495 // check correctness of reduce block nesting
3496 #if KMP_USE_DYNAMIC_LOCK
3497   if (__kmp_env_consistency_check)
3498     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3499 #else
3500   if (__kmp_env_consistency_check)
3501     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3502 #endif
3503 
3504   th = __kmp_thread_from_gtid(global_tid);
3505   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3506 
3507   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3508   // the value should be kept in a variable
3509   // the variable should be either a construct-specific or thread-specific
3510   // property, not a team specific property
3511   //     (a thread can reach the next reduce block on the next construct, reduce
3512   //     method may differ on the next construct)
3513   // an ident_t "loc" parameter could be used as a construct-specific property
3514   // (what if loc == 0?)
3515   //     (if both construct-specific and team-specific variables were shared,
3516   //     then unness extra syncs should be needed)
3517   // a thread-specific variable is better regarding two issues above (next
3518   // construct and extra syncs)
3519   // a thread-specific "th_local.reduction_method" variable is used currently
3520   // each thread executes 'determine' and 'set' lines (no need to execute by one
3521   // thread, to avoid unness extra syncs)
3522 
3523   packed_reduction_method = __kmp_determine_reduction_method(
3524       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3525   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3526 
3527   OMPT_REDUCTION_DECL(th, global_tid);
3528   if (packed_reduction_method == critical_reduce_block) {
3529 
3530     OMPT_REDUCTION_BEGIN;
3531 
3532     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3533     retval = 1;
3534 
3535   } else if (packed_reduction_method == empty_reduce_block) {
3536 
3537     OMPT_REDUCTION_BEGIN;
3538 
3539     // usage: if team size == 1, no synchronization is required ( Intel
3540     // platforms only )
3541     retval = 1;
3542 
3543   } else if (packed_reduction_method == atomic_reduce_block) {
3544 
3545     retval = 2;
3546 
3547     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3548     // won't be called by the code gen)
3549     //     (it's not quite good, because the checking block has been closed by
3550     //     this 'pop',
3551     //      but atomic operation has not been executed yet, will be executed
3552     //      slightly later, literally on next instruction)
3553     if (__kmp_env_consistency_check)
3554       __kmp_pop_sync(global_tid, ct_reduce, loc);
3555 
3556   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3557                                    tree_reduce_block)) {
3558 
3559 // AT: performance issue: a real barrier here
3560 // AT: (if primary thread is slow, other threads are blocked here waiting for
3561 //      the primary thread to come and release them)
3562 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3563 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3564 //      be confusing to a customer)
3565 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3566 // might go faster and be more in line with sense of NOWAIT
3567 // AT: TO DO: do epcc test and compare times
3568 
3569 // this barrier should be invisible to a customer and to the threading profile
3570 // tool (it's neither a terminating barrier nor customer's code, it's
3571 // used for an internal purpose)
3572 #if OMPT_SUPPORT
3573     // JP: can this barrier potentially leed to task scheduling?
3574     // JP: as long as there is a barrier in the implementation, OMPT should and
3575     // will provide the barrier events
3576     //         so we set-up the necessary frame/return addresses.
3577     ompt_frame_t *ompt_frame;
3578     if (ompt_enabled.enabled) {
3579       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3580       if (ompt_frame->enter_frame.ptr == NULL)
3581         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3582     }
3583     OMPT_STORE_RETURN_ADDRESS(global_tid);
3584 #endif
3585 #if USE_ITT_NOTIFY
3586     __kmp_threads[global_tid]->th.th_ident = loc;
3587 #endif
3588     retval =
3589         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3590                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3591     retval = (retval != 0) ? (0) : (1);
3592 #if OMPT_SUPPORT && OMPT_OPTIONAL
3593     if (ompt_enabled.enabled) {
3594       ompt_frame->enter_frame = ompt_data_none;
3595     }
3596 #endif
3597 
3598     // all other workers except primary thread should do this pop here
3599     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3600     if (__kmp_env_consistency_check) {
3601       if (retval == 0) {
3602         __kmp_pop_sync(global_tid, ct_reduce, loc);
3603       }
3604     }
3605 
3606   } else {
3607 
3608     // should never reach this block
3609     KMP_ASSERT(0); // "unexpected method"
3610   }
3611   if (teams_swapped) {
3612     __kmp_restore_swapped_teams(th, team, task_state);
3613   }
3614   KA_TRACE(
3615       10,
3616       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3617        global_tid, packed_reduction_method, retval));
3618 
3619   return retval;
3620 }
3621 
3622 /*!
3623 @ingroup SYNCHRONIZATION
3624 @param loc source location information
3625 @param global_tid global thread id.
3626 @param lck pointer to the unique lock data structure
3627 
3628 Finish the execution of a reduce nowait.
3629 */
3630 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3631                               kmp_critical_name *lck) {
3632 
3633   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3634 
3635   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3636   __kmp_assert_valid_gtid(global_tid);
3637 
3638   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3639 
3640   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3641 
3642   if (packed_reduction_method == critical_reduce_block) {
3643 
3644     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3645     OMPT_REDUCTION_END;
3646 
3647   } else if (packed_reduction_method == empty_reduce_block) {
3648 
3649     // usage: if team size == 1, no synchronization is required ( on Intel
3650     // platforms only )
3651 
3652     OMPT_REDUCTION_END;
3653 
3654   } else if (packed_reduction_method == atomic_reduce_block) {
3655 
3656     // neither primary thread nor other workers should get here
3657     //     (code gen does not generate this call in case 2: atomic reduce block)
3658     // actually it's better to remove this elseif at all;
3659     // after removal this value will checked by the 'else' and will assert
3660 
3661   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3662                                    tree_reduce_block)) {
3663 
3664     // only primary thread gets here
3665     // OMPT: tree reduction is annotated in the barrier code
3666 
3667   } else {
3668 
3669     // should never reach this block
3670     KMP_ASSERT(0); // "unexpected method"
3671   }
3672 
3673   if (__kmp_env_consistency_check)
3674     __kmp_pop_sync(global_tid, ct_reduce, loc);
3675 
3676   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3677                 global_tid, packed_reduction_method));
3678 
3679   return;
3680 }
3681 
3682 /* 2.a.ii. Reduce Block with a terminating barrier */
3683 
3684 /*!
3685 @ingroup SYNCHRONIZATION
3686 @param loc source location information
3687 @param global_tid global thread number
3688 @param num_vars number of items (variables) to be reduced
3689 @param reduce_size size of data in bytes to be reduced
3690 @param reduce_data pointer to data to be reduced
3691 @param reduce_func callback function providing reduction operation on two
3692 operands and returning result of reduction in lhs_data
3693 @param lck pointer to the unique lock data structure
3694 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3695 threads if atomic reduction needed
3696 
3697 A blocking reduce that includes an implicit barrier.
3698 */
3699 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3700                         size_t reduce_size, void *reduce_data,
3701                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3702                         kmp_critical_name *lck) {
3703   KMP_COUNT_BLOCK(REDUCE_wait);
3704   int retval = 0;
3705   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3706   kmp_info_t *th;
3707   kmp_team_t *team;
3708   int teams_swapped = 0, task_state;
3709 
3710   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3711   __kmp_assert_valid_gtid(global_tid);
3712 
3713   // why do we need this initialization here at all?
3714   // Reduction clause can not be a stand-alone directive.
3715 
3716   // do not call __kmp_serial_initialize(), it will be called by
3717   // __kmp_parallel_initialize() if needed
3718   // possible detection of false-positive race by the threadchecker ???
3719   if (!TCR_4(__kmp_init_parallel))
3720     __kmp_parallel_initialize();
3721 
3722   __kmp_resume_if_soft_paused();
3723 
3724 // check correctness of reduce block nesting
3725 #if KMP_USE_DYNAMIC_LOCK
3726   if (__kmp_env_consistency_check)
3727     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3728 #else
3729   if (__kmp_env_consistency_check)
3730     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3731 #endif
3732 
3733   th = __kmp_thread_from_gtid(global_tid);
3734   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3735 
3736   packed_reduction_method = __kmp_determine_reduction_method(
3737       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3738   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3739 
3740   OMPT_REDUCTION_DECL(th, global_tid);
3741 
3742   if (packed_reduction_method == critical_reduce_block) {
3743 
3744     OMPT_REDUCTION_BEGIN;
3745     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3746     retval = 1;
3747 
3748   } else if (packed_reduction_method == empty_reduce_block) {
3749 
3750     OMPT_REDUCTION_BEGIN;
3751     // usage: if team size == 1, no synchronization is required ( Intel
3752     // platforms only )
3753     retval = 1;
3754 
3755   } else if (packed_reduction_method == atomic_reduce_block) {
3756 
3757     retval = 2;
3758 
3759   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3760                                    tree_reduce_block)) {
3761 
3762 // case tree_reduce_block:
3763 // this barrier should be visible to a customer and to the threading profile
3764 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3765 #if OMPT_SUPPORT
3766     ompt_frame_t *ompt_frame;
3767     if (ompt_enabled.enabled) {
3768       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3769       if (ompt_frame->enter_frame.ptr == NULL)
3770         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3771     }
3772     OMPT_STORE_RETURN_ADDRESS(global_tid);
3773 #endif
3774 #if USE_ITT_NOTIFY
3775     __kmp_threads[global_tid]->th.th_ident =
3776         loc; // needed for correct notification of frames
3777 #endif
3778     retval =
3779         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3780                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3781     retval = (retval != 0) ? (0) : (1);
3782 #if OMPT_SUPPORT && OMPT_OPTIONAL
3783     if (ompt_enabled.enabled) {
3784       ompt_frame->enter_frame = ompt_data_none;
3785     }
3786 #endif
3787 
3788     // all other workers except primary thread should do this pop here
3789     // (none of other workers except primary will enter __kmpc_end_reduce())
3790     if (__kmp_env_consistency_check) {
3791       if (retval == 0) { // 0: all other workers; 1: primary thread
3792         __kmp_pop_sync(global_tid, ct_reduce, loc);
3793       }
3794     }
3795 
3796   } else {
3797 
3798     // should never reach this block
3799     KMP_ASSERT(0); // "unexpected method"
3800   }
3801   if (teams_swapped) {
3802     __kmp_restore_swapped_teams(th, team, task_state);
3803   }
3804 
3805   KA_TRACE(10,
3806            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3807             global_tid, packed_reduction_method, retval));
3808   return retval;
3809 }
3810 
3811 /*!
3812 @ingroup SYNCHRONIZATION
3813 @param loc source location information
3814 @param global_tid global thread id.
3815 @param lck pointer to the unique lock data structure
3816 
3817 Finish the execution of a blocking reduce.
3818 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3819 start function.
3820 */
3821 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3822                        kmp_critical_name *lck) {
3823 
3824   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3825   kmp_info_t *th;
3826   kmp_team_t *team;
3827   int teams_swapped = 0, task_state;
3828 
3829   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3830   __kmp_assert_valid_gtid(global_tid);
3831 
3832   th = __kmp_thread_from_gtid(global_tid);
3833   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3834 
3835   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3836 
3837   // this barrier should be visible to a customer and to the threading profile
3838   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3839   OMPT_REDUCTION_DECL(th, global_tid);
3840 
3841   if (packed_reduction_method == critical_reduce_block) {
3842     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3843 
3844     OMPT_REDUCTION_END;
3845 
3846 // TODO: implicit barrier: should be exposed
3847 #if OMPT_SUPPORT
3848     ompt_frame_t *ompt_frame;
3849     if (ompt_enabled.enabled) {
3850       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3851       if (ompt_frame->enter_frame.ptr == NULL)
3852         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3853     }
3854     OMPT_STORE_RETURN_ADDRESS(global_tid);
3855 #endif
3856 #if USE_ITT_NOTIFY
3857     __kmp_threads[global_tid]->th.th_ident = loc;
3858 #endif
3859     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3860 #if OMPT_SUPPORT && OMPT_OPTIONAL
3861     if (ompt_enabled.enabled) {
3862       ompt_frame->enter_frame = ompt_data_none;
3863     }
3864 #endif
3865 
3866   } else if (packed_reduction_method == empty_reduce_block) {
3867 
3868     OMPT_REDUCTION_END;
3869 
3870 // usage: if team size==1, no synchronization is required (Intel platforms only)
3871 
3872 // TODO: implicit barrier: should be exposed
3873 #if OMPT_SUPPORT
3874     ompt_frame_t *ompt_frame;
3875     if (ompt_enabled.enabled) {
3876       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3877       if (ompt_frame->enter_frame.ptr == NULL)
3878         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3879     }
3880     OMPT_STORE_RETURN_ADDRESS(global_tid);
3881 #endif
3882 #if USE_ITT_NOTIFY
3883     __kmp_threads[global_tid]->th.th_ident = loc;
3884 #endif
3885     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3886 #if OMPT_SUPPORT && OMPT_OPTIONAL
3887     if (ompt_enabled.enabled) {
3888       ompt_frame->enter_frame = ompt_data_none;
3889     }
3890 #endif
3891 
3892   } else if (packed_reduction_method == atomic_reduce_block) {
3893 
3894 #if OMPT_SUPPORT
3895     ompt_frame_t *ompt_frame;
3896     if (ompt_enabled.enabled) {
3897       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3898       if (ompt_frame->enter_frame.ptr == NULL)
3899         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3900     }
3901     OMPT_STORE_RETURN_ADDRESS(global_tid);
3902 #endif
3903 // TODO: implicit barrier: should be exposed
3904 #if USE_ITT_NOTIFY
3905     __kmp_threads[global_tid]->th.th_ident = loc;
3906 #endif
3907     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3908 #if OMPT_SUPPORT && OMPT_OPTIONAL
3909     if (ompt_enabled.enabled) {
3910       ompt_frame->enter_frame = ompt_data_none;
3911     }
3912 #endif
3913 
3914   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3915                                    tree_reduce_block)) {
3916 
3917     // only primary thread executes here (primary releases all other workers)
3918     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3919                             global_tid);
3920 
3921   } else {
3922 
3923     // should never reach this block
3924     KMP_ASSERT(0); // "unexpected method"
3925   }
3926   if (teams_swapped) {
3927     __kmp_restore_swapped_teams(th, team, task_state);
3928   }
3929 
3930   if (__kmp_env_consistency_check)
3931     __kmp_pop_sync(global_tid, ct_reduce, loc);
3932 
3933   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3934                 global_tid, packed_reduction_method));
3935 
3936   return;
3937 }
3938 
3939 #undef __KMP_GET_REDUCTION_METHOD
3940 #undef __KMP_SET_REDUCTION_METHOD
3941 
3942 /* end of interface to fast scalable reduce routines */
3943 
3944 kmp_uint64 __kmpc_get_taskid() {
3945 
3946   kmp_int32 gtid;
3947   kmp_info_t *thread;
3948 
3949   gtid = __kmp_get_gtid();
3950   if (gtid < 0) {
3951     return 0;
3952   }
3953   thread = __kmp_thread_from_gtid(gtid);
3954   return thread->th.th_current_task->td_task_id;
3955 
3956 } // __kmpc_get_taskid
3957 
3958 kmp_uint64 __kmpc_get_parent_taskid() {
3959 
3960   kmp_int32 gtid;
3961   kmp_info_t *thread;
3962   kmp_taskdata_t *parent_task;
3963 
3964   gtid = __kmp_get_gtid();
3965   if (gtid < 0) {
3966     return 0;
3967   }
3968   thread = __kmp_thread_from_gtid(gtid);
3969   parent_task = thread->th.th_current_task->td_parent;
3970   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3971 
3972 } // __kmpc_get_parent_taskid
3973 
3974 /*!
3975 @ingroup WORK_SHARING
3976 @param loc  source location information.
3977 @param gtid  global thread number.
3978 @param num_dims  number of associated doacross loops.
3979 @param dims  info on loops bounds.
3980 
3981 Initialize doacross loop information.
3982 Expect compiler send us inclusive bounds,
3983 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3984 */
3985 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3986                           const struct kmp_dim *dims) {
3987   __kmp_assert_valid_gtid(gtid);
3988   int j, idx;
3989   kmp_int64 last, trace_count;
3990   kmp_info_t *th = __kmp_threads[gtid];
3991   kmp_team_t *team = th->th.th_team;
3992   kmp_uint32 *flags;
3993   kmp_disp_t *pr_buf = th->th.th_dispatch;
3994   dispatch_shared_info_t *sh_buf;
3995 
3996   KA_TRACE(
3997       20,
3998       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3999        gtid, num_dims, !team->t.t_serialized));
4000   KMP_DEBUG_ASSERT(dims != NULL);
4001   KMP_DEBUG_ASSERT(num_dims > 0);
4002 
4003   if (team->t.t_serialized) {
4004     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
4005     return; // no dependencies if team is serialized
4006   }
4007   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
4008   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
4009   // the next loop
4010   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4011 
4012   // Save bounds info into allocated private buffer
4013   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
4014   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
4015       th, sizeof(kmp_int64) * (4 * num_dims + 1));
4016   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4017   pr_buf->th_doacross_info[0] =
4018       (kmp_int64)num_dims; // first element is number of dimensions
4019   // Save also address of num_done in order to access it later without knowing
4020   // the buffer index
4021   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
4022   pr_buf->th_doacross_info[2] = dims[0].lo;
4023   pr_buf->th_doacross_info[3] = dims[0].up;
4024   pr_buf->th_doacross_info[4] = dims[0].st;
4025   last = 5;
4026   for (j = 1; j < num_dims; ++j) {
4027     kmp_int64
4028         range_length; // To keep ranges of all dimensions but the first dims[0]
4029     if (dims[j].st == 1) { // most common case
4030       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
4031       range_length = dims[j].up - dims[j].lo + 1;
4032     } else {
4033       if (dims[j].st > 0) {
4034         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
4035         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
4036       } else { // negative increment
4037         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
4038         range_length =
4039             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
4040       }
4041     }
4042     pr_buf->th_doacross_info[last++] = range_length;
4043     pr_buf->th_doacross_info[last++] = dims[j].lo;
4044     pr_buf->th_doacross_info[last++] = dims[j].up;
4045     pr_buf->th_doacross_info[last++] = dims[j].st;
4046   }
4047 
4048   // Compute total trip count.
4049   // Start with range of dims[0] which we don't need to keep in the buffer.
4050   if (dims[0].st == 1) { // most common case
4051     trace_count = dims[0].up - dims[0].lo + 1;
4052   } else if (dims[0].st > 0) {
4053     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
4054     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
4055   } else { // negative increment
4056     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
4057     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
4058   }
4059   for (j = 1; j < num_dims; ++j) {
4060     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
4061   }
4062   KMP_DEBUG_ASSERT(trace_count > 0);
4063 
4064   // Check if shared buffer is not occupied by other loop (idx -
4065   // __kmp_dispatch_num_buffers)
4066   if (idx != sh_buf->doacross_buf_idx) {
4067     // Shared buffer is occupied, wait for it to be free
4068     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
4069                  __kmp_eq_4, NULL);
4070   }
4071 #if KMP_32_BIT_ARCH
4072   // Check if we are the first thread. After the CAS the first thread gets 0,
4073   // others get 1 if initialization is in progress, allocated pointer otherwise.
4074   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
4075   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
4076       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
4077 #else
4078   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
4079       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
4080 #endif
4081   if (flags == NULL) {
4082     // we are the first thread, allocate the array of flags
4083     size_t size =
4084         (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
4085     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4086     KMP_MB();
4087     sh_buf->doacross_flags = flags;
4088   } else if (flags == (kmp_uint32 *)1) {
4089 #if KMP_32_BIT_ARCH
4090     // initialization is still in progress, need to wait
4091     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4092 #else
4093     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4094 #endif
4095       KMP_YIELD(TRUE);
4096     KMP_MB();
4097   } else {
4098     KMP_MB();
4099   }
4100   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4101   pr_buf->th_doacross_flags =
4102       sh_buf->doacross_flags; // save private copy in order to not
4103   // touch shared buffer on each iteration
4104   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4105 }
4106 
4107 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4108   __kmp_assert_valid_gtid(gtid);
4109   kmp_int64 shft;
4110   size_t num_dims, i;
4111   kmp_uint32 flag;
4112   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4113   kmp_info_t *th = __kmp_threads[gtid];
4114   kmp_team_t *team = th->th.th_team;
4115   kmp_disp_t *pr_buf;
4116   kmp_int64 lo, up, st;
4117 
4118   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4119   if (team->t.t_serialized) {
4120     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4121     return; // no dependencies if team is serialized
4122   }
4123 
4124   // calculate sequential iteration number and check out-of-bounds condition
4125   pr_buf = th->th.th_dispatch;
4126   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4127   num_dims = (size_t)pr_buf->th_doacross_info[0];
4128   lo = pr_buf->th_doacross_info[2];
4129   up = pr_buf->th_doacross_info[3];
4130   st = pr_buf->th_doacross_info[4];
4131 #if OMPT_SUPPORT && OMPT_OPTIONAL
4132   ompt_dependence_t deps[num_dims];
4133 #endif
4134   if (st == 1) { // most common case
4135     if (vec[0] < lo || vec[0] > up) {
4136       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4137                     "bounds [%lld,%lld]\n",
4138                     gtid, vec[0], lo, up));
4139       return;
4140     }
4141     iter_number = vec[0] - lo;
4142   } else if (st > 0) {
4143     if (vec[0] < lo || vec[0] > up) {
4144       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4145                     "bounds [%lld,%lld]\n",
4146                     gtid, vec[0], lo, up));
4147       return;
4148     }
4149     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4150   } else { // negative increment
4151     if (vec[0] > lo || vec[0] < up) {
4152       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4153                     "bounds [%lld,%lld]\n",
4154                     gtid, vec[0], lo, up));
4155       return;
4156     }
4157     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4158   }
4159 #if OMPT_SUPPORT && OMPT_OPTIONAL
4160   deps[0].variable.value = iter_number;
4161   deps[0].dependence_type = ompt_dependence_type_sink;
4162 #endif
4163   for (i = 1; i < num_dims; ++i) {
4164     kmp_int64 iter, ln;
4165     size_t j = i * 4;
4166     ln = pr_buf->th_doacross_info[j + 1];
4167     lo = pr_buf->th_doacross_info[j + 2];
4168     up = pr_buf->th_doacross_info[j + 3];
4169     st = pr_buf->th_doacross_info[j + 4];
4170     if (st == 1) {
4171       if (vec[i] < lo || vec[i] > up) {
4172         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4173                       "bounds [%lld,%lld]\n",
4174                       gtid, vec[i], lo, up));
4175         return;
4176       }
4177       iter = vec[i] - lo;
4178     } else if (st > 0) {
4179       if (vec[i] < lo || vec[i] > up) {
4180         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4181                       "bounds [%lld,%lld]\n",
4182                       gtid, vec[i], lo, up));
4183         return;
4184       }
4185       iter = (kmp_uint64)(vec[i] - lo) / st;
4186     } else { // st < 0
4187       if (vec[i] > lo || vec[i] < up) {
4188         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4189                       "bounds [%lld,%lld]\n",
4190                       gtid, vec[i], lo, up));
4191         return;
4192       }
4193       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4194     }
4195     iter_number = iter + ln * iter_number;
4196 #if OMPT_SUPPORT && OMPT_OPTIONAL
4197     deps[i].variable.value = iter;
4198     deps[i].dependence_type = ompt_dependence_type_sink;
4199 #endif
4200   }
4201   shft = iter_number % 32; // use 32-bit granularity
4202   iter_number >>= 5; // divided by 32
4203   flag = 1 << shft;
4204   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4205     KMP_YIELD(TRUE);
4206   }
4207   KMP_MB();
4208 #if OMPT_SUPPORT && OMPT_OPTIONAL
4209   if (ompt_enabled.ompt_callback_dependences) {
4210     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4211         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4212   }
4213 #endif
4214   KA_TRACE(20,
4215            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4216             gtid, (iter_number << 5) + shft));
4217 }
4218 
4219 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4220   __kmp_assert_valid_gtid(gtid);
4221   kmp_int64 shft;
4222   size_t num_dims, i;
4223   kmp_uint32 flag;
4224   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4225   kmp_info_t *th = __kmp_threads[gtid];
4226   kmp_team_t *team = th->th.th_team;
4227   kmp_disp_t *pr_buf;
4228   kmp_int64 lo, st;
4229 
4230   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4231   if (team->t.t_serialized) {
4232     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4233     return; // no dependencies if team is serialized
4234   }
4235 
4236   // calculate sequential iteration number (same as in "wait" but no
4237   // out-of-bounds checks)
4238   pr_buf = th->th.th_dispatch;
4239   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4240   num_dims = (size_t)pr_buf->th_doacross_info[0];
4241   lo = pr_buf->th_doacross_info[2];
4242   st = pr_buf->th_doacross_info[4];
4243 #if OMPT_SUPPORT && OMPT_OPTIONAL
4244   ompt_dependence_t deps[num_dims];
4245 #endif
4246   if (st == 1) { // most common case
4247     iter_number = vec[0] - lo;
4248   } else if (st > 0) {
4249     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4250   } else { // negative increment
4251     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4252   }
4253 #if OMPT_SUPPORT && OMPT_OPTIONAL
4254   deps[0].variable.value = iter_number;
4255   deps[0].dependence_type = ompt_dependence_type_source;
4256 #endif
4257   for (i = 1; i < num_dims; ++i) {
4258     kmp_int64 iter, ln;
4259     size_t j = i * 4;
4260     ln = pr_buf->th_doacross_info[j + 1];
4261     lo = pr_buf->th_doacross_info[j + 2];
4262     st = pr_buf->th_doacross_info[j + 4];
4263     if (st == 1) {
4264       iter = vec[i] - lo;
4265     } else if (st > 0) {
4266       iter = (kmp_uint64)(vec[i] - lo) / st;
4267     } else { // st < 0
4268       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4269     }
4270     iter_number = iter + ln * iter_number;
4271 #if OMPT_SUPPORT && OMPT_OPTIONAL
4272     deps[i].variable.value = iter;
4273     deps[i].dependence_type = ompt_dependence_type_source;
4274 #endif
4275   }
4276 #if OMPT_SUPPORT && OMPT_OPTIONAL
4277   if (ompt_enabled.ompt_callback_dependences) {
4278     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4279         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4280   }
4281 #endif
4282   shft = iter_number % 32; // use 32-bit granularity
4283   iter_number >>= 5; // divided by 32
4284   flag = 1 << shft;
4285   KMP_MB();
4286   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4287     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4288   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4289                 (iter_number << 5) + shft));
4290 }
4291 
4292 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4293   __kmp_assert_valid_gtid(gtid);
4294   kmp_int32 num_done;
4295   kmp_info_t *th = __kmp_threads[gtid];
4296   kmp_team_t *team = th->th.th_team;
4297   kmp_disp_t *pr_buf = th->th.th_dispatch;
4298 
4299   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4300   if (team->t.t_serialized) {
4301     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4302     return; // nothing to do
4303   }
4304   num_done =
4305       KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4306   if (num_done == th->th.th_team_nproc) {
4307     // we are the last thread, need to free shared resources
4308     int idx = pr_buf->th_doacross_buf_idx - 1;
4309     dispatch_shared_info_t *sh_buf =
4310         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4311     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4312                      (kmp_int64)&sh_buf->doacross_num_done);
4313     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4314     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4315     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4316     sh_buf->doacross_flags = NULL;
4317     sh_buf->doacross_num_done = 0;
4318     sh_buf->doacross_buf_idx +=
4319         __kmp_dispatch_num_buffers; // free buffer for future re-use
4320   }
4321   // free private resources (need to keep buffer index forever)
4322   pr_buf->th_doacross_flags = NULL;
4323   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4324   pr_buf->th_doacross_info = NULL;
4325   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4326 }
4327 
4328 /* OpenMP 5.1 Memory Management routines */
4329 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4330   return __kmp_alloc(__kmp_entry_gtid(), 0, size, allocator);
4331 }
4332 
4333 void *omp_aligned_alloc(size_t align, size_t size,
4334                         omp_allocator_handle_t allocator) {
4335   return __kmp_alloc(__kmp_entry_gtid(), align, size, allocator);
4336 }
4337 
4338 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4339   return __kmp_calloc(__kmp_entry_gtid(), 0, nmemb, size, allocator);
4340 }
4341 
4342 void *omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
4343                          omp_allocator_handle_t allocator) {
4344   return __kmp_calloc(__kmp_entry_gtid(), align, nmemb, size, allocator);
4345 }
4346 
4347 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4348                   omp_allocator_handle_t free_allocator) {
4349   return __kmp_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4350                         free_allocator);
4351 }
4352 
4353 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4354   ___kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4355 }
4356 /* end of OpenMP 5.1 Memory Management routines */
4357 
4358 int __kmpc_get_target_offload(void) {
4359   if (!__kmp_init_serial) {
4360     __kmp_serial_initialize();
4361   }
4362   return __kmp_target_offload;
4363 }
4364 
4365 int __kmpc_pause_resource(kmp_pause_status_t level) {
4366   if (!__kmp_init_serial) {
4367     return 1; // Can't pause if runtime is not initialized
4368   }
4369   return __kmp_pause_resource(level);
4370 }
4371 
4372 void __kmpc_error(ident_t *loc, int severity, const char *message) {
4373   if (!__kmp_init_serial)
4374     __kmp_serial_initialize();
4375 
4376   KMP_ASSERT(severity == severity_warning || severity == severity_fatal);
4377 
4378 #if OMPT_SUPPORT
4379   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_error) {
4380     ompt_callbacks.ompt_callback(ompt_callback_error)(
4381         (ompt_severity_t)severity, message, KMP_STRLEN(message),
4382         OMPT_GET_RETURN_ADDRESS(0));
4383   }
4384 #endif // OMPT_SUPPORT
4385 
4386   char *src_loc;
4387   if (loc && loc->psource) {
4388     kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
4389     src_loc =
4390         __kmp_str_format("%s:%s:%s", str_loc.file, str_loc.line, str_loc.col);
4391     __kmp_str_loc_free(&str_loc);
4392   } else {
4393     src_loc = __kmp_str_format("unknown");
4394   }
4395 
4396   if (severity == severity_warning)
4397     KMP_WARNING(UserDirectedWarning, src_loc, message);
4398   else
4399     KMP_FATAL(UserDirectedError, src_loc, message);
4400 
4401   __kmp_str_free(&src_loc);
4402 }
4403 
4404 // Mark begin of scope directive.
4405 void __kmpc_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4406 // reserved is for extension of scope directive and not used.
4407 #if OMPT_SUPPORT && OMPT_OPTIONAL
4408   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4409     kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4410     int tid = __kmp_tid_from_gtid(gtid);
4411     ompt_callbacks.ompt_callback(ompt_callback_work)(
4412         ompt_work_scope, ompt_scope_begin,
4413         &(team->t.ompt_team_info.parallel_data),
4414         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4415         OMPT_GET_RETURN_ADDRESS(0));
4416   }
4417 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4418 }
4419 
4420 // Mark end of scope directive
4421 void __kmpc_end_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4422 // reserved is for extension of scope directive and not used.
4423 #if OMPT_SUPPORT && OMPT_OPTIONAL
4424   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4425     kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4426     int tid = __kmp_tid_from_gtid(gtid);
4427     ompt_callbacks.ompt_callback(ompt_callback_work)(
4428         ompt_work_scope, ompt_scope_end,
4429         &(team->t.ompt_team_info.parallel_data),
4430         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4431         OMPT_GET_RETURN_ADDRESS(0));
4432   }
4433 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4434 }
4435 
4436 #ifdef KMP_USE_VERSION_SYMBOLS
4437 // For GOMP compatibility there are two versions of each omp_* API.
4438 // One is the plain C symbol and one is the Fortran symbol with an appended
4439 // underscore. When we implement a specific ompc_* version of an omp_*
4440 // function, we want the plain GOMP versioned symbol to alias the ompc_* version
4441 // instead of the Fortran versions in kmp_ftn_entry.h
4442 extern "C" {
4443 // Have to undef these from omp.h so they aren't translated into
4444 // their ompc counterparts in the KMP_VERSION_OMPC_SYMBOL macros below
4445 #ifdef omp_set_affinity_format
4446 #undef omp_set_affinity_format
4447 #endif
4448 #ifdef omp_get_affinity_format
4449 #undef omp_get_affinity_format
4450 #endif
4451 #ifdef omp_display_affinity
4452 #undef omp_display_affinity
4453 #endif
4454 #ifdef omp_capture_affinity
4455 #undef omp_capture_affinity
4456 #endif
4457 KMP_VERSION_OMPC_SYMBOL(ompc_set_affinity_format, omp_set_affinity_format, 50,
4458                         "OMP_5.0");
4459 KMP_VERSION_OMPC_SYMBOL(ompc_get_affinity_format, omp_get_affinity_format, 50,
4460                         "OMP_5.0");
4461 KMP_VERSION_OMPC_SYMBOL(ompc_display_affinity, omp_display_affinity, 50,
4462                         "OMP_5.0");
4463 KMP_VERSION_OMPC_SYMBOL(ompc_capture_affinity, omp_capture_affinity, 50,
4464                         "OMP_5.0");
4465 } // extern "C"
4466 #endif
4467