xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_csupport.cpp (revision 13ec1e3155c7e9bf037b12af186351b7fa9b9450)
1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     __kmp_assign_root_init_mask();
43     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
44   } else if (__kmp_ignore_mppbeg() == FALSE) {
45     // By default __kmp_ignore_mppbeg() returns TRUE.
46     __kmp_internal_begin();
47     KC_TRACE(10, ("__kmpc_begin: called\n"));
48   }
49 }
50 
51 /*!
52  * @ingroup STARTUP_SHUTDOWN
53  * @param loc source location information
54  *
55  * Shutdown the runtime library. This is also optional, and even if called will
56  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
57  * zero.
58  */
59 void __kmpc_end(ident_t *loc) {
60   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
61   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
62   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
63   // returns FALSE and __kmpc_end() will unregister this root (it can cause
64   // library shut down).
65   if (__kmp_ignore_mppend() == FALSE) {
66     KC_TRACE(10, ("__kmpc_end: called\n"));
67     KA_TRACE(30, ("__kmpc_end\n"));
68 
69     __kmp_internal_end_thread(-1);
70   }
71 #if KMP_OS_WINDOWS && OMPT_SUPPORT
72   // Normal exit process on Windows does not allow worker threads of the final
73   // parallel region to finish reporting their events, so shutting down the
74   // library here fixes the issue at least for the cases where __kmpc_end() is
75   // placed properly.
76   if (ompt_enabled.enabled)
77     __kmp_internal_end_library(__kmp_gtid_get_specific());
78 #endif
79 }
80 
81 /*!
82 @ingroup THREAD_STATES
83 @param loc Source location information.
84 @return The global thread index of the active thread.
85 
86 This function can be called in any context.
87 
88 If the runtime has ony been entered at the outermost level from a
89 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
90 that which would be returned by omp_get_thread_num() in the outermost
91 active parallel construct. (Or zero if there is no active parallel
92 construct, since the primary thread is necessarily thread zero).
93 
94 If multiple non-OpenMP threads all enter an OpenMP construct then this
95 will be a unique thread identifier among all the threads created by
96 the OpenMP runtime (but the value cannot be defined in terms of
97 OpenMP thread ids returned by omp_get_thread_num()).
98 */
99 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
100   kmp_int32 gtid = __kmp_entry_gtid();
101 
102   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
103 
104   return gtid;
105 }
106 
107 /*!
108 @ingroup THREAD_STATES
109 @param loc Source location information.
110 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
111 
112 This function can be called in any context.
113 It returns the total number of threads under the control of the OpenMP runtime.
114 That is not a number that can be determined by any OpenMP standard calls, since
115 the library may be called from more than one non-OpenMP thread, and this
116 reflects the total over all such calls. Similarly the runtime maintains
117 underlying threads even when they are not active (since the cost of creating
118 and destroying OS threads is high), this call counts all such threads even if
119 they are not waiting for work.
120 */
121 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
122   KC_TRACE(10,
123            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
124 
125   return TCR_4(__kmp_all_nth);
126 }
127 
128 /*!
129 @ingroup THREAD_STATES
130 @param loc Source location information.
131 @return The thread number of the calling thread in the innermost active parallel
132 construct.
133 */
134 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
135   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
136   return __kmp_tid_from_gtid(__kmp_entry_gtid());
137 }
138 
139 /*!
140 @ingroup THREAD_STATES
141 @param loc Source location information.
142 @return The number of threads in the innermost active parallel construct.
143 */
144 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
145   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
146 
147   return __kmp_entry_thread()->th.th_team->t.t_nproc;
148 }
149 
150 /*!
151  * @ingroup DEPRECATED
152  * @param loc location description
153  *
154  * This function need not be called. It always returns TRUE.
155  */
156 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
157 #ifndef KMP_DEBUG
158 
159   return TRUE;
160 
161 #else
162 
163   const char *semi2;
164   const char *semi3;
165   int line_no;
166 
167   if (__kmp_par_range == 0) {
168     return TRUE;
169   }
170   semi2 = loc->psource;
171   if (semi2 == NULL) {
172     return TRUE;
173   }
174   semi2 = strchr(semi2, ';');
175   if (semi2 == NULL) {
176     return TRUE;
177   }
178   semi2 = strchr(semi2 + 1, ';');
179   if (semi2 == NULL) {
180     return TRUE;
181   }
182   if (__kmp_par_range_filename[0]) {
183     const char *name = semi2 - 1;
184     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
185       name--;
186     }
187     if ((*name == '/') || (*name == ';')) {
188       name++;
189     }
190     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
191       return __kmp_par_range < 0;
192     }
193   }
194   semi3 = strchr(semi2 + 1, ';');
195   if (__kmp_par_range_routine[0]) {
196     if ((semi3 != NULL) && (semi3 > semi2) &&
197         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
198       return __kmp_par_range < 0;
199     }
200   }
201   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
202     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
203       return __kmp_par_range > 0;
204     }
205     return __kmp_par_range < 0;
206   }
207   return TRUE;
208 
209 #endif /* KMP_DEBUG */
210 }
211 
212 /*!
213 @ingroup THREAD_STATES
214 @param loc Source location information.
215 @return 1 if this thread is executing inside an active parallel region, zero if
216 not.
217 */
218 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
219   return __kmp_entry_thread()->th.th_root->r.r_active;
220 }
221 
222 /*!
223 @ingroup PARALLEL
224 @param loc source location information
225 @param global_tid global thread number
226 @param num_threads number of threads requested for this parallel construct
227 
228 Set the number of threads to be used by the next fork spawned by this thread.
229 This call is only required if the parallel construct has a `num_threads` clause.
230 */
231 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
232                              kmp_int32 num_threads) {
233   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
234                 global_tid, num_threads));
235   __kmp_assert_valid_gtid(global_tid);
236   __kmp_push_num_threads(loc, global_tid, num_threads);
237 }
238 
239 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
240   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
241   /* the num_threads are automatically popped */
242 }
243 
244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245                            kmp_int32 proc_bind) {
246   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247                 proc_bind));
248   __kmp_assert_valid_gtid(global_tid);
249   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250 }
251 
252 /*!
253 @ingroup PARALLEL
254 @param loc  source location information
255 @param argc  total number of arguments in the ellipsis
256 @param microtask  pointer to callback routine consisting of outlined parallel
257 construct
258 @param ...  pointers to shared variables that aren't global
259 
260 Do the actual fork and call the microtask in the relevant number of threads.
261 */
262 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263   int gtid = __kmp_entry_gtid();
264 
265 #if (KMP_STATS_ENABLED)
266   // If we were in a serial region, then stop the serial timer, record
267   // the event, and start parallel region timer
268   stats_state_e previous_state = KMP_GET_THREAD_STATE();
269   if (previous_state == stats_state_e::SERIAL_REGION) {
270     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
271   } else {
272     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273   }
274   int inParallel = __kmpc_in_parallel(loc);
275   if (inParallel) {
276     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
277   } else {
278     KMP_COUNT_BLOCK(OMP_PARALLEL);
279   }
280 #endif
281 
282   // maybe to save thr_state is enough here
283   {
284     va_list ap;
285     va_start(ap, microtask);
286 
287 #if OMPT_SUPPORT
288     ompt_frame_t *ompt_frame;
289     if (ompt_enabled.enabled) {
290       kmp_info_t *master_th = __kmp_threads[gtid];
291       kmp_team_t *parent_team = master_th->th.th_team;
292       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
293       if (lwt)
294         ompt_frame = &(lwt->ompt_task_info.frame);
295       else {
296         int tid = __kmp_tid_from_gtid(gtid);
297         ompt_frame = &(
298             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
299       }
300       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
301     }
302     OMPT_STORE_RETURN_ADDRESS(gtid);
303 #endif
304 
305 #if INCLUDE_SSC_MARKS
306     SSC_MARK_FORKING();
307 #endif
308     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
309                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
310                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
311                     kmp_va_addr_of(ap));
312 #if INCLUDE_SSC_MARKS
313     SSC_MARK_JOINING();
314 #endif
315     __kmp_join_call(loc, gtid
316 #if OMPT_SUPPORT
317                     ,
318                     fork_context_intel
319 #endif
320     );
321 
322     va_end(ap);
323   }
324 
325 #if KMP_STATS_ENABLED
326   if (previous_state == stats_state_e::SERIAL_REGION) {
327     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
328     KMP_SET_THREAD_STATE(previous_state);
329   } else {
330     KMP_POP_PARTITIONED_TIMER();
331   }
332 #endif // KMP_STATS_ENABLED
333 }
334 
335 /*!
336 @ingroup PARALLEL
337 @param loc source location information
338 @param global_tid global thread number
339 @param num_teams number of teams requested for the teams construct
340 @param num_threads number of threads per team requested for the teams construct
341 
342 Set the number of teams to be used by the teams construct.
343 This call is only required if the teams construct has a `num_teams` clause
344 or a `thread_limit` clause (or both).
345 */
346 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
347                            kmp_int32 num_teams, kmp_int32 num_threads) {
348   KA_TRACE(20,
349            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
350             global_tid, num_teams, num_threads));
351   __kmp_assert_valid_gtid(global_tid);
352   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
353 }
354 
355 /*!
356 @ingroup PARALLEL
357 @param loc source location information
358 @param global_tid global thread number
359 @param num_teams_lo lower bound on number of teams requested for the teams
360 construct
361 @param num_teams_up upper bound on number of teams requested for the teams
362 construct
363 @param num_threads number of threads per team requested for the teams construct
364 
365 Set the number of teams to be used by the teams construct. The number of initial
366 teams cretaed will be greater than or equal to the lower bound and less than or
367 equal to the upper bound.
368 This call is only required if the teams construct has a `num_teams` clause
369 or a `thread_limit` clause (or both).
370 */
371 void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
372                               kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
373                               kmp_int32 num_threads) {
374   KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
375                 " num_teams_ub=%d num_threads=%d\n",
376                 global_tid, num_teams_lb, num_teams_ub, num_threads));
377   __kmp_assert_valid_gtid(global_tid);
378   __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
379                           num_threads);
380 }
381 
382 /*!
383 @ingroup PARALLEL
384 @param loc  source location information
385 @param argc  total number of arguments in the ellipsis
386 @param microtask  pointer to callback routine consisting of outlined teams
387 construct
388 @param ...  pointers to shared variables that aren't global
389 
390 Do the actual fork and call the microtask in the relevant number of threads.
391 */
392 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
393                        ...) {
394   int gtid = __kmp_entry_gtid();
395   kmp_info_t *this_thr = __kmp_threads[gtid];
396   va_list ap;
397   va_start(ap, microtask);
398 
399 #if KMP_STATS_ENABLED
400   KMP_COUNT_BLOCK(OMP_TEAMS);
401   stats_state_e previous_state = KMP_GET_THREAD_STATE();
402   if (previous_state == stats_state_e::SERIAL_REGION) {
403     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
404   } else {
405     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
406   }
407 #endif
408 
409   // remember teams entry point and nesting level
410   this_thr->th.th_teams_microtask = microtask;
411   this_thr->th.th_teams_level =
412       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
413 
414 #if OMPT_SUPPORT
415   kmp_team_t *parent_team = this_thr->th.th_team;
416   int tid = __kmp_tid_from_gtid(gtid);
417   if (ompt_enabled.enabled) {
418     parent_team->t.t_implicit_task_taskdata[tid]
419         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
420   }
421   OMPT_STORE_RETURN_ADDRESS(gtid);
422 #endif
423 
424   // check if __kmpc_push_num_teams called, set default number of teams
425   // otherwise
426   if (this_thr->th.th_teams_size.nteams == 0) {
427     __kmp_push_num_teams(loc, gtid, 0, 0);
428   }
429   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
430   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
431   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
432 
433   __kmp_fork_call(
434       loc, gtid, fork_context_intel, argc,
435       VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
436       VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
437   __kmp_join_call(loc, gtid
438 #if OMPT_SUPPORT
439                   ,
440                   fork_context_intel
441 #endif
442   );
443 
444   // Pop current CG root off list
445   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
446   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
447   this_thr->th.th_cg_roots = tmp->up;
448   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
449                  " to node %p. cg_nthreads was %d\n",
450                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
451   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
452   int i = tmp->cg_nthreads--;
453   if (i == 1) { // check is we are the last thread in CG (not always the case)
454     __kmp_free(tmp);
455   }
456   // Restore current task's thread_limit from CG root
457   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
458   this_thr->th.th_current_task->td_icvs.thread_limit =
459       this_thr->th.th_cg_roots->cg_thread_limit;
460 
461   this_thr->th.th_teams_microtask = NULL;
462   this_thr->th.th_teams_level = 0;
463   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
464   va_end(ap);
465 #if KMP_STATS_ENABLED
466   if (previous_state == stats_state_e::SERIAL_REGION) {
467     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
468     KMP_SET_THREAD_STATE(previous_state);
469   } else {
470     KMP_POP_PARTITIONED_TIMER();
471   }
472 #endif // KMP_STATS_ENABLED
473 }
474 
475 // I don't think this function should ever have been exported.
476 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
477 // openmp code ever called it, but it's been exported from the RTL for so
478 // long that I'm afraid to remove the definition.
479 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
480 
481 /*!
482 @ingroup PARALLEL
483 @param loc  source location information
484 @param global_tid  global thread number
485 
486 Enter a serialized parallel construct. This interface is used to handle a
487 conditional parallel region, like this,
488 @code
489 #pragma omp parallel if (condition)
490 @endcode
491 when the condition is false.
492 */
493 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
494   // The implementation is now in kmp_runtime.cpp so that it can share static
495   // functions with kmp_fork_call since the tasks to be done are similar in
496   // each case.
497   __kmp_assert_valid_gtid(global_tid);
498 #if OMPT_SUPPORT
499   OMPT_STORE_RETURN_ADDRESS(global_tid);
500 #endif
501   __kmp_serialized_parallel(loc, global_tid);
502 }
503 
504 /*!
505 @ingroup PARALLEL
506 @param loc  source location information
507 @param global_tid  global thread number
508 
509 Leave a serialized parallel construct.
510 */
511 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
512   kmp_internal_control_t *top;
513   kmp_info_t *this_thr;
514   kmp_team_t *serial_team;
515 
516   KC_TRACE(10,
517            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
518 
519   /* skip all this code for autopar serialized loops since it results in
520      unacceptable overhead */
521   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
522     return;
523 
524   // Not autopar code
525   __kmp_assert_valid_gtid(global_tid);
526   if (!TCR_4(__kmp_init_parallel))
527     __kmp_parallel_initialize();
528 
529   __kmp_resume_if_soft_paused();
530 
531   this_thr = __kmp_threads[global_tid];
532   serial_team = this_thr->th.th_serial_team;
533 
534   kmp_task_team_t *task_team = this_thr->th.th_task_team;
535   // we need to wait for the proxy tasks before finishing the thread
536   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
537     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
538 
539   KMP_MB();
540   KMP_DEBUG_ASSERT(serial_team);
541   KMP_ASSERT(serial_team->t.t_serialized);
542   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
543   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
544   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
545   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
546 
547 #if OMPT_SUPPORT
548   if (ompt_enabled.enabled &&
549       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
550     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
551     if (ompt_enabled.ompt_callback_implicit_task) {
552       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
553           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
554           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
555     }
556 
557     // reset clear the task id only after unlinking the task
558     ompt_data_t *parent_task_data;
559     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
560 
561     if (ompt_enabled.ompt_callback_parallel_end) {
562       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
563           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
564           ompt_parallel_invoker_program | ompt_parallel_team,
565           OMPT_LOAD_RETURN_ADDRESS(global_tid));
566     }
567     __ompt_lw_taskteam_unlink(this_thr);
568     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
569   }
570 #endif
571 
572   /* If necessary, pop the internal control stack values and replace the team
573    * values */
574   top = serial_team->t.t_control_stack_top;
575   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
576     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
577     serial_team->t.t_control_stack_top = top->next;
578     __kmp_free(top);
579   }
580 
581   // if( serial_team -> t.t_serialized > 1 )
582   serial_team->t.t_level--;
583 
584   /* pop dispatch buffers stack */
585   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
586   {
587     dispatch_private_info_t *disp_buffer =
588         serial_team->t.t_dispatch->th_disp_buffer;
589     serial_team->t.t_dispatch->th_disp_buffer =
590         serial_team->t.t_dispatch->th_disp_buffer->next;
591     __kmp_free(disp_buffer);
592   }
593   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
594 
595   --serial_team->t.t_serialized;
596   if (serial_team->t.t_serialized == 0) {
597 
598     /* return to the parallel section */
599 
600 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
601     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
602       __kmp_clear_x87_fpu_status_word();
603       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
604       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
605     }
606 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
607 
608 #if OMPD_SUPPORT
609     if (ompd_state & OMPD_ENABLE_BP)
610       ompd_bp_parallel_end();
611 #endif
612 
613     this_thr->th.th_team = serial_team->t.t_parent;
614     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
615 
616     /* restore values cached in the thread */
617     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
618     this_thr->th.th_team_master =
619         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
620     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
621 
622     /* TODO the below shouldn't need to be adjusted for serialized teams */
623     this_thr->th.th_dispatch =
624         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
625 
626     __kmp_pop_current_task_from_thread(this_thr);
627 
628     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
629     this_thr->th.th_current_task->td_flags.executing = 1;
630 
631     if (__kmp_tasking_mode != tskm_immediate_exec) {
632       // Copy the task team from the new child / old parent team to the thread.
633       this_thr->th.th_task_team =
634           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
635       KA_TRACE(20,
636                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
637                 "team %p\n",
638                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
639     }
640   } else {
641     if (__kmp_tasking_mode != tskm_immediate_exec) {
642       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
643                     "depth of serial team %p to %d\n",
644                     global_tid, serial_team, serial_team->t.t_serialized));
645     }
646   }
647 
648   if (__kmp_env_consistency_check)
649     __kmp_pop_parallel(global_tid, NULL);
650 #if OMPT_SUPPORT
651   if (ompt_enabled.enabled)
652     this_thr->th.ompt_thread_info.state =
653         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
654                                            : ompt_state_work_parallel);
655 #endif
656 }
657 
658 /*!
659 @ingroup SYNCHRONIZATION
660 @param loc  source location information.
661 
662 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
663 depending on the memory ordering convention obeyed by the compiler
664 even that may not be necessary).
665 */
666 void __kmpc_flush(ident_t *loc) {
667   KC_TRACE(10, ("__kmpc_flush: called\n"));
668 
669   /* need explicit __mf() here since use volatile instead in library */
670   KMP_MB(); /* Flush all pending memory write invalidates.  */
671 
672 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
673 #if KMP_MIC
674 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
675 // We shouldn't need it, though, since the ABI rules require that
676 // * If the compiler generates NGO stores it also generates the fence
677 // * If users hand-code NGO stores they should insert the fence
678 // therefore no incomplete unordered stores should be visible.
679 #else
680   // C74404
681   // This is to address non-temporal store instructions (sfence needed).
682   // The clflush instruction is addressed either (mfence needed).
683   // Probably the non-temporal load monvtdqa instruction should also be
684   // addressed.
685   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
686   if (!__kmp_cpuinfo.initialized) {
687     __kmp_query_cpuid(&__kmp_cpuinfo);
688   }
689   if (!__kmp_cpuinfo.sse2) {
690     // CPU cannot execute SSE2 instructions.
691   } else {
692 #if KMP_COMPILER_ICC
693     _mm_mfence();
694 #elif KMP_COMPILER_MSVC
695     MemoryBarrier();
696 #else
697     __sync_synchronize();
698 #endif // KMP_COMPILER_ICC
699   }
700 #endif // KMP_MIC
701 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
702        KMP_ARCH_RISCV64)
703 // Nothing to see here move along
704 #elif KMP_ARCH_PPC64
705 // Nothing needed here (we have a real MB above).
706 #else
707 #error Unknown or unsupported architecture
708 #endif
709 
710 #if OMPT_SUPPORT && OMPT_OPTIONAL
711   if (ompt_enabled.ompt_callback_flush) {
712     ompt_callbacks.ompt_callback(ompt_callback_flush)(
713         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
714   }
715 #endif
716 }
717 
718 /* -------------------------------------------------------------------------- */
719 /*!
720 @ingroup SYNCHRONIZATION
721 @param loc source location information
722 @param global_tid thread id.
723 
724 Execute a barrier.
725 */
726 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
727   KMP_COUNT_BLOCK(OMP_BARRIER);
728   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
729   __kmp_assert_valid_gtid(global_tid);
730 
731   if (!TCR_4(__kmp_init_parallel))
732     __kmp_parallel_initialize();
733 
734   __kmp_resume_if_soft_paused();
735 
736   if (__kmp_env_consistency_check) {
737     if (loc == 0) {
738       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
739     }
740     __kmp_check_barrier(global_tid, ct_barrier, loc);
741   }
742 
743 #if OMPT_SUPPORT
744   ompt_frame_t *ompt_frame;
745   if (ompt_enabled.enabled) {
746     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
747     if (ompt_frame->enter_frame.ptr == NULL)
748       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
749   }
750   OMPT_STORE_RETURN_ADDRESS(global_tid);
751 #endif
752   __kmp_threads[global_tid]->th.th_ident = loc;
753   // TODO: explicit barrier_wait_id:
754   //   this function is called when 'barrier' directive is present or
755   //   implicit barrier at the end of a worksharing construct.
756   // 1) better to add a per-thread barrier counter to a thread data structure
757   // 2) set to 0 when a new team is created
758   // 4) no sync is required
759 
760   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
761 #if OMPT_SUPPORT && OMPT_OPTIONAL
762   if (ompt_enabled.enabled) {
763     ompt_frame->enter_frame = ompt_data_none;
764   }
765 #endif
766 }
767 
768 /* The BARRIER for a MASTER section is always explicit   */
769 /*!
770 @ingroup WORK_SHARING
771 @param loc  source location information.
772 @param global_tid  global thread number .
773 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
774 */
775 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
776   int status = 0;
777 
778   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
779   __kmp_assert_valid_gtid(global_tid);
780 
781   if (!TCR_4(__kmp_init_parallel))
782     __kmp_parallel_initialize();
783 
784   __kmp_resume_if_soft_paused();
785 
786   if (KMP_MASTER_GTID(global_tid)) {
787     KMP_COUNT_BLOCK(OMP_MASTER);
788     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
789     status = 1;
790   }
791 
792 #if OMPT_SUPPORT && OMPT_OPTIONAL
793   if (status) {
794     if (ompt_enabled.ompt_callback_masked) {
795       kmp_info_t *this_thr = __kmp_threads[global_tid];
796       kmp_team_t *team = this_thr->th.th_team;
797 
798       int tid = __kmp_tid_from_gtid(global_tid);
799       ompt_callbacks.ompt_callback(ompt_callback_masked)(
800           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
801           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
802           OMPT_GET_RETURN_ADDRESS(0));
803     }
804   }
805 #endif
806 
807   if (__kmp_env_consistency_check) {
808 #if KMP_USE_DYNAMIC_LOCK
809     if (status)
810       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
811     else
812       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
813 #else
814     if (status)
815       __kmp_push_sync(global_tid, ct_master, loc, NULL);
816     else
817       __kmp_check_sync(global_tid, ct_master, loc, NULL);
818 #endif
819   }
820 
821   return status;
822 }
823 
824 /*!
825 @ingroup WORK_SHARING
826 @param loc  source location information.
827 @param global_tid  global thread number .
828 
829 Mark the end of a <tt>master</tt> region. This should only be called by the
830 thread that executes the <tt>master</tt> region.
831 */
832 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
833   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
834   __kmp_assert_valid_gtid(global_tid);
835   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
836   KMP_POP_PARTITIONED_TIMER();
837 
838 #if OMPT_SUPPORT && OMPT_OPTIONAL
839   kmp_info_t *this_thr = __kmp_threads[global_tid];
840   kmp_team_t *team = this_thr->th.th_team;
841   if (ompt_enabled.ompt_callback_masked) {
842     int tid = __kmp_tid_from_gtid(global_tid);
843     ompt_callbacks.ompt_callback(ompt_callback_masked)(
844         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
845         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
846         OMPT_GET_RETURN_ADDRESS(0));
847   }
848 #endif
849 
850   if (__kmp_env_consistency_check) {
851     if (KMP_MASTER_GTID(global_tid))
852       __kmp_pop_sync(global_tid, ct_master, loc);
853   }
854 }
855 
856 /*!
857 @ingroup WORK_SHARING
858 @param loc  source location information.
859 @param global_tid  global thread number.
860 @param filter result of evaluating filter clause on thread global_tid, or zero
861 if no filter clause present
862 @return 1 if this thread should execute the <tt>masked</tt> block, 0 otherwise.
863 */
864 kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
865   int status = 0;
866   int tid;
867   KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
868   __kmp_assert_valid_gtid(global_tid);
869 
870   if (!TCR_4(__kmp_init_parallel))
871     __kmp_parallel_initialize();
872 
873   __kmp_resume_if_soft_paused();
874 
875   tid = __kmp_tid_from_gtid(global_tid);
876   if (tid == filter) {
877     KMP_COUNT_BLOCK(OMP_MASKED);
878     KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
879     status = 1;
880   }
881 
882 #if OMPT_SUPPORT && OMPT_OPTIONAL
883   if (status) {
884     if (ompt_enabled.ompt_callback_masked) {
885       kmp_info_t *this_thr = __kmp_threads[global_tid];
886       kmp_team_t *team = this_thr->th.th_team;
887       ompt_callbacks.ompt_callback(ompt_callback_masked)(
888           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
889           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
890           OMPT_GET_RETURN_ADDRESS(0));
891     }
892   }
893 #endif
894 
895   if (__kmp_env_consistency_check) {
896 #if KMP_USE_DYNAMIC_LOCK
897     if (status)
898       __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
899     else
900       __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
901 #else
902     if (status)
903       __kmp_push_sync(global_tid, ct_masked, loc, NULL);
904     else
905       __kmp_check_sync(global_tid, ct_masked, loc, NULL);
906 #endif
907   }
908 
909   return status;
910 }
911 
912 /*!
913 @ingroup WORK_SHARING
914 @param loc  source location information.
915 @param global_tid  global thread number .
916 
917 Mark the end of a <tt>masked</tt> region. This should only be called by the
918 thread that executes the <tt>masked</tt> region.
919 */
920 void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
921   KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
922   __kmp_assert_valid_gtid(global_tid);
923   KMP_POP_PARTITIONED_TIMER();
924 
925 #if OMPT_SUPPORT && OMPT_OPTIONAL
926   kmp_info_t *this_thr = __kmp_threads[global_tid];
927   kmp_team_t *team = this_thr->th.th_team;
928   if (ompt_enabled.ompt_callback_masked) {
929     int tid = __kmp_tid_from_gtid(global_tid);
930     ompt_callbacks.ompt_callback(ompt_callback_masked)(
931         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
932         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
933         OMPT_GET_RETURN_ADDRESS(0));
934   }
935 #endif
936 
937   if (__kmp_env_consistency_check) {
938     __kmp_pop_sync(global_tid, ct_masked, loc);
939   }
940 }
941 
942 /*!
943 @ingroup WORK_SHARING
944 @param loc  source location information.
945 @param gtid  global thread number.
946 
947 Start execution of an <tt>ordered</tt> construct.
948 */
949 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
950   int cid = 0;
951   kmp_info_t *th;
952   KMP_DEBUG_ASSERT(__kmp_init_serial);
953 
954   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
955   __kmp_assert_valid_gtid(gtid);
956 
957   if (!TCR_4(__kmp_init_parallel))
958     __kmp_parallel_initialize();
959 
960   __kmp_resume_if_soft_paused();
961 
962 #if USE_ITT_BUILD
963   __kmp_itt_ordered_prep(gtid);
964 // TODO: ordered_wait_id
965 #endif /* USE_ITT_BUILD */
966 
967   th = __kmp_threads[gtid];
968 
969 #if OMPT_SUPPORT && OMPT_OPTIONAL
970   kmp_team_t *team;
971   ompt_wait_id_t lck;
972   void *codeptr_ra;
973   OMPT_STORE_RETURN_ADDRESS(gtid);
974   if (ompt_enabled.enabled) {
975     team = __kmp_team_from_gtid(gtid);
976     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
977     /* OMPT state update */
978     th->th.ompt_thread_info.wait_id = lck;
979     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
980 
981     /* OMPT event callback */
982     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
983     if (ompt_enabled.ompt_callback_mutex_acquire) {
984       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
985           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
986           codeptr_ra);
987     }
988   }
989 #endif
990 
991   if (th->th.th_dispatch->th_deo_fcn != 0)
992     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
993   else
994     __kmp_parallel_deo(&gtid, &cid, loc);
995 
996 #if OMPT_SUPPORT && OMPT_OPTIONAL
997   if (ompt_enabled.enabled) {
998     /* OMPT state update */
999     th->th.ompt_thread_info.state = ompt_state_work_parallel;
1000     th->th.ompt_thread_info.wait_id = 0;
1001 
1002     /* OMPT event callback */
1003     if (ompt_enabled.ompt_callback_mutex_acquired) {
1004       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1005           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1006     }
1007   }
1008 #endif
1009 
1010 #if USE_ITT_BUILD
1011   __kmp_itt_ordered_start(gtid);
1012 #endif /* USE_ITT_BUILD */
1013 }
1014 
1015 /*!
1016 @ingroup WORK_SHARING
1017 @param loc  source location information.
1018 @param gtid  global thread number.
1019 
1020 End execution of an <tt>ordered</tt> construct.
1021 */
1022 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
1023   int cid = 0;
1024   kmp_info_t *th;
1025 
1026   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
1027   __kmp_assert_valid_gtid(gtid);
1028 
1029 #if USE_ITT_BUILD
1030   __kmp_itt_ordered_end(gtid);
1031 // TODO: ordered_wait_id
1032 #endif /* USE_ITT_BUILD */
1033 
1034   th = __kmp_threads[gtid];
1035 
1036   if (th->th.th_dispatch->th_dxo_fcn != 0)
1037     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
1038   else
1039     __kmp_parallel_dxo(&gtid, &cid, loc);
1040 
1041 #if OMPT_SUPPORT && OMPT_OPTIONAL
1042   OMPT_STORE_RETURN_ADDRESS(gtid);
1043   if (ompt_enabled.ompt_callback_mutex_released) {
1044     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1045         ompt_mutex_ordered,
1046         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
1047             ->t.t_ordered.dt.t_value,
1048         OMPT_LOAD_RETURN_ADDRESS(gtid));
1049   }
1050 #endif
1051 }
1052 
1053 #if KMP_USE_DYNAMIC_LOCK
1054 
1055 static __forceinline void
1056 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
1057                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
1058   // Pointer to the allocated indirect lock is written to crit, while indexing
1059   // is ignored.
1060   void *idx;
1061   kmp_indirect_lock_t **lck;
1062   lck = (kmp_indirect_lock_t **)crit;
1063   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
1064   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
1065   KMP_SET_I_LOCK_LOCATION(ilk, loc);
1066   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
1067   KA_TRACE(20,
1068            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
1069 #if USE_ITT_BUILD
1070   __kmp_itt_critical_creating(ilk->lock, loc);
1071 #endif
1072   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
1073   if (status == 0) {
1074 #if USE_ITT_BUILD
1075     __kmp_itt_critical_destroyed(ilk->lock);
1076 #endif
1077     // We don't really need to destroy the unclaimed lock here since it will be
1078     // cleaned up at program exit.
1079     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
1080   }
1081   KMP_DEBUG_ASSERT(*lck != NULL);
1082 }
1083 
1084 // Fast-path acquire tas lock
1085 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
1086   {                                                                            \
1087     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1088     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1089     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1090     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
1091         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
1092       kmp_uint32 spins;                                                        \
1093       KMP_FSYNC_PREPARE(l);                                                    \
1094       KMP_INIT_YIELD(spins);                                                   \
1095       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1096       do {                                                                     \
1097         if (TCR_4(__kmp_nth) >                                                 \
1098             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1099           KMP_YIELD(TRUE);                                                     \
1100         } else {                                                               \
1101           KMP_YIELD_SPIN(spins);                                               \
1102         }                                                                      \
1103         __kmp_spin_backoff(&backoff);                                          \
1104       } while (                                                                \
1105           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1106           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1107     }                                                                          \
1108     KMP_FSYNC_ACQUIRED(l);                                                     \
1109   }
1110 
1111 // Fast-path test tas lock
1112 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1113   {                                                                            \
1114     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1115     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1116     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1117     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1118          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1119   }
1120 
1121 // Fast-path release tas lock
1122 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1123   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1124 
1125 #if KMP_USE_FUTEX
1126 
1127 #include <sys/syscall.h>
1128 #include <unistd.h>
1129 #ifndef FUTEX_WAIT
1130 #define FUTEX_WAIT 0
1131 #endif
1132 #ifndef FUTEX_WAKE
1133 #define FUTEX_WAKE 1
1134 #endif
1135 
1136 // Fast-path acquire futex lock
1137 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1138   {                                                                            \
1139     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1140     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1141     KMP_MB();                                                                  \
1142     KMP_FSYNC_PREPARE(ftx);                                                    \
1143     kmp_int32 poll_val;                                                        \
1144     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1145                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1146                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1147       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1148       if (!cond) {                                                             \
1149         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1150                                          poll_val |                            \
1151                                              KMP_LOCK_BUSY(1, futex))) {       \
1152           continue;                                                            \
1153         }                                                                      \
1154         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1155       }                                                                        \
1156       kmp_int32 rc;                                                            \
1157       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1158                         NULL, NULL, 0)) != 0) {                                \
1159         continue;                                                              \
1160       }                                                                        \
1161       gtid_code |= 1;                                                          \
1162     }                                                                          \
1163     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1164   }
1165 
1166 // Fast-path test futex lock
1167 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1168   {                                                                            \
1169     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1170     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1171                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1172       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1173       rc = TRUE;                                                               \
1174     } else {                                                                   \
1175       rc = FALSE;                                                              \
1176     }                                                                          \
1177   }
1178 
1179 // Fast-path release futex lock
1180 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1181   {                                                                            \
1182     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1183     KMP_MB();                                                                  \
1184     KMP_FSYNC_RELEASING(ftx);                                                  \
1185     kmp_int32 poll_val =                                                       \
1186         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1187     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1188       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1189               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1190     }                                                                          \
1191     KMP_MB();                                                                  \
1192     KMP_YIELD_OVERSUB();                                                       \
1193   }
1194 
1195 #endif // KMP_USE_FUTEX
1196 
1197 #else // KMP_USE_DYNAMIC_LOCK
1198 
1199 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1200                                                       ident_t const *loc,
1201                                                       kmp_int32 gtid) {
1202   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1203 
1204   // Because of the double-check, the following load doesn't need to be volatile
1205   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1206 
1207   if (lck == NULL) {
1208     void *idx;
1209 
1210     // Allocate & initialize the lock.
1211     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1212     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1213     __kmp_init_user_lock_with_checks(lck);
1214     __kmp_set_user_lock_location(lck, loc);
1215 #if USE_ITT_BUILD
1216     __kmp_itt_critical_creating(lck);
1217 // __kmp_itt_critical_creating() should be called *before* the first usage
1218 // of underlying lock. It is the only place where we can guarantee it. There
1219 // are chances the lock will destroyed with no usage, but it is not a
1220 // problem, because this is not real event seen by user but rather setting
1221 // name for object (lock). See more details in kmp_itt.h.
1222 #endif /* USE_ITT_BUILD */
1223 
1224     // Use a cmpxchg instruction to slam the start of the critical section with
1225     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1226     // and use the lock that the other thread allocated.
1227     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1228 
1229     if (status == 0) {
1230 // Deallocate the lock and reload the value.
1231 #if USE_ITT_BUILD
1232       __kmp_itt_critical_destroyed(lck);
1233 // Let ITT know the lock is destroyed and the same memory location may be reused
1234 // for another purpose.
1235 #endif /* USE_ITT_BUILD */
1236       __kmp_destroy_user_lock_with_checks(lck);
1237       __kmp_user_lock_free(&idx, gtid, lck);
1238       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1239       KMP_DEBUG_ASSERT(lck != NULL);
1240     }
1241   }
1242   return lck;
1243 }
1244 
1245 #endif // KMP_USE_DYNAMIC_LOCK
1246 
1247 /*!
1248 @ingroup WORK_SHARING
1249 @param loc  source location information.
1250 @param global_tid  global thread number.
1251 @param crit identity of the critical section. This could be a pointer to a lock
1252 associated with the critical section, or some other suitably unique value.
1253 
1254 Enter code protected by a `critical` construct.
1255 This function blocks until the executing thread can enter the critical section.
1256 */
1257 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1258                      kmp_critical_name *crit) {
1259 #if KMP_USE_DYNAMIC_LOCK
1260 #if OMPT_SUPPORT && OMPT_OPTIONAL
1261   OMPT_STORE_RETURN_ADDRESS(global_tid);
1262 #endif // OMPT_SUPPORT
1263   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1264 #else
1265   KMP_COUNT_BLOCK(OMP_CRITICAL);
1266 #if OMPT_SUPPORT && OMPT_OPTIONAL
1267   ompt_state_t prev_state = ompt_state_undefined;
1268   ompt_thread_info_t ti;
1269 #endif
1270   kmp_user_lock_p lck;
1271 
1272   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1273   __kmp_assert_valid_gtid(global_tid);
1274 
1275   // TODO: add THR_OVHD_STATE
1276 
1277   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1278   KMP_CHECK_USER_LOCK_INIT();
1279 
1280   if ((__kmp_user_lock_kind == lk_tas) &&
1281       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1282     lck = (kmp_user_lock_p)crit;
1283   }
1284 #if KMP_USE_FUTEX
1285   else if ((__kmp_user_lock_kind == lk_futex) &&
1286            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1287     lck = (kmp_user_lock_p)crit;
1288   }
1289 #endif
1290   else { // ticket, queuing or drdpa
1291     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1292   }
1293 
1294   if (__kmp_env_consistency_check)
1295     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1296 
1297     // since the critical directive binds to all threads, not just the current
1298     // team we have to check this even if we are in a serialized team.
1299     // also, even if we are the uber thread, we still have to conduct the lock,
1300     // as we have to contend with sibling threads.
1301 
1302 #if USE_ITT_BUILD
1303   __kmp_itt_critical_acquiring(lck);
1304 #endif /* USE_ITT_BUILD */
1305 #if OMPT_SUPPORT && OMPT_OPTIONAL
1306   OMPT_STORE_RETURN_ADDRESS(gtid);
1307   void *codeptr_ra = NULL;
1308   if (ompt_enabled.enabled) {
1309     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1310     /* OMPT state update */
1311     prev_state = ti.state;
1312     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1313     ti.state = ompt_state_wait_critical;
1314 
1315     /* OMPT event callback */
1316     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1317     if (ompt_enabled.ompt_callback_mutex_acquire) {
1318       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1319           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1320           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1321     }
1322   }
1323 #endif
1324   // Value of 'crit' should be good for using as a critical_id of the critical
1325   // section directive.
1326   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1327 
1328 #if USE_ITT_BUILD
1329   __kmp_itt_critical_acquired(lck);
1330 #endif /* USE_ITT_BUILD */
1331 #if OMPT_SUPPORT && OMPT_OPTIONAL
1332   if (ompt_enabled.enabled) {
1333     /* OMPT state update */
1334     ti.state = prev_state;
1335     ti.wait_id = 0;
1336 
1337     /* OMPT event callback */
1338     if (ompt_enabled.ompt_callback_mutex_acquired) {
1339       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1340           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1341     }
1342   }
1343 #endif
1344   KMP_POP_PARTITIONED_TIMER();
1345 
1346   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1347   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1348 #endif // KMP_USE_DYNAMIC_LOCK
1349 }
1350 
1351 #if KMP_USE_DYNAMIC_LOCK
1352 
1353 // Converts the given hint to an internal lock implementation
1354 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1355 #if KMP_USE_TSX
1356 #define KMP_TSX_LOCK(seq) lockseq_##seq
1357 #else
1358 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1359 #endif
1360 
1361 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1362 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1363 #else
1364 #define KMP_CPUINFO_RTM 0
1365 #endif
1366 
1367   // Hints that do not require further logic
1368   if (hint & kmp_lock_hint_hle)
1369     return KMP_TSX_LOCK(hle);
1370   if (hint & kmp_lock_hint_rtm)
1371     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1372   if (hint & kmp_lock_hint_adaptive)
1373     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1374 
1375   // Rule out conflicting hints first by returning the default lock
1376   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1377     return __kmp_user_lock_seq;
1378   if ((hint & omp_lock_hint_speculative) &&
1379       (hint & omp_lock_hint_nonspeculative))
1380     return __kmp_user_lock_seq;
1381 
1382   // Do not even consider speculation when it appears to be contended
1383   if (hint & omp_lock_hint_contended)
1384     return lockseq_queuing;
1385 
1386   // Uncontended lock without speculation
1387   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1388     return lockseq_tas;
1389 
1390   // Use RTM lock for speculation
1391   if (hint & omp_lock_hint_speculative)
1392     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1393 
1394   return __kmp_user_lock_seq;
1395 }
1396 
1397 #if OMPT_SUPPORT && OMPT_OPTIONAL
1398 #if KMP_USE_DYNAMIC_LOCK
1399 static kmp_mutex_impl_t
1400 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1401   if (user_lock) {
1402     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1403     case 0:
1404       break;
1405 #if KMP_USE_FUTEX
1406     case locktag_futex:
1407       return kmp_mutex_impl_queuing;
1408 #endif
1409     case locktag_tas:
1410       return kmp_mutex_impl_spin;
1411 #if KMP_USE_TSX
1412     case locktag_hle:
1413     case locktag_rtm_spin:
1414       return kmp_mutex_impl_speculative;
1415 #endif
1416     default:
1417       return kmp_mutex_impl_none;
1418     }
1419     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1420   }
1421   KMP_ASSERT(ilock);
1422   switch (ilock->type) {
1423 #if KMP_USE_TSX
1424   case locktag_adaptive:
1425   case locktag_rtm_queuing:
1426     return kmp_mutex_impl_speculative;
1427 #endif
1428   case locktag_nested_tas:
1429     return kmp_mutex_impl_spin;
1430 #if KMP_USE_FUTEX
1431   case locktag_nested_futex:
1432 #endif
1433   case locktag_ticket:
1434   case locktag_queuing:
1435   case locktag_drdpa:
1436   case locktag_nested_ticket:
1437   case locktag_nested_queuing:
1438   case locktag_nested_drdpa:
1439     return kmp_mutex_impl_queuing;
1440   default:
1441     return kmp_mutex_impl_none;
1442   }
1443 }
1444 #else
1445 // For locks without dynamic binding
1446 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1447   switch (__kmp_user_lock_kind) {
1448   case lk_tas:
1449     return kmp_mutex_impl_spin;
1450 #if KMP_USE_FUTEX
1451   case lk_futex:
1452 #endif
1453   case lk_ticket:
1454   case lk_queuing:
1455   case lk_drdpa:
1456     return kmp_mutex_impl_queuing;
1457 #if KMP_USE_TSX
1458   case lk_hle:
1459   case lk_rtm_queuing:
1460   case lk_rtm_spin:
1461   case lk_adaptive:
1462     return kmp_mutex_impl_speculative;
1463 #endif
1464   default:
1465     return kmp_mutex_impl_none;
1466   }
1467 }
1468 #endif // KMP_USE_DYNAMIC_LOCK
1469 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1470 
1471 /*!
1472 @ingroup WORK_SHARING
1473 @param loc  source location information.
1474 @param global_tid  global thread number.
1475 @param crit identity of the critical section. This could be a pointer to a lock
1476 associated with the critical section, or some other suitably unique value.
1477 @param hint the lock hint.
1478 
1479 Enter code protected by a `critical` construct with a hint. The hint value is
1480 used to suggest a lock implementation. This function blocks until the executing
1481 thread can enter the critical section unless the hint suggests use of
1482 speculative execution and the hardware supports it.
1483 */
1484 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1485                                kmp_critical_name *crit, uint32_t hint) {
1486   KMP_COUNT_BLOCK(OMP_CRITICAL);
1487   kmp_user_lock_p lck;
1488 #if OMPT_SUPPORT && OMPT_OPTIONAL
1489   ompt_state_t prev_state = ompt_state_undefined;
1490   ompt_thread_info_t ti;
1491   // This is the case, if called from __kmpc_critical:
1492   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1493   if (!codeptr)
1494     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1495 #endif
1496 
1497   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1498   __kmp_assert_valid_gtid(global_tid);
1499 
1500   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1501   // Check if it is initialized.
1502   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1503   kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
1504   if (*lk == 0) {
1505     if (KMP_IS_D_LOCK(lockseq)) {
1506       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1507                                   KMP_GET_D_TAG(lockseq));
1508     } else {
1509       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
1510     }
1511   }
1512   // Branch for accessing the actual lock object and set operation. This
1513   // branching is inevitable since this lock initialization does not follow the
1514   // normal dispatch path (lock table is not used).
1515   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1516     lck = (kmp_user_lock_p)lk;
1517     if (__kmp_env_consistency_check) {
1518       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1519                       __kmp_map_hint_to_lock(hint));
1520     }
1521 #if USE_ITT_BUILD
1522     __kmp_itt_critical_acquiring(lck);
1523 #endif
1524 #if OMPT_SUPPORT && OMPT_OPTIONAL
1525     if (ompt_enabled.enabled) {
1526       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1527       /* OMPT state update */
1528       prev_state = ti.state;
1529       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1530       ti.state = ompt_state_wait_critical;
1531 
1532       /* OMPT event callback */
1533       if (ompt_enabled.ompt_callback_mutex_acquire) {
1534         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1535             ompt_mutex_critical, (unsigned int)hint,
1536             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1537             codeptr);
1538       }
1539     }
1540 #endif
1541 #if KMP_USE_INLINED_TAS
1542     if (lockseq == lockseq_tas && !__kmp_env_consistency_check) {
1543       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1544     } else
1545 #elif KMP_USE_INLINED_FUTEX
1546     if (lockseq == lockseq_futex && !__kmp_env_consistency_check) {
1547       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1548     } else
1549 #endif
1550     {
1551       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1552     }
1553   } else {
1554     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1555     lck = ilk->lock;
1556     if (__kmp_env_consistency_check) {
1557       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1558                       __kmp_map_hint_to_lock(hint));
1559     }
1560 #if USE_ITT_BUILD
1561     __kmp_itt_critical_acquiring(lck);
1562 #endif
1563 #if OMPT_SUPPORT && OMPT_OPTIONAL
1564     if (ompt_enabled.enabled) {
1565       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1566       /* OMPT state update */
1567       prev_state = ti.state;
1568       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1569       ti.state = ompt_state_wait_critical;
1570 
1571       /* OMPT event callback */
1572       if (ompt_enabled.ompt_callback_mutex_acquire) {
1573         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1574             ompt_mutex_critical, (unsigned int)hint,
1575             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1576             codeptr);
1577       }
1578     }
1579 #endif
1580     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1581   }
1582   KMP_POP_PARTITIONED_TIMER();
1583 
1584 #if USE_ITT_BUILD
1585   __kmp_itt_critical_acquired(lck);
1586 #endif /* USE_ITT_BUILD */
1587 #if OMPT_SUPPORT && OMPT_OPTIONAL
1588   if (ompt_enabled.enabled) {
1589     /* OMPT state update */
1590     ti.state = prev_state;
1591     ti.wait_id = 0;
1592 
1593     /* OMPT event callback */
1594     if (ompt_enabled.ompt_callback_mutex_acquired) {
1595       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1596           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1597     }
1598   }
1599 #endif
1600 
1601   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1602   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1603 } // __kmpc_critical_with_hint
1604 
1605 #endif // KMP_USE_DYNAMIC_LOCK
1606 
1607 /*!
1608 @ingroup WORK_SHARING
1609 @param loc  source location information.
1610 @param global_tid  global thread number .
1611 @param crit identity of the critical section. This could be a pointer to a lock
1612 associated with the critical section, or some other suitably unique value.
1613 
1614 Leave a critical section, releasing any lock that was held during its execution.
1615 */
1616 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1617                          kmp_critical_name *crit) {
1618   kmp_user_lock_p lck;
1619 
1620   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1621 
1622 #if KMP_USE_DYNAMIC_LOCK
1623   int locktag = KMP_EXTRACT_D_TAG(crit);
1624   if (locktag) {
1625     lck = (kmp_user_lock_p)crit;
1626     KMP_ASSERT(lck != NULL);
1627     if (__kmp_env_consistency_check) {
1628       __kmp_pop_sync(global_tid, ct_critical, loc);
1629     }
1630 #if USE_ITT_BUILD
1631     __kmp_itt_critical_releasing(lck);
1632 #endif
1633 #if KMP_USE_INLINED_TAS
1634     if (locktag == locktag_tas && !__kmp_env_consistency_check) {
1635       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1636     } else
1637 #elif KMP_USE_INLINED_FUTEX
1638     if (locktag == locktag_futex && !__kmp_env_consistency_check) {
1639       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1640     } else
1641 #endif
1642     {
1643       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1644     }
1645   } else {
1646     kmp_indirect_lock_t *ilk =
1647         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1648     KMP_ASSERT(ilk != NULL);
1649     lck = ilk->lock;
1650     if (__kmp_env_consistency_check) {
1651       __kmp_pop_sync(global_tid, ct_critical, loc);
1652     }
1653 #if USE_ITT_BUILD
1654     __kmp_itt_critical_releasing(lck);
1655 #endif
1656     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1657   }
1658 
1659 #else // KMP_USE_DYNAMIC_LOCK
1660 
1661   if ((__kmp_user_lock_kind == lk_tas) &&
1662       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1663     lck = (kmp_user_lock_p)crit;
1664   }
1665 #if KMP_USE_FUTEX
1666   else if ((__kmp_user_lock_kind == lk_futex) &&
1667            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1668     lck = (kmp_user_lock_p)crit;
1669   }
1670 #endif
1671   else { // ticket, queuing or drdpa
1672     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1673   }
1674 
1675   KMP_ASSERT(lck != NULL);
1676 
1677   if (__kmp_env_consistency_check)
1678     __kmp_pop_sync(global_tid, ct_critical, loc);
1679 
1680 #if USE_ITT_BUILD
1681   __kmp_itt_critical_releasing(lck);
1682 #endif /* USE_ITT_BUILD */
1683   // Value of 'crit' should be good for using as a critical_id of the critical
1684   // section directive.
1685   __kmp_release_user_lock_with_checks(lck, global_tid);
1686 
1687 #endif // KMP_USE_DYNAMIC_LOCK
1688 
1689 #if OMPT_SUPPORT && OMPT_OPTIONAL
1690   /* OMPT release event triggers after lock is released; place here to trigger
1691    * for all #if branches */
1692   OMPT_STORE_RETURN_ADDRESS(global_tid);
1693   if (ompt_enabled.ompt_callback_mutex_released) {
1694     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1695         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1696         OMPT_LOAD_RETURN_ADDRESS(0));
1697   }
1698 #endif
1699 
1700   KMP_POP_PARTITIONED_TIMER();
1701   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1702 }
1703 
1704 /*!
1705 @ingroup SYNCHRONIZATION
1706 @param loc source location information
1707 @param global_tid thread id.
1708 @return one if the thread should execute the master block, zero otherwise
1709 
1710 Start execution of a combined barrier and master. The barrier is executed inside
1711 this function.
1712 */
1713 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1714   int status;
1715   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1716   __kmp_assert_valid_gtid(global_tid);
1717 
1718   if (!TCR_4(__kmp_init_parallel))
1719     __kmp_parallel_initialize();
1720 
1721   __kmp_resume_if_soft_paused();
1722 
1723   if (__kmp_env_consistency_check)
1724     __kmp_check_barrier(global_tid, ct_barrier, loc);
1725 
1726 #if OMPT_SUPPORT
1727   ompt_frame_t *ompt_frame;
1728   if (ompt_enabled.enabled) {
1729     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1730     if (ompt_frame->enter_frame.ptr == NULL)
1731       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1732   }
1733   OMPT_STORE_RETURN_ADDRESS(global_tid);
1734 #endif
1735 #if USE_ITT_NOTIFY
1736   __kmp_threads[global_tid]->th.th_ident = loc;
1737 #endif
1738   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1739 #if OMPT_SUPPORT && OMPT_OPTIONAL
1740   if (ompt_enabled.enabled) {
1741     ompt_frame->enter_frame = ompt_data_none;
1742   }
1743 #endif
1744 
1745   return (status != 0) ? 0 : 1;
1746 }
1747 
1748 /*!
1749 @ingroup SYNCHRONIZATION
1750 @param loc source location information
1751 @param global_tid thread id.
1752 
1753 Complete the execution of a combined barrier and master. This function should
1754 only be called at the completion of the <tt>master</tt> code. Other threads will
1755 still be waiting at the barrier and this call releases them.
1756 */
1757 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1758   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1759   __kmp_assert_valid_gtid(global_tid);
1760   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1761 }
1762 
1763 /*!
1764 @ingroup SYNCHRONIZATION
1765 @param loc source location information
1766 @param global_tid thread id.
1767 @return one if the thread should execute the master block, zero otherwise
1768 
1769 Start execution of a combined barrier and master(nowait) construct.
1770 The barrier is executed inside this function.
1771 There is no equivalent "end" function, since the
1772 */
1773 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1774   kmp_int32 ret;
1775   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1776   __kmp_assert_valid_gtid(global_tid);
1777 
1778   if (!TCR_4(__kmp_init_parallel))
1779     __kmp_parallel_initialize();
1780 
1781   __kmp_resume_if_soft_paused();
1782 
1783   if (__kmp_env_consistency_check) {
1784     if (loc == 0) {
1785       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1786     }
1787     __kmp_check_barrier(global_tid, ct_barrier, loc);
1788   }
1789 
1790 #if OMPT_SUPPORT
1791   ompt_frame_t *ompt_frame;
1792   if (ompt_enabled.enabled) {
1793     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1794     if (ompt_frame->enter_frame.ptr == NULL)
1795       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1796   }
1797   OMPT_STORE_RETURN_ADDRESS(global_tid);
1798 #endif
1799 #if USE_ITT_NOTIFY
1800   __kmp_threads[global_tid]->th.th_ident = loc;
1801 #endif
1802   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1803 #if OMPT_SUPPORT && OMPT_OPTIONAL
1804   if (ompt_enabled.enabled) {
1805     ompt_frame->enter_frame = ompt_data_none;
1806   }
1807 #endif
1808 
1809   ret = __kmpc_master(loc, global_tid);
1810 
1811   if (__kmp_env_consistency_check) {
1812     /*  there's no __kmpc_end_master called; so the (stats) */
1813     /*  actions of __kmpc_end_master are done here          */
1814     if (ret) {
1815       /* only one thread should do the pop since only */
1816       /* one did the push (see __kmpc_master())       */
1817       __kmp_pop_sync(global_tid, ct_master, loc);
1818     }
1819   }
1820 
1821   return (ret);
1822 }
1823 
1824 /* The BARRIER for a SINGLE process section is always explicit   */
1825 /*!
1826 @ingroup WORK_SHARING
1827 @param loc  source location information
1828 @param global_tid  global thread number
1829 @return One if this thread should execute the single construct, zero otherwise.
1830 
1831 Test whether to execute a <tt>single</tt> construct.
1832 There are no implicit barriers in the two "single" calls, rather the compiler
1833 should introduce an explicit barrier if it is required.
1834 */
1835 
1836 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1837   __kmp_assert_valid_gtid(global_tid);
1838   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1839 
1840   if (rc) {
1841     // We are going to execute the single statement, so we should count it.
1842     KMP_COUNT_BLOCK(OMP_SINGLE);
1843     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1844   }
1845 
1846 #if OMPT_SUPPORT && OMPT_OPTIONAL
1847   kmp_info_t *this_thr = __kmp_threads[global_tid];
1848   kmp_team_t *team = this_thr->th.th_team;
1849   int tid = __kmp_tid_from_gtid(global_tid);
1850 
1851   if (ompt_enabled.enabled) {
1852     if (rc) {
1853       if (ompt_enabled.ompt_callback_work) {
1854         ompt_callbacks.ompt_callback(ompt_callback_work)(
1855             ompt_work_single_executor, ompt_scope_begin,
1856             &(team->t.ompt_team_info.parallel_data),
1857             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1858             1, OMPT_GET_RETURN_ADDRESS(0));
1859       }
1860     } else {
1861       if (ompt_enabled.ompt_callback_work) {
1862         ompt_callbacks.ompt_callback(ompt_callback_work)(
1863             ompt_work_single_other, ompt_scope_begin,
1864             &(team->t.ompt_team_info.parallel_data),
1865             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1866             1, OMPT_GET_RETURN_ADDRESS(0));
1867         ompt_callbacks.ompt_callback(ompt_callback_work)(
1868             ompt_work_single_other, ompt_scope_end,
1869             &(team->t.ompt_team_info.parallel_data),
1870             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1871             1, OMPT_GET_RETURN_ADDRESS(0));
1872       }
1873     }
1874   }
1875 #endif
1876 
1877   return rc;
1878 }
1879 
1880 /*!
1881 @ingroup WORK_SHARING
1882 @param loc  source location information
1883 @param global_tid  global thread number
1884 
1885 Mark the end of a <tt>single</tt> construct.  This function should
1886 only be called by the thread that executed the block of code protected
1887 by the `single` construct.
1888 */
1889 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1890   __kmp_assert_valid_gtid(global_tid);
1891   __kmp_exit_single(global_tid);
1892   KMP_POP_PARTITIONED_TIMER();
1893 
1894 #if OMPT_SUPPORT && OMPT_OPTIONAL
1895   kmp_info_t *this_thr = __kmp_threads[global_tid];
1896   kmp_team_t *team = this_thr->th.th_team;
1897   int tid = __kmp_tid_from_gtid(global_tid);
1898 
1899   if (ompt_enabled.ompt_callback_work) {
1900     ompt_callbacks.ompt_callback(ompt_callback_work)(
1901         ompt_work_single_executor, ompt_scope_end,
1902         &(team->t.ompt_team_info.parallel_data),
1903         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1904         OMPT_GET_RETURN_ADDRESS(0));
1905   }
1906 #endif
1907 }
1908 
1909 /*!
1910 @ingroup WORK_SHARING
1911 @param loc Source location
1912 @param global_tid Global thread id
1913 
1914 Mark the end of a statically scheduled loop.
1915 */
1916 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1917   KMP_POP_PARTITIONED_TIMER();
1918   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1919 
1920 #if OMPT_SUPPORT && OMPT_OPTIONAL
1921   if (ompt_enabled.ompt_callback_work) {
1922     ompt_work_t ompt_work_type = ompt_work_loop;
1923     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1924     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1925     // Determine workshare type
1926     if (loc != NULL) {
1927       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1928         ompt_work_type = ompt_work_loop;
1929       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1930         ompt_work_type = ompt_work_sections;
1931       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1932         ompt_work_type = ompt_work_distribute;
1933       } else {
1934         // use default set above.
1935         // a warning about this case is provided in __kmpc_for_static_init
1936       }
1937       KMP_DEBUG_ASSERT(ompt_work_type);
1938     }
1939     ompt_callbacks.ompt_callback(ompt_callback_work)(
1940         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1941         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1942   }
1943 #endif
1944   if (__kmp_env_consistency_check)
1945     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1946 }
1947 
1948 // User routines which take C-style arguments (call by value)
1949 // different from the Fortran equivalent routines
1950 
1951 void ompc_set_num_threads(int arg) {
1952   // !!!!! TODO: check the per-task binding
1953   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1954 }
1955 
1956 void ompc_set_dynamic(int flag) {
1957   kmp_info_t *thread;
1958 
1959   /* For the thread-private implementation of the internal controls */
1960   thread = __kmp_entry_thread();
1961 
1962   __kmp_save_internal_controls(thread);
1963 
1964   set__dynamic(thread, flag ? true : false);
1965 }
1966 
1967 void ompc_set_nested(int flag) {
1968   kmp_info_t *thread;
1969 
1970   /* For the thread-private internal controls implementation */
1971   thread = __kmp_entry_thread();
1972 
1973   __kmp_save_internal_controls(thread);
1974 
1975   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1976 }
1977 
1978 void ompc_set_max_active_levels(int max_active_levels) {
1979   /* TO DO */
1980   /* we want per-task implementation of this internal control */
1981 
1982   /* For the per-thread internal controls implementation */
1983   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1984 }
1985 
1986 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1987   // !!!!! TODO: check the per-task binding
1988   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1989 }
1990 
1991 int ompc_get_ancestor_thread_num(int level) {
1992   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1993 }
1994 
1995 int ompc_get_team_size(int level) {
1996   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1997 }
1998 
1999 /* OpenMP 5.0 Affinity Format API */
2000 void KMP_EXPAND_NAME(ompc_set_affinity_format)(char const *format) {
2001   if (!__kmp_init_serial) {
2002     __kmp_serial_initialize();
2003   }
2004   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
2005                          format, KMP_STRLEN(format) + 1);
2006 }
2007 
2008 size_t KMP_EXPAND_NAME(ompc_get_affinity_format)(char *buffer, size_t size) {
2009   size_t format_size;
2010   if (!__kmp_init_serial) {
2011     __kmp_serial_initialize();
2012   }
2013   format_size = KMP_STRLEN(__kmp_affinity_format);
2014   if (buffer && size) {
2015     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
2016                            format_size + 1);
2017   }
2018   return format_size;
2019 }
2020 
2021 void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format) {
2022   int gtid;
2023   if (!TCR_4(__kmp_init_middle)) {
2024     __kmp_middle_initialize();
2025   }
2026   __kmp_assign_root_init_mask();
2027   gtid = __kmp_get_gtid();
2028   __kmp_aux_display_affinity(gtid, format);
2029 }
2030 
2031 size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
2032                                               char const *format) {
2033   int gtid;
2034   size_t num_required;
2035   kmp_str_buf_t capture_buf;
2036   if (!TCR_4(__kmp_init_middle)) {
2037     __kmp_middle_initialize();
2038   }
2039   __kmp_assign_root_init_mask();
2040   gtid = __kmp_get_gtid();
2041   __kmp_str_buf_init(&capture_buf);
2042   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
2043   if (buffer && buf_size) {
2044     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
2045                            capture_buf.used + 1);
2046   }
2047   __kmp_str_buf_free(&capture_buf);
2048   return num_required;
2049 }
2050 
2051 void kmpc_set_stacksize(int arg) {
2052   // __kmp_aux_set_stacksize initializes the library if needed
2053   __kmp_aux_set_stacksize(arg);
2054 }
2055 
2056 void kmpc_set_stacksize_s(size_t arg) {
2057   // __kmp_aux_set_stacksize initializes the library if needed
2058   __kmp_aux_set_stacksize(arg);
2059 }
2060 
2061 void kmpc_set_blocktime(int arg) {
2062   int gtid, tid;
2063   kmp_info_t *thread;
2064 
2065   gtid = __kmp_entry_gtid();
2066   tid = __kmp_tid_from_gtid(gtid);
2067   thread = __kmp_thread_from_gtid(gtid);
2068 
2069   __kmp_aux_set_blocktime(arg, thread, tid);
2070 }
2071 
2072 void kmpc_set_library(int arg) {
2073   // __kmp_user_set_library initializes the library if needed
2074   __kmp_user_set_library((enum library_type)arg);
2075 }
2076 
2077 void kmpc_set_defaults(char const *str) {
2078   // __kmp_aux_set_defaults initializes the library if needed
2079   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
2080 }
2081 
2082 void kmpc_set_disp_num_buffers(int arg) {
2083   // ignore after initialization because some teams have already
2084   // allocated dispatch buffers
2085   if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
2086       arg <= KMP_MAX_DISP_NUM_BUFF) {
2087     __kmp_dispatch_num_buffers = arg;
2088   }
2089 }
2090 
2091 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2092 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2093   return -1;
2094 #else
2095   if (!TCR_4(__kmp_init_middle)) {
2096     __kmp_middle_initialize();
2097   }
2098   __kmp_assign_root_init_mask();
2099   return __kmp_aux_set_affinity_mask_proc(proc, mask);
2100 #endif
2101 }
2102 
2103 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2104 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2105   return -1;
2106 #else
2107   if (!TCR_4(__kmp_init_middle)) {
2108     __kmp_middle_initialize();
2109   }
2110   __kmp_assign_root_init_mask();
2111   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2112 #endif
2113 }
2114 
2115 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2116 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2117   return -1;
2118 #else
2119   if (!TCR_4(__kmp_init_middle)) {
2120     __kmp_middle_initialize();
2121   }
2122   __kmp_assign_root_init_mask();
2123   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2124 #endif
2125 }
2126 
2127 /* -------------------------------------------------------------------------- */
2128 /*!
2129 @ingroup THREADPRIVATE
2130 @param loc       source location information
2131 @param gtid      global thread number
2132 @param cpy_size  size of the cpy_data buffer
2133 @param cpy_data  pointer to data to be copied
2134 @param cpy_func  helper function to call for copying data
2135 @param didit     flag variable: 1=single thread; 0=not single thread
2136 
2137 __kmpc_copyprivate implements the interface for the private data broadcast
2138 needed for the copyprivate clause associated with a single region in an
2139 OpenMP<sup>*</sup> program (both C and Fortran).
2140 All threads participating in the parallel region call this routine.
2141 One of the threads (called the single thread) should have the <tt>didit</tt>
2142 variable set to 1 and all other threads should have that variable set to 0.
2143 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2144 
2145 The OpenMP specification forbids the use of nowait on the single region when a
2146 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2147 barrier internally to avoid race conditions, so the code generation for the
2148 single region should avoid generating a barrier after the call to @ref
2149 __kmpc_copyprivate.
2150 
2151 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2152 The <tt>loc</tt> parameter is a pointer to source location information.
2153 
2154 Internal implementation: The single thread will first copy its descriptor
2155 address (cpy_data) to a team-private location, then the other threads will each
2156 call the function pointed to by the parameter cpy_func, which carries out the
2157 copy by copying the data using the cpy_data buffer.
2158 
2159 The cpy_func routine used for the copy and the contents of the data area defined
2160 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2161 to be done. For instance, the cpy_data buffer can hold the actual data to be
2162 copied or it may hold a list of pointers to the data. The cpy_func routine must
2163 interpret the cpy_data buffer appropriately.
2164 
2165 The interface to cpy_func is as follows:
2166 @code
2167 void cpy_func( void *destination, void *source )
2168 @endcode
2169 where void *destination is the cpy_data pointer for the thread being copied to
2170 and void *source is the cpy_data pointer for the thread being copied from.
2171 */
2172 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2173                         void *cpy_data, void (*cpy_func)(void *, void *),
2174                         kmp_int32 didit) {
2175   void **data_ptr;
2176   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2177   __kmp_assert_valid_gtid(gtid);
2178 
2179   KMP_MB();
2180 
2181   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2182 
2183   if (__kmp_env_consistency_check) {
2184     if (loc == 0) {
2185       KMP_WARNING(ConstructIdentInvalid);
2186     }
2187   }
2188 
2189   // ToDo: Optimize the following two barriers into some kind of split barrier
2190 
2191   if (didit)
2192     *data_ptr = cpy_data;
2193 
2194 #if OMPT_SUPPORT
2195   ompt_frame_t *ompt_frame;
2196   if (ompt_enabled.enabled) {
2197     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2198     if (ompt_frame->enter_frame.ptr == NULL)
2199       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2200   }
2201   OMPT_STORE_RETURN_ADDRESS(gtid);
2202 #endif
2203 /* This barrier is not a barrier region boundary */
2204 #if USE_ITT_NOTIFY
2205   __kmp_threads[gtid]->th.th_ident = loc;
2206 #endif
2207   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2208 
2209   if (!didit)
2210     (*cpy_func)(cpy_data, *data_ptr);
2211 
2212   // Consider next barrier a user-visible barrier for barrier region boundaries
2213   // Nesting checks are already handled by the single construct checks
2214   {
2215 #if OMPT_SUPPORT
2216     OMPT_STORE_RETURN_ADDRESS(gtid);
2217 #endif
2218 #if USE_ITT_NOTIFY
2219     __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2220 // tasks can overwrite the location)
2221 #endif
2222     __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2223 #if OMPT_SUPPORT && OMPT_OPTIONAL
2224     if (ompt_enabled.enabled) {
2225       ompt_frame->enter_frame = ompt_data_none;
2226     }
2227 #endif
2228   }
2229 }
2230 
2231 /* -------------------------------------------------------------------------- */
2232 
2233 #define INIT_LOCK __kmp_init_user_lock_with_checks
2234 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2235 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2236 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2237 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2238 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2239   __kmp_acquire_nested_user_lock_with_checks_timed
2240 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2241 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2242 #define TEST_LOCK __kmp_test_user_lock_with_checks
2243 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2244 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2245 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2246 
2247 // TODO: Make check abort messages use location info & pass it into
2248 // with_checks routines
2249 
2250 #if KMP_USE_DYNAMIC_LOCK
2251 
2252 // internal lock initializer
2253 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2254                                                     kmp_dyna_lockseq_t seq) {
2255   if (KMP_IS_D_LOCK(seq)) {
2256     KMP_INIT_D_LOCK(lock, seq);
2257 #if USE_ITT_BUILD
2258     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2259 #endif
2260   } else {
2261     KMP_INIT_I_LOCK(lock, seq);
2262 #if USE_ITT_BUILD
2263     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2264     __kmp_itt_lock_creating(ilk->lock, loc);
2265 #endif
2266   }
2267 }
2268 
2269 // internal nest lock initializer
2270 static __forceinline void
2271 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2272                                kmp_dyna_lockseq_t seq) {
2273 #if KMP_USE_TSX
2274   // Don't have nested lock implementation for speculative locks
2275   if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2276       seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2277     seq = __kmp_user_lock_seq;
2278 #endif
2279   switch (seq) {
2280   case lockseq_tas:
2281     seq = lockseq_nested_tas;
2282     break;
2283 #if KMP_USE_FUTEX
2284   case lockseq_futex:
2285     seq = lockseq_nested_futex;
2286     break;
2287 #endif
2288   case lockseq_ticket:
2289     seq = lockseq_nested_ticket;
2290     break;
2291   case lockseq_queuing:
2292     seq = lockseq_nested_queuing;
2293     break;
2294   case lockseq_drdpa:
2295     seq = lockseq_nested_drdpa;
2296     break;
2297   default:
2298     seq = lockseq_nested_queuing;
2299   }
2300   KMP_INIT_I_LOCK(lock, seq);
2301 #if USE_ITT_BUILD
2302   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2303   __kmp_itt_lock_creating(ilk->lock, loc);
2304 #endif
2305 }
2306 
2307 /* initialize the lock with a hint */
2308 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2309                                 uintptr_t hint) {
2310   KMP_DEBUG_ASSERT(__kmp_init_serial);
2311   if (__kmp_env_consistency_check && user_lock == NULL) {
2312     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2313   }
2314 
2315   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2316 
2317 #if OMPT_SUPPORT && OMPT_OPTIONAL
2318   // This is the case, if called from omp_init_lock_with_hint:
2319   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2320   if (!codeptr)
2321     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2322   if (ompt_enabled.ompt_callback_lock_init) {
2323     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2324         ompt_mutex_lock, (omp_lock_hint_t)hint,
2325         __ompt_get_mutex_impl_type(user_lock),
2326         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2327   }
2328 #endif
2329 }
2330 
2331 /* initialize the lock with a hint */
2332 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2333                                      void **user_lock, uintptr_t hint) {
2334   KMP_DEBUG_ASSERT(__kmp_init_serial);
2335   if (__kmp_env_consistency_check && user_lock == NULL) {
2336     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2337   }
2338 
2339   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2340 
2341 #if OMPT_SUPPORT && OMPT_OPTIONAL
2342   // This is the case, if called from omp_init_lock_with_hint:
2343   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2344   if (!codeptr)
2345     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2346   if (ompt_enabled.ompt_callback_lock_init) {
2347     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2348         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2349         __ompt_get_mutex_impl_type(user_lock),
2350         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2351   }
2352 #endif
2353 }
2354 
2355 #endif // KMP_USE_DYNAMIC_LOCK
2356 
2357 /* initialize the lock */
2358 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2359 #if KMP_USE_DYNAMIC_LOCK
2360 
2361   KMP_DEBUG_ASSERT(__kmp_init_serial);
2362   if (__kmp_env_consistency_check && user_lock == NULL) {
2363     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2364   }
2365   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2366 
2367 #if OMPT_SUPPORT && OMPT_OPTIONAL
2368   // This is the case, if called from omp_init_lock_with_hint:
2369   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2370   if (!codeptr)
2371     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2372   if (ompt_enabled.ompt_callback_lock_init) {
2373     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2374         ompt_mutex_lock, omp_lock_hint_none,
2375         __ompt_get_mutex_impl_type(user_lock),
2376         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2377   }
2378 #endif
2379 
2380 #else // KMP_USE_DYNAMIC_LOCK
2381 
2382   static char const *const func = "omp_init_lock";
2383   kmp_user_lock_p lck;
2384   KMP_DEBUG_ASSERT(__kmp_init_serial);
2385 
2386   if (__kmp_env_consistency_check) {
2387     if (user_lock == NULL) {
2388       KMP_FATAL(LockIsUninitialized, func);
2389     }
2390   }
2391 
2392   KMP_CHECK_USER_LOCK_INIT();
2393 
2394   if ((__kmp_user_lock_kind == lk_tas) &&
2395       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2396     lck = (kmp_user_lock_p)user_lock;
2397   }
2398 #if KMP_USE_FUTEX
2399   else if ((__kmp_user_lock_kind == lk_futex) &&
2400            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2401     lck = (kmp_user_lock_p)user_lock;
2402   }
2403 #endif
2404   else {
2405     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2406   }
2407   INIT_LOCK(lck);
2408   __kmp_set_user_lock_location(lck, loc);
2409 
2410 #if OMPT_SUPPORT && OMPT_OPTIONAL
2411   // This is the case, if called from omp_init_lock_with_hint:
2412   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2413   if (!codeptr)
2414     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2415   if (ompt_enabled.ompt_callback_lock_init) {
2416     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2417         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2418         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2419   }
2420 #endif
2421 
2422 #if USE_ITT_BUILD
2423   __kmp_itt_lock_creating(lck);
2424 #endif /* USE_ITT_BUILD */
2425 
2426 #endif // KMP_USE_DYNAMIC_LOCK
2427 } // __kmpc_init_lock
2428 
2429 /* initialize the lock */
2430 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2431 #if KMP_USE_DYNAMIC_LOCK
2432 
2433   KMP_DEBUG_ASSERT(__kmp_init_serial);
2434   if (__kmp_env_consistency_check && user_lock == NULL) {
2435     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2436   }
2437   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2438 
2439 #if OMPT_SUPPORT && OMPT_OPTIONAL
2440   // This is the case, if called from omp_init_lock_with_hint:
2441   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2442   if (!codeptr)
2443     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2444   if (ompt_enabled.ompt_callback_lock_init) {
2445     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2446         ompt_mutex_nest_lock, omp_lock_hint_none,
2447         __ompt_get_mutex_impl_type(user_lock),
2448         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2449   }
2450 #endif
2451 
2452 #else // KMP_USE_DYNAMIC_LOCK
2453 
2454   static char const *const func = "omp_init_nest_lock";
2455   kmp_user_lock_p lck;
2456   KMP_DEBUG_ASSERT(__kmp_init_serial);
2457 
2458   if (__kmp_env_consistency_check) {
2459     if (user_lock == NULL) {
2460       KMP_FATAL(LockIsUninitialized, func);
2461     }
2462   }
2463 
2464   KMP_CHECK_USER_LOCK_INIT();
2465 
2466   if ((__kmp_user_lock_kind == lk_tas) &&
2467       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2468        OMP_NEST_LOCK_T_SIZE)) {
2469     lck = (kmp_user_lock_p)user_lock;
2470   }
2471 #if KMP_USE_FUTEX
2472   else if ((__kmp_user_lock_kind == lk_futex) &&
2473            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2474             OMP_NEST_LOCK_T_SIZE)) {
2475     lck = (kmp_user_lock_p)user_lock;
2476   }
2477 #endif
2478   else {
2479     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2480   }
2481 
2482   INIT_NESTED_LOCK(lck);
2483   __kmp_set_user_lock_location(lck, loc);
2484 
2485 #if OMPT_SUPPORT && OMPT_OPTIONAL
2486   // This is the case, if called from omp_init_lock_with_hint:
2487   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2488   if (!codeptr)
2489     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2490   if (ompt_enabled.ompt_callback_lock_init) {
2491     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2492         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2493         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2494   }
2495 #endif
2496 
2497 #if USE_ITT_BUILD
2498   __kmp_itt_lock_creating(lck);
2499 #endif /* USE_ITT_BUILD */
2500 
2501 #endif // KMP_USE_DYNAMIC_LOCK
2502 } // __kmpc_init_nest_lock
2503 
2504 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2505 #if KMP_USE_DYNAMIC_LOCK
2506 
2507 #if USE_ITT_BUILD
2508   kmp_user_lock_p lck;
2509   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2510     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2511   } else {
2512     lck = (kmp_user_lock_p)user_lock;
2513   }
2514   __kmp_itt_lock_destroyed(lck);
2515 #endif
2516 #if OMPT_SUPPORT && OMPT_OPTIONAL
2517   // This is the case, if called from omp_init_lock_with_hint:
2518   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2519   if (!codeptr)
2520     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2521   if (ompt_enabled.ompt_callback_lock_destroy) {
2522     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2523         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2524   }
2525 #endif
2526   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2527 #else
2528   kmp_user_lock_p lck;
2529 
2530   if ((__kmp_user_lock_kind == lk_tas) &&
2531       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2532     lck = (kmp_user_lock_p)user_lock;
2533   }
2534 #if KMP_USE_FUTEX
2535   else if ((__kmp_user_lock_kind == lk_futex) &&
2536            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2537     lck = (kmp_user_lock_p)user_lock;
2538   }
2539 #endif
2540   else {
2541     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2542   }
2543 
2544 #if OMPT_SUPPORT && OMPT_OPTIONAL
2545   // This is the case, if called from omp_init_lock_with_hint:
2546   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2547   if (!codeptr)
2548     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2549   if (ompt_enabled.ompt_callback_lock_destroy) {
2550     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2551         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2552   }
2553 #endif
2554 
2555 #if USE_ITT_BUILD
2556   __kmp_itt_lock_destroyed(lck);
2557 #endif /* USE_ITT_BUILD */
2558   DESTROY_LOCK(lck);
2559 
2560   if ((__kmp_user_lock_kind == lk_tas) &&
2561       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2562     ;
2563   }
2564 #if KMP_USE_FUTEX
2565   else if ((__kmp_user_lock_kind == lk_futex) &&
2566            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2567     ;
2568   }
2569 #endif
2570   else {
2571     __kmp_user_lock_free(user_lock, gtid, lck);
2572   }
2573 #endif // KMP_USE_DYNAMIC_LOCK
2574 } // __kmpc_destroy_lock
2575 
2576 /* destroy the lock */
2577 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2578 #if KMP_USE_DYNAMIC_LOCK
2579 
2580 #if USE_ITT_BUILD
2581   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2582   __kmp_itt_lock_destroyed(ilk->lock);
2583 #endif
2584 #if OMPT_SUPPORT && OMPT_OPTIONAL
2585   // This is the case, if called from omp_init_lock_with_hint:
2586   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2587   if (!codeptr)
2588     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2589   if (ompt_enabled.ompt_callback_lock_destroy) {
2590     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2591         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2592   }
2593 #endif
2594   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2595 
2596 #else // KMP_USE_DYNAMIC_LOCK
2597 
2598   kmp_user_lock_p lck;
2599 
2600   if ((__kmp_user_lock_kind == lk_tas) &&
2601       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2602        OMP_NEST_LOCK_T_SIZE)) {
2603     lck = (kmp_user_lock_p)user_lock;
2604   }
2605 #if KMP_USE_FUTEX
2606   else if ((__kmp_user_lock_kind == lk_futex) &&
2607            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2608             OMP_NEST_LOCK_T_SIZE)) {
2609     lck = (kmp_user_lock_p)user_lock;
2610   }
2611 #endif
2612   else {
2613     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2614   }
2615 
2616 #if OMPT_SUPPORT && OMPT_OPTIONAL
2617   // This is the case, if called from omp_init_lock_with_hint:
2618   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2619   if (!codeptr)
2620     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2621   if (ompt_enabled.ompt_callback_lock_destroy) {
2622     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2623         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2624   }
2625 #endif
2626 
2627 #if USE_ITT_BUILD
2628   __kmp_itt_lock_destroyed(lck);
2629 #endif /* USE_ITT_BUILD */
2630 
2631   DESTROY_NESTED_LOCK(lck);
2632 
2633   if ((__kmp_user_lock_kind == lk_tas) &&
2634       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2635        OMP_NEST_LOCK_T_SIZE)) {
2636     ;
2637   }
2638 #if KMP_USE_FUTEX
2639   else if ((__kmp_user_lock_kind == lk_futex) &&
2640            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2641             OMP_NEST_LOCK_T_SIZE)) {
2642     ;
2643   }
2644 #endif
2645   else {
2646     __kmp_user_lock_free(user_lock, gtid, lck);
2647   }
2648 #endif // KMP_USE_DYNAMIC_LOCK
2649 } // __kmpc_destroy_nest_lock
2650 
2651 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2652   KMP_COUNT_BLOCK(OMP_set_lock);
2653 #if KMP_USE_DYNAMIC_LOCK
2654   int tag = KMP_EXTRACT_D_TAG(user_lock);
2655 #if USE_ITT_BUILD
2656   __kmp_itt_lock_acquiring(
2657       (kmp_user_lock_p)
2658           user_lock); // itt function will get to the right lock object.
2659 #endif
2660 #if OMPT_SUPPORT && OMPT_OPTIONAL
2661   // This is the case, if called from omp_init_lock_with_hint:
2662   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2663   if (!codeptr)
2664     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2665   if (ompt_enabled.ompt_callback_mutex_acquire) {
2666     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2667         ompt_mutex_lock, omp_lock_hint_none,
2668         __ompt_get_mutex_impl_type(user_lock),
2669         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2670   }
2671 #endif
2672 #if KMP_USE_INLINED_TAS
2673   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2674     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2675   } else
2676 #elif KMP_USE_INLINED_FUTEX
2677   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2678     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2679   } else
2680 #endif
2681   {
2682     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2683   }
2684 #if USE_ITT_BUILD
2685   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2686 #endif
2687 #if OMPT_SUPPORT && OMPT_OPTIONAL
2688   if (ompt_enabled.ompt_callback_mutex_acquired) {
2689     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2690         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2691   }
2692 #endif
2693 
2694 #else // KMP_USE_DYNAMIC_LOCK
2695 
2696   kmp_user_lock_p lck;
2697 
2698   if ((__kmp_user_lock_kind == lk_tas) &&
2699       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2700     lck = (kmp_user_lock_p)user_lock;
2701   }
2702 #if KMP_USE_FUTEX
2703   else if ((__kmp_user_lock_kind == lk_futex) &&
2704            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2705     lck = (kmp_user_lock_p)user_lock;
2706   }
2707 #endif
2708   else {
2709     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2710   }
2711 
2712 #if USE_ITT_BUILD
2713   __kmp_itt_lock_acquiring(lck);
2714 #endif /* USE_ITT_BUILD */
2715 #if OMPT_SUPPORT && OMPT_OPTIONAL
2716   // This is the case, if called from omp_init_lock_with_hint:
2717   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2718   if (!codeptr)
2719     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2720   if (ompt_enabled.ompt_callback_mutex_acquire) {
2721     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2722         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2723         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2724   }
2725 #endif
2726 
2727   ACQUIRE_LOCK(lck, gtid);
2728 
2729 #if USE_ITT_BUILD
2730   __kmp_itt_lock_acquired(lck);
2731 #endif /* USE_ITT_BUILD */
2732 
2733 #if OMPT_SUPPORT && OMPT_OPTIONAL
2734   if (ompt_enabled.ompt_callback_mutex_acquired) {
2735     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2736         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2737   }
2738 #endif
2739 
2740 #endif // KMP_USE_DYNAMIC_LOCK
2741 }
2742 
2743 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2744 #if KMP_USE_DYNAMIC_LOCK
2745 
2746 #if USE_ITT_BUILD
2747   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2748 #endif
2749 #if OMPT_SUPPORT && OMPT_OPTIONAL
2750   // This is the case, if called from omp_init_lock_with_hint:
2751   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2752   if (!codeptr)
2753     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2754   if (ompt_enabled.enabled) {
2755     if (ompt_enabled.ompt_callback_mutex_acquire) {
2756       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2757           ompt_mutex_nest_lock, omp_lock_hint_none,
2758           __ompt_get_mutex_impl_type(user_lock),
2759           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2760     }
2761   }
2762 #endif
2763   int acquire_status =
2764       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2765   (void)acquire_status;
2766 #if USE_ITT_BUILD
2767   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2768 #endif
2769 
2770 #if OMPT_SUPPORT && OMPT_OPTIONAL
2771   if (ompt_enabled.enabled) {
2772     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2773       if (ompt_enabled.ompt_callback_mutex_acquired) {
2774         // lock_first
2775         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2776             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2777             codeptr);
2778       }
2779     } else {
2780       if (ompt_enabled.ompt_callback_nest_lock) {
2781         // lock_next
2782         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2783             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2784       }
2785     }
2786   }
2787 #endif
2788 
2789 #else // KMP_USE_DYNAMIC_LOCK
2790   int acquire_status;
2791   kmp_user_lock_p lck;
2792 
2793   if ((__kmp_user_lock_kind == lk_tas) &&
2794       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2795        OMP_NEST_LOCK_T_SIZE)) {
2796     lck = (kmp_user_lock_p)user_lock;
2797   }
2798 #if KMP_USE_FUTEX
2799   else if ((__kmp_user_lock_kind == lk_futex) &&
2800            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2801             OMP_NEST_LOCK_T_SIZE)) {
2802     lck = (kmp_user_lock_p)user_lock;
2803   }
2804 #endif
2805   else {
2806     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2807   }
2808 
2809 #if USE_ITT_BUILD
2810   __kmp_itt_lock_acquiring(lck);
2811 #endif /* USE_ITT_BUILD */
2812 #if OMPT_SUPPORT && OMPT_OPTIONAL
2813   // This is the case, if called from omp_init_lock_with_hint:
2814   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2815   if (!codeptr)
2816     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2817   if (ompt_enabled.enabled) {
2818     if (ompt_enabled.ompt_callback_mutex_acquire) {
2819       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2820           ompt_mutex_nest_lock, omp_lock_hint_none,
2821           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2822           codeptr);
2823     }
2824   }
2825 #endif
2826 
2827   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2828 
2829 #if USE_ITT_BUILD
2830   __kmp_itt_lock_acquired(lck);
2831 #endif /* USE_ITT_BUILD */
2832 
2833 #if OMPT_SUPPORT && OMPT_OPTIONAL
2834   if (ompt_enabled.enabled) {
2835     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2836       if (ompt_enabled.ompt_callback_mutex_acquired) {
2837         // lock_first
2838         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2839             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2840       }
2841     } else {
2842       if (ompt_enabled.ompt_callback_nest_lock) {
2843         // lock_next
2844         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2845             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2846       }
2847     }
2848   }
2849 #endif
2850 
2851 #endif // KMP_USE_DYNAMIC_LOCK
2852 }
2853 
2854 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2855 #if KMP_USE_DYNAMIC_LOCK
2856 
2857   int tag = KMP_EXTRACT_D_TAG(user_lock);
2858 #if USE_ITT_BUILD
2859   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2860 #endif
2861 #if KMP_USE_INLINED_TAS
2862   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2863     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2864   } else
2865 #elif KMP_USE_INLINED_FUTEX
2866   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2867     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2868   } else
2869 #endif
2870   {
2871     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2872   }
2873 
2874 #if OMPT_SUPPORT && OMPT_OPTIONAL
2875   // This is the case, if called from omp_init_lock_with_hint:
2876   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2877   if (!codeptr)
2878     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2879   if (ompt_enabled.ompt_callback_mutex_released) {
2880     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2881         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2882   }
2883 #endif
2884 
2885 #else // KMP_USE_DYNAMIC_LOCK
2886 
2887   kmp_user_lock_p lck;
2888 
2889   /* Can't use serial interval since not block structured */
2890   /* release the lock */
2891 
2892   if ((__kmp_user_lock_kind == lk_tas) &&
2893       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2894 #if KMP_OS_LINUX &&                                                            \
2895     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2896 // "fast" path implemented to fix customer performance issue
2897 #if USE_ITT_BUILD
2898     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2899 #endif /* USE_ITT_BUILD */
2900     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2901     KMP_MB();
2902 
2903 #if OMPT_SUPPORT && OMPT_OPTIONAL
2904     // This is the case, if called from omp_init_lock_with_hint:
2905     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2906     if (!codeptr)
2907       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2908     if (ompt_enabled.ompt_callback_mutex_released) {
2909       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2910           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2911     }
2912 #endif
2913 
2914     return;
2915 #else
2916     lck = (kmp_user_lock_p)user_lock;
2917 #endif
2918   }
2919 #if KMP_USE_FUTEX
2920   else if ((__kmp_user_lock_kind == lk_futex) &&
2921            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2922     lck = (kmp_user_lock_p)user_lock;
2923   }
2924 #endif
2925   else {
2926     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2927   }
2928 
2929 #if USE_ITT_BUILD
2930   __kmp_itt_lock_releasing(lck);
2931 #endif /* USE_ITT_BUILD */
2932 
2933   RELEASE_LOCK(lck, gtid);
2934 
2935 #if OMPT_SUPPORT && OMPT_OPTIONAL
2936   // This is the case, if called from omp_init_lock_with_hint:
2937   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2938   if (!codeptr)
2939     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2940   if (ompt_enabled.ompt_callback_mutex_released) {
2941     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2942         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2943   }
2944 #endif
2945 
2946 #endif // KMP_USE_DYNAMIC_LOCK
2947 }
2948 
2949 /* release the lock */
2950 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2951 #if KMP_USE_DYNAMIC_LOCK
2952 
2953 #if USE_ITT_BUILD
2954   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2955 #endif
2956   int release_status =
2957       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2958   (void)release_status;
2959 
2960 #if OMPT_SUPPORT && OMPT_OPTIONAL
2961   // This is the case, if called from omp_init_lock_with_hint:
2962   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2963   if (!codeptr)
2964     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2965   if (ompt_enabled.enabled) {
2966     if (release_status == KMP_LOCK_RELEASED) {
2967       if (ompt_enabled.ompt_callback_mutex_released) {
2968         // release_lock_last
2969         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2970             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2971             codeptr);
2972       }
2973     } else if (ompt_enabled.ompt_callback_nest_lock) {
2974       // release_lock_prev
2975       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2976           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2977     }
2978   }
2979 #endif
2980 
2981 #else // KMP_USE_DYNAMIC_LOCK
2982 
2983   kmp_user_lock_p lck;
2984 
2985   /* Can't use serial interval since not block structured */
2986 
2987   if ((__kmp_user_lock_kind == lk_tas) &&
2988       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2989        OMP_NEST_LOCK_T_SIZE)) {
2990 #if KMP_OS_LINUX &&                                                            \
2991     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2992     // "fast" path implemented to fix customer performance issue
2993     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2994 #if USE_ITT_BUILD
2995     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2996 #endif /* USE_ITT_BUILD */
2997 
2998 #if OMPT_SUPPORT && OMPT_OPTIONAL
2999     int release_status = KMP_LOCK_STILL_HELD;
3000 #endif
3001 
3002     if (--(tl->lk.depth_locked) == 0) {
3003       TCW_4(tl->lk.poll, 0);
3004 #if OMPT_SUPPORT && OMPT_OPTIONAL
3005       release_status = KMP_LOCK_RELEASED;
3006 #endif
3007     }
3008     KMP_MB();
3009 
3010 #if OMPT_SUPPORT && OMPT_OPTIONAL
3011     // This is the case, if called from omp_init_lock_with_hint:
3012     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3013     if (!codeptr)
3014       codeptr = OMPT_GET_RETURN_ADDRESS(0);
3015     if (ompt_enabled.enabled) {
3016       if (release_status == KMP_LOCK_RELEASED) {
3017         if (ompt_enabled.ompt_callback_mutex_released) {
3018           // release_lock_last
3019           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3020               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3021         }
3022       } else if (ompt_enabled.ompt_callback_nest_lock) {
3023         // release_lock_previous
3024         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3025             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3026       }
3027     }
3028 #endif
3029 
3030     return;
3031 #else
3032     lck = (kmp_user_lock_p)user_lock;
3033 #endif
3034   }
3035 #if KMP_USE_FUTEX
3036   else if ((__kmp_user_lock_kind == lk_futex) &&
3037            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3038             OMP_NEST_LOCK_T_SIZE)) {
3039     lck = (kmp_user_lock_p)user_lock;
3040   }
3041 #endif
3042   else {
3043     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
3044   }
3045 
3046 #if USE_ITT_BUILD
3047   __kmp_itt_lock_releasing(lck);
3048 #endif /* USE_ITT_BUILD */
3049 
3050   int release_status;
3051   release_status = RELEASE_NESTED_LOCK(lck, gtid);
3052 #if OMPT_SUPPORT && OMPT_OPTIONAL
3053   // This is the case, if called from omp_init_lock_with_hint:
3054   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3055   if (!codeptr)
3056     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3057   if (ompt_enabled.enabled) {
3058     if (release_status == KMP_LOCK_RELEASED) {
3059       if (ompt_enabled.ompt_callback_mutex_released) {
3060         // release_lock_last
3061         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3062             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3063       }
3064     } else if (ompt_enabled.ompt_callback_nest_lock) {
3065       // release_lock_previous
3066       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3067           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3068     }
3069   }
3070 #endif
3071 
3072 #endif // KMP_USE_DYNAMIC_LOCK
3073 }
3074 
3075 /* try to acquire the lock */
3076 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3077   KMP_COUNT_BLOCK(OMP_test_lock);
3078 
3079 #if KMP_USE_DYNAMIC_LOCK
3080   int rc;
3081   int tag = KMP_EXTRACT_D_TAG(user_lock);
3082 #if USE_ITT_BUILD
3083   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3084 #endif
3085 #if OMPT_SUPPORT && OMPT_OPTIONAL
3086   // This is the case, if called from omp_init_lock_with_hint:
3087   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3088   if (!codeptr)
3089     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3090   if (ompt_enabled.ompt_callback_mutex_acquire) {
3091     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3092         ompt_mutex_lock, omp_lock_hint_none,
3093         __ompt_get_mutex_impl_type(user_lock),
3094         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3095   }
3096 #endif
3097 #if KMP_USE_INLINED_TAS
3098   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3099     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3100   } else
3101 #elif KMP_USE_INLINED_FUTEX
3102   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3103     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3104   } else
3105 #endif
3106   {
3107     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3108   }
3109   if (rc) {
3110 #if USE_ITT_BUILD
3111     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3112 #endif
3113 #if OMPT_SUPPORT && OMPT_OPTIONAL
3114     if (ompt_enabled.ompt_callback_mutex_acquired) {
3115       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3116           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3117     }
3118 #endif
3119     return FTN_TRUE;
3120   } else {
3121 #if USE_ITT_BUILD
3122     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3123 #endif
3124     return FTN_FALSE;
3125   }
3126 
3127 #else // KMP_USE_DYNAMIC_LOCK
3128 
3129   kmp_user_lock_p lck;
3130   int rc;
3131 
3132   if ((__kmp_user_lock_kind == lk_tas) &&
3133       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3134     lck = (kmp_user_lock_p)user_lock;
3135   }
3136 #if KMP_USE_FUTEX
3137   else if ((__kmp_user_lock_kind == lk_futex) &&
3138            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3139     lck = (kmp_user_lock_p)user_lock;
3140   }
3141 #endif
3142   else {
3143     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3144   }
3145 
3146 #if USE_ITT_BUILD
3147   __kmp_itt_lock_acquiring(lck);
3148 #endif /* USE_ITT_BUILD */
3149 #if OMPT_SUPPORT && OMPT_OPTIONAL
3150   // This is the case, if called from omp_init_lock_with_hint:
3151   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3152   if (!codeptr)
3153     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3154   if (ompt_enabled.ompt_callback_mutex_acquire) {
3155     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3156         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3157         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3158   }
3159 #endif
3160 
3161   rc = TEST_LOCK(lck, gtid);
3162 #if USE_ITT_BUILD
3163   if (rc) {
3164     __kmp_itt_lock_acquired(lck);
3165   } else {
3166     __kmp_itt_lock_cancelled(lck);
3167   }
3168 #endif /* USE_ITT_BUILD */
3169 #if OMPT_SUPPORT && OMPT_OPTIONAL
3170   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3171     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3172         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3173   }
3174 #endif
3175 
3176   return (rc ? FTN_TRUE : FTN_FALSE);
3177 
3178   /* Can't use serial interval since not block structured */
3179 
3180 #endif // KMP_USE_DYNAMIC_LOCK
3181 }
3182 
3183 /* try to acquire the lock */
3184 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3185 #if KMP_USE_DYNAMIC_LOCK
3186   int rc;
3187 #if USE_ITT_BUILD
3188   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3189 #endif
3190 #if OMPT_SUPPORT && OMPT_OPTIONAL
3191   // This is the case, if called from omp_init_lock_with_hint:
3192   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3193   if (!codeptr)
3194     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3195   if (ompt_enabled.ompt_callback_mutex_acquire) {
3196     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3197         ompt_mutex_nest_lock, omp_lock_hint_none,
3198         __ompt_get_mutex_impl_type(user_lock),
3199         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3200   }
3201 #endif
3202   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3203 #if USE_ITT_BUILD
3204   if (rc) {
3205     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3206   } else {
3207     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3208   }
3209 #endif
3210 #if OMPT_SUPPORT && OMPT_OPTIONAL
3211   if (ompt_enabled.enabled && rc) {
3212     if (rc == 1) {
3213       if (ompt_enabled.ompt_callback_mutex_acquired) {
3214         // lock_first
3215         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3216             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3217             codeptr);
3218       }
3219     } else {
3220       if (ompt_enabled.ompt_callback_nest_lock) {
3221         // lock_next
3222         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3223             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3224       }
3225     }
3226   }
3227 #endif
3228   return rc;
3229 
3230 #else // KMP_USE_DYNAMIC_LOCK
3231 
3232   kmp_user_lock_p lck;
3233   int rc;
3234 
3235   if ((__kmp_user_lock_kind == lk_tas) &&
3236       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3237        OMP_NEST_LOCK_T_SIZE)) {
3238     lck = (kmp_user_lock_p)user_lock;
3239   }
3240 #if KMP_USE_FUTEX
3241   else if ((__kmp_user_lock_kind == lk_futex) &&
3242            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3243             OMP_NEST_LOCK_T_SIZE)) {
3244     lck = (kmp_user_lock_p)user_lock;
3245   }
3246 #endif
3247   else {
3248     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3249   }
3250 
3251 #if USE_ITT_BUILD
3252   __kmp_itt_lock_acquiring(lck);
3253 #endif /* USE_ITT_BUILD */
3254 
3255 #if OMPT_SUPPORT && OMPT_OPTIONAL
3256   // This is the case, if called from omp_init_lock_with_hint:
3257   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3258   if (!codeptr)
3259     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3260   if (ompt_enabled.enabled) &&
3261         ompt_enabled.ompt_callback_mutex_acquire) {
3262       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3263           ompt_mutex_nest_lock, omp_lock_hint_none,
3264           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3265           codeptr);
3266     }
3267 #endif
3268 
3269   rc = TEST_NESTED_LOCK(lck, gtid);
3270 #if USE_ITT_BUILD
3271   if (rc) {
3272     __kmp_itt_lock_acquired(lck);
3273   } else {
3274     __kmp_itt_lock_cancelled(lck);
3275   }
3276 #endif /* USE_ITT_BUILD */
3277 #if OMPT_SUPPORT && OMPT_OPTIONAL
3278   if (ompt_enabled.enabled && rc) {
3279     if (rc == 1) {
3280       if (ompt_enabled.ompt_callback_mutex_acquired) {
3281         // lock_first
3282         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3283             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3284       }
3285     } else {
3286       if (ompt_enabled.ompt_callback_nest_lock) {
3287         // lock_next
3288         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3289             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3290       }
3291     }
3292   }
3293 #endif
3294   return rc;
3295 
3296   /* Can't use serial interval since not block structured */
3297 
3298 #endif // KMP_USE_DYNAMIC_LOCK
3299 }
3300 
3301 // Interface to fast scalable reduce methods routines
3302 
3303 // keep the selected method in a thread local structure for cross-function
3304 // usage: will be used in __kmpc_end_reduce* functions;
3305 // another solution: to re-determine the method one more time in
3306 // __kmpc_end_reduce* functions (new prototype required then)
3307 // AT: which solution is better?
3308 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3309   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3310 
3311 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3312   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3313 
3314 // description of the packed_reduction_method variable: look at the macros in
3315 // kmp.h
3316 
3317 // used in a critical section reduce block
3318 static __forceinline void
3319 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3320                                           kmp_critical_name *crit) {
3321 
3322   // this lock was visible to a customer and to the threading profile tool as a
3323   // serial overhead span (although it's used for an internal purpose only)
3324   //            why was it visible in previous implementation?
3325   //            should we keep it visible in new reduce block?
3326   kmp_user_lock_p lck;
3327 
3328 #if KMP_USE_DYNAMIC_LOCK
3329 
3330   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3331   // Check if it is initialized.
3332   if (*lk == 0) {
3333     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3334       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3335                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3336     } else {
3337       __kmp_init_indirect_csptr(crit, loc, global_tid,
3338                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3339     }
3340   }
3341   // Branch for accessing the actual lock object and set operation. This
3342   // branching is inevitable since this lock initialization does not follow the
3343   // normal dispatch path (lock table is not used).
3344   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3345     lck = (kmp_user_lock_p)lk;
3346     KMP_DEBUG_ASSERT(lck != NULL);
3347     if (__kmp_env_consistency_check) {
3348       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3349     }
3350     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3351   } else {
3352     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3353     lck = ilk->lock;
3354     KMP_DEBUG_ASSERT(lck != NULL);
3355     if (__kmp_env_consistency_check) {
3356       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3357     }
3358     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3359   }
3360 
3361 #else // KMP_USE_DYNAMIC_LOCK
3362 
3363   // We know that the fast reduction code is only emitted by Intel compilers
3364   // with 32 byte critical sections. If there isn't enough space, then we
3365   // have to use a pointer.
3366   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3367     lck = (kmp_user_lock_p)crit;
3368   } else {
3369     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3370   }
3371   KMP_DEBUG_ASSERT(lck != NULL);
3372 
3373   if (__kmp_env_consistency_check)
3374     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3375 
3376   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3377 
3378 #endif // KMP_USE_DYNAMIC_LOCK
3379 }
3380 
3381 // used in a critical section reduce block
3382 static __forceinline void
3383 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3384                                         kmp_critical_name *crit) {
3385 
3386   kmp_user_lock_p lck;
3387 
3388 #if KMP_USE_DYNAMIC_LOCK
3389 
3390   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3391     lck = (kmp_user_lock_p)crit;
3392     if (__kmp_env_consistency_check)
3393       __kmp_pop_sync(global_tid, ct_critical, loc);
3394     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3395   } else {
3396     kmp_indirect_lock_t *ilk =
3397         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3398     if (__kmp_env_consistency_check)
3399       __kmp_pop_sync(global_tid, ct_critical, loc);
3400     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3401   }
3402 
3403 #else // KMP_USE_DYNAMIC_LOCK
3404 
3405   // We know that the fast reduction code is only emitted by Intel compilers
3406   // with 32 byte critical sections. If there isn't enough space, then we have
3407   // to use a pointer.
3408   if (__kmp_base_user_lock_size > 32) {
3409     lck = *((kmp_user_lock_p *)crit);
3410     KMP_ASSERT(lck != NULL);
3411   } else {
3412     lck = (kmp_user_lock_p)crit;
3413   }
3414 
3415   if (__kmp_env_consistency_check)
3416     __kmp_pop_sync(global_tid, ct_critical, loc);
3417 
3418   __kmp_release_user_lock_with_checks(lck, global_tid);
3419 
3420 #endif // KMP_USE_DYNAMIC_LOCK
3421 } // __kmp_end_critical_section_reduce_block
3422 
3423 static __forceinline int
3424 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3425                                      int *task_state) {
3426   kmp_team_t *team;
3427 
3428   // Check if we are inside the teams construct?
3429   if (th->th.th_teams_microtask) {
3430     *team_p = team = th->th.th_team;
3431     if (team->t.t_level == th->th.th_teams_level) {
3432       // This is reduction at teams construct.
3433       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3434       // Let's swap teams temporarily for the reduction.
3435       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3436       th->th.th_team = team->t.t_parent;
3437       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3438       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3439       *task_state = th->th.th_task_state;
3440       th->th.th_task_state = 0;
3441 
3442       return 1;
3443     }
3444   }
3445   return 0;
3446 }
3447 
3448 static __forceinline void
3449 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3450   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3451   th->th.th_info.ds.ds_tid = 0;
3452   th->th.th_team = team;
3453   th->th.th_team_nproc = team->t.t_nproc;
3454   th->th.th_task_team = team->t.t_task_team[task_state];
3455   __kmp_type_convert(task_state, &(th->th.th_task_state));
3456 }
3457 
3458 /* 2.a.i. Reduce Block without a terminating barrier */
3459 /*!
3460 @ingroup SYNCHRONIZATION
3461 @param loc source location information
3462 @param global_tid global thread number
3463 @param num_vars number of items (variables) to be reduced
3464 @param reduce_size size of data in bytes to be reduced
3465 @param reduce_data pointer to data to be reduced
3466 @param reduce_func callback function providing reduction operation on two
3467 operands and returning result of reduction in lhs_data
3468 @param lck pointer to the unique lock data structure
3469 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3470 threads if atomic reduction needed
3471 
3472 The nowait version is used for a reduce clause with the nowait argument.
3473 */
3474 kmp_int32
3475 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3476                      size_t reduce_size, void *reduce_data,
3477                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3478                      kmp_critical_name *lck) {
3479 
3480   KMP_COUNT_BLOCK(REDUCE_nowait);
3481   int retval = 0;
3482   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3483   kmp_info_t *th;
3484   kmp_team_t *team;
3485   int teams_swapped = 0, task_state;
3486   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3487   __kmp_assert_valid_gtid(global_tid);
3488 
3489   // why do we need this initialization here at all?
3490   // Reduction clause can not be used as a stand-alone directive.
3491 
3492   // do not call __kmp_serial_initialize(), it will be called by
3493   // __kmp_parallel_initialize() if needed
3494   // possible detection of false-positive race by the threadchecker ???
3495   if (!TCR_4(__kmp_init_parallel))
3496     __kmp_parallel_initialize();
3497 
3498   __kmp_resume_if_soft_paused();
3499 
3500 // check correctness of reduce block nesting
3501 #if KMP_USE_DYNAMIC_LOCK
3502   if (__kmp_env_consistency_check)
3503     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3504 #else
3505   if (__kmp_env_consistency_check)
3506     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3507 #endif
3508 
3509   th = __kmp_thread_from_gtid(global_tid);
3510   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3511 
3512   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3513   // the value should be kept in a variable
3514   // the variable should be either a construct-specific or thread-specific
3515   // property, not a team specific property
3516   //     (a thread can reach the next reduce block on the next construct, reduce
3517   //     method may differ on the next construct)
3518   // an ident_t "loc" parameter could be used as a construct-specific property
3519   // (what if loc == 0?)
3520   //     (if both construct-specific and team-specific variables were shared,
3521   //     then unness extra syncs should be needed)
3522   // a thread-specific variable is better regarding two issues above (next
3523   // construct and extra syncs)
3524   // a thread-specific "th_local.reduction_method" variable is used currently
3525   // each thread executes 'determine' and 'set' lines (no need to execute by one
3526   // thread, to avoid unness extra syncs)
3527 
3528   packed_reduction_method = __kmp_determine_reduction_method(
3529       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3530   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3531 
3532   OMPT_REDUCTION_DECL(th, global_tid);
3533   if (packed_reduction_method == critical_reduce_block) {
3534 
3535     OMPT_REDUCTION_BEGIN;
3536 
3537     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3538     retval = 1;
3539 
3540   } else if (packed_reduction_method == empty_reduce_block) {
3541 
3542     OMPT_REDUCTION_BEGIN;
3543 
3544     // usage: if team size == 1, no synchronization is required ( Intel
3545     // platforms only )
3546     retval = 1;
3547 
3548   } else if (packed_reduction_method == atomic_reduce_block) {
3549 
3550     retval = 2;
3551 
3552     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3553     // won't be called by the code gen)
3554     //     (it's not quite good, because the checking block has been closed by
3555     //     this 'pop',
3556     //      but atomic operation has not been executed yet, will be executed
3557     //      slightly later, literally on next instruction)
3558     if (__kmp_env_consistency_check)
3559       __kmp_pop_sync(global_tid, ct_reduce, loc);
3560 
3561   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3562                                    tree_reduce_block)) {
3563 
3564 // AT: performance issue: a real barrier here
3565 // AT: (if primary thread is slow, other threads are blocked here waiting for
3566 //      the primary thread to come and release them)
3567 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3568 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3569 //      be confusing to a customer)
3570 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3571 // might go faster and be more in line with sense of NOWAIT
3572 // AT: TO DO: do epcc test and compare times
3573 
3574 // this barrier should be invisible to a customer and to the threading profile
3575 // tool (it's neither a terminating barrier nor customer's code, it's
3576 // used for an internal purpose)
3577 #if OMPT_SUPPORT
3578     // JP: can this barrier potentially leed to task scheduling?
3579     // JP: as long as there is a barrier in the implementation, OMPT should and
3580     // will provide the barrier events
3581     //         so we set-up the necessary frame/return addresses.
3582     ompt_frame_t *ompt_frame;
3583     if (ompt_enabled.enabled) {
3584       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3585       if (ompt_frame->enter_frame.ptr == NULL)
3586         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3587     }
3588     OMPT_STORE_RETURN_ADDRESS(global_tid);
3589 #endif
3590 #if USE_ITT_NOTIFY
3591     __kmp_threads[global_tid]->th.th_ident = loc;
3592 #endif
3593     retval =
3594         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3595                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3596     retval = (retval != 0) ? (0) : (1);
3597 #if OMPT_SUPPORT && OMPT_OPTIONAL
3598     if (ompt_enabled.enabled) {
3599       ompt_frame->enter_frame = ompt_data_none;
3600     }
3601 #endif
3602 
3603     // all other workers except primary thread should do this pop here
3604     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3605     if (__kmp_env_consistency_check) {
3606       if (retval == 0) {
3607         __kmp_pop_sync(global_tid, ct_reduce, loc);
3608       }
3609     }
3610 
3611   } else {
3612 
3613     // should never reach this block
3614     KMP_ASSERT(0); // "unexpected method"
3615   }
3616   if (teams_swapped) {
3617     __kmp_restore_swapped_teams(th, team, task_state);
3618   }
3619   KA_TRACE(
3620       10,
3621       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3622        global_tid, packed_reduction_method, retval));
3623 
3624   return retval;
3625 }
3626 
3627 /*!
3628 @ingroup SYNCHRONIZATION
3629 @param loc source location information
3630 @param global_tid global thread id.
3631 @param lck pointer to the unique lock data structure
3632 
3633 Finish the execution of a reduce nowait.
3634 */
3635 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3636                               kmp_critical_name *lck) {
3637 
3638   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3639 
3640   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3641   __kmp_assert_valid_gtid(global_tid);
3642 
3643   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3644 
3645   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3646 
3647   if (packed_reduction_method == critical_reduce_block) {
3648 
3649     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3650     OMPT_REDUCTION_END;
3651 
3652   } else if (packed_reduction_method == empty_reduce_block) {
3653 
3654     // usage: if team size == 1, no synchronization is required ( on Intel
3655     // platforms only )
3656 
3657     OMPT_REDUCTION_END;
3658 
3659   } else if (packed_reduction_method == atomic_reduce_block) {
3660 
3661     // neither primary thread nor other workers should get here
3662     //     (code gen does not generate this call in case 2: atomic reduce block)
3663     // actually it's better to remove this elseif at all;
3664     // after removal this value will checked by the 'else' and will assert
3665 
3666   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3667                                    tree_reduce_block)) {
3668 
3669     // only primary thread gets here
3670     // OMPT: tree reduction is annotated in the barrier code
3671 
3672   } else {
3673 
3674     // should never reach this block
3675     KMP_ASSERT(0); // "unexpected method"
3676   }
3677 
3678   if (__kmp_env_consistency_check)
3679     __kmp_pop_sync(global_tid, ct_reduce, loc);
3680 
3681   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3682                 global_tid, packed_reduction_method));
3683 
3684   return;
3685 }
3686 
3687 /* 2.a.ii. Reduce Block with a terminating barrier */
3688 
3689 /*!
3690 @ingroup SYNCHRONIZATION
3691 @param loc source location information
3692 @param global_tid global thread number
3693 @param num_vars number of items (variables) to be reduced
3694 @param reduce_size size of data in bytes to be reduced
3695 @param reduce_data pointer to data to be reduced
3696 @param reduce_func callback function providing reduction operation on two
3697 operands and returning result of reduction in lhs_data
3698 @param lck pointer to the unique lock data structure
3699 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3700 threads if atomic reduction needed
3701 
3702 A blocking reduce that includes an implicit barrier.
3703 */
3704 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3705                         size_t reduce_size, void *reduce_data,
3706                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3707                         kmp_critical_name *lck) {
3708   KMP_COUNT_BLOCK(REDUCE_wait);
3709   int retval = 0;
3710   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3711   kmp_info_t *th;
3712   kmp_team_t *team;
3713   int teams_swapped = 0, task_state;
3714 
3715   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3716   __kmp_assert_valid_gtid(global_tid);
3717 
3718   // why do we need this initialization here at all?
3719   // Reduction clause can not be a stand-alone directive.
3720 
3721   // do not call __kmp_serial_initialize(), it will be called by
3722   // __kmp_parallel_initialize() if needed
3723   // possible detection of false-positive race by the threadchecker ???
3724   if (!TCR_4(__kmp_init_parallel))
3725     __kmp_parallel_initialize();
3726 
3727   __kmp_resume_if_soft_paused();
3728 
3729 // check correctness of reduce block nesting
3730 #if KMP_USE_DYNAMIC_LOCK
3731   if (__kmp_env_consistency_check)
3732     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3733 #else
3734   if (__kmp_env_consistency_check)
3735     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3736 #endif
3737 
3738   th = __kmp_thread_from_gtid(global_tid);
3739   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3740 
3741   packed_reduction_method = __kmp_determine_reduction_method(
3742       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3743   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3744 
3745   OMPT_REDUCTION_DECL(th, global_tid);
3746 
3747   if (packed_reduction_method == critical_reduce_block) {
3748 
3749     OMPT_REDUCTION_BEGIN;
3750     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3751     retval = 1;
3752 
3753   } else if (packed_reduction_method == empty_reduce_block) {
3754 
3755     OMPT_REDUCTION_BEGIN;
3756     // usage: if team size == 1, no synchronization is required ( Intel
3757     // platforms only )
3758     retval = 1;
3759 
3760   } else if (packed_reduction_method == atomic_reduce_block) {
3761 
3762     retval = 2;
3763 
3764   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3765                                    tree_reduce_block)) {
3766 
3767 // case tree_reduce_block:
3768 // this barrier should be visible to a customer and to the threading profile
3769 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3770 #if OMPT_SUPPORT
3771     ompt_frame_t *ompt_frame;
3772     if (ompt_enabled.enabled) {
3773       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3774       if (ompt_frame->enter_frame.ptr == NULL)
3775         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3776     }
3777     OMPT_STORE_RETURN_ADDRESS(global_tid);
3778 #endif
3779 #if USE_ITT_NOTIFY
3780     __kmp_threads[global_tid]->th.th_ident =
3781         loc; // needed for correct notification of frames
3782 #endif
3783     retval =
3784         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3785                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3786     retval = (retval != 0) ? (0) : (1);
3787 #if OMPT_SUPPORT && OMPT_OPTIONAL
3788     if (ompt_enabled.enabled) {
3789       ompt_frame->enter_frame = ompt_data_none;
3790     }
3791 #endif
3792 
3793     // all other workers except primary thread should do this pop here
3794     // (none of other workers except primary will enter __kmpc_end_reduce())
3795     if (__kmp_env_consistency_check) {
3796       if (retval == 0) { // 0: all other workers; 1: primary thread
3797         __kmp_pop_sync(global_tid, ct_reduce, loc);
3798       }
3799     }
3800 
3801   } else {
3802 
3803     // should never reach this block
3804     KMP_ASSERT(0); // "unexpected method"
3805   }
3806   if (teams_swapped) {
3807     __kmp_restore_swapped_teams(th, team, task_state);
3808   }
3809 
3810   KA_TRACE(10,
3811            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3812             global_tid, packed_reduction_method, retval));
3813   return retval;
3814 }
3815 
3816 /*!
3817 @ingroup SYNCHRONIZATION
3818 @param loc source location information
3819 @param global_tid global thread id.
3820 @param lck pointer to the unique lock data structure
3821 
3822 Finish the execution of a blocking reduce.
3823 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3824 start function.
3825 */
3826 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3827                        kmp_critical_name *lck) {
3828 
3829   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3830   kmp_info_t *th;
3831   kmp_team_t *team;
3832   int teams_swapped = 0, task_state;
3833 
3834   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3835   __kmp_assert_valid_gtid(global_tid);
3836 
3837   th = __kmp_thread_from_gtid(global_tid);
3838   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3839 
3840   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3841 
3842   // this barrier should be visible to a customer and to the threading profile
3843   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3844   OMPT_REDUCTION_DECL(th, global_tid);
3845 
3846   if (packed_reduction_method == critical_reduce_block) {
3847     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3848 
3849     OMPT_REDUCTION_END;
3850 
3851 // TODO: implicit barrier: should be exposed
3852 #if OMPT_SUPPORT
3853     ompt_frame_t *ompt_frame;
3854     if (ompt_enabled.enabled) {
3855       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3856       if (ompt_frame->enter_frame.ptr == NULL)
3857         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3858     }
3859     OMPT_STORE_RETURN_ADDRESS(global_tid);
3860 #endif
3861 #if USE_ITT_NOTIFY
3862     __kmp_threads[global_tid]->th.th_ident = loc;
3863 #endif
3864     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3865 #if OMPT_SUPPORT && OMPT_OPTIONAL
3866     if (ompt_enabled.enabled) {
3867       ompt_frame->enter_frame = ompt_data_none;
3868     }
3869 #endif
3870 
3871   } else if (packed_reduction_method == empty_reduce_block) {
3872 
3873     OMPT_REDUCTION_END;
3874 
3875 // usage: if team size==1, no synchronization is required (Intel platforms only)
3876 
3877 // TODO: implicit barrier: should be exposed
3878 #if OMPT_SUPPORT
3879     ompt_frame_t *ompt_frame;
3880     if (ompt_enabled.enabled) {
3881       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3882       if (ompt_frame->enter_frame.ptr == NULL)
3883         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3884     }
3885     OMPT_STORE_RETURN_ADDRESS(global_tid);
3886 #endif
3887 #if USE_ITT_NOTIFY
3888     __kmp_threads[global_tid]->th.th_ident = loc;
3889 #endif
3890     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3891 #if OMPT_SUPPORT && OMPT_OPTIONAL
3892     if (ompt_enabled.enabled) {
3893       ompt_frame->enter_frame = ompt_data_none;
3894     }
3895 #endif
3896 
3897   } else if (packed_reduction_method == atomic_reduce_block) {
3898 
3899 #if OMPT_SUPPORT
3900     ompt_frame_t *ompt_frame;
3901     if (ompt_enabled.enabled) {
3902       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3903       if (ompt_frame->enter_frame.ptr == NULL)
3904         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3905     }
3906     OMPT_STORE_RETURN_ADDRESS(global_tid);
3907 #endif
3908 // TODO: implicit barrier: should be exposed
3909 #if USE_ITT_NOTIFY
3910     __kmp_threads[global_tid]->th.th_ident = loc;
3911 #endif
3912     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3913 #if OMPT_SUPPORT && OMPT_OPTIONAL
3914     if (ompt_enabled.enabled) {
3915       ompt_frame->enter_frame = ompt_data_none;
3916     }
3917 #endif
3918 
3919   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3920                                    tree_reduce_block)) {
3921 
3922     // only primary thread executes here (primary releases all other workers)
3923     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3924                             global_tid);
3925 
3926   } else {
3927 
3928     // should never reach this block
3929     KMP_ASSERT(0); // "unexpected method"
3930   }
3931   if (teams_swapped) {
3932     __kmp_restore_swapped_teams(th, team, task_state);
3933   }
3934 
3935   if (__kmp_env_consistency_check)
3936     __kmp_pop_sync(global_tid, ct_reduce, loc);
3937 
3938   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3939                 global_tid, packed_reduction_method));
3940 
3941   return;
3942 }
3943 
3944 #undef __KMP_GET_REDUCTION_METHOD
3945 #undef __KMP_SET_REDUCTION_METHOD
3946 
3947 /* end of interface to fast scalable reduce routines */
3948 
3949 kmp_uint64 __kmpc_get_taskid() {
3950 
3951   kmp_int32 gtid;
3952   kmp_info_t *thread;
3953 
3954   gtid = __kmp_get_gtid();
3955   if (gtid < 0) {
3956     return 0;
3957   }
3958   thread = __kmp_thread_from_gtid(gtid);
3959   return thread->th.th_current_task->td_task_id;
3960 
3961 } // __kmpc_get_taskid
3962 
3963 kmp_uint64 __kmpc_get_parent_taskid() {
3964 
3965   kmp_int32 gtid;
3966   kmp_info_t *thread;
3967   kmp_taskdata_t *parent_task;
3968 
3969   gtid = __kmp_get_gtid();
3970   if (gtid < 0) {
3971     return 0;
3972   }
3973   thread = __kmp_thread_from_gtid(gtid);
3974   parent_task = thread->th.th_current_task->td_parent;
3975   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3976 
3977 } // __kmpc_get_parent_taskid
3978 
3979 /*!
3980 @ingroup WORK_SHARING
3981 @param loc  source location information.
3982 @param gtid  global thread number.
3983 @param num_dims  number of associated doacross loops.
3984 @param dims  info on loops bounds.
3985 
3986 Initialize doacross loop information.
3987 Expect compiler send us inclusive bounds,
3988 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3989 */
3990 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3991                           const struct kmp_dim *dims) {
3992   __kmp_assert_valid_gtid(gtid);
3993   int j, idx;
3994   kmp_int64 last, trace_count;
3995   kmp_info_t *th = __kmp_threads[gtid];
3996   kmp_team_t *team = th->th.th_team;
3997   kmp_uint32 *flags;
3998   kmp_disp_t *pr_buf = th->th.th_dispatch;
3999   dispatch_shared_info_t *sh_buf;
4000 
4001   KA_TRACE(
4002       20,
4003       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
4004        gtid, num_dims, !team->t.t_serialized));
4005   KMP_DEBUG_ASSERT(dims != NULL);
4006   KMP_DEBUG_ASSERT(num_dims > 0);
4007 
4008   if (team->t.t_serialized) {
4009     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
4010     return; // no dependencies if team is serialized
4011   }
4012   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
4013   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
4014   // the next loop
4015   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4016 
4017   // Save bounds info into allocated private buffer
4018   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
4019   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
4020       th, sizeof(kmp_int64) * (4 * num_dims + 1));
4021   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4022   pr_buf->th_doacross_info[0] =
4023       (kmp_int64)num_dims; // first element is number of dimensions
4024   // Save also address of num_done in order to access it later without knowing
4025   // the buffer index
4026   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
4027   pr_buf->th_doacross_info[2] = dims[0].lo;
4028   pr_buf->th_doacross_info[3] = dims[0].up;
4029   pr_buf->th_doacross_info[4] = dims[0].st;
4030   last = 5;
4031   for (j = 1; j < num_dims; ++j) {
4032     kmp_int64
4033         range_length; // To keep ranges of all dimensions but the first dims[0]
4034     if (dims[j].st == 1) { // most common case
4035       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
4036       range_length = dims[j].up - dims[j].lo + 1;
4037     } else {
4038       if (dims[j].st > 0) {
4039         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
4040         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
4041       } else { // negative increment
4042         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
4043         range_length =
4044             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
4045       }
4046     }
4047     pr_buf->th_doacross_info[last++] = range_length;
4048     pr_buf->th_doacross_info[last++] = dims[j].lo;
4049     pr_buf->th_doacross_info[last++] = dims[j].up;
4050     pr_buf->th_doacross_info[last++] = dims[j].st;
4051   }
4052 
4053   // Compute total trip count.
4054   // Start with range of dims[0] which we don't need to keep in the buffer.
4055   if (dims[0].st == 1) { // most common case
4056     trace_count = dims[0].up - dims[0].lo + 1;
4057   } else if (dims[0].st > 0) {
4058     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
4059     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
4060   } else { // negative increment
4061     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
4062     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
4063   }
4064   for (j = 1; j < num_dims; ++j) {
4065     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
4066   }
4067   KMP_DEBUG_ASSERT(trace_count > 0);
4068 
4069   // Check if shared buffer is not occupied by other loop (idx -
4070   // __kmp_dispatch_num_buffers)
4071   if (idx != sh_buf->doacross_buf_idx) {
4072     // Shared buffer is occupied, wait for it to be free
4073     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
4074                  __kmp_eq_4, NULL);
4075   }
4076 #if KMP_32_BIT_ARCH
4077   // Check if we are the first thread. After the CAS the first thread gets 0,
4078   // others get 1 if initialization is in progress, allocated pointer otherwise.
4079   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
4080   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
4081       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
4082 #else
4083   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
4084       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
4085 #endif
4086   if (flags == NULL) {
4087     // we are the first thread, allocate the array of flags
4088     size_t size =
4089         (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
4090     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4091     KMP_MB();
4092     sh_buf->doacross_flags = flags;
4093   } else if (flags == (kmp_uint32 *)1) {
4094 #if KMP_32_BIT_ARCH
4095     // initialization is still in progress, need to wait
4096     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4097 #else
4098     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4099 #endif
4100       KMP_YIELD(TRUE);
4101     KMP_MB();
4102   } else {
4103     KMP_MB();
4104   }
4105   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4106   pr_buf->th_doacross_flags =
4107       sh_buf->doacross_flags; // save private copy in order to not
4108   // touch shared buffer on each iteration
4109   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4110 }
4111 
4112 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4113   __kmp_assert_valid_gtid(gtid);
4114   kmp_int64 shft;
4115   size_t num_dims, i;
4116   kmp_uint32 flag;
4117   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4118   kmp_info_t *th = __kmp_threads[gtid];
4119   kmp_team_t *team = th->th.th_team;
4120   kmp_disp_t *pr_buf;
4121   kmp_int64 lo, up, st;
4122 
4123   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4124   if (team->t.t_serialized) {
4125     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4126     return; // no dependencies if team is serialized
4127   }
4128 
4129   // calculate sequential iteration number and check out-of-bounds condition
4130   pr_buf = th->th.th_dispatch;
4131   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4132   num_dims = (size_t)pr_buf->th_doacross_info[0];
4133   lo = pr_buf->th_doacross_info[2];
4134   up = pr_buf->th_doacross_info[3];
4135   st = pr_buf->th_doacross_info[4];
4136 #if OMPT_SUPPORT && OMPT_OPTIONAL
4137   ompt_dependence_t deps[num_dims];
4138 #endif
4139   if (st == 1) { // most common case
4140     if (vec[0] < lo || vec[0] > up) {
4141       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4142                     "bounds [%lld,%lld]\n",
4143                     gtid, vec[0], lo, up));
4144       return;
4145     }
4146     iter_number = vec[0] - lo;
4147   } else if (st > 0) {
4148     if (vec[0] < lo || vec[0] > up) {
4149       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4150                     "bounds [%lld,%lld]\n",
4151                     gtid, vec[0], lo, up));
4152       return;
4153     }
4154     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4155   } else { // negative increment
4156     if (vec[0] > lo || vec[0] < up) {
4157       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4158                     "bounds [%lld,%lld]\n",
4159                     gtid, vec[0], lo, up));
4160       return;
4161     }
4162     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4163   }
4164 #if OMPT_SUPPORT && OMPT_OPTIONAL
4165   deps[0].variable.value = iter_number;
4166   deps[0].dependence_type = ompt_dependence_type_sink;
4167 #endif
4168   for (i = 1; i < num_dims; ++i) {
4169     kmp_int64 iter, ln;
4170     size_t j = i * 4;
4171     ln = pr_buf->th_doacross_info[j + 1];
4172     lo = pr_buf->th_doacross_info[j + 2];
4173     up = pr_buf->th_doacross_info[j + 3];
4174     st = pr_buf->th_doacross_info[j + 4];
4175     if (st == 1) {
4176       if (vec[i] < lo || vec[i] > up) {
4177         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4178                       "bounds [%lld,%lld]\n",
4179                       gtid, vec[i], lo, up));
4180         return;
4181       }
4182       iter = vec[i] - lo;
4183     } else if (st > 0) {
4184       if (vec[i] < lo || vec[i] > up) {
4185         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4186                       "bounds [%lld,%lld]\n",
4187                       gtid, vec[i], lo, up));
4188         return;
4189       }
4190       iter = (kmp_uint64)(vec[i] - lo) / st;
4191     } else { // st < 0
4192       if (vec[i] > lo || vec[i] < up) {
4193         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4194                       "bounds [%lld,%lld]\n",
4195                       gtid, vec[i], lo, up));
4196         return;
4197       }
4198       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4199     }
4200     iter_number = iter + ln * iter_number;
4201 #if OMPT_SUPPORT && OMPT_OPTIONAL
4202     deps[i].variable.value = iter;
4203     deps[i].dependence_type = ompt_dependence_type_sink;
4204 #endif
4205   }
4206   shft = iter_number % 32; // use 32-bit granularity
4207   iter_number >>= 5; // divided by 32
4208   flag = 1 << shft;
4209   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4210     KMP_YIELD(TRUE);
4211   }
4212   KMP_MB();
4213 #if OMPT_SUPPORT && OMPT_OPTIONAL
4214   if (ompt_enabled.ompt_callback_dependences) {
4215     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4216         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4217   }
4218 #endif
4219   KA_TRACE(20,
4220            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4221             gtid, (iter_number << 5) + shft));
4222 }
4223 
4224 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4225   __kmp_assert_valid_gtid(gtid);
4226   kmp_int64 shft;
4227   size_t num_dims, i;
4228   kmp_uint32 flag;
4229   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4230   kmp_info_t *th = __kmp_threads[gtid];
4231   kmp_team_t *team = th->th.th_team;
4232   kmp_disp_t *pr_buf;
4233   kmp_int64 lo, st;
4234 
4235   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4236   if (team->t.t_serialized) {
4237     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4238     return; // no dependencies if team is serialized
4239   }
4240 
4241   // calculate sequential iteration number (same as in "wait" but no
4242   // out-of-bounds checks)
4243   pr_buf = th->th.th_dispatch;
4244   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4245   num_dims = (size_t)pr_buf->th_doacross_info[0];
4246   lo = pr_buf->th_doacross_info[2];
4247   st = pr_buf->th_doacross_info[4];
4248 #if OMPT_SUPPORT && OMPT_OPTIONAL
4249   ompt_dependence_t deps[num_dims];
4250 #endif
4251   if (st == 1) { // most common case
4252     iter_number = vec[0] - lo;
4253   } else if (st > 0) {
4254     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4255   } else { // negative increment
4256     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4257   }
4258 #if OMPT_SUPPORT && OMPT_OPTIONAL
4259   deps[0].variable.value = iter_number;
4260   deps[0].dependence_type = ompt_dependence_type_source;
4261 #endif
4262   for (i = 1; i < num_dims; ++i) {
4263     kmp_int64 iter, ln;
4264     size_t j = i * 4;
4265     ln = pr_buf->th_doacross_info[j + 1];
4266     lo = pr_buf->th_doacross_info[j + 2];
4267     st = pr_buf->th_doacross_info[j + 4];
4268     if (st == 1) {
4269       iter = vec[i] - lo;
4270     } else if (st > 0) {
4271       iter = (kmp_uint64)(vec[i] - lo) / st;
4272     } else { // st < 0
4273       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4274     }
4275     iter_number = iter + ln * iter_number;
4276 #if OMPT_SUPPORT && OMPT_OPTIONAL
4277     deps[i].variable.value = iter;
4278     deps[i].dependence_type = ompt_dependence_type_source;
4279 #endif
4280   }
4281 #if OMPT_SUPPORT && OMPT_OPTIONAL
4282   if (ompt_enabled.ompt_callback_dependences) {
4283     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4284         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4285   }
4286 #endif
4287   shft = iter_number % 32; // use 32-bit granularity
4288   iter_number >>= 5; // divided by 32
4289   flag = 1 << shft;
4290   KMP_MB();
4291   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4292     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4293   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4294                 (iter_number << 5) + shft));
4295 }
4296 
4297 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4298   __kmp_assert_valid_gtid(gtid);
4299   kmp_int32 num_done;
4300   kmp_info_t *th = __kmp_threads[gtid];
4301   kmp_team_t *team = th->th.th_team;
4302   kmp_disp_t *pr_buf = th->th.th_dispatch;
4303 
4304   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4305   if (team->t.t_serialized) {
4306     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4307     return; // nothing to do
4308   }
4309   num_done =
4310       KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4311   if (num_done == th->th.th_team_nproc) {
4312     // we are the last thread, need to free shared resources
4313     int idx = pr_buf->th_doacross_buf_idx - 1;
4314     dispatch_shared_info_t *sh_buf =
4315         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4316     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4317                      (kmp_int64)&sh_buf->doacross_num_done);
4318     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4319     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4320     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4321     sh_buf->doacross_flags = NULL;
4322     sh_buf->doacross_num_done = 0;
4323     sh_buf->doacross_buf_idx +=
4324         __kmp_dispatch_num_buffers; // free buffer for future re-use
4325   }
4326   // free private resources (need to keep buffer index forever)
4327   pr_buf->th_doacross_flags = NULL;
4328   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4329   pr_buf->th_doacross_info = NULL;
4330   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4331 }
4332 
4333 /* omp_alloc/omp_calloc/omp_free only defined for C/C++, not for Fortran */
4334 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4335   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4336 }
4337 
4338 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4339   return __kmpc_calloc(__kmp_entry_gtid(), nmemb, size, allocator);
4340 }
4341 
4342 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4343                   omp_allocator_handle_t free_allocator) {
4344   return __kmpc_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4345                         free_allocator);
4346 }
4347 
4348 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4349   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4350 }
4351 
4352 int __kmpc_get_target_offload(void) {
4353   if (!__kmp_init_serial) {
4354     __kmp_serial_initialize();
4355   }
4356   return __kmp_target_offload;
4357 }
4358 
4359 int __kmpc_pause_resource(kmp_pause_status_t level) {
4360   if (!__kmp_init_serial) {
4361     return 1; // Can't pause if runtime is not initialized
4362   }
4363   return __kmp_pause_resource(level);
4364 }
4365 
4366 void __kmpc_error(ident_t *loc, int severity, const char *message) {
4367   if (!__kmp_init_serial)
4368     __kmp_serial_initialize();
4369 
4370   KMP_ASSERT(severity == severity_warning || severity == severity_fatal);
4371 
4372 #if OMPT_SUPPORT
4373   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_error) {
4374     ompt_callbacks.ompt_callback(ompt_callback_error)(
4375         (ompt_severity_t)severity, message, KMP_STRLEN(message),
4376         OMPT_GET_RETURN_ADDRESS(0));
4377   }
4378 #endif // OMPT_SUPPORT
4379 
4380   char *src_loc;
4381   if (loc && loc->psource) {
4382     kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
4383     src_loc =
4384         __kmp_str_format("%s:%s:%s", str_loc.file, str_loc.line, str_loc.col);
4385     __kmp_str_loc_free(&str_loc);
4386   } else {
4387     src_loc = __kmp_str_format("unknown");
4388   }
4389 
4390   if (severity == severity_warning)
4391     KMP_WARNING(UserDirectedWarning, src_loc, message);
4392   else
4393     KMP_FATAL(UserDirectedError, src_loc, message);
4394 
4395   __kmp_str_free(&src_loc);
4396 }
4397 
4398 #ifdef KMP_USE_VERSION_SYMBOLS
4399 // For GOMP compatibility there are two versions of each omp_* API.
4400 // One is the plain C symbol and one is the Fortran symbol with an appended
4401 // underscore. When we implement a specific ompc_* version of an omp_*
4402 // function, we want the plain GOMP versioned symbol to alias the ompc_* version
4403 // instead of the Fortran versions in kmp_ftn_entry.h
4404 extern "C" {
4405 // Have to undef these from omp.h so they aren't translated into
4406 // their ompc counterparts in the KMP_VERSION_OMPC_SYMBOL macros below
4407 #ifdef omp_set_affinity_format
4408 #undef omp_set_affinity_format
4409 #endif
4410 #ifdef omp_get_affinity_format
4411 #undef omp_get_affinity_format
4412 #endif
4413 #ifdef omp_display_affinity
4414 #undef omp_display_affinity
4415 #endif
4416 #ifdef omp_capture_affinity
4417 #undef omp_capture_affinity
4418 #endif
4419 KMP_VERSION_OMPC_SYMBOL(ompc_set_affinity_format, omp_set_affinity_format, 50,
4420                         "OMP_5.0");
4421 KMP_VERSION_OMPC_SYMBOL(ompc_get_affinity_format, omp_get_affinity_format, 50,
4422                         "OMP_5.0");
4423 KMP_VERSION_OMPC_SYMBOL(ompc_display_affinity, omp_display_affinity, 50,
4424                         "OMP_5.0");
4425 KMP_VERSION_OMPC_SYMBOL(ompc_capture_affinity, omp_capture_affinity, 50,
4426                         "OMP_5.0");
4427 } // extern "C"
4428 #endif
4429