xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_csupport.cpp (revision 5f4c09dd85bff675e0ca63c55ea3c517e0fddfcc)
1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     __kmp_assign_root_init_mask();
43     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
44   } else if (__kmp_ignore_mppbeg() == FALSE) {
45     // By default __kmp_ignore_mppbeg() returns TRUE.
46     __kmp_internal_begin();
47     KC_TRACE(10, ("__kmpc_begin: called\n"));
48   }
49 }
50 
51 /*!
52  * @ingroup STARTUP_SHUTDOWN
53  * @param loc source location information
54  *
55  * Shutdown the runtime library. This is also optional, and even if called will
56  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
57  * zero.
58  */
59 void __kmpc_end(ident_t *loc) {
60   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
61   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
62   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
63   // returns FALSE and __kmpc_end() will unregister this root (it can cause
64   // library shut down).
65   if (__kmp_ignore_mppend() == FALSE) {
66     KC_TRACE(10, ("__kmpc_end: called\n"));
67     KA_TRACE(30, ("__kmpc_end\n"));
68 
69     __kmp_internal_end_thread(-1);
70   }
71 #if KMP_OS_WINDOWS && OMPT_SUPPORT
72   // Normal exit process on Windows does not allow worker threads of the final
73   // parallel region to finish reporting their events, so shutting down the
74   // library here fixes the issue at least for the cases where __kmpc_end() is
75   // placed properly.
76   if (ompt_enabled.enabled)
77     __kmp_internal_end_library(__kmp_gtid_get_specific());
78 #endif
79 }
80 
81 /*!
82 @ingroup THREAD_STATES
83 @param loc Source location information.
84 @return The global thread index of the active thread.
85 
86 This function can be called in any context.
87 
88 If the runtime has ony been entered at the outermost level from a
89 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
90 that which would be returned by omp_get_thread_num() in the outermost
91 active parallel construct. (Or zero if there is no active parallel
92 construct, since the primary thread is necessarily thread zero).
93 
94 If multiple non-OpenMP threads all enter an OpenMP construct then this
95 will be a unique thread identifier among all the threads created by
96 the OpenMP runtime (but the value cannot be defined in terms of
97 OpenMP thread ids returned by omp_get_thread_num()).
98 */
99 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
100   kmp_int32 gtid = __kmp_entry_gtid();
101 
102   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
103 
104   return gtid;
105 }
106 
107 /*!
108 @ingroup THREAD_STATES
109 @param loc Source location information.
110 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
111 
112 This function can be called in any context.
113 It returns the total number of threads under the control of the OpenMP runtime.
114 That is not a number that can be determined by any OpenMP standard calls, since
115 the library may be called from more than one non-OpenMP thread, and this
116 reflects the total over all such calls. Similarly the runtime maintains
117 underlying threads even when they are not active (since the cost of creating
118 and destroying OS threads is high), this call counts all such threads even if
119 they are not waiting for work.
120 */
121 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
122   KC_TRACE(10,
123            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
124 
125   return TCR_4(__kmp_all_nth);
126 }
127 
128 /*!
129 @ingroup THREAD_STATES
130 @param loc Source location information.
131 @return The thread number of the calling thread in the innermost active parallel
132 construct.
133 */
134 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
135   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
136   return __kmp_tid_from_gtid(__kmp_entry_gtid());
137 }
138 
139 /*!
140 @ingroup THREAD_STATES
141 @param loc Source location information.
142 @return The number of threads in the innermost active parallel construct.
143 */
144 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
145   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
146 
147   return __kmp_entry_thread()->th.th_team->t.t_nproc;
148 }
149 
150 /*!
151  * @ingroup DEPRECATED
152  * @param loc location description
153  *
154  * This function need not be called. It always returns TRUE.
155  */
156 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
157 #ifndef KMP_DEBUG
158 
159   return TRUE;
160 
161 #else
162 
163   const char *semi2;
164   const char *semi3;
165   int line_no;
166 
167   if (__kmp_par_range == 0) {
168     return TRUE;
169   }
170   semi2 = loc->psource;
171   if (semi2 == NULL) {
172     return TRUE;
173   }
174   semi2 = strchr(semi2, ';');
175   if (semi2 == NULL) {
176     return TRUE;
177   }
178   semi2 = strchr(semi2 + 1, ';');
179   if (semi2 == NULL) {
180     return TRUE;
181   }
182   if (__kmp_par_range_filename[0]) {
183     const char *name = semi2 - 1;
184     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
185       name--;
186     }
187     if ((*name == '/') || (*name == ';')) {
188       name++;
189     }
190     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
191       return __kmp_par_range < 0;
192     }
193   }
194   semi3 = strchr(semi2 + 1, ';');
195   if (__kmp_par_range_routine[0]) {
196     if ((semi3 != NULL) && (semi3 > semi2) &&
197         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
198       return __kmp_par_range < 0;
199     }
200   }
201   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
202     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
203       return __kmp_par_range > 0;
204     }
205     return __kmp_par_range < 0;
206   }
207   return TRUE;
208 
209 #endif /* KMP_DEBUG */
210 }
211 
212 /*!
213 @ingroup THREAD_STATES
214 @param loc Source location information.
215 @return 1 if this thread is executing inside an active parallel region, zero if
216 not.
217 */
218 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
219   return __kmp_entry_thread()->th.th_root->r.r_active;
220 }
221 
222 /*!
223 @ingroup PARALLEL
224 @param loc source location information
225 @param global_tid global thread number
226 @param num_threads number of threads requested for this parallel construct
227 
228 Set the number of threads to be used by the next fork spawned by this thread.
229 This call is only required if the parallel construct has a `num_threads` clause.
230 */
231 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
232                              kmp_int32 num_threads) {
233   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
234                 global_tid, num_threads));
235   __kmp_assert_valid_gtid(global_tid);
236   __kmp_push_num_threads(loc, global_tid, num_threads);
237 }
238 
239 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
240   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
241   /* the num_threads are automatically popped */
242 }
243 
244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245                            kmp_int32 proc_bind) {
246   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247                 proc_bind));
248   __kmp_assert_valid_gtid(global_tid);
249   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250 }
251 
252 /*!
253 @ingroup PARALLEL
254 @param loc  source location information
255 @param argc  total number of arguments in the ellipsis
256 @param microtask  pointer to callback routine consisting of outlined parallel
257 construct
258 @param ...  pointers to shared variables that aren't global
259 
260 Do the actual fork and call the microtask in the relevant number of threads.
261 */
262 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263   int gtid = __kmp_entry_gtid();
264 
265 #if (KMP_STATS_ENABLED)
266   // If we were in a serial region, then stop the serial timer, record
267   // the event, and start parallel region timer
268   stats_state_e previous_state = KMP_GET_THREAD_STATE();
269   if (previous_state == stats_state_e::SERIAL_REGION) {
270     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
271   } else {
272     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273   }
274   int inParallel = __kmpc_in_parallel(loc);
275   if (inParallel) {
276     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
277   } else {
278     KMP_COUNT_BLOCK(OMP_PARALLEL);
279   }
280 #endif
281 
282   // maybe to save thr_state is enough here
283   {
284     va_list ap;
285     va_start(ap, microtask);
286 
287 #if OMPT_SUPPORT
288     ompt_frame_t *ompt_frame;
289     if (ompt_enabled.enabled) {
290       kmp_info_t *master_th = __kmp_threads[gtid];
291       ompt_frame = &master_th->th.th_current_task->ompt_task_info.frame;
292       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
293     }
294     OMPT_STORE_RETURN_ADDRESS(gtid);
295 #endif
296 
297 #if INCLUDE_SSC_MARKS
298     SSC_MARK_FORKING();
299 #endif
300     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
301                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
302                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
303                     kmp_va_addr_of(ap));
304 #if INCLUDE_SSC_MARKS
305     SSC_MARK_JOINING();
306 #endif
307     __kmp_join_call(loc, gtid
308 #if OMPT_SUPPORT
309                     ,
310                     fork_context_intel
311 #endif
312     );
313 
314     va_end(ap);
315 
316 #if OMPT_SUPPORT
317     if (ompt_enabled.enabled) {
318       ompt_frame->enter_frame = ompt_data_none;
319     }
320 #endif
321   }
322 
323 #if KMP_STATS_ENABLED
324   if (previous_state == stats_state_e::SERIAL_REGION) {
325     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
326     KMP_SET_THREAD_STATE(previous_state);
327   } else {
328     KMP_POP_PARTITIONED_TIMER();
329   }
330 #endif // KMP_STATS_ENABLED
331 }
332 
333 /*!
334 @ingroup PARALLEL
335 @param loc  source location information
336 @param microtask  pointer to callback routine consisting of outlined parallel
337 construct
338 @param cond  condition for running in parallel
339 @param args  struct of pointers to shared variables that aren't global
340 
341 Perform a fork only if the condition is true.
342 */
343 void __kmpc_fork_call_if(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
344                          kmp_int32 cond, void *args) {
345   int gtid = __kmp_entry_gtid();
346   int zero = 0;
347   if (cond) {
348     if (args)
349       __kmpc_fork_call(loc, argc, microtask, args);
350     else
351       __kmpc_fork_call(loc, argc, microtask);
352   } else {
353     __kmpc_serialized_parallel(loc, gtid);
354 
355     if (args)
356       microtask(&gtid, &zero, args);
357     else
358       microtask(&gtid, &zero);
359 
360     __kmpc_end_serialized_parallel(loc, gtid);
361   }
362 }
363 
364 /*!
365 @ingroup PARALLEL
366 @param loc source location information
367 @param global_tid global thread number
368 @param num_teams number of teams requested for the teams construct
369 @param num_threads number of threads per team requested for the teams construct
370 
371 Set the number of teams to be used by the teams construct.
372 This call is only required if the teams construct has a `num_teams` clause
373 or a `thread_limit` clause (or both).
374 */
375 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
376                            kmp_int32 num_teams, kmp_int32 num_threads) {
377   KA_TRACE(20,
378            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
379             global_tid, num_teams, num_threads));
380   __kmp_assert_valid_gtid(global_tid);
381   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
382 }
383 
384 /*!
385 @ingroup PARALLEL
386 @param loc source location information
387 @param global_tid global thread number
388 @param num_teams_lb lower bound on number of teams requested for the teams
389 construct
390 @param num_teams_ub upper bound on number of teams requested for the teams
391 construct
392 @param num_threads number of threads per team requested for the teams construct
393 
394 Set the number of teams to be used by the teams construct. The number of initial
395 teams cretaed will be greater than or equal to the lower bound and less than or
396 equal to the upper bound.
397 This call is only required if the teams construct has a `num_teams` clause
398 or a `thread_limit` clause (or both).
399 */
400 void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
401                               kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
402                               kmp_int32 num_threads) {
403   KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
404                 " num_teams_ub=%d num_threads=%d\n",
405                 global_tid, num_teams_lb, num_teams_ub, num_threads));
406   __kmp_assert_valid_gtid(global_tid);
407   __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
408                           num_threads);
409 }
410 
411 /*!
412 @ingroup PARALLEL
413 @param loc  source location information
414 @param argc  total number of arguments in the ellipsis
415 @param microtask  pointer to callback routine consisting of outlined teams
416 construct
417 @param ...  pointers to shared variables that aren't global
418 
419 Do the actual fork and call the microtask in the relevant number of threads.
420 */
421 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
422                        ...) {
423   int gtid = __kmp_entry_gtid();
424   kmp_info_t *this_thr = __kmp_threads[gtid];
425   va_list ap;
426   va_start(ap, microtask);
427 
428 #if KMP_STATS_ENABLED
429   KMP_COUNT_BLOCK(OMP_TEAMS);
430   stats_state_e previous_state = KMP_GET_THREAD_STATE();
431   if (previous_state == stats_state_e::SERIAL_REGION) {
432     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
433   } else {
434     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
435   }
436 #endif
437 
438   // remember teams entry point and nesting level
439   this_thr->th.th_teams_microtask = microtask;
440   this_thr->th.th_teams_level =
441       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
442 
443 #if OMPT_SUPPORT
444   kmp_team_t *parent_team = this_thr->th.th_team;
445   int tid = __kmp_tid_from_gtid(gtid);
446   if (ompt_enabled.enabled) {
447     parent_team->t.t_implicit_task_taskdata[tid]
448         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
449   }
450   OMPT_STORE_RETURN_ADDRESS(gtid);
451 #endif
452 
453   // check if __kmpc_push_num_teams called, set default number of teams
454   // otherwise
455   if (this_thr->th.th_teams_size.nteams == 0) {
456     __kmp_push_num_teams(loc, gtid, 0, 0);
457   }
458   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
459   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
460   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
461 
462   __kmp_fork_call(
463       loc, gtid, fork_context_intel, argc,
464       VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
465       VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
466   __kmp_join_call(loc, gtid
467 #if OMPT_SUPPORT
468                   ,
469                   fork_context_intel
470 #endif
471   );
472 
473   // Pop current CG root off list
474   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
475   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
476   this_thr->th.th_cg_roots = tmp->up;
477   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
478                  " to node %p. cg_nthreads was %d\n",
479                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
480   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
481   int i = tmp->cg_nthreads--;
482   if (i == 1) { // check is we are the last thread in CG (not always the case)
483     __kmp_free(tmp);
484   }
485   // Restore current task's thread_limit from CG root
486   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
487   this_thr->th.th_current_task->td_icvs.thread_limit =
488       this_thr->th.th_cg_roots->cg_thread_limit;
489 
490   this_thr->th.th_teams_microtask = NULL;
491   this_thr->th.th_teams_level = 0;
492   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
493   va_end(ap);
494 #if KMP_STATS_ENABLED
495   if (previous_state == stats_state_e::SERIAL_REGION) {
496     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
497     KMP_SET_THREAD_STATE(previous_state);
498   } else {
499     KMP_POP_PARTITIONED_TIMER();
500   }
501 #endif // KMP_STATS_ENABLED
502 }
503 
504 // I don't think this function should ever have been exported.
505 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
506 // openmp code ever called it, but it's been exported from the RTL for so
507 // long that I'm afraid to remove the definition.
508 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
509 
510 /*!
511 @ingroup PARALLEL
512 @param loc  source location information
513 @param global_tid  global thread number
514 
515 Enter a serialized parallel construct. This interface is used to handle a
516 conditional parallel region, like this,
517 @code
518 #pragma omp parallel if (condition)
519 @endcode
520 when the condition is false.
521 */
522 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
523   // The implementation is now in kmp_runtime.cpp so that it can share static
524   // functions with kmp_fork_call since the tasks to be done are similar in
525   // each case.
526   __kmp_assert_valid_gtid(global_tid);
527 #if OMPT_SUPPORT
528   OMPT_STORE_RETURN_ADDRESS(global_tid);
529 #endif
530   __kmp_serialized_parallel(loc, global_tid);
531 }
532 
533 /*!
534 @ingroup PARALLEL
535 @param loc  source location information
536 @param global_tid  global thread number
537 
538 Leave a serialized parallel construct.
539 */
540 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
541   kmp_internal_control_t *top;
542   kmp_info_t *this_thr;
543   kmp_team_t *serial_team;
544 
545   KC_TRACE(10,
546            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
547 
548   /* skip all this code for autopar serialized loops since it results in
549      unacceptable overhead */
550   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
551     return;
552 
553   // Not autopar code
554   __kmp_assert_valid_gtid(global_tid);
555   if (!TCR_4(__kmp_init_parallel))
556     __kmp_parallel_initialize();
557 
558   __kmp_resume_if_soft_paused();
559 
560   this_thr = __kmp_threads[global_tid];
561   serial_team = this_thr->th.th_serial_team;
562 
563   kmp_task_team_t *task_team = this_thr->th.th_task_team;
564   // we need to wait for the proxy tasks before finishing the thread
565   if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
566                             task_team->tt.tt_hidden_helper_task_encountered))
567     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
568 
569   KMP_MB();
570   KMP_DEBUG_ASSERT(serial_team);
571   KMP_ASSERT(serial_team->t.t_serialized);
572   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
573   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
574   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
575   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
576 
577 #if OMPT_SUPPORT
578   if (ompt_enabled.enabled &&
579       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
580     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
581     if (ompt_enabled.ompt_callback_implicit_task) {
582       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
583           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
584           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
585     }
586 
587     // reset clear the task id only after unlinking the task
588     ompt_data_t *parent_task_data;
589     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
590 
591     if (ompt_enabled.ompt_callback_parallel_end) {
592       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
593           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
594           ompt_parallel_invoker_program | ompt_parallel_team,
595           OMPT_LOAD_RETURN_ADDRESS(global_tid));
596     }
597     __ompt_lw_taskteam_unlink(this_thr);
598     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
599   }
600 #endif
601 
602   /* If necessary, pop the internal control stack values and replace the team
603    * values */
604   top = serial_team->t.t_control_stack_top;
605   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
606     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
607     serial_team->t.t_control_stack_top = top->next;
608     __kmp_free(top);
609   }
610 
611   /* pop dispatch buffers stack */
612   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
613   {
614     dispatch_private_info_t *disp_buffer =
615         serial_team->t.t_dispatch->th_disp_buffer;
616     serial_team->t.t_dispatch->th_disp_buffer =
617         serial_team->t.t_dispatch->th_disp_buffer->next;
618     __kmp_free(disp_buffer);
619   }
620   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
621 
622   --serial_team->t.t_serialized;
623   if (serial_team->t.t_serialized == 0) {
624 
625     /* return to the parallel section */
626 
627 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
628     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
629       __kmp_clear_x87_fpu_status_word();
630       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
631       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
632     }
633 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
634 
635     __kmp_pop_current_task_from_thread(this_thr);
636 #if OMPD_SUPPORT
637     if (ompd_state & OMPD_ENABLE_BP)
638       ompd_bp_parallel_end();
639 #endif
640 
641     this_thr->th.th_team = serial_team->t.t_parent;
642     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
643 
644     /* restore values cached in the thread */
645     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
646     this_thr->th.th_team_master =
647         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
648     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
649 
650     /* TODO the below shouldn't need to be adjusted for serialized teams */
651     this_thr->th.th_dispatch =
652         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
653 
654     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
655     this_thr->th.th_current_task->td_flags.executing = 1;
656 
657     if (__kmp_tasking_mode != tskm_immediate_exec) {
658       // Copy the task team from the new child / old parent team to the thread.
659       this_thr->th.th_task_team =
660           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
661       KA_TRACE(20,
662                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
663                 "team %p\n",
664                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
665     }
666 #if KMP_AFFINITY_SUPPORTED
667     if (this_thr->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
668       __kmp_reset_root_init_mask(global_tid);
669     }
670 #endif
671   } else {
672     if (__kmp_tasking_mode != tskm_immediate_exec) {
673       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
674                     "depth of serial team %p to %d\n",
675                     global_tid, serial_team, serial_team->t.t_serialized));
676     }
677   }
678 
679   serial_team->t.t_level--;
680   if (__kmp_env_consistency_check)
681     __kmp_pop_parallel(global_tid, NULL);
682 #if OMPT_SUPPORT
683   if (ompt_enabled.enabled)
684     this_thr->th.ompt_thread_info.state =
685         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
686                                            : ompt_state_work_parallel);
687 #endif
688 }
689 
690 /*!
691 @ingroup SYNCHRONIZATION
692 @param loc  source location information.
693 
694 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
695 depending on the memory ordering convention obeyed by the compiler
696 even that may not be necessary).
697 */
698 void __kmpc_flush(ident_t *loc) {
699   KC_TRACE(10, ("__kmpc_flush: called\n"));
700 
701   /* need explicit __mf() here since use volatile instead in library */
702   KMP_MFENCE(); /* Flush all pending memory write invalidates.  */
703 
704 #if OMPT_SUPPORT && OMPT_OPTIONAL
705   if (ompt_enabled.ompt_callback_flush) {
706     ompt_callbacks.ompt_callback(ompt_callback_flush)(
707         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
708   }
709 #endif
710 }
711 
712 /* -------------------------------------------------------------------------- */
713 /*!
714 @ingroup SYNCHRONIZATION
715 @param loc source location information
716 @param global_tid thread id.
717 
718 Execute a barrier.
719 */
720 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
721   KMP_COUNT_BLOCK(OMP_BARRIER);
722   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
723   __kmp_assert_valid_gtid(global_tid);
724 
725   if (!TCR_4(__kmp_init_parallel))
726     __kmp_parallel_initialize();
727 
728   __kmp_resume_if_soft_paused();
729 
730   if (__kmp_env_consistency_check) {
731     if (loc == 0) {
732       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
733     }
734     __kmp_check_barrier(global_tid, ct_barrier, loc);
735   }
736 
737 #if OMPT_SUPPORT
738   ompt_frame_t *ompt_frame;
739   if (ompt_enabled.enabled) {
740     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
741     if (ompt_frame->enter_frame.ptr == NULL)
742       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
743   }
744   OMPT_STORE_RETURN_ADDRESS(global_tid);
745 #endif
746   __kmp_threads[global_tid]->th.th_ident = loc;
747   // TODO: explicit barrier_wait_id:
748   //   this function is called when 'barrier' directive is present or
749   //   implicit barrier at the end of a worksharing construct.
750   // 1) better to add a per-thread barrier counter to a thread data structure
751   // 2) set to 0 when a new team is created
752   // 4) no sync is required
753 
754   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
755 #if OMPT_SUPPORT && OMPT_OPTIONAL
756   if (ompt_enabled.enabled) {
757     ompt_frame->enter_frame = ompt_data_none;
758   }
759 #endif
760 }
761 
762 /* The BARRIER for a MASTER section is always explicit   */
763 /*!
764 @ingroup WORK_SHARING
765 @param loc  source location information.
766 @param global_tid  global thread number .
767 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
768 */
769 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
770   int status = 0;
771 
772   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
773   __kmp_assert_valid_gtid(global_tid);
774 
775   if (!TCR_4(__kmp_init_parallel))
776     __kmp_parallel_initialize();
777 
778   __kmp_resume_if_soft_paused();
779 
780   if (KMP_MASTER_GTID(global_tid)) {
781     KMP_COUNT_BLOCK(OMP_MASTER);
782     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
783     status = 1;
784   }
785 
786 #if OMPT_SUPPORT && OMPT_OPTIONAL
787   if (status) {
788     if (ompt_enabled.ompt_callback_masked) {
789       kmp_info_t *this_thr = __kmp_threads[global_tid];
790       kmp_team_t *team = this_thr->th.th_team;
791 
792       int tid = __kmp_tid_from_gtid(global_tid);
793       ompt_callbacks.ompt_callback(ompt_callback_masked)(
794           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
795           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
796           OMPT_GET_RETURN_ADDRESS(0));
797     }
798   }
799 #endif
800 
801   if (__kmp_env_consistency_check) {
802 #if KMP_USE_DYNAMIC_LOCK
803     if (status)
804       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
805     else
806       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
807 #else
808     if (status)
809       __kmp_push_sync(global_tid, ct_master, loc, NULL);
810     else
811       __kmp_check_sync(global_tid, ct_master, loc, NULL);
812 #endif
813   }
814 
815   return status;
816 }
817 
818 /*!
819 @ingroup WORK_SHARING
820 @param loc  source location information.
821 @param global_tid  global thread number .
822 
823 Mark the end of a <tt>master</tt> region. This should only be called by the
824 thread that executes the <tt>master</tt> region.
825 */
826 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
827   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
828   __kmp_assert_valid_gtid(global_tid);
829   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
830   KMP_POP_PARTITIONED_TIMER();
831 
832 #if OMPT_SUPPORT && OMPT_OPTIONAL
833   kmp_info_t *this_thr = __kmp_threads[global_tid];
834   kmp_team_t *team = this_thr->th.th_team;
835   if (ompt_enabled.ompt_callback_masked) {
836     int tid = __kmp_tid_from_gtid(global_tid);
837     ompt_callbacks.ompt_callback(ompt_callback_masked)(
838         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
839         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
840         OMPT_GET_RETURN_ADDRESS(0));
841   }
842 #endif
843 
844   if (__kmp_env_consistency_check) {
845     if (KMP_MASTER_GTID(global_tid))
846       __kmp_pop_sync(global_tid, ct_master, loc);
847   }
848 }
849 
850 /*!
851 @ingroup WORK_SHARING
852 @param loc  source location information.
853 @param global_tid  global thread number.
854 @param filter result of evaluating filter clause on thread global_tid, or zero
855 if no filter clause present
856 @return 1 if this thread should execute the <tt>masked</tt> block, 0 otherwise.
857 */
858 kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
859   int status = 0;
860   int tid;
861   KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
862   __kmp_assert_valid_gtid(global_tid);
863 
864   if (!TCR_4(__kmp_init_parallel))
865     __kmp_parallel_initialize();
866 
867   __kmp_resume_if_soft_paused();
868 
869   tid = __kmp_tid_from_gtid(global_tid);
870   if (tid == filter) {
871     KMP_COUNT_BLOCK(OMP_MASKED);
872     KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
873     status = 1;
874   }
875 
876 #if OMPT_SUPPORT && OMPT_OPTIONAL
877   if (status) {
878     if (ompt_enabled.ompt_callback_masked) {
879       kmp_info_t *this_thr = __kmp_threads[global_tid];
880       kmp_team_t *team = this_thr->th.th_team;
881       ompt_callbacks.ompt_callback(ompt_callback_masked)(
882           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
883           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
884           OMPT_GET_RETURN_ADDRESS(0));
885     }
886   }
887 #endif
888 
889   if (__kmp_env_consistency_check) {
890 #if KMP_USE_DYNAMIC_LOCK
891     if (status)
892       __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
893     else
894       __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
895 #else
896     if (status)
897       __kmp_push_sync(global_tid, ct_masked, loc, NULL);
898     else
899       __kmp_check_sync(global_tid, ct_masked, loc, NULL);
900 #endif
901   }
902 
903   return status;
904 }
905 
906 /*!
907 @ingroup WORK_SHARING
908 @param loc  source location information.
909 @param global_tid  global thread number .
910 
911 Mark the end of a <tt>masked</tt> region. This should only be called by the
912 thread that executes the <tt>masked</tt> region.
913 */
914 void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
915   KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
916   __kmp_assert_valid_gtid(global_tid);
917   KMP_POP_PARTITIONED_TIMER();
918 
919 #if OMPT_SUPPORT && OMPT_OPTIONAL
920   kmp_info_t *this_thr = __kmp_threads[global_tid];
921   kmp_team_t *team = this_thr->th.th_team;
922   if (ompt_enabled.ompt_callback_masked) {
923     int tid = __kmp_tid_from_gtid(global_tid);
924     ompt_callbacks.ompt_callback(ompt_callback_masked)(
925         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
926         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
927         OMPT_GET_RETURN_ADDRESS(0));
928   }
929 #endif
930 
931   if (__kmp_env_consistency_check) {
932     __kmp_pop_sync(global_tid, ct_masked, loc);
933   }
934 }
935 
936 /*!
937 @ingroup WORK_SHARING
938 @param loc  source location information.
939 @param gtid  global thread number.
940 
941 Start execution of an <tt>ordered</tt> construct.
942 */
943 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
944   int cid = 0;
945   kmp_info_t *th;
946   KMP_DEBUG_ASSERT(__kmp_init_serial);
947 
948   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
949   __kmp_assert_valid_gtid(gtid);
950 
951   if (!TCR_4(__kmp_init_parallel))
952     __kmp_parallel_initialize();
953 
954   __kmp_resume_if_soft_paused();
955 
956 #if USE_ITT_BUILD
957   __kmp_itt_ordered_prep(gtid);
958 // TODO: ordered_wait_id
959 #endif /* USE_ITT_BUILD */
960 
961   th = __kmp_threads[gtid];
962 
963 #if OMPT_SUPPORT && OMPT_OPTIONAL
964   kmp_team_t *team;
965   ompt_wait_id_t lck;
966   void *codeptr_ra;
967   OMPT_STORE_RETURN_ADDRESS(gtid);
968   if (ompt_enabled.enabled) {
969     team = __kmp_team_from_gtid(gtid);
970     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
971     /* OMPT state update */
972     th->th.ompt_thread_info.wait_id = lck;
973     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
974 
975     /* OMPT event callback */
976     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
977     if (ompt_enabled.ompt_callback_mutex_acquire) {
978       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
979           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
980           codeptr_ra);
981     }
982   }
983 #endif
984 
985   if (th->th.th_dispatch->th_deo_fcn != 0)
986     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
987   else
988     __kmp_parallel_deo(&gtid, &cid, loc);
989 
990 #if OMPT_SUPPORT && OMPT_OPTIONAL
991   if (ompt_enabled.enabled) {
992     /* OMPT state update */
993     th->th.ompt_thread_info.state = ompt_state_work_parallel;
994     th->th.ompt_thread_info.wait_id = 0;
995 
996     /* OMPT event callback */
997     if (ompt_enabled.ompt_callback_mutex_acquired) {
998       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
999           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1000     }
1001   }
1002 #endif
1003 
1004 #if USE_ITT_BUILD
1005   __kmp_itt_ordered_start(gtid);
1006 #endif /* USE_ITT_BUILD */
1007 }
1008 
1009 /*!
1010 @ingroup WORK_SHARING
1011 @param loc  source location information.
1012 @param gtid  global thread number.
1013 
1014 End execution of an <tt>ordered</tt> construct.
1015 */
1016 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
1017   int cid = 0;
1018   kmp_info_t *th;
1019 
1020   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
1021   __kmp_assert_valid_gtid(gtid);
1022 
1023 #if USE_ITT_BUILD
1024   __kmp_itt_ordered_end(gtid);
1025 // TODO: ordered_wait_id
1026 #endif /* USE_ITT_BUILD */
1027 
1028   th = __kmp_threads[gtid];
1029 
1030   if (th->th.th_dispatch->th_dxo_fcn != 0)
1031     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
1032   else
1033     __kmp_parallel_dxo(&gtid, &cid, loc);
1034 
1035 #if OMPT_SUPPORT && OMPT_OPTIONAL
1036   OMPT_STORE_RETURN_ADDRESS(gtid);
1037   if (ompt_enabled.ompt_callback_mutex_released) {
1038     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1039         ompt_mutex_ordered,
1040         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
1041             ->t.t_ordered.dt.t_value,
1042         OMPT_LOAD_RETURN_ADDRESS(gtid));
1043   }
1044 #endif
1045 }
1046 
1047 #if KMP_USE_DYNAMIC_LOCK
1048 
1049 static __forceinline void
1050 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
1051                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
1052   // Pointer to the allocated indirect lock is written to crit, while indexing
1053   // is ignored.
1054   void *idx;
1055   kmp_indirect_lock_t **lck;
1056   lck = (kmp_indirect_lock_t **)crit;
1057   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
1058   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
1059   KMP_SET_I_LOCK_LOCATION(ilk, loc);
1060   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
1061   KA_TRACE(20,
1062            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
1063 #if USE_ITT_BUILD
1064   __kmp_itt_critical_creating(ilk->lock, loc);
1065 #endif
1066   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
1067   if (status == 0) {
1068 #if USE_ITT_BUILD
1069     __kmp_itt_critical_destroyed(ilk->lock);
1070 #endif
1071     // We don't really need to destroy the unclaimed lock here since it will be
1072     // cleaned up at program exit.
1073     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
1074   }
1075   KMP_DEBUG_ASSERT(*lck != NULL);
1076 }
1077 
1078 // Fast-path acquire tas lock
1079 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
1080   {                                                                            \
1081     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1082     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1083     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1084     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
1085         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
1086       kmp_uint32 spins;                                                        \
1087       KMP_FSYNC_PREPARE(l);                                                    \
1088       KMP_INIT_YIELD(spins);                                                   \
1089       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1090       do {                                                                     \
1091         if (TCR_4(__kmp_nth) >                                                 \
1092             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1093           KMP_YIELD(TRUE);                                                     \
1094         } else {                                                               \
1095           KMP_YIELD_SPIN(spins);                                               \
1096         }                                                                      \
1097         __kmp_spin_backoff(&backoff);                                          \
1098       } while (                                                                \
1099           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1100           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1101     }                                                                          \
1102     KMP_FSYNC_ACQUIRED(l);                                                     \
1103   }
1104 
1105 // Fast-path test tas lock
1106 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1107   {                                                                            \
1108     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1109     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1110     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1111     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1112          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1113   }
1114 
1115 // Fast-path release tas lock
1116 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1117   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1118 
1119 #if KMP_USE_FUTEX
1120 
1121 #include <sys/syscall.h>
1122 #include <unistd.h>
1123 #ifndef FUTEX_WAIT
1124 #define FUTEX_WAIT 0
1125 #endif
1126 #ifndef FUTEX_WAKE
1127 #define FUTEX_WAKE 1
1128 #endif
1129 
1130 // Fast-path acquire futex lock
1131 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1132   {                                                                            \
1133     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1134     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1135     KMP_MB();                                                                  \
1136     KMP_FSYNC_PREPARE(ftx);                                                    \
1137     kmp_int32 poll_val;                                                        \
1138     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1139                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1140                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1141       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1142       if (!cond) {                                                             \
1143         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1144                                          poll_val |                            \
1145                                              KMP_LOCK_BUSY(1, futex))) {       \
1146           continue;                                                            \
1147         }                                                                      \
1148         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1149       }                                                                        \
1150       kmp_int32 rc;                                                            \
1151       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1152                         NULL, NULL, 0)) != 0) {                                \
1153         continue;                                                              \
1154       }                                                                        \
1155       gtid_code |= 1;                                                          \
1156     }                                                                          \
1157     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1158   }
1159 
1160 // Fast-path test futex lock
1161 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1162   {                                                                            \
1163     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1164     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1165                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1166       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1167       rc = TRUE;                                                               \
1168     } else {                                                                   \
1169       rc = FALSE;                                                              \
1170     }                                                                          \
1171   }
1172 
1173 // Fast-path release futex lock
1174 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1175   {                                                                            \
1176     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1177     KMP_MB();                                                                  \
1178     KMP_FSYNC_RELEASING(ftx);                                                  \
1179     kmp_int32 poll_val =                                                       \
1180         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1181     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1182       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1183               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1184     }                                                                          \
1185     KMP_MB();                                                                  \
1186     KMP_YIELD_OVERSUB();                                                       \
1187   }
1188 
1189 #endif // KMP_USE_FUTEX
1190 
1191 #else // KMP_USE_DYNAMIC_LOCK
1192 
1193 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1194                                                       ident_t const *loc,
1195                                                       kmp_int32 gtid) {
1196   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1197 
1198   // Because of the double-check, the following load doesn't need to be volatile
1199   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1200 
1201   if (lck == NULL) {
1202     void *idx;
1203 
1204     // Allocate & initialize the lock.
1205     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1206     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1207     __kmp_init_user_lock_with_checks(lck);
1208     __kmp_set_user_lock_location(lck, loc);
1209 #if USE_ITT_BUILD
1210     __kmp_itt_critical_creating(lck);
1211 // __kmp_itt_critical_creating() should be called *before* the first usage
1212 // of underlying lock. It is the only place where we can guarantee it. There
1213 // are chances the lock will destroyed with no usage, but it is not a
1214 // problem, because this is not real event seen by user but rather setting
1215 // name for object (lock). See more details in kmp_itt.h.
1216 #endif /* USE_ITT_BUILD */
1217 
1218     // Use a cmpxchg instruction to slam the start of the critical section with
1219     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1220     // and use the lock that the other thread allocated.
1221     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1222 
1223     if (status == 0) {
1224 // Deallocate the lock and reload the value.
1225 #if USE_ITT_BUILD
1226       __kmp_itt_critical_destroyed(lck);
1227 // Let ITT know the lock is destroyed and the same memory location may be reused
1228 // for another purpose.
1229 #endif /* USE_ITT_BUILD */
1230       __kmp_destroy_user_lock_with_checks(lck);
1231       __kmp_user_lock_free(&idx, gtid, lck);
1232       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1233       KMP_DEBUG_ASSERT(lck != NULL);
1234     }
1235   }
1236   return lck;
1237 }
1238 
1239 #endif // KMP_USE_DYNAMIC_LOCK
1240 
1241 /*!
1242 @ingroup WORK_SHARING
1243 @param loc  source location information.
1244 @param global_tid  global thread number.
1245 @param crit identity of the critical section. This could be a pointer to a lock
1246 associated with the critical section, or some other suitably unique value.
1247 
1248 Enter code protected by a `critical` construct.
1249 This function blocks until the executing thread can enter the critical section.
1250 */
1251 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1252                      kmp_critical_name *crit) {
1253 #if KMP_USE_DYNAMIC_LOCK
1254 #if OMPT_SUPPORT && OMPT_OPTIONAL
1255   OMPT_STORE_RETURN_ADDRESS(global_tid);
1256 #endif // OMPT_SUPPORT
1257   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1258 #else
1259   KMP_COUNT_BLOCK(OMP_CRITICAL);
1260 #if OMPT_SUPPORT && OMPT_OPTIONAL
1261   ompt_state_t prev_state = ompt_state_undefined;
1262   ompt_thread_info_t ti;
1263 #endif
1264   kmp_user_lock_p lck;
1265 
1266   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1267   __kmp_assert_valid_gtid(global_tid);
1268 
1269   // TODO: add THR_OVHD_STATE
1270 
1271   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1272   KMP_CHECK_USER_LOCK_INIT();
1273 
1274   if ((__kmp_user_lock_kind == lk_tas) &&
1275       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1276     lck = (kmp_user_lock_p)crit;
1277   }
1278 #if KMP_USE_FUTEX
1279   else if ((__kmp_user_lock_kind == lk_futex) &&
1280            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1281     lck = (kmp_user_lock_p)crit;
1282   }
1283 #endif
1284   else { // ticket, queuing or drdpa
1285     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1286   }
1287 
1288   if (__kmp_env_consistency_check)
1289     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1290 
1291     // since the critical directive binds to all threads, not just the current
1292     // team we have to check this even if we are in a serialized team.
1293     // also, even if we are the uber thread, we still have to conduct the lock,
1294     // as we have to contend with sibling threads.
1295 
1296 #if USE_ITT_BUILD
1297   __kmp_itt_critical_acquiring(lck);
1298 #endif /* USE_ITT_BUILD */
1299 #if OMPT_SUPPORT && OMPT_OPTIONAL
1300   OMPT_STORE_RETURN_ADDRESS(gtid);
1301   void *codeptr_ra = NULL;
1302   if (ompt_enabled.enabled) {
1303     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1304     /* OMPT state update */
1305     prev_state = ti.state;
1306     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1307     ti.state = ompt_state_wait_critical;
1308 
1309     /* OMPT event callback */
1310     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1311     if (ompt_enabled.ompt_callback_mutex_acquire) {
1312       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1313           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1314           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1315     }
1316   }
1317 #endif
1318   // Value of 'crit' should be good for using as a critical_id of the critical
1319   // section directive.
1320   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1321 
1322 #if USE_ITT_BUILD
1323   __kmp_itt_critical_acquired(lck);
1324 #endif /* USE_ITT_BUILD */
1325 #if OMPT_SUPPORT && OMPT_OPTIONAL
1326   if (ompt_enabled.enabled) {
1327     /* OMPT state update */
1328     ti.state = prev_state;
1329     ti.wait_id = 0;
1330 
1331     /* OMPT event callback */
1332     if (ompt_enabled.ompt_callback_mutex_acquired) {
1333       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1334           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1335     }
1336   }
1337 #endif
1338   KMP_POP_PARTITIONED_TIMER();
1339 
1340   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1341   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1342 #endif // KMP_USE_DYNAMIC_LOCK
1343 }
1344 
1345 #if KMP_USE_DYNAMIC_LOCK
1346 
1347 // Converts the given hint to an internal lock implementation
1348 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1349 #if KMP_USE_TSX
1350 #define KMP_TSX_LOCK(seq) lockseq_##seq
1351 #else
1352 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1353 #endif
1354 
1355 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1356 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.flags.rtm)
1357 #else
1358 #define KMP_CPUINFO_RTM 0
1359 #endif
1360 
1361   // Hints that do not require further logic
1362   if (hint & kmp_lock_hint_hle)
1363     return KMP_TSX_LOCK(hle);
1364   if (hint & kmp_lock_hint_rtm)
1365     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1366   if (hint & kmp_lock_hint_adaptive)
1367     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1368 
1369   // Rule out conflicting hints first by returning the default lock
1370   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1371     return __kmp_user_lock_seq;
1372   if ((hint & omp_lock_hint_speculative) &&
1373       (hint & omp_lock_hint_nonspeculative))
1374     return __kmp_user_lock_seq;
1375 
1376   // Do not even consider speculation when it appears to be contended
1377   if (hint & omp_lock_hint_contended)
1378     return lockseq_queuing;
1379 
1380   // Uncontended lock without speculation
1381   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1382     return lockseq_tas;
1383 
1384   // Use RTM lock for speculation
1385   if (hint & omp_lock_hint_speculative)
1386     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1387 
1388   return __kmp_user_lock_seq;
1389 }
1390 
1391 #if OMPT_SUPPORT && OMPT_OPTIONAL
1392 #if KMP_USE_DYNAMIC_LOCK
1393 static kmp_mutex_impl_t
1394 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1395   if (user_lock) {
1396     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1397     case 0:
1398       break;
1399 #if KMP_USE_FUTEX
1400     case locktag_futex:
1401       return kmp_mutex_impl_queuing;
1402 #endif
1403     case locktag_tas:
1404       return kmp_mutex_impl_spin;
1405 #if KMP_USE_TSX
1406     case locktag_hle:
1407     case locktag_rtm_spin:
1408       return kmp_mutex_impl_speculative;
1409 #endif
1410     default:
1411       return kmp_mutex_impl_none;
1412     }
1413     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1414   }
1415   KMP_ASSERT(ilock);
1416   switch (ilock->type) {
1417 #if KMP_USE_TSX
1418   case locktag_adaptive:
1419   case locktag_rtm_queuing:
1420     return kmp_mutex_impl_speculative;
1421 #endif
1422   case locktag_nested_tas:
1423     return kmp_mutex_impl_spin;
1424 #if KMP_USE_FUTEX
1425   case locktag_nested_futex:
1426 #endif
1427   case locktag_ticket:
1428   case locktag_queuing:
1429   case locktag_drdpa:
1430   case locktag_nested_ticket:
1431   case locktag_nested_queuing:
1432   case locktag_nested_drdpa:
1433     return kmp_mutex_impl_queuing;
1434   default:
1435     return kmp_mutex_impl_none;
1436   }
1437 }
1438 #else
1439 // For locks without dynamic binding
1440 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1441   switch (__kmp_user_lock_kind) {
1442   case lk_tas:
1443     return kmp_mutex_impl_spin;
1444 #if KMP_USE_FUTEX
1445   case lk_futex:
1446 #endif
1447   case lk_ticket:
1448   case lk_queuing:
1449   case lk_drdpa:
1450     return kmp_mutex_impl_queuing;
1451 #if KMP_USE_TSX
1452   case lk_hle:
1453   case lk_rtm_queuing:
1454   case lk_rtm_spin:
1455   case lk_adaptive:
1456     return kmp_mutex_impl_speculative;
1457 #endif
1458   default:
1459     return kmp_mutex_impl_none;
1460   }
1461 }
1462 #endif // KMP_USE_DYNAMIC_LOCK
1463 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1464 
1465 /*!
1466 @ingroup WORK_SHARING
1467 @param loc  source location information.
1468 @param global_tid  global thread number.
1469 @param crit identity of the critical section. This could be a pointer to a lock
1470 associated with the critical section, or some other suitably unique value.
1471 @param hint the lock hint.
1472 
1473 Enter code protected by a `critical` construct with a hint. The hint value is
1474 used to suggest a lock implementation. This function blocks until the executing
1475 thread can enter the critical section unless the hint suggests use of
1476 speculative execution and the hardware supports it.
1477 */
1478 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1479                                kmp_critical_name *crit, uint32_t hint) {
1480   KMP_COUNT_BLOCK(OMP_CRITICAL);
1481   kmp_user_lock_p lck;
1482 #if OMPT_SUPPORT && OMPT_OPTIONAL
1483   ompt_state_t prev_state = ompt_state_undefined;
1484   ompt_thread_info_t ti;
1485   // This is the case, if called from __kmpc_critical:
1486   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1487   if (!codeptr)
1488     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1489 #endif
1490 
1491   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1492   __kmp_assert_valid_gtid(global_tid);
1493 
1494   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1495   // Check if it is initialized.
1496   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1497   kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
1498   if (*lk == 0) {
1499     if (KMP_IS_D_LOCK(lockseq)) {
1500       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1501                                   KMP_GET_D_TAG(lockseq));
1502     } else {
1503       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
1504     }
1505   }
1506   // Branch for accessing the actual lock object and set operation. This
1507   // branching is inevitable since this lock initialization does not follow the
1508   // normal dispatch path (lock table is not used).
1509   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1510     lck = (kmp_user_lock_p)lk;
1511     if (__kmp_env_consistency_check) {
1512       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1513                       __kmp_map_hint_to_lock(hint));
1514     }
1515 #if USE_ITT_BUILD
1516     __kmp_itt_critical_acquiring(lck);
1517 #endif
1518 #if OMPT_SUPPORT && OMPT_OPTIONAL
1519     if (ompt_enabled.enabled) {
1520       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1521       /* OMPT state update */
1522       prev_state = ti.state;
1523       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1524       ti.state = ompt_state_wait_critical;
1525 
1526       /* OMPT event callback */
1527       if (ompt_enabled.ompt_callback_mutex_acquire) {
1528         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1529             ompt_mutex_critical, (unsigned int)hint,
1530             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1531             codeptr);
1532       }
1533     }
1534 #endif
1535 #if KMP_USE_INLINED_TAS
1536     if (lockseq == lockseq_tas && !__kmp_env_consistency_check) {
1537       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1538     } else
1539 #elif KMP_USE_INLINED_FUTEX
1540     if (lockseq == lockseq_futex && !__kmp_env_consistency_check) {
1541       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1542     } else
1543 #endif
1544     {
1545       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1546     }
1547   } else {
1548     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1549     lck = ilk->lock;
1550     if (__kmp_env_consistency_check) {
1551       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1552                       __kmp_map_hint_to_lock(hint));
1553     }
1554 #if USE_ITT_BUILD
1555     __kmp_itt_critical_acquiring(lck);
1556 #endif
1557 #if OMPT_SUPPORT && OMPT_OPTIONAL
1558     if (ompt_enabled.enabled) {
1559       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1560       /* OMPT state update */
1561       prev_state = ti.state;
1562       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1563       ti.state = ompt_state_wait_critical;
1564 
1565       /* OMPT event callback */
1566       if (ompt_enabled.ompt_callback_mutex_acquire) {
1567         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1568             ompt_mutex_critical, (unsigned int)hint,
1569             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1570             codeptr);
1571       }
1572     }
1573 #endif
1574     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1575   }
1576   KMP_POP_PARTITIONED_TIMER();
1577 
1578 #if USE_ITT_BUILD
1579   __kmp_itt_critical_acquired(lck);
1580 #endif /* USE_ITT_BUILD */
1581 #if OMPT_SUPPORT && OMPT_OPTIONAL
1582   if (ompt_enabled.enabled) {
1583     /* OMPT state update */
1584     ti.state = prev_state;
1585     ti.wait_id = 0;
1586 
1587     /* OMPT event callback */
1588     if (ompt_enabled.ompt_callback_mutex_acquired) {
1589       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1590           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1591     }
1592   }
1593 #endif
1594 
1595   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1596   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1597 } // __kmpc_critical_with_hint
1598 
1599 #endif // KMP_USE_DYNAMIC_LOCK
1600 
1601 /*!
1602 @ingroup WORK_SHARING
1603 @param loc  source location information.
1604 @param global_tid  global thread number .
1605 @param crit identity of the critical section. This could be a pointer to a lock
1606 associated with the critical section, or some other suitably unique value.
1607 
1608 Leave a critical section, releasing any lock that was held during its execution.
1609 */
1610 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1611                          kmp_critical_name *crit) {
1612   kmp_user_lock_p lck;
1613 
1614   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1615 
1616 #if KMP_USE_DYNAMIC_LOCK
1617   int locktag = KMP_EXTRACT_D_TAG(crit);
1618   if (locktag) {
1619     lck = (kmp_user_lock_p)crit;
1620     KMP_ASSERT(lck != NULL);
1621     if (__kmp_env_consistency_check) {
1622       __kmp_pop_sync(global_tid, ct_critical, loc);
1623     }
1624 #if USE_ITT_BUILD
1625     __kmp_itt_critical_releasing(lck);
1626 #endif
1627 #if KMP_USE_INLINED_TAS
1628     if (locktag == locktag_tas && !__kmp_env_consistency_check) {
1629       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1630     } else
1631 #elif KMP_USE_INLINED_FUTEX
1632     if (locktag == locktag_futex && !__kmp_env_consistency_check) {
1633       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1634     } else
1635 #endif
1636     {
1637       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1638     }
1639   } else {
1640     kmp_indirect_lock_t *ilk =
1641         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1642     KMP_ASSERT(ilk != NULL);
1643     lck = ilk->lock;
1644     if (__kmp_env_consistency_check) {
1645       __kmp_pop_sync(global_tid, ct_critical, loc);
1646     }
1647 #if USE_ITT_BUILD
1648     __kmp_itt_critical_releasing(lck);
1649 #endif
1650     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1651   }
1652 
1653 #else // KMP_USE_DYNAMIC_LOCK
1654 
1655   if ((__kmp_user_lock_kind == lk_tas) &&
1656       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1657     lck = (kmp_user_lock_p)crit;
1658   }
1659 #if KMP_USE_FUTEX
1660   else if ((__kmp_user_lock_kind == lk_futex) &&
1661            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1662     lck = (kmp_user_lock_p)crit;
1663   }
1664 #endif
1665   else { // ticket, queuing or drdpa
1666     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1667   }
1668 
1669   KMP_ASSERT(lck != NULL);
1670 
1671   if (__kmp_env_consistency_check)
1672     __kmp_pop_sync(global_tid, ct_critical, loc);
1673 
1674 #if USE_ITT_BUILD
1675   __kmp_itt_critical_releasing(lck);
1676 #endif /* USE_ITT_BUILD */
1677   // Value of 'crit' should be good for using as a critical_id of the critical
1678   // section directive.
1679   __kmp_release_user_lock_with_checks(lck, global_tid);
1680 
1681 #endif // KMP_USE_DYNAMIC_LOCK
1682 
1683 #if OMPT_SUPPORT && OMPT_OPTIONAL
1684   /* OMPT release event triggers after lock is released; place here to trigger
1685    * for all #if branches */
1686   OMPT_STORE_RETURN_ADDRESS(global_tid);
1687   if (ompt_enabled.ompt_callback_mutex_released) {
1688     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1689         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1690         OMPT_LOAD_RETURN_ADDRESS(0));
1691   }
1692 #endif
1693 
1694   KMP_POP_PARTITIONED_TIMER();
1695   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1696 }
1697 
1698 /*!
1699 @ingroup SYNCHRONIZATION
1700 @param loc source location information
1701 @param global_tid thread id.
1702 @return one if the thread should execute the master block, zero otherwise
1703 
1704 Start execution of a combined barrier and master. The barrier is executed inside
1705 this function.
1706 */
1707 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1708   int status;
1709   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1710   __kmp_assert_valid_gtid(global_tid);
1711 
1712   if (!TCR_4(__kmp_init_parallel))
1713     __kmp_parallel_initialize();
1714 
1715   __kmp_resume_if_soft_paused();
1716 
1717   if (__kmp_env_consistency_check)
1718     __kmp_check_barrier(global_tid, ct_barrier, loc);
1719 
1720 #if OMPT_SUPPORT
1721   ompt_frame_t *ompt_frame;
1722   if (ompt_enabled.enabled) {
1723     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1724     if (ompt_frame->enter_frame.ptr == NULL)
1725       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1726   }
1727   OMPT_STORE_RETURN_ADDRESS(global_tid);
1728 #endif
1729 #if USE_ITT_NOTIFY
1730   __kmp_threads[global_tid]->th.th_ident = loc;
1731 #endif
1732   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1733 #if OMPT_SUPPORT && OMPT_OPTIONAL
1734   if (ompt_enabled.enabled) {
1735     ompt_frame->enter_frame = ompt_data_none;
1736   }
1737 #endif
1738 
1739   return (status != 0) ? 0 : 1;
1740 }
1741 
1742 /*!
1743 @ingroup SYNCHRONIZATION
1744 @param loc source location information
1745 @param global_tid thread id.
1746 
1747 Complete the execution of a combined barrier and master. This function should
1748 only be called at the completion of the <tt>master</tt> code. Other threads will
1749 still be waiting at the barrier and this call releases them.
1750 */
1751 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1752   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1753   __kmp_assert_valid_gtid(global_tid);
1754   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1755 }
1756 
1757 /*!
1758 @ingroup SYNCHRONIZATION
1759 @param loc source location information
1760 @param global_tid thread id.
1761 @return one if the thread should execute the master block, zero otherwise
1762 
1763 Start execution of a combined barrier and master(nowait) construct.
1764 The barrier is executed inside this function.
1765 There is no equivalent "end" function, since the
1766 */
1767 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1768   kmp_int32 ret;
1769   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1770   __kmp_assert_valid_gtid(global_tid);
1771 
1772   if (!TCR_4(__kmp_init_parallel))
1773     __kmp_parallel_initialize();
1774 
1775   __kmp_resume_if_soft_paused();
1776 
1777   if (__kmp_env_consistency_check) {
1778     if (loc == 0) {
1779       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1780     }
1781     __kmp_check_barrier(global_tid, ct_barrier, loc);
1782   }
1783 
1784 #if OMPT_SUPPORT
1785   ompt_frame_t *ompt_frame;
1786   if (ompt_enabled.enabled) {
1787     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1788     if (ompt_frame->enter_frame.ptr == NULL)
1789       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1790   }
1791   OMPT_STORE_RETURN_ADDRESS(global_tid);
1792 #endif
1793 #if USE_ITT_NOTIFY
1794   __kmp_threads[global_tid]->th.th_ident = loc;
1795 #endif
1796   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1797 #if OMPT_SUPPORT && OMPT_OPTIONAL
1798   if (ompt_enabled.enabled) {
1799     ompt_frame->enter_frame = ompt_data_none;
1800   }
1801 #endif
1802 
1803   ret = __kmpc_master(loc, global_tid);
1804 
1805   if (__kmp_env_consistency_check) {
1806     /*  there's no __kmpc_end_master called; so the (stats) */
1807     /*  actions of __kmpc_end_master are done here          */
1808     if (ret) {
1809       /* only one thread should do the pop since only */
1810       /* one did the push (see __kmpc_master())       */
1811       __kmp_pop_sync(global_tid, ct_master, loc);
1812     }
1813   }
1814 
1815   return (ret);
1816 }
1817 
1818 /* The BARRIER for a SINGLE process section is always explicit   */
1819 /*!
1820 @ingroup WORK_SHARING
1821 @param loc  source location information
1822 @param global_tid  global thread number
1823 @return One if this thread should execute the single construct, zero otherwise.
1824 
1825 Test whether to execute a <tt>single</tt> construct.
1826 There are no implicit barriers in the two "single" calls, rather the compiler
1827 should introduce an explicit barrier if it is required.
1828 */
1829 
1830 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1831   __kmp_assert_valid_gtid(global_tid);
1832   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1833 
1834   if (rc) {
1835     // We are going to execute the single statement, so we should count it.
1836     KMP_COUNT_BLOCK(OMP_SINGLE);
1837     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1838   }
1839 
1840 #if OMPT_SUPPORT && OMPT_OPTIONAL
1841   kmp_info_t *this_thr = __kmp_threads[global_tid];
1842   kmp_team_t *team = this_thr->th.th_team;
1843   int tid = __kmp_tid_from_gtid(global_tid);
1844 
1845   if (ompt_enabled.enabled) {
1846     if (rc) {
1847       if (ompt_enabled.ompt_callback_work) {
1848         ompt_callbacks.ompt_callback(ompt_callback_work)(
1849             ompt_work_single_executor, ompt_scope_begin,
1850             &(team->t.ompt_team_info.parallel_data),
1851             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1852             1, OMPT_GET_RETURN_ADDRESS(0));
1853       }
1854     } else {
1855       if (ompt_enabled.ompt_callback_work) {
1856         ompt_callbacks.ompt_callback(ompt_callback_work)(
1857             ompt_work_single_other, ompt_scope_begin,
1858             &(team->t.ompt_team_info.parallel_data),
1859             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1860             1, OMPT_GET_RETURN_ADDRESS(0));
1861         ompt_callbacks.ompt_callback(ompt_callback_work)(
1862             ompt_work_single_other, ompt_scope_end,
1863             &(team->t.ompt_team_info.parallel_data),
1864             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1865             1, OMPT_GET_RETURN_ADDRESS(0));
1866       }
1867     }
1868   }
1869 #endif
1870 
1871   return rc;
1872 }
1873 
1874 /*!
1875 @ingroup WORK_SHARING
1876 @param loc  source location information
1877 @param global_tid  global thread number
1878 
1879 Mark the end of a <tt>single</tt> construct.  This function should
1880 only be called by the thread that executed the block of code protected
1881 by the `single` construct.
1882 */
1883 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1884   __kmp_assert_valid_gtid(global_tid);
1885   __kmp_exit_single(global_tid);
1886   KMP_POP_PARTITIONED_TIMER();
1887 
1888 #if OMPT_SUPPORT && OMPT_OPTIONAL
1889   kmp_info_t *this_thr = __kmp_threads[global_tid];
1890   kmp_team_t *team = this_thr->th.th_team;
1891   int tid = __kmp_tid_from_gtid(global_tid);
1892 
1893   if (ompt_enabled.ompt_callback_work) {
1894     ompt_callbacks.ompt_callback(ompt_callback_work)(
1895         ompt_work_single_executor, ompt_scope_end,
1896         &(team->t.ompt_team_info.parallel_data),
1897         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1898         OMPT_GET_RETURN_ADDRESS(0));
1899   }
1900 #endif
1901 }
1902 
1903 /*!
1904 @ingroup WORK_SHARING
1905 @param loc Source location
1906 @param global_tid Global thread id
1907 
1908 Mark the end of a statically scheduled loop.
1909 */
1910 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1911   KMP_POP_PARTITIONED_TIMER();
1912   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1913 
1914 #if OMPT_SUPPORT && OMPT_OPTIONAL
1915   if (ompt_enabled.ompt_callback_work) {
1916     ompt_work_t ompt_work_type = ompt_work_loop;
1917     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1918     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1919     // Determine workshare type
1920     if (loc != NULL) {
1921       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1922         ompt_work_type = ompt_work_loop;
1923       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1924         ompt_work_type = ompt_work_sections;
1925       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1926         ompt_work_type = ompt_work_distribute;
1927       } else {
1928         // use default set above.
1929         // a warning about this case is provided in __kmpc_for_static_init
1930       }
1931       KMP_DEBUG_ASSERT(ompt_work_type);
1932     }
1933     ompt_callbacks.ompt_callback(ompt_callback_work)(
1934         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1935         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1936   }
1937 #endif
1938   if (__kmp_env_consistency_check)
1939     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1940 }
1941 
1942 // User routines which take C-style arguments (call by value)
1943 // different from the Fortran equivalent routines
1944 
1945 void ompc_set_num_threads(int arg) {
1946   // !!!!! TODO: check the per-task binding
1947   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1948 }
1949 
1950 void ompc_set_dynamic(int flag) {
1951   kmp_info_t *thread;
1952 
1953   /* For the thread-private implementation of the internal controls */
1954   thread = __kmp_entry_thread();
1955 
1956   __kmp_save_internal_controls(thread);
1957 
1958   set__dynamic(thread, flag ? true : false);
1959 }
1960 
1961 void ompc_set_nested(int flag) {
1962   kmp_info_t *thread;
1963 
1964   /* For the thread-private internal controls implementation */
1965   thread = __kmp_entry_thread();
1966 
1967   __kmp_save_internal_controls(thread);
1968 
1969   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1970 }
1971 
1972 void ompc_set_max_active_levels(int max_active_levels) {
1973   /* TO DO */
1974   /* we want per-task implementation of this internal control */
1975 
1976   /* For the per-thread internal controls implementation */
1977   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1978 }
1979 
1980 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1981   // !!!!! TODO: check the per-task binding
1982   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1983 }
1984 
1985 int ompc_get_ancestor_thread_num(int level) {
1986   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1987 }
1988 
1989 int ompc_get_team_size(int level) {
1990   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1991 }
1992 
1993 /* OpenMP 5.0 Affinity Format API */
1994 void KMP_EXPAND_NAME(ompc_set_affinity_format)(char const *format) {
1995   if (!__kmp_init_serial) {
1996     __kmp_serial_initialize();
1997   }
1998   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1999                          format, KMP_STRLEN(format) + 1);
2000 }
2001 
2002 size_t KMP_EXPAND_NAME(ompc_get_affinity_format)(char *buffer, size_t size) {
2003   size_t format_size;
2004   if (!__kmp_init_serial) {
2005     __kmp_serial_initialize();
2006   }
2007   format_size = KMP_STRLEN(__kmp_affinity_format);
2008   if (buffer && size) {
2009     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
2010                            format_size + 1);
2011   }
2012   return format_size;
2013 }
2014 
2015 void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format) {
2016   int gtid;
2017   if (!TCR_4(__kmp_init_middle)) {
2018     __kmp_middle_initialize();
2019   }
2020   __kmp_assign_root_init_mask();
2021   gtid = __kmp_get_gtid();
2022 #if KMP_AFFINITY_SUPPORTED
2023   if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
2024       __kmp_affinity.flags.reset) {
2025     __kmp_reset_root_init_mask(gtid);
2026   }
2027 #endif
2028   __kmp_aux_display_affinity(gtid, format);
2029 }
2030 
2031 size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
2032                                               char const *format) {
2033   int gtid;
2034   size_t num_required;
2035   kmp_str_buf_t capture_buf;
2036   if (!TCR_4(__kmp_init_middle)) {
2037     __kmp_middle_initialize();
2038   }
2039   __kmp_assign_root_init_mask();
2040   gtid = __kmp_get_gtid();
2041 #if KMP_AFFINITY_SUPPORTED
2042   if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
2043       __kmp_affinity.flags.reset) {
2044     __kmp_reset_root_init_mask(gtid);
2045   }
2046 #endif
2047   __kmp_str_buf_init(&capture_buf);
2048   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
2049   if (buffer && buf_size) {
2050     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
2051                            capture_buf.used + 1);
2052   }
2053   __kmp_str_buf_free(&capture_buf);
2054   return num_required;
2055 }
2056 
2057 void kmpc_set_stacksize(int arg) {
2058   // __kmp_aux_set_stacksize initializes the library if needed
2059   __kmp_aux_set_stacksize(arg);
2060 }
2061 
2062 void kmpc_set_stacksize_s(size_t arg) {
2063   // __kmp_aux_set_stacksize initializes the library if needed
2064   __kmp_aux_set_stacksize(arg);
2065 }
2066 
2067 void kmpc_set_blocktime(int arg) {
2068   int gtid, tid;
2069   kmp_info_t *thread;
2070 
2071   gtid = __kmp_entry_gtid();
2072   tid = __kmp_tid_from_gtid(gtid);
2073   thread = __kmp_thread_from_gtid(gtid);
2074 
2075   __kmp_aux_set_blocktime(arg, thread, tid);
2076 }
2077 
2078 void kmpc_set_library(int arg) {
2079   // __kmp_user_set_library initializes the library if needed
2080   __kmp_user_set_library((enum library_type)arg);
2081 }
2082 
2083 void kmpc_set_defaults(char const *str) {
2084   // __kmp_aux_set_defaults initializes the library if needed
2085   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
2086 }
2087 
2088 void kmpc_set_disp_num_buffers(int arg) {
2089   // ignore after initialization because some teams have already
2090   // allocated dispatch buffers
2091   if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
2092       arg <= KMP_MAX_DISP_NUM_BUFF) {
2093     __kmp_dispatch_num_buffers = arg;
2094   }
2095 }
2096 
2097 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2098 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2099   return -1;
2100 #else
2101   if (!TCR_4(__kmp_init_middle)) {
2102     __kmp_middle_initialize();
2103   }
2104   __kmp_assign_root_init_mask();
2105   return __kmp_aux_set_affinity_mask_proc(proc, mask);
2106 #endif
2107 }
2108 
2109 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2110 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2111   return -1;
2112 #else
2113   if (!TCR_4(__kmp_init_middle)) {
2114     __kmp_middle_initialize();
2115   }
2116   __kmp_assign_root_init_mask();
2117   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2118 #endif
2119 }
2120 
2121 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2122 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2123   return -1;
2124 #else
2125   if (!TCR_4(__kmp_init_middle)) {
2126     __kmp_middle_initialize();
2127   }
2128   __kmp_assign_root_init_mask();
2129   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2130 #endif
2131 }
2132 
2133 /* -------------------------------------------------------------------------- */
2134 /*!
2135 @ingroup THREADPRIVATE
2136 @param loc       source location information
2137 @param gtid      global thread number
2138 @param cpy_size  size of the cpy_data buffer
2139 @param cpy_data  pointer to data to be copied
2140 @param cpy_func  helper function to call for copying data
2141 @param didit     flag variable: 1=single thread; 0=not single thread
2142 
2143 __kmpc_copyprivate implements the interface for the private data broadcast
2144 needed for the copyprivate clause associated with a single region in an
2145 OpenMP<sup>*</sup> program (both C and Fortran).
2146 All threads participating in the parallel region call this routine.
2147 One of the threads (called the single thread) should have the <tt>didit</tt>
2148 variable set to 1 and all other threads should have that variable set to 0.
2149 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2150 
2151 The OpenMP specification forbids the use of nowait on the single region when a
2152 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2153 barrier internally to avoid race conditions, so the code generation for the
2154 single region should avoid generating a barrier after the call to @ref
2155 __kmpc_copyprivate.
2156 
2157 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2158 The <tt>loc</tt> parameter is a pointer to source location information.
2159 
2160 Internal implementation: The single thread will first copy its descriptor
2161 address (cpy_data) to a team-private location, then the other threads will each
2162 call the function pointed to by the parameter cpy_func, which carries out the
2163 copy by copying the data using the cpy_data buffer.
2164 
2165 The cpy_func routine used for the copy and the contents of the data area defined
2166 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2167 to be done. For instance, the cpy_data buffer can hold the actual data to be
2168 copied or it may hold a list of pointers to the data. The cpy_func routine must
2169 interpret the cpy_data buffer appropriately.
2170 
2171 The interface to cpy_func is as follows:
2172 @code
2173 void cpy_func( void *destination, void *source )
2174 @endcode
2175 where void *destination is the cpy_data pointer for the thread being copied to
2176 and void *source is the cpy_data pointer for the thread being copied from.
2177 */
2178 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2179                         void *cpy_data, void (*cpy_func)(void *, void *),
2180                         kmp_int32 didit) {
2181   void **data_ptr;
2182   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2183   __kmp_assert_valid_gtid(gtid);
2184 
2185   KMP_MB();
2186 
2187   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2188 
2189   if (__kmp_env_consistency_check) {
2190     if (loc == 0) {
2191       KMP_WARNING(ConstructIdentInvalid);
2192     }
2193   }
2194 
2195   // ToDo: Optimize the following two barriers into some kind of split barrier
2196 
2197   if (didit)
2198     *data_ptr = cpy_data;
2199 
2200 #if OMPT_SUPPORT
2201   ompt_frame_t *ompt_frame;
2202   if (ompt_enabled.enabled) {
2203     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2204     if (ompt_frame->enter_frame.ptr == NULL)
2205       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2206   }
2207   OMPT_STORE_RETURN_ADDRESS(gtid);
2208 #endif
2209 /* This barrier is not a barrier region boundary */
2210 #if USE_ITT_NOTIFY
2211   __kmp_threads[gtid]->th.th_ident = loc;
2212 #endif
2213   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2214 
2215   if (!didit)
2216     (*cpy_func)(cpy_data, *data_ptr);
2217 
2218   // Consider next barrier a user-visible barrier for barrier region boundaries
2219   // Nesting checks are already handled by the single construct checks
2220   {
2221 #if OMPT_SUPPORT
2222     OMPT_STORE_RETURN_ADDRESS(gtid);
2223 #endif
2224 #if USE_ITT_NOTIFY
2225     __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2226 // tasks can overwrite the location)
2227 #endif
2228     __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2229 #if OMPT_SUPPORT && OMPT_OPTIONAL
2230     if (ompt_enabled.enabled) {
2231       ompt_frame->enter_frame = ompt_data_none;
2232     }
2233 #endif
2234   }
2235 }
2236 
2237 /* --------------------------------------------------------------------------*/
2238 /*!
2239 @ingroup THREADPRIVATE
2240 @param loc       source location information
2241 @param gtid      global thread number
2242 @param cpy_data  pointer to the data to be saved/copied or 0
2243 @return          the saved pointer to the data
2244 
2245 __kmpc_copyprivate_light is a lighter version of __kmpc_copyprivate:
2246 __kmpc_copyprivate_light only saves the pointer it's given (if it's not 0, so
2247 coming from single), and returns that pointer in all calls (for single thread
2248 it's not needed). This version doesn't do any actual data copying. Data copying
2249 has to be done somewhere else, e.g. inline in the generated code. Due to this,
2250 this function doesn't have any barrier at the end of the function, like
2251 __kmpc_copyprivate does, so generated code needs barrier after copying of all
2252 data was done.
2253 */
2254 void *__kmpc_copyprivate_light(ident_t *loc, kmp_int32 gtid, void *cpy_data) {
2255   void **data_ptr;
2256 
2257   KC_TRACE(10, ("__kmpc_copyprivate_light: called T#%d\n", gtid));
2258 
2259   KMP_MB();
2260 
2261   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2262 
2263   if (__kmp_env_consistency_check) {
2264     if (loc == 0) {
2265       KMP_WARNING(ConstructIdentInvalid);
2266     }
2267   }
2268 
2269   // ToDo: Optimize the following barrier
2270 
2271   if (cpy_data)
2272     *data_ptr = cpy_data;
2273 
2274 #if OMPT_SUPPORT
2275   ompt_frame_t *ompt_frame;
2276   if (ompt_enabled.enabled) {
2277     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2278     if (ompt_frame->enter_frame.ptr == NULL)
2279       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2280     OMPT_STORE_RETURN_ADDRESS(gtid);
2281   }
2282 #endif
2283 /* This barrier is not a barrier region boundary */
2284 #if USE_ITT_NOTIFY
2285   __kmp_threads[gtid]->th.th_ident = loc;
2286 #endif
2287   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2288 
2289   return *data_ptr;
2290 }
2291 
2292 /* -------------------------------------------------------------------------- */
2293 
2294 #define INIT_LOCK __kmp_init_user_lock_with_checks
2295 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2296 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2297 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2298 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2299 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2300   __kmp_acquire_nested_user_lock_with_checks_timed
2301 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2302 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2303 #define TEST_LOCK __kmp_test_user_lock_with_checks
2304 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2305 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2306 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2307 
2308 // TODO: Make check abort messages use location info & pass it into
2309 // with_checks routines
2310 
2311 #if KMP_USE_DYNAMIC_LOCK
2312 
2313 // internal lock initializer
2314 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2315                                                     kmp_dyna_lockseq_t seq) {
2316   if (KMP_IS_D_LOCK(seq)) {
2317     KMP_INIT_D_LOCK(lock, seq);
2318 #if USE_ITT_BUILD
2319     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2320 #endif
2321   } else {
2322     KMP_INIT_I_LOCK(lock, seq);
2323 #if USE_ITT_BUILD
2324     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2325     __kmp_itt_lock_creating(ilk->lock, loc);
2326 #endif
2327   }
2328 }
2329 
2330 // internal nest lock initializer
2331 static __forceinline void
2332 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2333                                kmp_dyna_lockseq_t seq) {
2334 #if KMP_USE_TSX
2335   // Don't have nested lock implementation for speculative locks
2336   if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2337       seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2338     seq = __kmp_user_lock_seq;
2339 #endif
2340   switch (seq) {
2341   case lockseq_tas:
2342     seq = lockseq_nested_tas;
2343     break;
2344 #if KMP_USE_FUTEX
2345   case lockseq_futex:
2346     seq = lockseq_nested_futex;
2347     break;
2348 #endif
2349   case lockseq_ticket:
2350     seq = lockseq_nested_ticket;
2351     break;
2352   case lockseq_queuing:
2353     seq = lockseq_nested_queuing;
2354     break;
2355   case lockseq_drdpa:
2356     seq = lockseq_nested_drdpa;
2357     break;
2358   default:
2359     seq = lockseq_nested_queuing;
2360   }
2361   KMP_INIT_I_LOCK(lock, seq);
2362 #if USE_ITT_BUILD
2363   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2364   __kmp_itt_lock_creating(ilk->lock, loc);
2365 #endif
2366 }
2367 
2368 /* initialize the lock with a hint */
2369 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2370                                 uintptr_t hint) {
2371   KMP_DEBUG_ASSERT(__kmp_init_serial);
2372   if (__kmp_env_consistency_check && user_lock == NULL) {
2373     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2374   }
2375 
2376   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2377 
2378 #if OMPT_SUPPORT && OMPT_OPTIONAL
2379   // This is the case, if called from omp_init_lock_with_hint:
2380   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2381   if (!codeptr)
2382     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2383   if (ompt_enabled.ompt_callback_lock_init) {
2384     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2385         ompt_mutex_lock, (omp_lock_hint_t)hint,
2386         __ompt_get_mutex_impl_type(user_lock),
2387         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2388   }
2389 #endif
2390 }
2391 
2392 /* initialize the lock with a hint */
2393 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2394                                      void **user_lock, uintptr_t hint) {
2395   KMP_DEBUG_ASSERT(__kmp_init_serial);
2396   if (__kmp_env_consistency_check && user_lock == NULL) {
2397     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2398   }
2399 
2400   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2401 
2402 #if OMPT_SUPPORT && OMPT_OPTIONAL
2403   // This is the case, if called from omp_init_lock_with_hint:
2404   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2405   if (!codeptr)
2406     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2407   if (ompt_enabled.ompt_callback_lock_init) {
2408     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2409         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2410         __ompt_get_mutex_impl_type(user_lock),
2411         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2412   }
2413 #endif
2414 }
2415 
2416 #endif // KMP_USE_DYNAMIC_LOCK
2417 
2418 /* initialize the lock */
2419 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2420 #if KMP_USE_DYNAMIC_LOCK
2421 
2422   KMP_DEBUG_ASSERT(__kmp_init_serial);
2423   if (__kmp_env_consistency_check && user_lock == NULL) {
2424     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2425   }
2426   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2427 
2428 #if OMPT_SUPPORT && OMPT_OPTIONAL
2429   // This is the case, if called from omp_init_lock_with_hint:
2430   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2431   if (!codeptr)
2432     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2433   if (ompt_enabled.ompt_callback_lock_init) {
2434     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2435         ompt_mutex_lock, omp_lock_hint_none,
2436         __ompt_get_mutex_impl_type(user_lock),
2437         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2438   }
2439 #endif
2440 
2441 #else // KMP_USE_DYNAMIC_LOCK
2442 
2443   static char const *const func = "omp_init_lock";
2444   kmp_user_lock_p lck;
2445   KMP_DEBUG_ASSERT(__kmp_init_serial);
2446 
2447   if (__kmp_env_consistency_check) {
2448     if (user_lock == NULL) {
2449       KMP_FATAL(LockIsUninitialized, func);
2450     }
2451   }
2452 
2453   KMP_CHECK_USER_LOCK_INIT();
2454 
2455   if ((__kmp_user_lock_kind == lk_tas) &&
2456       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2457     lck = (kmp_user_lock_p)user_lock;
2458   }
2459 #if KMP_USE_FUTEX
2460   else if ((__kmp_user_lock_kind == lk_futex) &&
2461            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2462     lck = (kmp_user_lock_p)user_lock;
2463   }
2464 #endif
2465   else {
2466     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2467   }
2468   INIT_LOCK(lck);
2469   __kmp_set_user_lock_location(lck, loc);
2470 
2471 #if OMPT_SUPPORT && OMPT_OPTIONAL
2472   // This is the case, if called from omp_init_lock_with_hint:
2473   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2474   if (!codeptr)
2475     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2476   if (ompt_enabled.ompt_callback_lock_init) {
2477     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2478         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2479         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2480   }
2481 #endif
2482 
2483 #if USE_ITT_BUILD
2484   __kmp_itt_lock_creating(lck);
2485 #endif /* USE_ITT_BUILD */
2486 
2487 #endif // KMP_USE_DYNAMIC_LOCK
2488 } // __kmpc_init_lock
2489 
2490 /* initialize the lock */
2491 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2492 #if KMP_USE_DYNAMIC_LOCK
2493 
2494   KMP_DEBUG_ASSERT(__kmp_init_serial);
2495   if (__kmp_env_consistency_check && user_lock == NULL) {
2496     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2497   }
2498   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2499 
2500 #if OMPT_SUPPORT && OMPT_OPTIONAL
2501   // This is the case, if called from omp_init_lock_with_hint:
2502   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2503   if (!codeptr)
2504     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2505   if (ompt_enabled.ompt_callback_lock_init) {
2506     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2507         ompt_mutex_nest_lock, omp_lock_hint_none,
2508         __ompt_get_mutex_impl_type(user_lock),
2509         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2510   }
2511 #endif
2512 
2513 #else // KMP_USE_DYNAMIC_LOCK
2514 
2515   static char const *const func = "omp_init_nest_lock";
2516   kmp_user_lock_p lck;
2517   KMP_DEBUG_ASSERT(__kmp_init_serial);
2518 
2519   if (__kmp_env_consistency_check) {
2520     if (user_lock == NULL) {
2521       KMP_FATAL(LockIsUninitialized, func);
2522     }
2523   }
2524 
2525   KMP_CHECK_USER_LOCK_INIT();
2526 
2527   if ((__kmp_user_lock_kind == lk_tas) &&
2528       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2529        OMP_NEST_LOCK_T_SIZE)) {
2530     lck = (kmp_user_lock_p)user_lock;
2531   }
2532 #if KMP_USE_FUTEX
2533   else if ((__kmp_user_lock_kind == lk_futex) &&
2534            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2535             OMP_NEST_LOCK_T_SIZE)) {
2536     lck = (kmp_user_lock_p)user_lock;
2537   }
2538 #endif
2539   else {
2540     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2541   }
2542 
2543   INIT_NESTED_LOCK(lck);
2544   __kmp_set_user_lock_location(lck, loc);
2545 
2546 #if OMPT_SUPPORT && OMPT_OPTIONAL
2547   // This is the case, if called from omp_init_lock_with_hint:
2548   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2549   if (!codeptr)
2550     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2551   if (ompt_enabled.ompt_callback_lock_init) {
2552     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2553         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2554         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2555   }
2556 #endif
2557 
2558 #if USE_ITT_BUILD
2559   __kmp_itt_lock_creating(lck);
2560 #endif /* USE_ITT_BUILD */
2561 
2562 #endif // KMP_USE_DYNAMIC_LOCK
2563 } // __kmpc_init_nest_lock
2564 
2565 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2566 #if KMP_USE_DYNAMIC_LOCK
2567 
2568 #if USE_ITT_BUILD
2569   kmp_user_lock_p lck;
2570   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2571     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2572   } else {
2573     lck = (kmp_user_lock_p)user_lock;
2574   }
2575   __kmp_itt_lock_destroyed(lck);
2576 #endif
2577 #if OMPT_SUPPORT && OMPT_OPTIONAL
2578   // This is the case, if called from omp_init_lock_with_hint:
2579   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2580   if (!codeptr)
2581     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2582   if (ompt_enabled.ompt_callback_lock_destroy) {
2583     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2584         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2585   }
2586 #endif
2587   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2588 #else
2589   kmp_user_lock_p lck;
2590 
2591   if ((__kmp_user_lock_kind == lk_tas) &&
2592       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2593     lck = (kmp_user_lock_p)user_lock;
2594   }
2595 #if KMP_USE_FUTEX
2596   else if ((__kmp_user_lock_kind == lk_futex) &&
2597            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2598     lck = (kmp_user_lock_p)user_lock;
2599   }
2600 #endif
2601   else {
2602     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2603   }
2604 
2605 #if OMPT_SUPPORT && OMPT_OPTIONAL
2606   // This is the case, if called from omp_init_lock_with_hint:
2607   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2608   if (!codeptr)
2609     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2610   if (ompt_enabled.ompt_callback_lock_destroy) {
2611     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2612         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2613   }
2614 #endif
2615 
2616 #if USE_ITT_BUILD
2617   __kmp_itt_lock_destroyed(lck);
2618 #endif /* USE_ITT_BUILD */
2619   DESTROY_LOCK(lck);
2620 
2621   if ((__kmp_user_lock_kind == lk_tas) &&
2622       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2623     ;
2624   }
2625 #if KMP_USE_FUTEX
2626   else if ((__kmp_user_lock_kind == lk_futex) &&
2627            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2628     ;
2629   }
2630 #endif
2631   else {
2632     __kmp_user_lock_free(user_lock, gtid, lck);
2633   }
2634 #endif // KMP_USE_DYNAMIC_LOCK
2635 } // __kmpc_destroy_lock
2636 
2637 /* destroy the lock */
2638 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2639 #if KMP_USE_DYNAMIC_LOCK
2640 
2641 #if USE_ITT_BUILD
2642   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2643   __kmp_itt_lock_destroyed(ilk->lock);
2644 #endif
2645 #if OMPT_SUPPORT && OMPT_OPTIONAL
2646   // This is the case, if called from omp_init_lock_with_hint:
2647   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2648   if (!codeptr)
2649     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2650   if (ompt_enabled.ompt_callback_lock_destroy) {
2651     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2652         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2653   }
2654 #endif
2655   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2656 
2657 #else // KMP_USE_DYNAMIC_LOCK
2658 
2659   kmp_user_lock_p lck;
2660 
2661   if ((__kmp_user_lock_kind == lk_tas) &&
2662       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2663        OMP_NEST_LOCK_T_SIZE)) {
2664     lck = (kmp_user_lock_p)user_lock;
2665   }
2666 #if KMP_USE_FUTEX
2667   else if ((__kmp_user_lock_kind == lk_futex) &&
2668            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2669             OMP_NEST_LOCK_T_SIZE)) {
2670     lck = (kmp_user_lock_p)user_lock;
2671   }
2672 #endif
2673   else {
2674     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2675   }
2676 
2677 #if OMPT_SUPPORT && OMPT_OPTIONAL
2678   // This is the case, if called from omp_init_lock_with_hint:
2679   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2680   if (!codeptr)
2681     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2682   if (ompt_enabled.ompt_callback_lock_destroy) {
2683     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2684         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2685   }
2686 #endif
2687 
2688 #if USE_ITT_BUILD
2689   __kmp_itt_lock_destroyed(lck);
2690 #endif /* USE_ITT_BUILD */
2691 
2692   DESTROY_NESTED_LOCK(lck);
2693 
2694   if ((__kmp_user_lock_kind == lk_tas) &&
2695       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2696        OMP_NEST_LOCK_T_SIZE)) {
2697     ;
2698   }
2699 #if KMP_USE_FUTEX
2700   else if ((__kmp_user_lock_kind == lk_futex) &&
2701            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2702             OMP_NEST_LOCK_T_SIZE)) {
2703     ;
2704   }
2705 #endif
2706   else {
2707     __kmp_user_lock_free(user_lock, gtid, lck);
2708   }
2709 #endif // KMP_USE_DYNAMIC_LOCK
2710 } // __kmpc_destroy_nest_lock
2711 
2712 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2713   KMP_COUNT_BLOCK(OMP_set_lock);
2714 #if KMP_USE_DYNAMIC_LOCK
2715   int tag = KMP_EXTRACT_D_TAG(user_lock);
2716 #if USE_ITT_BUILD
2717   __kmp_itt_lock_acquiring(
2718       (kmp_user_lock_p)
2719           user_lock); // itt function will get to the right lock object.
2720 #endif
2721 #if OMPT_SUPPORT && OMPT_OPTIONAL
2722   // This is the case, if called from omp_init_lock_with_hint:
2723   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2724   if (!codeptr)
2725     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2726   if (ompt_enabled.ompt_callback_mutex_acquire) {
2727     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2728         ompt_mutex_lock, omp_lock_hint_none,
2729         __ompt_get_mutex_impl_type(user_lock),
2730         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2731   }
2732 #endif
2733 #if KMP_USE_INLINED_TAS
2734   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2735     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2736   } else
2737 #elif KMP_USE_INLINED_FUTEX
2738   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2739     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2740   } else
2741 #endif
2742   {
2743     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2744   }
2745 #if USE_ITT_BUILD
2746   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2747 #endif
2748 #if OMPT_SUPPORT && OMPT_OPTIONAL
2749   if (ompt_enabled.ompt_callback_mutex_acquired) {
2750     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2751         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2752   }
2753 #endif
2754 
2755 #else // KMP_USE_DYNAMIC_LOCK
2756 
2757   kmp_user_lock_p lck;
2758 
2759   if ((__kmp_user_lock_kind == lk_tas) &&
2760       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2761     lck = (kmp_user_lock_p)user_lock;
2762   }
2763 #if KMP_USE_FUTEX
2764   else if ((__kmp_user_lock_kind == lk_futex) &&
2765            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2766     lck = (kmp_user_lock_p)user_lock;
2767   }
2768 #endif
2769   else {
2770     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2771   }
2772 
2773 #if USE_ITT_BUILD
2774   __kmp_itt_lock_acquiring(lck);
2775 #endif /* USE_ITT_BUILD */
2776 #if OMPT_SUPPORT && OMPT_OPTIONAL
2777   // This is the case, if called from omp_init_lock_with_hint:
2778   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2779   if (!codeptr)
2780     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2781   if (ompt_enabled.ompt_callback_mutex_acquire) {
2782     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2783         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2784         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2785   }
2786 #endif
2787 
2788   ACQUIRE_LOCK(lck, gtid);
2789 
2790 #if USE_ITT_BUILD
2791   __kmp_itt_lock_acquired(lck);
2792 #endif /* USE_ITT_BUILD */
2793 
2794 #if OMPT_SUPPORT && OMPT_OPTIONAL
2795   if (ompt_enabled.ompt_callback_mutex_acquired) {
2796     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2797         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2798   }
2799 #endif
2800 
2801 #endif // KMP_USE_DYNAMIC_LOCK
2802 }
2803 
2804 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2805 #if KMP_USE_DYNAMIC_LOCK
2806 
2807 #if USE_ITT_BUILD
2808   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2809 #endif
2810 #if OMPT_SUPPORT && OMPT_OPTIONAL
2811   // This is the case, if called from omp_init_lock_with_hint:
2812   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2813   if (!codeptr)
2814     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2815   if (ompt_enabled.enabled) {
2816     if (ompt_enabled.ompt_callback_mutex_acquire) {
2817       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2818           ompt_mutex_nest_lock, omp_lock_hint_none,
2819           __ompt_get_mutex_impl_type(user_lock),
2820           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2821     }
2822   }
2823 #endif
2824   int acquire_status =
2825       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2826   (void)acquire_status;
2827 #if USE_ITT_BUILD
2828   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2829 #endif
2830 
2831 #if OMPT_SUPPORT && OMPT_OPTIONAL
2832   if (ompt_enabled.enabled) {
2833     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2834       if (ompt_enabled.ompt_callback_mutex_acquired) {
2835         // lock_first
2836         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2837             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2838             codeptr);
2839       }
2840     } else {
2841       if (ompt_enabled.ompt_callback_nest_lock) {
2842         // lock_next
2843         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2844             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2845       }
2846     }
2847   }
2848 #endif
2849 
2850 #else // KMP_USE_DYNAMIC_LOCK
2851   int acquire_status;
2852   kmp_user_lock_p lck;
2853 
2854   if ((__kmp_user_lock_kind == lk_tas) &&
2855       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2856        OMP_NEST_LOCK_T_SIZE)) {
2857     lck = (kmp_user_lock_p)user_lock;
2858   }
2859 #if KMP_USE_FUTEX
2860   else if ((__kmp_user_lock_kind == lk_futex) &&
2861            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2862             OMP_NEST_LOCK_T_SIZE)) {
2863     lck = (kmp_user_lock_p)user_lock;
2864   }
2865 #endif
2866   else {
2867     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2868   }
2869 
2870 #if USE_ITT_BUILD
2871   __kmp_itt_lock_acquiring(lck);
2872 #endif /* USE_ITT_BUILD */
2873 #if OMPT_SUPPORT && OMPT_OPTIONAL
2874   // This is the case, if called from omp_init_lock_with_hint:
2875   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2876   if (!codeptr)
2877     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2878   if (ompt_enabled.enabled) {
2879     if (ompt_enabled.ompt_callback_mutex_acquire) {
2880       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2881           ompt_mutex_nest_lock, omp_lock_hint_none,
2882           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2883           codeptr);
2884     }
2885   }
2886 #endif
2887 
2888   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2889 
2890 #if USE_ITT_BUILD
2891   __kmp_itt_lock_acquired(lck);
2892 #endif /* USE_ITT_BUILD */
2893 
2894 #if OMPT_SUPPORT && OMPT_OPTIONAL
2895   if (ompt_enabled.enabled) {
2896     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2897       if (ompt_enabled.ompt_callback_mutex_acquired) {
2898         // lock_first
2899         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2900             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2901       }
2902     } else {
2903       if (ompt_enabled.ompt_callback_nest_lock) {
2904         // lock_next
2905         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2906             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2907       }
2908     }
2909   }
2910 #endif
2911 
2912 #endif // KMP_USE_DYNAMIC_LOCK
2913 }
2914 
2915 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2916 #if KMP_USE_DYNAMIC_LOCK
2917 
2918   int tag = KMP_EXTRACT_D_TAG(user_lock);
2919 #if USE_ITT_BUILD
2920   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2921 #endif
2922 #if KMP_USE_INLINED_TAS
2923   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2924     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2925   } else
2926 #elif KMP_USE_INLINED_FUTEX
2927   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2928     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2929   } else
2930 #endif
2931   {
2932     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2933   }
2934 
2935 #if OMPT_SUPPORT && OMPT_OPTIONAL
2936   // This is the case, if called from omp_init_lock_with_hint:
2937   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2938   if (!codeptr)
2939     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2940   if (ompt_enabled.ompt_callback_mutex_released) {
2941     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2942         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2943   }
2944 #endif
2945 
2946 #else // KMP_USE_DYNAMIC_LOCK
2947 
2948   kmp_user_lock_p lck;
2949 
2950   /* Can't use serial interval since not block structured */
2951   /* release the lock */
2952 
2953   if ((__kmp_user_lock_kind == lk_tas) &&
2954       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2955 #if KMP_OS_LINUX &&                                                            \
2956     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2957 // "fast" path implemented to fix customer performance issue
2958 #if USE_ITT_BUILD
2959     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2960 #endif /* USE_ITT_BUILD */
2961     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2962     KMP_MB();
2963 
2964 #if OMPT_SUPPORT && OMPT_OPTIONAL
2965     // This is the case, if called from omp_init_lock_with_hint:
2966     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2967     if (!codeptr)
2968       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2969     if (ompt_enabled.ompt_callback_mutex_released) {
2970       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2971           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2972     }
2973 #endif
2974 
2975     return;
2976 #else
2977     lck = (kmp_user_lock_p)user_lock;
2978 #endif
2979   }
2980 #if KMP_USE_FUTEX
2981   else if ((__kmp_user_lock_kind == lk_futex) &&
2982            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2983     lck = (kmp_user_lock_p)user_lock;
2984   }
2985 #endif
2986   else {
2987     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2988   }
2989 
2990 #if USE_ITT_BUILD
2991   __kmp_itt_lock_releasing(lck);
2992 #endif /* USE_ITT_BUILD */
2993 
2994   RELEASE_LOCK(lck, gtid);
2995 
2996 #if OMPT_SUPPORT && OMPT_OPTIONAL
2997   // This is the case, if called from omp_init_lock_with_hint:
2998   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2999   if (!codeptr)
3000     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3001   if (ompt_enabled.ompt_callback_mutex_released) {
3002     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3003         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3004   }
3005 #endif
3006 
3007 #endif // KMP_USE_DYNAMIC_LOCK
3008 }
3009 
3010 /* release the lock */
3011 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3012 #if KMP_USE_DYNAMIC_LOCK
3013 
3014 #if USE_ITT_BUILD
3015   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
3016 #endif
3017   int release_status =
3018       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
3019   (void)release_status;
3020 
3021 #if OMPT_SUPPORT && OMPT_OPTIONAL
3022   // This is the case, if called from omp_init_lock_with_hint:
3023   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3024   if (!codeptr)
3025     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3026   if (ompt_enabled.enabled) {
3027     if (release_status == KMP_LOCK_RELEASED) {
3028       if (ompt_enabled.ompt_callback_mutex_released) {
3029         // release_lock_last
3030         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3031             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3032             codeptr);
3033       }
3034     } else if (ompt_enabled.ompt_callback_nest_lock) {
3035       // release_lock_prev
3036       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3037           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3038     }
3039   }
3040 #endif
3041 
3042 #else // KMP_USE_DYNAMIC_LOCK
3043 
3044   kmp_user_lock_p lck;
3045 
3046   /* Can't use serial interval since not block structured */
3047 
3048   if ((__kmp_user_lock_kind == lk_tas) &&
3049       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3050        OMP_NEST_LOCK_T_SIZE)) {
3051 #if KMP_OS_LINUX &&                                                            \
3052     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
3053     // "fast" path implemented to fix customer performance issue
3054     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
3055 #if USE_ITT_BUILD
3056     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
3057 #endif /* USE_ITT_BUILD */
3058 
3059 #if OMPT_SUPPORT && OMPT_OPTIONAL
3060     int release_status = KMP_LOCK_STILL_HELD;
3061 #endif
3062 
3063     if (--(tl->lk.depth_locked) == 0) {
3064       TCW_4(tl->lk.poll, 0);
3065 #if OMPT_SUPPORT && OMPT_OPTIONAL
3066       release_status = KMP_LOCK_RELEASED;
3067 #endif
3068     }
3069     KMP_MB();
3070 
3071 #if OMPT_SUPPORT && OMPT_OPTIONAL
3072     // This is the case, if called from omp_init_lock_with_hint:
3073     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3074     if (!codeptr)
3075       codeptr = OMPT_GET_RETURN_ADDRESS(0);
3076     if (ompt_enabled.enabled) {
3077       if (release_status == KMP_LOCK_RELEASED) {
3078         if (ompt_enabled.ompt_callback_mutex_released) {
3079           // release_lock_last
3080           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3081               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3082         }
3083       } else if (ompt_enabled.ompt_callback_nest_lock) {
3084         // release_lock_previous
3085         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3086             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3087       }
3088     }
3089 #endif
3090 
3091     return;
3092 #else
3093     lck = (kmp_user_lock_p)user_lock;
3094 #endif
3095   }
3096 #if KMP_USE_FUTEX
3097   else if ((__kmp_user_lock_kind == lk_futex) &&
3098            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3099             OMP_NEST_LOCK_T_SIZE)) {
3100     lck = (kmp_user_lock_p)user_lock;
3101   }
3102 #endif
3103   else {
3104     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
3105   }
3106 
3107 #if USE_ITT_BUILD
3108   __kmp_itt_lock_releasing(lck);
3109 #endif /* USE_ITT_BUILD */
3110 
3111   int release_status;
3112   release_status = RELEASE_NESTED_LOCK(lck, gtid);
3113 #if OMPT_SUPPORT && OMPT_OPTIONAL
3114   // This is the case, if called from omp_init_lock_with_hint:
3115   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3116   if (!codeptr)
3117     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3118   if (ompt_enabled.enabled) {
3119     if (release_status == KMP_LOCK_RELEASED) {
3120       if (ompt_enabled.ompt_callback_mutex_released) {
3121         // release_lock_last
3122         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3123             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3124       }
3125     } else if (ompt_enabled.ompt_callback_nest_lock) {
3126       // release_lock_previous
3127       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3128           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3129     }
3130   }
3131 #endif
3132 
3133 #endif // KMP_USE_DYNAMIC_LOCK
3134 }
3135 
3136 /* try to acquire the lock */
3137 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3138   KMP_COUNT_BLOCK(OMP_test_lock);
3139 
3140 #if KMP_USE_DYNAMIC_LOCK
3141   int rc;
3142   int tag = KMP_EXTRACT_D_TAG(user_lock);
3143 #if USE_ITT_BUILD
3144   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3145 #endif
3146 #if OMPT_SUPPORT && OMPT_OPTIONAL
3147   // This is the case, if called from omp_init_lock_with_hint:
3148   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3149   if (!codeptr)
3150     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3151   if (ompt_enabled.ompt_callback_mutex_acquire) {
3152     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3153         ompt_mutex_lock, omp_lock_hint_none,
3154         __ompt_get_mutex_impl_type(user_lock),
3155         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3156   }
3157 #endif
3158 #if KMP_USE_INLINED_TAS
3159   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3160     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3161   } else
3162 #elif KMP_USE_INLINED_FUTEX
3163   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3164     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3165   } else
3166 #endif
3167   {
3168     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3169   }
3170   if (rc) {
3171 #if USE_ITT_BUILD
3172     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3173 #endif
3174 #if OMPT_SUPPORT && OMPT_OPTIONAL
3175     if (ompt_enabled.ompt_callback_mutex_acquired) {
3176       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3177           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3178     }
3179 #endif
3180     return FTN_TRUE;
3181   } else {
3182 #if USE_ITT_BUILD
3183     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3184 #endif
3185     return FTN_FALSE;
3186   }
3187 
3188 #else // KMP_USE_DYNAMIC_LOCK
3189 
3190   kmp_user_lock_p lck;
3191   int rc;
3192 
3193   if ((__kmp_user_lock_kind == lk_tas) &&
3194       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3195     lck = (kmp_user_lock_p)user_lock;
3196   }
3197 #if KMP_USE_FUTEX
3198   else if ((__kmp_user_lock_kind == lk_futex) &&
3199            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3200     lck = (kmp_user_lock_p)user_lock;
3201   }
3202 #endif
3203   else {
3204     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3205   }
3206 
3207 #if USE_ITT_BUILD
3208   __kmp_itt_lock_acquiring(lck);
3209 #endif /* USE_ITT_BUILD */
3210 #if OMPT_SUPPORT && OMPT_OPTIONAL
3211   // This is the case, if called from omp_init_lock_with_hint:
3212   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3213   if (!codeptr)
3214     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3215   if (ompt_enabled.ompt_callback_mutex_acquire) {
3216     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3217         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3218         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3219   }
3220 #endif
3221 
3222   rc = TEST_LOCK(lck, gtid);
3223 #if USE_ITT_BUILD
3224   if (rc) {
3225     __kmp_itt_lock_acquired(lck);
3226   } else {
3227     __kmp_itt_lock_cancelled(lck);
3228   }
3229 #endif /* USE_ITT_BUILD */
3230 #if OMPT_SUPPORT && OMPT_OPTIONAL
3231   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3232     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3233         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3234   }
3235 #endif
3236 
3237   return (rc ? FTN_TRUE : FTN_FALSE);
3238 
3239   /* Can't use serial interval since not block structured */
3240 
3241 #endif // KMP_USE_DYNAMIC_LOCK
3242 }
3243 
3244 /* try to acquire the lock */
3245 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3246 #if KMP_USE_DYNAMIC_LOCK
3247   int rc;
3248 #if USE_ITT_BUILD
3249   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3250 #endif
3251 #if OMPT_SUPPORT && OMPT_OPTIONAL
3252   // This is the case, if called from omp_init_lock_with_hint:
3253   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3254   if (!codeptr)
3255     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3256   if (ompt_enabled.ompt_callback_mutex_acquire) {
3257     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3258         ompt_mutex_nest_lock, omp_lock_hint_none,
3259         __ompt_get_mutex_impl_type(user_lock),
3260         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3261   }
3262 #endif
3263   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3264 #if USE_ITT_BUILD
3265   if (rc) {
3266     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3267   } else {
3268     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3269   }
3270 #endif
3271 #if OMPT_SUPPORT && OMPT_OPTIONAL
3272   if (ompt_enabled.enabled && rc) {
3273     if (rc == 1) {
3274       if (ompt_enabled.ompt_callback_mutex_acquired) {
3275         // lock_first
3276         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3277             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3278             codeptr);
3279       }
3280     } else {
3281       if (ompt_enabled.ompt_callback_nest_lock) {
3282         // lock_next
3283         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3284             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3285       }
3286     }
3287   }
3288 #endif
3289   return rc;
3290 
3291 #else // KMP_USE_DYNAMIC_LOCK
3292 
3293   kmp_user_lock_p lck;
3294   int rc;
3295 
3296   if ((__kmp_user_lock_kind == lk_tas) &&
3297       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3298        OMP_NEST_LOCK_T_SIZE)) {
3299     lck = (kmp_user_lock_p)user_lock;
3300   }
3301 #if KMP_USE_FUTEX
3302   else if ((__kmp_user_lock_kind == lk_futex) &&
3303            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3304             OMP_NEST_LOCK_T_SIZE)) {
3305     lck = (kmp_user_lock_p)user_lock;
3306   }
3307 #endif
3308   else {
3309     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3310   }
3311 
3312 #if USE_ITT_BUILD
3313   __kmp_itt_lock_acquiring(lck);
3314 #endif /* USE_ITT_BUILD */
3315 
3316 #if OMPT_SUPPORT && OMPT_OPTIONAL
3317   // This is the case, if called from omp_init_lock_with_hint:
3318   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3319   if (!codeptr)
3320     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3321   if (ompt_enabled.enabled) &&
3322         ompt_enabled.ompt_callback_mutex_acquire) {
3323       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3324           ompt_mutex_nest_lock, omp_lock_hint_none,
3325           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3326           codeptr);
3327     }
3328 #endif
3329 
3330   rc = TEST_NESTED_LOCK(lck, gtid);
3331 #if USE_ITT_BUILD
3332   if (rc) {
3333     __kmp_itt_lock_acquired(lck);
3334   } else {
3335     __kmp_itt_lock_cancelled(lck);
3336   }
3337 #endif /* USE_ITT_BUILD */
3338 #if OMPT_SUPPORT && OMPT_OPTIONAL
3339   if (ompt_enabled.enabled && rc) {
3340     if (rc == 1) {
3341       if (ompt_enabled.ompt_callback_mutex_acquired) {
3342         // lock_first
3343         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3344             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3345       }
3346     } else {
3347       if (ompt_enabled.ompt_callback_nest_lock) {
3348         // lock_next
3349         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3350             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3351       }
3352     }
3353   }
3354 #endif
3355   return rc;
3356 
3357   /* Can't use serial interval since not block structured */
3358 
3359 #endif // KMP_USE_DYNAMIC_LOCK
3360 }
3361 
3362 // Interface to fast scalable reduce methods routines
3363 
3364 // keep the selected method in a thread local structure for cross-function
3365 // usage: will be used in __kmpc_end_reduce* functions;
3366 // another solution: to re-determine the method one more time in
3367 // __kmpc_end_reduce* functions (new prototype required then)
3368 // AT: which solution is better?
3369 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3370   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3371 
3372 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3373   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3374 
3375 // description of the packed_reduction_method variable: look at the macros in
3376 // kmp.h
3377 
3378 // used in a critical section reduce block
3379 static __forceinline void
3380 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3381                                           kmp_critical_name *crit) {
3382 
3383   // this lock was visible to a customer and to the threading profile tool as a
3384   // serial overhead span (although it's used for an internal purpose only)
3385   //            why was it visible in previous implementation?
3386   //            should we keep it visible in new reduce block?
3387   kmp_user_lock_p lck;
3388 
3389 #if KMP_USE_DYNAMIC_LOCK
3390 
3391   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3392   // Check if it is initialized.
3393   if (*lk == 0) {
3394     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3395       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3396                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3397     } else {
3398       __kmp_init_indirect_csptr(crit, loc, global_tid,
3399                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3400     }
3401   }
3402   // Branch for accessing the actual lock object and set operation. This
3403   // branching is inevitable since this lock initialization does not follow the
3404   // normal dispatch path (lock table is not used).
3405   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3406     lck = (kmp_user_lock_p)lk;
3407     KMP_DEBUG_ASSERT(lck != NULL);
3408     if (__kmp_env_consistency_check) {
3409       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3410     }
3411     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3412   } else {
3413     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3414     lck = ilk->lock;
3415     KMP_DEBUG_ASSERT(lck != NULL);
3416     if (__kmp_env_consistency_check) {
3417       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3418     }
3419     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3420   }
3421 
3422 #else // KMP_USE_DYNAMIC_LOCK
3423 
3424   // We know that the fast reduction code is only emitted by Intel compilers
3425   // with 32 byte critical sections. If there isn't enough space, then we
3426   // have to use a pointer.
3427   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3428     lck = (kmp_user_lock_p)crit;
3429   } else {
3430     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3431   }
3432   KMP_DEBUG_ASSERT(lck != NULL);
3433 
3434   if (__kmp_env_consistency_check)
3435     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3436 
3437   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3438 
3439 #endif // KMP_USE_DYNAMIC_LOCK
3440 }
3441 
3442 // used in a critical section reduce block
3443 static __forceinline void
3444 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3445                                         kmp_critical_name *crit) {
3446 
3447   kmp_user_lock_p lck;
3448 
3449 #if KMP_USE_DYNAMIC_LOCK
3450 
3451   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3452     lck = (kmp_user_lock_p)crit;
3453     if (__kmp_env_consistency_check)
3454       __kmp_pop_sync(global_tid, ct_critical, loc);
3455     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3456   } else {
3457     kmp_indirect_lock_t *ilk =
3458         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3459     if (__kmp_env_consistency_check)
3460       __kmp_pop_sync(global_tid, ct_critical, loc);
3461     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3462   }
3463 
3464 #else // KMP_USE_DYNAMIC_LOCK
3465 
3466   // We know that the fast reduction code is only emitted by Intel compilers
3467   // with 32 byte critical sections. If there isn't enough space, then we have
3468   // to use a pointer.
3469   if (__kmp_base_user_lock_size > 32) {
3470     lck = *((kmp_user_lock_p *)crit);
3471     KMP_ASSERT(lck != NULL);
3472   } else {
3473     lck = (kmp_user_lock_p)crit;
3474   }
3475 
3476   if (__kmp_env_consistency_check)
3477     __kmp_pop_sync(global_tid, ct_critical, loc);
3478 
3479   __kmp_release_user_lock_with_checks(lck, global_tid);
3480 
3481 #endif // KMP_USE_DYNAMIC_LOCK
3482 } // __kmp_end_critical_section_reduce_block
3483 
3484 static __forceinline int
3485 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3486                                      int *task_state) {
3487   kmp_team_t *team;
3488 
3489   // Check if we are inside the teams construct?
3490   if (th->th.th_teams_microtask) {
3491     *team_p = team = th->th.th_team;
3492     if (team->t.t_level == th->th.th_teams_level) {
3493       // This is reduction at teams construct.
3494       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3495       // Let's swap teams temporarily for the reduction.
3496       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3497       th->th.th_team = team->t.t_parent;
3498       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3499       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3500       *task_state = th->th.th_task_state;
3501       th->th.th_task_state = 0;
3502 
3503       return 1;
3504     }
3505   }
3506   return 0;
3507 }
3508 
3509 static __forceinline void
3510 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3511   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3512   th->th.th_info.ds.ds_tid = 0;
3513   th->th.th_team = team;
3514   th->th.th_team_nproc = team->t.t_nproc;
3515   th->th.th_task_team = team->t.t_task_team[task_state];
3516   __kmp_type_convert(task_state, &(th->th.th_task_state));
3517 }
3518 
3519 /* 2.a.i. Reduce Block without a terminating barrier */
3520 /*!
3521 @ingroup SYNCHRONIZATION
3522 @param loc source location information
3523 @param global_tid global thread number
3524 @param num_vars number of items (variables) to be reduced
3525 @param reduce_size size of data in bytes to be reduced
3526 @param reduce_data pointer to data to be reduced
3527 @param reduce_func callback function providing reduction operation on two
3528 operands and returning result of reduction in lhs_data
3529 @param lck pointer to the unique lock data structure
3530 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3531 threads if atomic reduction needed
3532 
3533 The nowait version is used for a reduce clause with the nowait argument.
3534 */
3535 kmp_int32
3536 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3537                      size_t reduce_size, void *reduce_data,
3538                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3539                      kmp_critical_name *lck) {
3540 
3541   KMP_COUNT_BLOCK(REDUCE_nowait);
3542   int retval = 0;
3543   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3544   kmp_info_t *th;
3545   kmp_team_t *team;
3546   int teams_swapped = 0, task_state;
3547   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3548   __kmp_assert_valid_gtid(global_tid);
3549 
3550   // why do we need this initialization here at all?
3551   // Reduction clause can not be used as a stand-alone directive.
3552 
3553   // do not call __kmp_serial_initialize(), it will be called by
3554   // __kmp_parallel_initialize() if needed
3555   // possible detection of false-positive race by the threadchecker ???
3556   if (!TCR_4(__kmp_init_parallel))
3557     __kmp_parallel_initialize();
3558 
3559   __kmp_resume_if_soft_paused();
3560 
3561 // check correctness of reduce block nesting
3562 #if KMP_USE_DYNAMIC_LOCK
3563   if (__kmp_env_consistency_check)
3564     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3565 #else
3566   if (__kmp_env_consistency_check)
3567     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3568 #endif
3569 
3570   th = __kmp_thread_from_gtid(global_tid);
3571   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3572 
3573   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3574   // the value should be kept in a variable
3575   // the variable should be either a construct-specific or thread-specific
3576   // property, not a team specific property
3577   //     (a thread can reach the next reduce block on the next construct, reduce
3578   //     method may differ on the next construct)
3579   // an ident_t "loc" parameter could be used as a construct-specific property
3580   // (what if loc == 0?)
3581   //     (if both construct-specific and team-specific variables were shared,
3582   //     then unness extra syncs should be needed)
3583   // a thread-specific variable is better regarding two issues above (next
3584   // construct and extra syncs)
3585   // a thread-specific "th_local.reduction_method" variable is used currently
3586   // each thread executes 'determine' and 'set' lines (no need to execute by one
3587   // thread, to avoid unness extra syncs)
3588 
3589   packed_reduction_method = __kmp_determine_reduction_method(
3590       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3591   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3592 
3593   OMPT_REDUCTION_DECL(th, global_tid);
3594   if (packed_reduction_method == critical_reduce_block) {
3595 
3596     OMPT_REDUCTION_BEGIN;
3597 
3598     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3599     retval = 1;
3600 
3601   } else if (packed_reduction_method == empty_reduce_block) {
3602 
3603     OMPT_REDUCTION_BEGIN;
3604 
3605     // usage: if team size == 1, no synchronization is required ( Intel
3606     // platforms only )
3607     retval = 1;
3608 
3609   } else if (packed_reduction_method == atomic_reduce_block) {
3610 
3611     retval = 2;
3612 
3613     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3614     // won't be called by the code gen)
3615     //     (it's not quite good, because the checking block has been closed by
3616     //     this 'pop',
3617     //      but atomic operation has not been executed yet, will be executed
3618     //      slightly later, literally on next instruction)
3619     if (__kmp_env_consistency_check)
3620       __kmp_pop_sync(global_tid, ct_reduce, loc);
3621 
3622   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3623                                    tree_reduce_block)) {
3624 
3625 // AT: performance issue: a real barrier here
3626 // AT: (if primary thread is slow, other threads are blocked here waiting for
3627 //      the primary thread to come and release them)
3628 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3629 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3630 //      be confusing to a customer)
3631 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3632 // might go faster and be more in line with sense of NOWAIT
3633 // AT: TO DO: do epcc test and compare times
3634 
3635 // this barrier should be invisible to a customer and to the threading profile
3636 // tool (it's neither a terminating barrier nor customer's code, it's
3637 // used for an internal purpose)
3638 #if OMPT_SUPPORT
3639     // JP: can this barrier potentially leed to task scheduling?
3640     // JP: as long as there is a barrier in the implementation, OMPT should and
3641     // will provide the barrier events
3642     //         so we set-up the necessary frame/return addresses.
3643     ompt_frame_t *ompt_frame;
3644     if (ompt_enabled.enabled) {
3645       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3646       if (ompt_frame->enter_frame.ptr == NULL)
3647         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3648     }
3649     OMPT_STORE_RETURN_ADDRESS(global_tid);
3650 #endif
3651 #if USE_ITT_NOTIFY
3652     __kmp_threads[global_tid]->th.th_ident = loc;
3653 #endif
3654     retval =
3655         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3656                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3657     retval = (retval != 0) ? (0) : (1);
3658 #if OMPT_SUPPORT && OMPT_OPTIONAL
3659     if (ompt_enabled.enabled) {
3660       ompt_frame->enter_frame = ompt_data_none;
3661     }
3662 #endif
3663 
3664     // all other workers except primary thread should do this pop here
3665     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3666     if (__kmp_env_consistency_check) {
3667       if (retval == 0) {
3668         __kmp_pop_sync(global_tid, ct_reduce, loc);
3669       }
3670     }
3671 
3672   } else {
3673 
3674     // should never reach this block
3675     KMP_ASSERT(0); // "unexpected method"
3676   }
3677   if (teams_swapped) {
3678     __kmp_restore_swapped_teams(th, team, task_state);
3679   }
3680   KA_TRACE(
3681       10,
3682       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3683        global_tid, packed_reduction_method, retval));
3684 
3685   return retval;
3686 }
3687 
3688 /*!
3689 @ingroup SYNCHRONIZATION
3690 @param loc source location information
3691 @param global_tid global thread id.
3692 @param lck pointer to the unique lock data structure
3693 
3694 Finish the execution of a reduce nowait.
3695 */
3696 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3697                               kmp_critical_name *lck) {
3698 
3699   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3700 
3701   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3702   __kmp_assert_valid_gtid(global_tid);
3703 
3704   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3705 
3706   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3707 
3708   if (packed_reduction_method == critical_reduce_block) {
3709 
3710     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3711     OMPT_REDUCTION_END;
3712 
3713   } else if (packed_reduction_method == empty_reduce_block) {
3714 
3715     // usage: if team size == 1, no synchronization is required ( on Intel
3716     // platforms only )
3717 
3718     OMPT_REDUCTION_END;
3719 
3720   } else if (packed_reduction_method == atomic_reduce_block) {
3721 
3722     // neither primary thread nor other workers should get here
3723     //     (code gen does not generate this call in case 2: atomic reduce block)
3724     // actually it's better to remove this elseif at all;
3725     // after removal this value will checked by the 'else' and will assert
3726 
3727   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3728                                    tree_reduce_block)) {
3729 
3730     // only primary thread gets here
3731     // OMPT: tree reduction is annotated in the barrier code
3732 
3733   } else {
3734 
3735     // should never reach this block
3736     KMP_ASSERT(0); // "unexpected method"
3737   }
3738 
3739   if (__kmp_env_consistency_check)
3740     __kmp_pop_sync(global_tid, ct_reduce, loc);
3741 
3742   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3743                 global_tid, packed_reduction_method));
3744 
3745   return;
3746 }
3747 
3748 /* 2.a.ii. Reduce Block with a terminating barrier */
3749 
3750 /*!
3751 @ingroup SYNCHRONIZATION
3752 @param loc source location information
3753 @param global_tid global thread number
3754 @param num_vars number of items (variables) to be reduced
3755 @param reduce_size size of data in bytes to be reduced
3756 @param reduce_data pointer to data to be reduced
3757 @param reduce_func callback function providing reduction operation on two
3758 operands and returning result of reduction in lhs_data
3759 @param lck pointer to the unique lock data structure
3760 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3761 threads if atomic reduction needed
3762 
3763 A blocking reduce that includes an implicit barrier.
3764 */
3765 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3766                         size_t reduce_size, void *reduce_data,
3767                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3768                         kmp_critical_name *lck) {
3769   KMP_COUNT_BLOCK(REDUCE_wait);
3770   int retval = 0;
3771   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3772   kmp_info_t *th;
3773   kmp_team_t *team;
3774   int teams_swapped = 0, task_state;
3775 
3776   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3777   __kmp_assert_valid_gtid(global_tid);
3778 
3779   // why do we need this initialization here at all?
3780   // Reduction clause can not be a stand-alone directive.
3781 
3782   // do not call __kmp_serial_initialize(), it will be called by
3783   // __kmp_parallel_initialize() if needed
3784   // possible detection of false-positive race by the threadchecker ???
3785   if (!TCR_4(__kmp_init_parallel))
3786     __kmp_parallel_initialize();
3787 
3788   __kmp_resume_if_soft_paused();
3789 
3790 // check correctness of reduce block nesting
3791 #if KMP_USE_DYNAMIC_LOCK
3792   if (__kmp_env_consistency_check)
3793     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3794 #else
3795   if (__kmp_env_consistency_check)
3796     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3797 #endif
3798 
3799   th = __kmp_thread_from_gtid(global_tid);
3800   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3801 
3802   packed_reduction_method = __kmp_determine_reduction_method(
3803       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3804   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3805 
3806   OMPT_REDUCTION_DECL(th, global_tid);
3807 
3808   if (packed_reduction_method == critical_reduce_block) {
3809 
3810     OMPT_REDUCTION_BEGIN;
3811     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3812     retval = 1;
3813 
3814   } else if (packed_reduction_method == empty_reduce_block) {
3815 
3816     OMPT_REDUCTION_BEGIN;
3817     // usage: if team size == 1, no synchronization is required ( Intel
3818     // platforms only )
3819     retval = 1;
3820 
3821   } else if (packed_reduction_method == atomic_reduce_block) {
3822 
3823     retval = 2;
3824 
3825   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3826                                    tree_reduce_block)) {
3827 
3828 // case tree_reduce_block:
3829 // this barrier should be visible to a customer and to the threading profile
3830 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3831 #if OMPT_SUPPORT
3832     ompt_frame_t *ompt_frame;
3833     if (ompt_enabled.enabled) {
3834       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3835       if (ompt_frame->enter_frame.ptr == NULL)
3836         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3837     }
3838     OMPT_STORE_RETURN_ADDRESS(global_tid);
3839 #endif
3840 #if USE_ITT_NOTIFY
3841     __kmp_threads[global_tid]->th.th_ident =
3842         loc; // needed for correct notification of frames
3843 #endif
3844     retval =
3845         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3846                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3847     retval = (retval != 0) ? (0) : (1);
3848 #if OMPT_SUPPORT && OMPT_OPTIONAL
3849     if (ompt_enabled.enabled) {
3850       ompt_frame->enter_frame = ompt_data_none;
3851     }
3852 #endif
3853 
3854     // all other workers except primary thread should do this pop here
3855     // (none of other workers except primary will enter __kmpc_end_reduce())
3856     if (__kmp_env_consistency_check) {
3857       if (retval == 0) { // 0: all other workers; 1: primary thread
3858         __kmp_pop_sync(global_tid, ct_reduce, loc);
3859       }
3860     }
3861 
3862   } else {
3863 
3864     // should never reach this block
3865     KMP_ASSERT(0); // "unexpected method"
3866   }
3867   if (teams_swapped) {
3868     __kmp_restore_swapped_teams(th, team, task_state);
3869   }
3870 
3871   KA_TRACE(10,
3872            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3873             global_tid, packed_reduction_method, retval));
3874   return retval;
3875 }
3876 
3877 /*!
3878 @ingroup SYNCHRONIZATION
3879 @param loc source location information
3880 @param global_tid global thread id.
3881 @param lck pointer to the unique lock data structure
3882 
3883 Finish the execution of a blocking reduce.
3884 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3885 start function.
3886 */
3887 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3888                        kmp_critical_name *lck) {
3889 
3890   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3891   kmp_info_t *th;
3892   kmp_team_t *team;
3893   int teams_swapped = 0, task_state;
3894 
3895   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3896   __kmp_assert_valid_gtid(global_tid);
3897 
3898   th = __kmp_thread_from_gtid(global_tid);
3899   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3900 
3901   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3902 
3903   // this barrier should be visible to a customer and to the threading profile
3904   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3905   OMPT_REDUCTION_DECL(th, global_tid);
3906 
3907   if (packed_reduction_method == critical_reduce_block) {
3908     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3909 
3910     OMPT_REDUCTION_END;
3911 
3912 // TODO: implicit barrier: should be exposed
3913 #if OMPT_SUPPORT
3914     ompt_frame_t *ompt_frame;
3915     if (ompt_enabled.enabled) {
3916       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3917       if (ompt_frame->enter_frame.ptr == NULL)
3918         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3919     }
3920     OMPT_STORE_RETURN_ADDRESS(global_tid);
3921 #endif
3922 #if USE_ITT_NOTIFY
3923     __kmp_threads[global_tid]->th.th_ident = loc;
3924 #endif
3925     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3926 #if OMPT_SUPPORT && OMPT_OPTIONAL
3927     if (ompt_enabled.enabled) {
3928       ompt_frame->enter_frame = ompt_data_none;
3929     }
3930 #endif
3931 
3932   } else if (packed_reduction_method == empty_reduce_block) {
3933 
3934     OMPT_REDUCTION_END;
3935 
3936 // usage: if team size==1, no synchronization is required (Intel platforms only)
3937 
3938 // TODO: implicit barrier: should be exposed
3939 #if OMPT_SUPPORT
3940     ompt_frame_t *ompt_frame;
3941     if (ompt_enabled.enabled) {
3942       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3943       if (ompt_frame->enter_frame.ptr == NULL)
3944         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3945     }
3946     OMPT_STORE_RETURN_ADDRESS(global_tid);
3947 #endif
3948 #if USE_ITT_NOTIFY
3949     __kmp_threads[global_tid]->th.th_ident = loc;
3950 #endif
3951     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3952 #if OMPT_SUPPORT && OMPT_OPTIONAL
3953     if (ompt_enabled.enabled) {
3954       ompt_frame->enter_frame = ompt_data_none;
3955     }
3956 #endif
3957 
3958   } else if (packed_reduction_method == atomic_reduce_block) {
3959 
3960 #if OMPT_SUPPORT
3961     ompt_frame_t *ompt_frame;
3962     if (ompt_enabled.enabled) {
3963       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3964       if (ompt_frame->enter_frame.ptr == NULL)
3965         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3966     }
3967     OMPT_STORE_RETURN_ADDRESS(global_tid);
3968 #endif
3969 // TODO: implicit barrier: should be exposed
3970 #if USE_ITT_NOTIFY
3971     __kmp_threads[global_tid]->th.th_ident = loc;
3972 #endif
3973     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3974 #if OMPT_SUPPORT && OMPT_OPTIONAL
3975     if (ompt_enabled.enabled) {
3976       ompt_frame->enter_frame = ompt_data_none;
3977     }
3978 #endif
3979 
3980   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3981                                    tree_reduce_block)) {
3982 
3983     // only primary thread executes here (primary releases all other workers)
3984     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3985                             global_tid);
3986 
3987   } else {
3988 
3989     // should never reach this block
3990     KMP_ASSERT(0); // "unexpected method"
3991   }
3992   if (teams_swapped) {
3993     __kmp_restore_swapped_teams(th, team, task_state);
3994   }
3995 
3996   if (__kmp_env_consistency_check)
3997     __kmp_pop_sync(global_tid, ct_reduce, loc);
3998 
3999   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
4000                 global_tid, packed_reduction_method));
4001 
4002   return;
4003 }
4004 
4005 #undef __KMP_GET_REDUCTION_METHOD
4006 #undef __KMP_SET_REDUCTION_METHOD
4007 
4008 /* end of interface to fast scalable reduce routines */
4009 
4010 kmp_uint64 __kmpc_get_taskid() {
4011 
4012   kmp_int32 gtid;
4013   kmp_info_t *thread;
4014 
4015   gtid = __kmp_get_gtid();
4016   if (gtid < 0) {
4017     return 0;
4018   }
4019   thread = __kmp_thread_from_gtid(gtid);
4020   return thread->th.th_current_task->td_task_id;
4021 
4022 } // __kmpc_get_taskid
4023 
4024 kmp_uint64 __kmpc_get_parent_taskid() {
4025 
4026   kmp_int32 gtid;
4027   kmp_info_t *thread;
4028   kmp_taskdata_t *parent_task;
4029 
4030   gtid = __kmp_get_gtid();
4031   if (gtid < 0) {
4032     return 0;
4033   }
4034   thread = __kmp_thread_from_gtid(gtid);
4035   parent_task = thread->th.th_current_task->td_parent;
4036   return (parent_task == NULL ? 0 : parent_task->td_task_id);
4037 
4038 } // __kmpc_get_parent_taskid
4039 
4040 /*!
4041 @ingroup WORK_SHARING
4042 @param loc  source location information.
4043 @param gtid  global thread number.
4044 @param num_dims  number of associated doacross loops.
4045 @param dims  info on loops bounds.
4046 
4047 Initialize doacross loop information.
4048 Expect compiler send us inclusive bounds,
4049 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
4050 */
4051 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
4052                           const struct kmp_dim *dims) {
4053   __kmp_assert_valid_gtid(gtid);
4054   int j, idx;
4055   kmp_int64 last, trace_count;
4056   kmp_info_t *th = __kmp_threads[gtid];
4057   kmp_team_t *team = th->th.th_team;
4058   kmp_uint32 *flags;
4059   kmp_disp_t *pr_buf = th->th.th_dispatch;
4060   dispatch_shared_info_t *sh_buf;
4061 
4062   KA_TRACE(
4063       20,
4064       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
4065        gtid, num_dims, !team->t.t_serialized));
4066   KMP_DEBUG_ASSERT(dims != NULL);
4067   KMP_DEBUG_ASSERT(num_dims > 0);
4068 
4069   if (team->t.t_serialized) {
4070     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
4071     return; // no dependencies if team is serialized
4072   }
4073   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
4074   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
4075   // the next loop
4076   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4077 
4078   // Save bounds info into allocated private buffer
4079   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
4080   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
4081       th, sizeof(kmp_int64) * (4 * num_dims + 1));
4082   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4083   pr_buf->th_doacross_info[0] =
4084       (kmp_int64)num_dims; // first element is number of dimensions
4085   // Save also address of num_done in order to access it later without knowing
4086   // the buffer index
4087   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
4088   pr_buf->th_doacross_info[2] = dims[0].lo;
4089   pr_buf->th_doacross_info[3] = dims[0].up;
4090   pr_buf->th_doacross_info[4] = dims[0].st;
4091   last = 5;
4092   for (j = 1; j < num_dims; ++j) {
4093     kmp_int64
4094         range_length; // To keep ranges of all dimensions but the first dims[0]
4095     if (dims[j].st == 1) { // most common case
4096       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
4097       range_length = dims[j].up - dims[j].lo + 1;
4098     } else {
4099       if (dims[j].st > 0) {
4100         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
4101         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
4102       } else { // negative increment
4103         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
4104         range_length =
4105             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
4106       }
4107     }
4108     pr_buf->th_doacross_info[last++] = range_length;
4109     pr_buf->th_doacross_info[last++] = dims[j].lo;
4110     pr_buf->th_doacross_info[last++] = dims[j].up;
4111     pr_buf->th_doacross_info[last++] = dims[j].st;
4112   }
4113 
4114   // Compute total trip count.
4115   // Start with range of dims[0] which we don't need to keep in the buffer.
4116   if (dims[0].st == 1) { // most common case
4117     trace_count = dims[0].up - dims[0].lo + 1;
4118   } else if (dims[0].st > 0) {
4119     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
4120     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
4121   } else { // negative increment
4122     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
4123     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
4124   }
4125   for (j = 1; j < num_dims; ++j) {
4126     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
4127   }
4128   KMP_DEBUG_ASSERT(trace_count > 0);
4129 
4130   // Check if shared buffer is not occupied by other loop (idx -
4131   // __kmp_dispatch_num_buffers)
4132   if (idx != sh_buf->doacross_buf_idx) {
4133     // Shared buffer is occupied, wait for it to be free
4134     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
4135                  __kmp_eq_4, NULL);
4136   }
4137 #if KMP_32_BIT_ARCH
4138   // Check if we are the first thread. After the CAS the first thread gets 0,
4139   // others get 1 if initialization is in progress, allocated pointer otherwise.
4140   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
4141   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
4142       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
4143 #else
4144   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
4145       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
4146 #endif
4147   if (flags == NULL) {
4148     // we are the first thread, allocate the array of flags
4149     size_t size =
4150         (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
4151     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4152     KMP_MB();
4153     sh_buf->doacross_flags = flags;
4154   } else if (flags == (kmp_uint32 *)1) {
4155 #if KMP_32_BIT_ARCH
4156     // initialization is still in progress, need to wait
4157     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4158 #else
4159     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4160 #endif
4161       KMP_YIELD(TRUE);
4162     KMP_MB();
4163   } else {
4164     KMP_MB();
4165   }
4166   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4167   pr_buf->th_doacross_flags =
4168       sh_buf->doacross_flags; // save private copy in order to not
4169   // touch shared buffer on each iteration
4170   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4171 }
4172 
4173 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4174   __kmp_assert_valid_gtid(gtid);
4175   kmp_int64 shft;
4176   size_t num_dims, i;
4177   kmp_uint32 flag;
4178   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4179   kmp_info_t *th = __kmp_threads[gtid];
4180   kmp_team_t *team = th->th.th_team;
4181   kmp_disp_t *pr_buf;
4182   kmp_int64 lo, up, st;
4183 
4184   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4185   if (team->t.t_serialized) {
4186     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4187     return; // no dependencies if team is serialized
4188   }
4189 
4190   // calculate sequential iteration number and check out-of-bounds condition
4191   pr_buf = th->th.th_dispatch;
4192   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4193   num_dims = (size_t)pr_buf->th_doacross_info[0];
4194   lo = pr_buf->th_doacross_info[2];
4195   up = pr_buf->th_doacross_info[3];
4196   st = pr_buf->th_doacross_info[4];
4197 #if OMPT_SUPPORT && OMPT_OPTIONAL
4198   ompt_dependence_t deps[num_dims];
4199 #endif
4200   if (st == 1) { // most common case
4201     if (vec[0] < lo || vec[0] > up) {
4202       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4203                     "bounds [%lld,%lld]\n",
4204                     gtid, vec[0], lo, up));
4205       return;
4206     }
4207     iter_number = vec[0] - lo;
4208   } else if (st > 0) {
4209     if (vec[0] < lo || vec[0] > up) {
4210       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4211                     "bounds [%lld,%lld]\n",
4212                     gtid, vec[0], lo, up));
4213       return;
4214     }
4215     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4216   } else { // negative increment
4217     if (vec[0] > lo || vec[0] < up) {
4218       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4219                     "bounds [%lld,%lld]\n",
4220                     gtid, vec[0], lo, up));
4221       return;
4222     }
4223     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4224   }
4225 #if OMPT_SUPPORT && OMPT_OPTIONAL
4226   deps[0].variable.value = iter_number;
4227   deps[0].dependence_type = ompt_dependence_type_sink;
4228 #endif
4229   for (i = 1; i < num_dims; ++i) {
4230     kmp_int64 iter, ln;
4231     size_t j = i * 4;
4232     ln = pr_buf->th_doacross_info[j + 1];
4233     lo = pr_buf->th_doacross_info[j + 2];
4234     up = pr_buf->th_doacross_info[j + 3];
4235     st = pr_buf->th_doacross_info[j + 4];
4236     if (st == 1) {
4237       if (vec[i] < lo || vec[i] > up) {
4238         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4239                       "bounds [%lld,%lld]\n",
4240                       gtid, vec[i], lo, up));
4241         return;
4242       }
4243       iter = vec[i] - lo;
4244     } else if (st > 0) {
4245       if (vec[i] < lo || vec[i] > up) {
4246         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4247                       "bounds [%lld,%lld]\n",
4248                       gtid, vec[i], lo, up));
4249         return;
4250       }
4251       iter = (kmp_uint64)(vec[i] - lo) / st;
4252     } else { // st < 0
4253       if (vec[i] > lo || vec[i] < up) {
4254         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4255                       "bounds [%lld,%lld]\n",
4256                       gtid, vec[i], lo, up));
4257         return;
4258       }
4259       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4260     }
4261     iter_number = iter + ln * iter_number;
4262 #if OMPT_SUPPORT && OMPT_OPTIONAL
4263     deps[i].variable.value = iter;
4264     deps[i].dependence_type = ompt_dependence_type_sink;
4265 #endif
4266   }
4267   shft = iter_number % 32; // use 32-bit granularity
4268   iter_number >>= 5; // divided by 32
4269   flag = 1 << shft;
4270   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4271     KMP_YIELD(TRUE);
4272   }
4273   KMP_MB();
4274 #if OMPT_SUPPORT && OMPT_OPTIONAL
4275   if (ompt_enabled.ompt_callback_dependences) {
4276     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4277         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4278   }
4279 #endif
4280   KA_TRACE(20,
4281            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4282             gtid, (iter_number << 5) + shft));
4283 }
4284 
4285 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4286   __kmp_assert_valid_gtid(gtid);
4287   kmp_int64 shft;
4288   size_t num_dims, i;
4289   kmp_uint32 flag;
4290   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4291   kmp_info_t *th = __kmp_threads[gtid];
4292   kmp_team_t *team = th->th.th_team;
4293   kmp_disp_t *pr_buf;
4294   kmp_int64 lo, st;
4295 
4296   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4297   if (team->t.t_serialized) {
4298     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4299     return; // no dependencies if team is serialized
4300   }
4301 
4302   // calculate sequential iteration number (same as in "wait" but no
4303   // out-of-bounds checks)
4304   pr_buf = th->th.th_dispatch;
4305   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4306   num_dims = (size_t)pr_buf->th_doacross_info[0];
4307   lo = pr_buf->th_doacross_info[2];
4308   st = pr_buf->th_doacross_info[4];
4309 #if OMPT_SUPPORT && OMPT_OPTIONAL
4310   ompt_dependence_t deps[num_dims];
4311 #endif
4312   if (st == 1) { // most common case
4313     iter_number = vec[0] - lo;
4314   } else if (st > 0) {
4315     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4316   } else { // negative increment
4317     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4318   }
4319 #if OMPT_SUPPORT && OMPT_OPTIONAL
4320   deps[0].variable.value = iter_number;
4321   deps[0].dependence_type = ompt_dependence_type_source;
4322 #endif
4323   for (i = 1; i < num_dims; ++i) {
4324     kmp_int64 iter, ln;
4325     size_t j = i * 4;
4326     ln = pr_buf->th_doacross_info[j + 1];
4327     lo = pr_buf->th_doacross_info[j + 2];
4328     st = pr_buf->th_doacross_info[j + 4];
4329     if (st == 1) {
4330       iter = vec[i] - lo;
4331     } else if (st > 0) {
4332       iter = (kmp_uint64)(vec[i] - lo) / st;
4333     } else { // st < 0
4334       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4335     }
4336     iter_number = iter + ln * iter_number;
4337 #if OMPT_SUPPORT && OMPT_OPTIONAL
4338     deps[i].variable.value = iter;
4339     deps[i].dependence_type = ompt_dependence_type_source;
4340 #endif
4341   }
4342 #if OMPT_SUPPORT && OMPT_OPTIONAL
4343   if (ompt_enabled.ompt_callback_dependences) {
4344     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4345         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4346   }
4347 #endif
4348   shft = iter_number % 32; // use 32-bit granularity
4349   iter_number >>= 5; // divided by 32
4350   flag = 1 << shft;
4351   KMP_MB();
4352   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4353     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4354   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4355                 (iter_number << 5) + shft));
4356 }
4357 
4358 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4359   __kmp_assert_valid_gtid(gtid);
4360   kmp_int32 num_done;
4361   kmp_info_t *th = __kmp_threads[gtid];
4362   kmp_team_t *team = th->th.th_team;
4363   kmp_disp_t *pr_buf = th->th.th_dispatch;
4364 
4365   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4366   if (team->t.t_serialized) {
4367     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4368     return; // nothing to do
4369   }
4370   num_done =
4371       KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4372   if (num_done == th->th.th_team_nproc) {
4373     // we are the last thread, need to free shared resources
4374     int idx = pr_buf->th_doacross_buf_idx - 1;
4375     dispatch_shared_info_t *sh_buf =
4376         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4377     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4378                      (kmp_int64)&sh_buf->doacross_num_done);
4379     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4380     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4381     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4382     sh_buf->doacross_flags = NULL;
4383     sh_buf->doacross_num_done = 0;
4384     sh_buf->doacross_buf_idx +=
4385         __kmp_dispatch_num_buffers; // free buffer for future re-use
4386   }
4387   // free private resources (need to keep buffer index forever)
4388   pr_buf->th_doacross_flags = NULL;
4389   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4390   pr_buf->th_doacross_info = NULL;
4391   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4392 }
4393 
4394 /* OpenMP 5.1 Memory Management routines */
4395 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4396   return __kmp_alloc(__kmp_entry_gtid(), 0, size, allocator);
4397 }
4398 
4399 void *omp_aligned_alloc(size_t align, size_t size,
4400                         omp_allocator_handle_t allocator) {
4401   return __kmp_alloc(__kmp_entry_gtid(), align, size, allocator);
4402 }
4403 
4404 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4405   return __kmp_calloc(__kmp_entry_gtid(), 0, nmemb, size, allocator);
4406 }
4407 
4408 void *omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
4409                          omp_allocator_handle_t allocator) {
4410   return __kmp_calloc(__kmp_entry_gtid(), align, nmemb, size, allocator);
4411 }
4412 
4413 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4414                   omp_allocator_handle_t free_allocator) {
4415   return __kmp_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4416                        free_allocator);
4417 }
4418 
4419 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4420   ___kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4421 }
4422 /* end of OpenMP 5.1 Memory Management routines */
4423 
4424 int __kmpc_get_target_offload(void) {
4425   if (!__kmp_init_serial) {
4426     __kmp_serial_initialize();
4427   }
4428   return __kmp_target_offload;
4429 }
4430 
4431 int __kmpc_pause_resource(kmp_pause_status_t level) {
4432   if (!__kmp_init_serial) {
4433     return 1; // Can't pause if runtime is not initialized
4434   }
4435   return __kmp_pause_resource(level);
4436 }
4437 
4438 void __kmpc_error(ident_t *loc, int severity, const char *message) {
4439   if (!__kmp_init_serial)
4440     __kmp_serial_initialize();
4441 
4442   KMP_ASSERT(severity == severity_warning || severity == severity_fatal);
4443 
4444 #if OMPT_SUPPORT
4445   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_error) {
4446     ompt_callbacks.ompt_callback(ompt_callback_error)(
4447         (ompt_severity_t)severity, message, KMP_STRLEN(message),
4448         OMPT_GET_RETURN_ADDRESS(0));
4449   }
4450 #endif // OMPT_SUPPORT
4451 
4452   char *src_loc;
4453   if (loc && loc->psource) {
4454     kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
4455     src_loc =
4456         __kmp_str_format("%s:%d:%d", str_loc.file, str_loc.line, str_loc.col);
4457     __kmp_str_loc_free(&str_loc);
4458   } else {
4459     src_loc = __kmp_str_format("unknown");
4460   }
4461 
4462   if (severity == severity_warning)
4463     KMP_WARNING(UserDirectedWarning, src_loc, message);
4464   else
4465     KMP_FATAL(UserDirectedError, src_loc, message);
4466 
4467   __kmp_str_free(&src_loc);
4468 }
4469 
4470 // Mark begin of scope directive.
4471 void __kmpc_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4472 // reserved is for extension of scope directive and not used.
4473 #if OMPT_SUPPORT && OMPT_OPTIONAL
4474   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4475     kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4476     int tid = __kmp_tid_from_gtid(gtid);
4477     ompt_callbacks.ompt_callback(ompt_callback_work)(
4478         ompt_work_scope, ompt_scope_begin,
4479         &(team->t.ompt_team_info.parallel_data),
4480         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4481         OMPT_GET_RETURN_ADDRESS(0));
4482   }
4483 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4484 }
4485 
4486 // Mark end of scope directive
4487 void __kmpc_end_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4488 // reserved is for extension of scope directive and not used.
4489 #if OMPT_SUPPORT && OMPT_OPTIONAL
4490   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4491     kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4492     int tid = __kmp_tid_from_gtid(gtid);
4493     ompt_callbacks.ompt_callback(ompt_callback_work)(
4494         ompt_work_scope, ompt_scope_end,
4495         &(team->t.ompt_team_info.parallel_data),
4496         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4497         OMPT_GET_RETURN_ADDRESS(0));
4498   }
4499 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4500 }
4501 
4502 #ifdef KMP_USE_VERSION_SYMBOLS
4503 // For GOMP compatibility there are two versions of each omp_* API.
4504 // One is the plain C symbol and one is the Fortran symbol with an appended
4505 // underscore. When we implement a specific ompc_* version of an omp_*
4506 // function, we want the plain GOMP versioned symbol to alias the ompc_* version
4507 // instead of the Fortran versions in kmp_ftn_entry.h
4508 extern "C" {
4509 // Have to undef these from omp.h so they aren't translated into
4510 // their ompc counterparts in the KMP_VERSION_OMPC_SYMBOL macros below
4511 #ifdef omp_set_affinity_format
4512 #undef omp_set_affinity_format
4513 #endif
4514 #ifdef omp_get_affinity_format
4515 #undef omp_get_affinity_format
4516 #endif
4517 #ifdef omp_display_affinity
4518 #undef omp_display_affinity
4519 #endif
4520 #ifdef omp_capture_affinity
4521 #undef omp_capture_affinity
4522 #endif
4523 KMP_VERSION_OMPC_SYMBOL(ompc_set_affinity_format, omp_set_affinity_format, 50,
4524                         "OMP_5.0");
4525 KMP_VERSION_OMPC_SYMBOL(ompc_get_affinity_format, omp_get_affinity_format, 50,
4526                         "OMP_5.0");
4527 KMP_VERSION_OMPC_SYMBOL(ompc_display_affinity, omp_display_affinity, 50,
4528                         "OMP_5.0");
4529 KMP_VERSION_OMPC_SYMBOL(ompc_capture_affinity, omp_capture_affinity, 50,
4530                         "OMP_5.0");
4531 } // extern "C"
4532 #endif
4533