xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_csupport.cpp (revision 3dd5524264095ed8612c28908e13f80668eff2f9)
1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     __kmp_assign_root_init_mask();
43     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
44   } else if (__kmp_ignore_mppbeg() == FALSE) {
45     // By default __kmp_ignore_mppbeg() returns TRUE.
46     __kmp_internal_begin();
47     KC_TRACE(10, ("__kmpc_begin: called\n"));
48   }
49 }
50 
51 /*!
52  * @ingroup STARTUP_SHUTDOWN
53  * @param loc source location information
54  *
55  * Shutdown the runtime library. This is also optional, and even if called will
56  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
57  * zero.
58  */
59 void __kmpc_end(ident_t *loc) {
60   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
61   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
62   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
63   // returns FALSE and __kmpc_end() will unregister this root (it can cause
64   // library shut down).
65   if (__kmp_ignore_mppend() == FALSE) {
66     KC_TRACE(10, ("__kmpc_end: called\n"));
67     KA_TRACE(30, ("__kmpc_end\n"));
68 
69     __kmp_internal_end_thread(-1);
70   }
71 #if KMP_OS_WINDOWS && OMPT_SUPPORT
72   // Normal exit process on Windows does not allow worker threads of the final
73   // parallel region to finish reporting their events, so shutting down the
74   // library here fixes the issue at least for the cases where __kmpc_end() is
75   // placed properly.
76   if (ompt_enabled.enabled)
77     __kmp_internal_end_library(__kmp_gtid_get_specific());
78 #endif
79 }
80 
81 /*!
82 @ingroup THREAD_STATES
83 @param loc Source location information.
84 @return The global thread index of the active thread.
85 
86 This function can be called in any context.
87 
88 If the runtime has ony been entered at the outermost level from a
89 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
90 that which would be returned by omp_get_thread_num() in the outermost
91 active parallel construct. (Or zero if there is no active parallel
92 construct, since the primary thread is necessarily thread zero).
93 
94 If multiple non-OpenMP threads all enter an OpenMP construct then this
95 will be a unique thread identifier among all the threads created by
96 the OpenMP runtime (but the value cannot be defined in terms of
97 OpenMP thread ids returned by omp_get_thread_num()).
98 */
99 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
100   kmp_int32 gtid = __kmp_entry_gtid();
101 
102   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
103 
104   return gtid;
105 }
106 
107 /*!
108 @ingroup THREAD_STATES
109 @param loc Source location information.
110 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
111 
112 This function can be called in any context.
113 It returns the total number of threads under the control of the OpenMP runtime.
114 That is not a number that can be determined by any OpenMP standard calls, since
115 the library may be called from more than one non-OpenMP thread, and this
116 reflects the total over all such calls. Similarly the runtime maintains
117 underlying threads even when they are not active (since the cost of creating
118 and destroying OS threads is high), this call counts all such threads even if
119 they are not waiting for work.
120 */
121 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
122   KC_TRACE(10,
123            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
124 
125   return TCR_4(__kmp_all_nth);
126 }
127 
128 /*!
129 @ingroup THREAD_STATES
130 @param loc Source location information.
131 @return The thread number of the calling thread in the innermost active parallel
132 construct.
133 */
134 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
135   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
136   return __kmp_tid_from_gtid(__kmp_entry_gtid());
137 }
138 
139 /*!
140 @ingroup THREAD_STATES
141 @param loc Source location information.
142 @return The number of threads in the innermost active parallel construct.
143 */
144 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
145   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
146 
147   return __kmp_entry_thread()->th.th_team->t.t_nproc;
148 }
149 
150 /*!
151  * @ingroup DEPRECATED
152  * @param loc location description
153  *
154  * This function need not be called. It always returns TRUE.
155  */
156 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
157 #ifndef KMP_DEBUG
158 
159   return TRUE;
160 
161 #else
162 
163   const char *semi2;
164   const char *semi3;
165   int line_no;
166 
167   if (__kmp_par_range == 0) {
168     return TRUE;
169   }
170   semi2 = loc->psource;
171   if (semi2 == NULL) {
172     return TRUE;
173   }
174   semi2 = strchr(semi2, ';');
175   if (semi2 == NULL) {
176     return TRUE;
177   }
178   semi2 = strchr(semi2 + 1, ';');
179   if (semi2 == NULL) {
180     return TRUE;
181   }
182   if (__kmp_par_range_filename[0]) {
183     const char *name = semi2 - 1;
184     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
185       name--;
186     }
187     if ((*name == '/') || (*name == ';')) {
188       name++;
189     }
190     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
191       return __kmp_par_range < 0;
192     }
193   }
194   semi3 = strchr(semi2 + 1, ';');
195   if (__kmp_par_range_routine[0]) {
196     if ((semi3 != NULL) && (semi3 > semi2) &&
197         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
198       return __kmp_par_range < 0;
199     }
200   }
201   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
202     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
203       return __kmp_par_range > 0;
204     }
205     return __kmp_par_range < 0;
206   }
207   return TRUE;
208 
209 #endif /* KMP_DEBUG */
210 }
211 
212 /*!
213 @ingroup THREAD_STATES
214 @param loc Source location information.
215 @return 1 if this thread is executing inside an active parallel region, zero if
216 not.
217 */
218 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
219   return __kmp_entry_thread()->th.th_root->r.r_active;
220 }
221 
222 /*!
223 @ingroup PARALLEL
224 @param loc source location information
225 @param global_tid global thread number
226 @param num_threads number of threads requested for this parallel construct
227 
228 Set the number of threads to be used by the next fork spawned by this thread.
229 This call is only required if the parallel construct has a `num_threads` clause.
230 */
231 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
232                              kmp_int32 num_threads) {
233   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
234                 global_tid, num_threads));
235   __kmp_assert_valid_gtid(global_tid);
236   __kmp_push_num_threads(loc, global_tid, num_threads);
237 }
238 
239 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
240   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
241   /* the num_threads are automatically popped */
242 }
243 
244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245                            kmp_int32 proc_bind) {
246   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247                 proc_bind));
248   __kmp_assert_valid_gtid(global_tid);
249   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250 }
251 
252 /*!
253 @ingroup PARALLEL
254 @param loc  source location information
255 @param argc  total number of arguments in the ellipsis
256 @param microtask  pointer to callback routine consisting of outlined parallel
257 construct
258 @param ...  pointers to shared variables that aren't global
259 
260 Do the actual fork and call the microtask in the relevant number of threads.
261 */
262 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263   int gtid = __kmp_entry_gtid();
264 
265 #if (KMP_STATS_ENABLED)
266   // If we were in a serial region, then stop the serial timer, record
267   // the event, and start parallel region timer
268   stats_state_e previous_state = KMP_GET_THREAD_STATE();
269   if (previous_state == stats_state_e::SERIAL_REGION) {
270     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
271   } else {
272     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273   }
274   int inParallel = __kmpc_in_parallel(loc);
275   if (inParallel) {
276     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
277   } else {
278     KMP_COUNT_BLOCK(OMP_PARALLEL);
279   }
280 #endif
281 
282   // maybe to save thr_state is enough here
283   {
284     va_list ap;
285     va_start(ap, microtask);
286 
287 #if OMPT_SUPPORT
288     ompt_frame_t *ompt_frame;
289     if (ompt_enabled.enabled) {
290       kmp_info_t *master_th = __kmp_threads[gtid];
291       ompt_frame = &master_th->th.th_current_task->ompt_task_info.frame;
292       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
293     }
294     OMPT_STORE_RETURN_ADDRESS(gtid);
295 #endif
296 
297 #if INCLUDE_SSC_MARKS
298     SSC_MARK_FORKING();
299 #endif
300     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
301                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
302                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
303                     kmp_va_addr_of(ap));
304 #if INCLUDE_SSC_MARKS
305     SSC_MARK_JOINING();
306 #endif
307     __kmp_join_call(loc, gtid
308 #if OMPT_SUPPORT
309                     ,
310                     fork_context_intel
311 #endif
312     );
313 
314     va_end(ap);
315 
316 #if OMPT_SUPPORT
317     if (ompt_enabled.enabled) {
318       ompt_frame->enter_frame = ompt_data_none;
319     }
320 #endif
321   }
322 
323 #if KMP_STATS_ENABLED
324   if (previous_state == stats_state_e::SERIAL_REGION) {
325     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
326     KMP_SET_THREAD_STATE(previous_state);
327   } else {
328     KMP_POP_PARTITIONED_TIMER();
329   }
330 #endif // KMP_STATS_ENABLED
331 }
332 
333 /*!
334 @ingroup PARALLEL
335 @param loc source location information
336 @param global_tid global thread number
337 @param num_teams number of teams requested for the teams construct
338 @param num_threads number of threads per team requested for the teams construct
339 
340 Set the number of teams to be used by the teams construct.
341 This call is only required if the teams construct has a `num_teams` clause
342 or a `thread_limit` clause (or both).
343 */
344 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
345                            kmp_int32 num_teams, kmp_int32 num_threads) {
346   KA_TRACE(20,
347            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
348             global_tid, num_teams, num_threads));
349   __kmp_assert_valid_gtid(global_tid);
350   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
351 }
352 
353 /*!
354 @ingroup PARALLEL
355 @param loc source location information
356 @param global_tid global thread number
357 @param num_teams_lb lower bound on number of teams requested for the teams
358 construct
359 @param num_teams_ub upper bound on number of teams requested for the teams
360 construct
361 @param num_threads number of threads per team requested for the teams construct
362 
363 Set the number of teams to be used by the teams construct. The number of initial
364 teams cretaed will be greater than or equal to the lower bound and less than or
365 equal to the upper bound.
366 This call is only required if the teams construct has a `num_teams` clause
367 or a `thread_limit` clause (or both).
368 */
369 void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
370                               kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
371                               kmp_int32 num_threads) {
372   KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
373                 " num_teams_ub=%d num_threads=%d\n",
374                 global_tid, num_teams_lb, num_teams_ub, num_threads));
375   __kmp_assert_valid_gtid(global_tid);
376   __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
377                           num_threads);
378 }
379 
380 /*!
381 @ingroup PARALLEL
382 @param loc  source location information
383 @param argc  total number of arguments in the ellipsis
384 @param microtask  pointer to callback routine consisting of outlined teams
385 construct
386 @param ...  pointers to shared variables that aren't global
387 
388 Do the actual fork and call the microtask in the relevant number of threads.
389 */
390 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
391                        ...) {
392   int gtid = __kmp_entry_gtid();
393   kmp_info_t *this_thr = __kmp_threads[gtid];
394   va_list ap;
395   va_start(ap, microtask);
396 
397 #if KMP_STATS_ENABLED
398   KMP_COUNT_BLOCK(OMP_TEAMS);
399   stats_state_e previous_state = KMP_GET_THREAD_STATE();
400   if (previous_state == stats_state_e::SERIAL_REGION) {
401     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
402   } else {
403     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
404   }
405 #endif
406 
407   // remember teams entry point and nesting level
408   this_thr->th.th_teams_microtask = microtask;
409   this_thr->th.th_teams_level =
410       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
411 
412 #if OMPT_SUPPORT
413   kmp_team_t *parent_team = this_thr->th.th_team;
414   int tid = __kmp_tid_from_gtid(gtid);
415   if (ompt_enabled.enabled) {
416     parent_team->t.t_implicit_task_taskdata[tid]
417         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
418   }
419   OMPT_STORE_RETURN_ADDRESS(gtid);
420 #endif
421 
422   // check if __kmpc_push_num_teams called, set default number of teams
423   // otherwise
424   if (this_thr->th.th_teams_size.nteams == 0) {
425     __kmp_push_num_teams(loc, gtid, 0, 0);
426   }
427   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
428   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
429   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
430 
431   __kmp_fork_call(
432       loc, gtid, fork_context_intel, argc,
433       VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
434       VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
435   __kmp_join_call(loc, gtid
436 #if OMPT_SUPPORT
437                   ,
438                   fork_context_intel
439 #endif
440   );
441 
442   // Pop current CG root off list
443   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
444   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
445   this_thr->th.th_cg_roots = tmp->up;
446   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
447                  " to node %p. cg_nthreads was %d\n",
448                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
449   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
450   int i = tmp->cg_nthreads--;
451   if (i == 1) { // check is we are the last thread in CG (not always the case)
452     __kmp_free(tmp);
453   }
454   // Restore current task's thread_limit from CG root
455   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
456   this_thr->th.th_current_task->td_icvs.thread_limit =
457       this_thr->th.th_cg_roots->cg_thread_limit;
458 
459   this_thr->th.th_teams_microtask = NULL;
460   this_thr->th.th_teams_level = 0;
461   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
462   va_end(ap);
463 #if KMP_STATS_ENABLED
464   if (previous_state == stats_state_e::SERIAL_REGION) {
465     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
466     KMP_SET_THREAD_STATE(previous_state);
467   } else {
468     KMP_POP_PARTITIONED_TIMER();
469   }
470 #endif // KMP_STATS_ENABLED
471 }
472 
473 // I don't think this function should ever have been exported.
474 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
475 // openmp code ever called it, but it's been exported from the RTL for so
476 // long that I'm afraid to remove the definition.
477 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
478 
479 /*!
480 @ingroup PARALLEL
481 @param loc  source location information
482 @param global_tid  global thread number
483 
484 Enter a serialized parallel construct. This interface is used to handle a
485 conditional parallel region, like this,
486 @code
487 #pragma omp parallel if (condition)
488 @endcode
489 when the condition is false.
490 */
491 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
492   // The implementation is now in kmp_runtime.cpp so that it can share static
493   // functions with kmp_fork_call since the tasks to be done are similar in
494   // each case.
495   __kmp_assert_valid_gtid(global_tid);
496 #if OMPT_SUPPORT
497   OMPT_STORE_RETURN_ADDRESS(global_tid);
498 #endif
499   __kmp_serialized_parallel(loc, global_tid);
500 }
501 
502 /*!
503 @ingroup PARALLEL
504 @param loc  source location information
505 @param global_tid  global thread number
506 
507 Leave a serialized parallel construct.
508 */
509 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
510   kmp_internal_control_t *top;
511   kmp_info_t *this_thr;
512   kmp_team_t *serial_team;
513 
514   KC_TRACE(10,
515            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
516 
517   /* skip all this code for autopar serialized loops since it results in
518      unacceptable overhead */
519   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
520     return;
521 
522   // Not autopar code
523   __kmp_assert_valid_gtid(global_tid);
524   if (!TCR_4(__kmp_init_parallel))
525     __kmp_parallel_initialize();
526 
527   __kmp_resume_if_soft_paused();
528 
529   this_thr = __kmp_threads[global_tid];
530   serial_team = this_thr->th.th_serial_team;
531 
532   kmp_task_team_t *task_team = this_thr->th.th_task_team;
533   // we need to wait for the proxy tasks before finishing the thread
534   if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
535                             task_team->tt.tt_hidden_helper_task_encountered))
536     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
537 
538   KMP_MB();
539   KMP_DEBUG_ASSERT(serial_team);
540   KMP_ASSERT(serial_team->t.t_serialized);
541   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
542   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
543   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
544   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
545 
546 #if OMPT_SUPPORT
547   if (ompt_enabled.enabled &&
548       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
549     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
550     if (ompt_enabled.ompt_callback_implicit_task) {
551       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
552           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
553           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
554     }
555 
556     // reset clear the task id only after unlinking the task
557     ompt_data_t *parent_task_data;
558     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
559 
560     if (ompt_enabled.ompt_callback_parallel_end) {
561       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
562           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
563           ompt_parallel_invoker_program | ompt_parallel_team,
564           OMPT_LOAD_RETURN_ADDRESS(global_tid));
565     }
566     __ompt_lw_taskteam_unlink(this_thr);
567     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
568   }
569 #endif
570 
571   /* If necessary, pop the internal control stack values and replace the team
572    * values */
573   top = serial_team->t.t_control_stack_top;
574   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
575     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
576     serial_team->t.t_control_stack_top = top->next;
577     __kmp_free(top);
578   }
579 
580   /* pop dispatch buffers stack */
581   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
582   {
583     dispatch_private_info_t *disp_buffer =
584         serial_team->t.t_dispatch->th_disp_buffer;
585     serial_team->t.t_dispatch->th_disp_buffer =
586         serial_team->t.t_dispatch->th_disp_buffer->next;
587     __kmp_free(disp_buffer);
588   }
589   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
590 
591   --serial_team->t.t_serialized;
592   if (serial_team->t.t_serialized == 0) {
593 
594     /* return to the parallel section */
595 
596 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
597     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
598       __kmp_clear_x87_fpu_status_word();
599       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
600       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
601     }
602 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
603 
604     __kmp_pop_current_task_from_thread(this_thr);
605 #if OMPD_SUPPORT
606     if (ompd_state & OMPD_ENABLE_BP)
607       ompd_bp_parallel_end();
608 #endif
609 
610     this_thr->th.th_team = serial_team->t.t_parent;
611     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
612 
613     /* restore values cached in the thread */
614     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
615     this_thr->th.th_team_master =
616         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
617     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
618 
619     /* TODO the below shouldn't need to be adjusted for serialized teams */
620     this_thr->th.th_dispatch =
621         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
622 
623     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
624     this_thr->th.th_current_task->td_flags.executing = 1;
625 
626     if (__kmp_tasking_mode != tskm_immediate_exec) {
627       // Copy the task team from the new child / old parent team to the thread.
628       this_thr->th.th_task_team =
629           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
630       KA_TRACE(20,
631                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
632                 "team %p\n",
633                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
634     }
635 #if KMP_AFFINITY_SUPPORTED
636     if (this_thr->th.th_team->t.t_level == 0 && __kmp_affin_reset) {
637       __kmp_reset_root_init_mask(global_tid);
638     }
639 #endif
640   } else {
641     if (__kmp_tasking_mode != tskm_immediate_exec) {
642       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
643                     "depth of serial team %p to %d\n",
644                     global_tid, serial_team, serial_team->t.t_serialized));
645     }
646   }
647 
648   serial_team->t.t_level--;
649   if (__kmp_env_consistency_check)
650     __kmp_pop_parallel(global_tid, NULL);
651 #if OMPT_SUPPORT
652   if (ompt_enabled.enabled)
653     this_thr->th.ompt_thread_info.state =
654         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
655                                            : ompt_state_work_parallel);
656 #endif
657 }
658 
659 /*!
660 @ingroup SYNCHRONIZATION
661 @param loc  source location information.
662 
663 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
664 depending on the memory ordering convention obeyed by the compiler
665 even that may not be necessary).
666 */
667 void __kmpc_flush(ident_t *loc) {
668   KC_TRACE(10, ("__kmpc_flush: called\n"));
669 
670   /* need explicit __mf() here since use volatile instead in library */
671   KMP_MB(); /* Flush all pending memory write invalidates.  */
672 
673 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
674 #if KMP_MIC
675 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
676 // We shouldn't need it, though, since the ABI rules require that
677 // * If the compiler generates NGO stores it also generates the fence
678 // * If users hand-code NGO stores they should insert the fence
679 // therefore no incomplete unordered stores should be visible.
680 #else
681   // C74404
682   // This is to address non-temporal store instructions (sfence needed).
683   // The clflush instruction is addressed either (mfence needed).
684   // Probably the non-temporal load monvtdqa instruction should also be
685   // addressed.
686   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
687   if (!__kmp_cpuinfo.initialized) {
688     __kmp_query_cpuid(&__kmp_cpuinfo);
689   }
690   if (!__kmp_cpuinfo.flags.sse2) {
691     // CPU cannot execute SSE2 instructions.
692   } else {
693 #if KMP_COMPILER_ICC || KMP_COMPILER_ICX
694     _mm_mfence();
695 #elif KMP_COMPILER_MSVC
696     MemoryBarrier();
697 #else
698     __sync_synchronize();
699 #endif // KMP_COMPILER_ICC || KMP_COMPILER_ICX
700   }
701 #endif // KMP_MIC
702 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
703        KMP_ARCH_RISCV64)
704 // Nothing to see here move along
705 #elif KMP_ARCH_PPC64
706 // Nothing needed here (we have a real MB above).
707 #else
708 #error Unknown or unsupported architecture
709 #endif
710 
711 #if OMPT_SUPPORT && OMPT_OPTIONAL
712   if (ompt_enabled.ompt_callback_flush) {
713     ompt_callbacks.ompt_callback(ompt_callback_flush)(
714         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
715   }
716 #endif
717 }
718 
719 /* -------------------------------------------------------------------------- */
720 /*!
721 @ingroup SYNCHRONIZATION
722 @param loc source location information
723 @param global_tid thread id.
724 
725 Execute a barrier.
726 */
727 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
728   KMP_COUNT_BLOCK(OMP_BARRIER);
729   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
730   __kmp_assert_valid_gtid(global_tid);
731 
732   if (!TCR_4(__kmp_init_parallel))
733     __kmp_parallel_initialize();
734 
735   __kmp_resume_if_soft_paused();
736 
737   if (__kmp_env_consistency_check) {
738     if (loc == 0) {
739       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
740     }
741     __kmp_check_barrier(global_tid, ct_barrier, loc);
742   }
743 
744 #if OMPT_SUPPORT
745   ompt_frame_t *ompt_frame;
746   if (ompt_enabled.enabled) {
747     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
748     if (ompt_frame->enter_frame.ptr == NULL)
749       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
750   }
751   OMPT_STORE_RETURN_ADDRESS(global_tid);
752 #endif
753   __kmp_threads[global_tid]->th.th_ident = loc;
754   // TODO: explicit barrier_wait_id:
755   //   this function is called when 'barrier' directive is present or
756   //   implicit barrier at the end of a worksharing construct.
757   // 1) better to add a per-thread barrier counter to a thread data structure
758   // 2) set to 0 when a new team is created
759   // 4) no sync is required
760 
761   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
762 #if OMPT_SUPPORT && OMPT_OPTIONAL
763   if (ompt_enabled.enabled) {
764     ompt_frame->enter_frame = ompt_data_none;
765   }
766 #endif
767 }
768 
769 /* The BARRIER for a MASTER section is always explicit   */
770 /*!
771 @ingroup WORK_SHARING
772 @param loc  source location information.
773 @param global_tid  global thread number .
774 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
775 */
776 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
777   int status = 0;
778 
779   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
780   __kmp_assert_valid_gtid(global_tid);
781 
782   if (!TCR_4(__kmp_init_parallel))
783     __kmp_parallel_initialize();
784 
785   __kmp_resume_if_soft_paused();
786 
787   if (KMP_MASTER_GTID(global_tid)) {
788     KMP_COUNT_BLOCK(OMP_MASTER);
789     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
790     status = 1;
791   }
792 
793 #if OMPT_SUPPORT && OMPT_OPTIONAL
794   if (status) {
795     if (ompt_enabled.ompt_callback_masked) {
796       kmp_info_t *this_thr = __kmp_threads[global_tid];
797       kmp_team_t *team = this_thr->th.th_team;
798 
799       int tid = __kmp_tid_from_gtid(global_tid);
800       ompt_callbacks.ompt_callback(ompt_callback_masked)(
801           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
802           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
803           OMPT_GET_RETURN_ADDRESS(0));
804     }
805   }
806 #endif
807 
808   if (__kmp_env_consistency_check) {
809 #if KMP_USE_DYNAMIC_LOCK
810     if (status)
811       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
812     else
813       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
814 #else
815     if (status)
816       __kmp_push_sync(global_tid, ct_master, loc, NULL);
817     else
818       __kmp_check_sync(global_tid, ct_master, loc, NULL);
819 #endif
820   }
821 
822   return status;
823 }
824 
825 /*!
826 @ingroup WORK_SHARING
827 @param loc  source location information.
828 @param global_tid  global thread number .
829 
830 Mark the end of a <tt>master</tt> region. This should only be called by the
831 thread that executes the <tt>master</tt> region.
832 */
833 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
834   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
835   __kmp_assert_valid_gtid(global_tid);
836   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
837   KMP_POP_PARTITIONED_TIMER();
838 
839 #if OMPT_SUPPORT && OMPT_OPTIONAL
840   kmp_info_t *this_thr = __kmp_threads[global_tid];
841   kmp_team_t *team = this_thr->th.th_team;
842   if (ompt_enabled.ompt_callback_masked) {
843     int tid = __kmp_tid_from_gtid(global_tid);
844     ompt_callbacks.ompt_callback(ompt_callback_masked)(
845         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
846         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
847         OMPT_GET_RETURN_ADDRESS(0));
848   }
849 #endif
850 
851   if (__kmp_env_consistency_check) {
852     if (KMP_MASTER_GTID(global_tid))
853       __kmp_pop_sync(global_tid, ct_master, loc);
854   }
855 }
856 
857 /*!
858 @ingroup WORK_SHARING
859 @param loc  source location information.
860 @param global_tid  global thread number.
861 @param filter result of evaluating filter clause on thread global_tid, or zero
862 if no filter clause present
863 @return 1 if this thread should execute the <tt>masked</tt> block, 0 otherwise.
864 */
865 kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
866   int status = 0;
867   int tid;
868   KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
869   __kmp_assert_valid_gtid(global_tid);
870 
871   if (!TCR_4(__kmp_init_parallel))
872     __kmp_parallel_initialize();
873 
874   __kmp_resume_if_soft_paused();
875 
876   tid = __kmp_tid_from_gtid(global_tid);
877   if (tid == filter) {
878     KMP_COUNT_BLOCK(OMP_MASKED);
879     KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
880     status = 1;
881   }
882 
883 #if OMPT_SUPPORT && OMPT_OPTIONAL
884   if (status) {
885     if (ompt_enabled.ompt_callback_masked) {
886       kmp_info_t *this_thr = __kmp_threads[global_tid];
887       kmp_team_t *team = this_thr->th.th_team;
888       ompt_callbacks.ompt_callback(ompt_callback_masked)(
889           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
890           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
891           OMPT_GET_RETURN_ADDRESS(0));
892     }
893   }
894 #endif
895 
896   if (__kmp_env_consistency_check) {
897 #if KMP_USE_DYNAMIC_LOCK
898     if (status)
899       __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
900     else
901       __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
902 #else
903     if (status)
904       __kmp_push_sync(global_tid, ct_masked, loc, NULL);
905     else
906       __kmp_check_sync(global_tid, ct_masked, loc, NULL);
907 #endif
908   }
909 
910   return status;
911 }
912 
913 /*!
914 @ingroup WORK_SHARING
915 @param loc  source location information.
916 @param global_tid  global thread number .
917 
918 Mark the end of a <tt>masked</tt> region. This should only be called by the
919 thread that executes the <tt>masked</tt> region.
920 */
921 void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
922   KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
923   __kmp_assert_valid_gtid(global_tid);
924   KMP_POP_PARTITIONED_TIMER();
925 
926 #if OMPT_SUPPORT && OMPT_OPTIONAL
927   kmp_info_t *this_thr = __kmp_threads[global_tid];
928   kmp_team_t *team = this_thr->th.th_team;
929   if (ompt_enabled.ompt_callback_masked) {
930     int tid = __kmp_tid_from_gtid(global_tid);
931     ompt_callbacks.ompt_callback(ompt_callback_masked)(
932         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
933         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
934         OMPT_GET_RETURN_ADDRESS(0));
935   }
936 #endif
937 
938   if (__kmp_env_consistency_check) {
939     __kmp_pop_sync(global_tid, ct_masked, loc);
940   }
941 }
942 
943 /*!
944 @ingroup WORK_SHARING
945 @param loc  source location information.
946 @param gtid  global thread number.
947 
948 Start execution of an <tt>ordered</tt> construct.
949 */
950 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
951   int cid = 0;
952   kmp_info_t *th;
953   KMP_DEBUG_ASSERT(__kmp_init_serial);
954 
955   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
956   __kmp_assert_valid_gtid(gtid);
957 
958   if (!TCR_4(__kmp_init_parallel))
959     __kmp_parallel_initialize();
960 
961   __kmp_resume_if_soft_paused();
962 
963 #if USE_ITT_BUILD
964   __kmp_itt_ordered_prep(gtid);
965 // TODO: ordered_wait_id
966 #endif /* USE_ITT_BUILD */
967 
968   th = __kmp_threads[gtid];
969 
970 #if OMPT_SUPPORT && OMPT_OPTIONAL
971   kmp_team_t *team;
972   ompt_wait_id_t lck;
973   void *codeptr_ra;
974   OMPT_STORE_RETURN_ADDRESS(gtid);
975   if (ompt_enabled.enabled) {
976     team = __kmp_team_from_gtid(gtid);
977     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
978     /* OMPT state update */
979     th->th.ompt_thread_info.wait_id = lck;
980     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
981 
982     /* OMPT event callback */
983     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
984     if (ompt_enabled.ompt_callback_mutex_acquire) {
985       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
986           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
987           codeptr_ra);
988     }
989   }
990 #endif
991 
992   if (th->th.th_dispatch->th_deo_fcn != 0)
993     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
994   else
995     __kmp_parallel_deo(&gtid, &cid, loc);
996 
997 #if OMPT_SUPPORT && OMPT_OPTIONAL
998   if (ompt_enabled.enabled) {
999     /* OMPT state update */
1000     th->th.ompt_thread_info.state = ompt_state_work_parallel;
1001     th->th.ompt_thread_info.wait_id = 0;
1002 
1003     /* OMPT event callback */
1004     if (ompt_enabled.ompt_callback_mutex_acquired) {
1005       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1006           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1007     }
1008   }
1009 #endif
1010 
1011 #if USE_ITT_BUILD
1012   __kmp_itt_ordered_start(gtid);
1013 #endif /* USE_ITT_BUILD */
1014 }
1015 
1016 /*!
1017 @ingroup WORK_SHARING
1018 @param loc  source location information.
1019 @param gtid  global thread number.
1020 
1021 End execution of an <tt>ordered</tt> construct.
1022 */
1023 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
1024   int cid = 0;
1025   kmp_info_t *th;
1026 
1027   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
1028   __kmp_assert_valid_gtid(gtid);
1029 
1030 #if USE_ITT_BUILD
1031   __kmp_itt_ordered_end(gtid);
1032 // TODO: ordered_wait_id
1033 #endif /* USE_ITT_BUILD */
1034 
1035   th = __kmp_threads[gtid];
1036 
1037   if (th->th.th_dispatch->th_dxo_fcn != 0)
1038     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
1039   else
1040     __kmp_parallel_dxo(&gtid, &cid, loc);
1041 
1042 #if OMPT_SUPPORT && OMPT_OPTIONAL
1043   OMPT_STORE_RETURN_ADDRESS(gtid);
1044   if (ompt_enabled.ompt_callback_mutex_released) {
1045     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1046         ompt_mutex_ordered,
1047         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
1048             ->t.t_ordered.dt.t_value,
1049         OMPT_LOAD_RETURN_ADDRESS(gtid));
1050   }
1051 #endif
1052 }
1053 
1054 #if KMP_USE_DYNAMIC_LOCK
1055 
1056 static __forceinline void
1057 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
1058                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
1059   // Pointer to the allocated indirect lock is written to crit, while indexing
1060   // is ignored.
1061   void *idx;
1062   kmp_indirect_lock_t **lck;
1063   lck = (kmp_indirect_lock_t **)crit;
1064   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
1065   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
1066   KMP_SET_I_LOCK_LOCATION(ilk, loc);
1067   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
1068   KA_TRACE(20,
1069            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
1070 #if USE_ITT_BUILD
1071   __kmp_itt_critical_creating(ilk->lock, loc);
1072 #endif
1073   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
1074   if (status == 0) {
1075 #if USE_ITT_BUILD
1076     __kmp_itt_critical_destroyed(ilk->lock);
1077 #endif
1078     // We don't really need to destroy the unclaimed lock here since it will be
1079     // cleaned up at program exit.
1080     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
1081   }
1082   KMP_DEBUG_ASSERT(*lck != NULL);
1083 }
1084 
1085 // Fast-path acquire tas lock
1086 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
1087   {                                                                            \
1088     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1089     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1090     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1091     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
1092         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
1093       kmp_uint32 spins;                                                        \
1094       KMP_FSYNC_PREPARE(l);                                                    \
1095       KMP_INIT_YIELD(spins);                                                   \
1096       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1097       do {                                                                     \
1098         if (TCR_4(__kmp_nth) >                                                 \
1099             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1100           KMP_YIELD(TRUE);                                                     \
1101         } else {                                                               \
1102           KMP_YIELD_SPIN(spins);                                               \
1103         }                                                                      \
1104         __kmp_spin_backoff(&backoff);                                          \
1105       } while (                                                                \
1106           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1107           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1108     }                                                                          \
1109     KMP_FSYNC_ACQUIRED(l);                                                     \
1110   }
1111 
1112 // Fast-path test tas lock
1113 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1114   {                                                                            \
1115     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1116     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1117     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1118     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1119          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1120   }
1121 
1122 // Fast-path release tas lock
1123 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1124   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1125 
1126 #if KMP_USE_FUTEX
1127 
1128 #include <sys/syscall.h>
1129 #include <unistd.h>
1130 #ifndef FUTEX_WAIT
1131 #define FUTEX_WAIT 0
1132 #endif
1133 #ifndef FUTEX_WAKE
1134 #define FUTEX_WAKE 1
1135 #endif
1136 
1137 // Fast-path acquire futex lock
1138 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1139   {                                                                            \
1140     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1141     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1142     KMP_MB();                                                                  \
1143     KMP_FSYNC_PREPARE(ftx);                                                    \
1144     kmp_int32 poll_val;                                                        \
1145     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1146                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1147                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1148       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1149       if (!cond) {                                                             \
1150         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1151                                          poll_val |                            \
1152                                              KMP_LOCK_BUSY(1, futex))) {       \
1153           continue;                                                            \
1154         }                                                                      \
1155         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1156       }                                                                        \
1157       kmp_int32 rc;                                                            \
1158       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1159                         NULL, NULL, 0)) != 0) {                                \
1160         continue;                                                              \
1161       }                                                                        \
1162       gtid_code |= 1;                                                          \
1163     }                                                                          \
1164     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1165   }
1166 
1167 // Fast-path test futex lock
1168 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1169   {                                                                            \
1170     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1171     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1172                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1173       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1174       rc = TRUE;                                                               \
1175     } else {                                                                   \
1176       rc = FALSE;                                                              \
1177     }                                                                          \
1178   }
1179 
1180 // Fast-path release futex lock
1181 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1182   {                                                                            \
1183     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1184     KMP_MB();                                                                  \
1185     KMP_FSYNC_RELEASING(ftx);                                                  \
1186     kmp_int32 poll_val =                                                       \
1187         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1188     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1189       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1190               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1191     }                                                                          \
1192     KMP_MB();                                                                  \
1193     KMP_YIELD_OVERSUB();                                                       \
1194   }
1195 
1196 #endif // KMP_USE_FUTEX
1197 
1198 #else // KMP_USE_DYNAMIC_LOCK
1199 
1200 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1201                                                       ident_t const *loc,
1202                                                       kmp_int32 gtid) {
1203   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1204 
1205   // Because of the double-check, the following load doesn't need to be volatile
1206   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1207 
1208   if (lck == NULL) {
1209     void *idx;
1210 
1211     // Allocate & initialize the lock.
1212     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1213     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1214     __kmp_init_user_lock_with_checks(lck);
1215     __kmp_set_user_lock_location(lck, loc);
1216 #if USE_ITT_BUILD
1217     __kmp_itt_critical_creating(lck);
1218 // __kmp_itt_critical_creating() should be called *before* the first usage
1219 // of underlying lock. It is the only place where we can guarantee it. There
1220 // are chances the lock will destroyed with no usage, but it is not a
1221 // problem, because this is not real event seen by user but rather setting
1222 // name for object (lock). See more details in kmp_itt.h.
1223 #endif /* USE_ITT_BUILD */
1224 
1225     // Use a cmpxchg instruction to slam the start of the critical section with
1226     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1227     // and use the lock that the other thread allocated.
1228     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1229 
1230     if (status == 0) {
1231 // Deallocate the lock and reload the value.
1232 #if USE_ITT_BUILD
1233       __kmp_itt_critical_destroyed(lck);
1234 // Let ITT know the lock is destroyed and the same memory location may be reused
1235 // for another purpose.
1236 #endif /* USE_ITT_BUILD */
1237       __kmp_destroy_user_lock_with_checks(lck);
1238       __kmp_user_lock_free(&idx, gtid, lck);
1239       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1240       KMP_DEBUG_ASSERT(lck != NULL);
1241     }
1242   }
1243   return lck;
1244 }
1245 
1246 #endif // KMP_USE_DYNAMIC_LOCK
1247 
1248 /*!
1249 @ingroup WORK_SHARING
1250 @param loc  source location information.
1251 @param global_tid  global thread number.
1252 @param crit identity of the critical section. This could be a pointer to a lock
1253 associated with the critical section, or some other suitably unique value.
1254 
1255 Enter code protected by a `critical` construct.
1256 This function blocks until the executing thread can enter the critical section.
1257 */
1258 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1259                      kmp_critical_name *crit) {
1260 #if KMP_USE_DYNAMIC_LOCK
1261 #if OMPT_SUPPORT && OMPT_OPTIONAL
1262   OMPT_STORE_RETURN_ADDRESS(global_tid);
1263 #endif // OMPT_SUPPORT
1264   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1265 #else
1266   KMP_COUNT_BLOCK(OMP_CRITICAL);
1267 #if OMPT_SUPPORT && OMPT_OPTIONAL
1268   ompt_state_t prev_state = ompt_state_undefined;
1269   ompt_thread_info_t ti;
1270 #endif
1271   kmp_user_lock_p lck;
1272 
1273   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1274   __kmp_assert_valid_gtid(global_tid);
1275 
1276   // TODO: add THR_OVHD_STATE
1277 
1278   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1279   KMP_CHECK_USER_LOCK_INIT();
1280 
1281   if ((__kmp_user_lock_kind == lk_tas) &&
1282       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1283     lck = (kmp_user_lock_p)crit;
1284   }
1285 #if KMP_USE_FUTEX
1286   else if ((__kmp_user_lock_kind == lk_futex) &&
1287            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1288     lck = (kmp_user_lock_p)crit;
1289   }
1290 #endif
1291   else { // ticket, queuing or drdpa
1292     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1293   }
1294 
1295   if (__kmp_env_consistency_check)
1296     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1297 
1298     // since the critical directive binds to all threads, not just the current
1299     // team we have to check this even if we are in a serialized team.
1300     // also, even if we are the uber thread, we still have to conduct the lock,
1301     // as we have to contend with sibling threads.
1302 
1303 #if USE_ITT_BUILD
1304   __kmp_itt_critical_acquiring(lck);
1305 #endif /* USE_ITT_BUILD */
1306 #if OMPT_SUPPORT && OMPT_OPTIONAL
1307   OMPT_STORE_RETURN_ADDRESS(gtid);
1308   void *codeptr_ra = NULL;
1309   if (ompt_enabled.enabled) {
1310     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1311     /* OMPT state update */
1312     prev_state = ti.state;
1313     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1314     ti.state = ompt_state_wait_critical;
1315 
1316     /* OMPT event callback */
1317     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1318     if (ompt_enabled.ompt_callback_mutex_acquire) {
1319       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1320           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1321           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1322     }
1323   }
1324 #endif
1325   // Value of 'crit' should be good for using as a critical_id of the critical
1326   // section directive.
1327   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1328 
1329 #if USE_ITT_BUILD
1330   __kmp_itt_critical_acquired(lck);
1331 #endif /* USE_ITT_BUILD */
1332 #if OMPT_SUPPORT && OMPT_OPTIONAL
1333   if (ompt_enabled.enabled) {
1334     /* OMPT state update */
1335     ti.state = prev_state;
1336     ti.wait_id = 0;
1337 
1338     /* OMPT event callback */
1339     if (ompt_enabled.ompt_callback_mutex_acquired) {
1340       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1341           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1342     }
1343   }
1344 #endif
1345   KMP_POP_PARTITIONED_TIMER();
1346 
1347   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1348   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1349 #endif // KMP_USE_DYNAMIC_LOCK
1350 }
1351 
1352 #if KMP_USE_DYNAMIC_LOCK
1353 
1354 // Converts the given hint to an internal lock implementation
1355 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1356 #if KMP_USE_TSX
1357 #define KMP_TSX_LOCK(seq) lockseq_##seq
1358 #else
1359 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1360 #endif
1361 
1362 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1363 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.flags.rtm)
1364 #else
1365 #define KMP_CPUINFO_RTM 0
1366 #endif
1367 
1368   // Hints that do not require further logic
1369   if (hint & kmp_lock_hint_hle)
1370     return KMP_TSX_LOCK(hle);
1371   if (hint & kmp_lock_hint_rtm)
1372     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1373   if (hint & kmp_lock_hint_adaptive)
1374     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1375 
1376   // Rule out conflicting hints first by returning the default lock
1377   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1378     return __kmp_user_lock_seq;
1379   if ((hint & omp_lock_hint_speculative) &&
1380       (hint & omp_lock_hint_nonspeculative))
1381     return __kmp_user_lock_seq;
1382 
1383   // Do not even consider speculation when it appears to be contended
1384   if (hint & omp_lock_hint_contended)
1385     return lockseq_queuing;
1386 
1387   // Uncontended lock without speculation
1388   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1389     return lockseq_tas;
1390 
1391   // Use RTM lock for speculation
1392   if (hint & omp_lock_hint_speculative)
1393     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1394 
1395   return __kmp_user_lock_seq;
1396 }
1397 
1398 #if OMPT_SUPPORT && OMPT_OPTIONAL
1399 #if KMP_USE_DYNAMIC_LOCK
1400 static kmp_mutex_impl_t
1401 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1402   if (user_lock) {
1403     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1404     case 0:
1405       break;
1406 #if KMP_USE_FUTEX
1407     case locktag_futex:
1408       return kmp_mutex_impl_queuing;
1409 #endif
1410     case locktag_tas:
1411       return kmp_mutex_impl_spin;
1412 #if KMP_USE_TSX
1413     case locktag_hle:
1414     case locktag_rtm_spin:
1415       return kmp_mutex_impl_speculative;
1416 #endif
1417     default:
1418       return kmp_mutex_impl_none;
1419     }
1420     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1421   }
1422   KMP_ASSERT(ilock);
1423   switch (ilock->type) {
1424 #if KMP_USE_TSX
1425   case locktag_adaptive:
1426   case locktag_rtm_queuing:
1427     return kmp_mutex_impl_speculative;
1428 #endif
1429   case locktag_nested_tas:
1430     return kmp_mutex_impl_spin;
1431 #if KMP_USE_FUTEX
1432   case locktag_nested_futex:
1433 #endif
1434   case locktag_ticket:
1435   case locktag_queuing:
1436   case locktag_drdpa:
1437   case locktag_nested_ticket:
1438   case locktag_nested_queuing:
1439   case locktag_nested_drdpa:
1440     return kmp_mutex_impl_queuing;
1441   default:
1442     return kmp_mutex_impl_none;
1443   }
1444 }
1445 #else
1446 // For locks without dynamic binding
1447 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1448   switch (__kmp_user_lock_kind) {
1449   case lk_tas:
1450     return kmp_mutex_impl_spin;
1451 #if KMP_USE_FUTEX
1452   case lk_futex:
1453 #endif
1454   case lk_ticket:
1455   case lk_queuing:
1456   case lk_drdpa:
1457     return kmp_mutex_impl_queuing;
1458 #if KMP_USE_TSX
1459   case lk_hle:
1460   case lk_rtm_queuing:
1461   case lk_rtm_spin:
1462   case lk_adaptive:
1463     return kmp_mutex_impl_speculative;
1464 #endif
1465   default:
1466     return kmp_mutex_impl_none;
1467   }
1468 }
1469 #endif // KMP_USE_DYNAMIC_LOCK
1470 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1471 
1472 /*!
1473 @ingroup WORK_SHARING
1474 @param loc  source location information.
1475 @param global_tid  global thread number.
1476 @param crit identity of the critical section. This could be a pointer to a lock
1477 associated with the critical section, or some other suitably unique value.
1478 @param hint the lock hint.
1479 
1480 Enter code protected by a `critical` construct with a hint. The hint value is
1481 used to suggest a lock implementation. This function blocks until the executing
1482 thread can enter the critical section unless the hint suggests use of
1483 speculative execution and the hardware supports it.
1484 */
1485 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1486                                kmp_critical_name *crit, uint32_t hint) {
1487   KMP_COUNT_BLOCK(OMP_CRITICAL);
1488   kmp_user_lock_p lck;
1489 #if OMPT_SUPPORT && OMPT_OPTIONAL
1490   ompt_state_t prev_state = ompt_state_undefined;
1491   ompt_thread_info_t ti;
1492   // This is the case, if called from __kmpc_critical:
1493   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1494   if (!codeptr)
1495     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1496 #endif
1497 
1498   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1499   __kmp_assert_valid_gtid(global_tid);
1500 
1501   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1502   // Check if it is initialized.
1503   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1504   kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
1505   if (*lk == 0) {
1506     if (KMP_IS_D_LOCK(lockseq)) {
1507       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1508                                   KMP_GET_D_TAG(lockseq));
1509     } else {
1510       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
1511     }
1512   }
1513   // Branch for accessing the actual lock object and set operation. This
1514   // branching is inevitable since this lock initialization does not follow the
1515   // normal dispatch path (lock table is not used).
1516   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1517     lck = (kmp_user_lock_p)lk;
1518     if (__kmp_env_consistency_check) {
1519       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1520                       __kmp_map_hint_to_lock(hint));
1521     }
1522 #if USE_ITT_BUILD
1523     __kmp_itt_critical_acquiring(lck);
1524 #endif
1525 #if OMPT_SUPPORT && OMPT_OPTIONAL
1526     if (ompt_enabled.enabled) {
1527       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1528       /* OMPT state update */
1529       prev_state = ti.state;
1530       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1531       ti.state = ompt_state_wait_critical;
1532 
1533       /* OMPT event callback */
1534       if (ompt_enabled.ompt_callback_mutex_acquire) {
1535         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1536             ompt_mutex_critical, (unsigned int)hint,
1537             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1538             codeptr);
1539       }
1540     }
1541 #endif
1542 #if KMP_USE_INLINED_TAS
1543     if (lockseq == lockseq_tas && !__kmp_env_consistency_check) {
1544       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1545     } else
1546 #elif KMP_USE_INLINED_FUTEX
1547     if (lockseq == lockseq_futex && !__kmp_env_consistency_check) {
1548       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1549     } else
1550 #endif
1551     {
1552       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1553     }
1554   } else {
1555     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1556     lck = ilk->lock;
1557     if (__kmp_env_consistency_check) {
1558       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1559                       __kmp_map_hint_to_lock(hint));
1560     }
1561 #if USE_ITT_BUILD
1562     __kmp_itt_critical_acquiring(lck);
1563 #endif
1564 #if OMPT_SUPPORT && OMPT_OPTIONAL
1565     if (ompt_enabled.enabled) {
1566       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1567       /* OMPT state update */
1568       prev_state = ti.state;
1569       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1570       ti.state = ompt_state_wait_critical;
1571 
1572       /* OMPT event callback */
1573       if (ompt_enabled.ompt_callback_mutex_acquire) {
1574         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1575             ompt_mutex_critical, (unsigned int)hint,
1576             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1577             codeptr);
1578       }
1579     }
1580 #endif
1581     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1582   }
1583   KMP_POP_PARTITIONED_TIMER();
1584 
1585 #if USE_ITT_BUILD
1586   __kmp_itt_critical_acquired(lck);
1587 #endif /* USE_ITT_BUILD */
1588 #if OMPT_SUPPORT && OMPT_OPTIONAL
1589   if (ompt_enabled.enabled) {
1590     /* OMPT state update */
1591     ti.state = prev_state;
1592     ti.wait_id = 0;
1593 
1594     /* OMPT event callback */
1595     if (ompt_enabled.ompt_callback_mutex_acquired) {
1596       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1597           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1598     }
1599   }
1600 #endif
1601 
1602   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1603   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1604 } // __kmpc_critical_with_hint
1605 
1606 #endif // KMP_USE_DYNAMIC_LOCK
1607 
1608 /*!
1609 @ingroup WORK_SHARING
1610 @param loc  source location information.
1611 @param global_tid  global thread number .
1612 @param crit identity of the critical section. This could be a pointer to a lock
1613 associated with the critical section, or some other suitably unique value.
1614 
1615 Leave a critical section, releasing any lock that was held during its execution.
1616 */
1617 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1618                          kmp_critical_name *crit) {
1619   kmp_user_lock_p lck;
1620 
1621   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1622 
1623 #if KMP_USE_DYNAMIC_LOCK
1624   int locktag = KMP_EXTRACT_D_TAG(crit);
1625   if (locktag) {
1626     lck = (kmp_user_lock_p)crit;
1627     KMP_ASSERT(lck != NULL);
1628     if (__kmp_env_consistency_check) {
1629       __kmp_pop_sync(global_tid, ct_critical, loc);
1630     }
1631 #if USE_ITT_BUILD
1632     __kmp_itt_critical_releasing(lck);
1633 #endif
1634 #if KMP_USE_INLINED_TAS
1635     if (locktag == locktag_tas && !__kmp_env_consistency_check) {
1636       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1637     } else
1638 #elif KMP_USE_INLINED_FUTEX
1639     if (locktag == locktag_futex && !__kmp_env_consistency_check) {
1640       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1641     } else
1642 #endif
1643     {
1644       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1645     }
1646   } else {
1647     kmp_indirect_lock_t *ilk =
1648         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1649     KMP_ASSERT(ilk != NULL);
1650     lck = ilk->lock;
1651     if (__kmp_env_consistency_check) {
1652       __kmp_pop_sync(global_tid, ct_critical, loc);
1653     }
1654 #if USE_ITT_BUILD
1655     __kmp_itt_critical_releasing(lck);
1656 #endif
1657     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1658   }
1659 
1660 #else // KMP_USE_DYNAMIC_LOCK
1661 
1662   if ((__kmp_user_lock_kind == lk_tas) &&
1663       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1664     lck = (kmp_user_lock_p)crit;
1665   }
1666 #if KMP_USE_FUTEX
1667   else if ((__kmp_user_lock_kind == lk_futex) &&
1668            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1669     lck = (kmp_user_lock_p)crit;
1670   }
1671 #endif
1672   else { // ticket, queuing or drdpa
1673     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1674   }
1675 
1676   KMP_ASSERT(lck != NULL);
1677 
1678   if (__kmp_env_consistency_check)
1679     __kmp_pop_sync(global_tid, ct_critical, loc);
1680 
1681 #if USE_ITT_BUILD
1682   __kmp_itt_critical_releasing(lck);
1683 #endif /* USE_ITT_BUILD */
1684   // Value of 'crit' should be good for using as a critical_id of the critical
1685   // section directive.
1686   __kmp_release_user_lock_with_checks(lck, global_tid);
1687 
1688 #endif // KMP_USE_DYNAMIC_LOCK
1689 
1690 #if OMPT_SUPPORT && OMPT_OPTIONAL
1691   /* OMPT release event triggers after lock is released; place here to trigger
1692    * for all #if branches */
1693   OMPT_STORE_RETURN_ADDRESS(global_tid);
1694   if (ompt_enabled.ompt_callback_mutex_released) {
1695     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1696         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1697         OMPT_LOAD_RETURN_ADDRESS(0));
1698   }
1699 #endif
1700 
1701   KMP_POP_PARTITIONED_TIMER();
1702   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1703 }
1704 
1705 /*!
1706 @ingroup SYNCHRONIZATION
1707 @param loc source location information
1708 @param global_tid thread id.
1709 @return one if the thread should execute the master block, zero otherwise
1710 
1711 Start execution of a combined barrier and master. The barrier is executed inside
1712 this function.
1713 */
1714 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1715   int status;
1716   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1717   __kmp_assert_valid_gtid(global_tid);
1718 
1719   if (!TCR_4(__kmp_init_parallel))
1720     __kmp_parallel_initialize();
1721 
1722   __kmp_resume_if_soft_paused();
1723 
1724   if (__kmp_env_consistency_check)
1725     __kmp_check_barrier(global_tid, ct_barrier, loc);
1726 
1727 #if OMPT_SUPPORT
1728   ompt_frame_t *ompt_frame;
1729   if (ompt_enabled.enabled) {
1730     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1731     if (ompt_frame->enter_frame.ptr == NULL)
1732       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1733   }
1734   OMPT_STORE_RETURN_ADDRESS(global_tid);
1735 #endif
1736 #if USE_ITT_NOTIFY
1737   __kmp_threads[global_tid]->th.th_ident = loc;
1738 #endif
1739   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1740 #if OMPT_SUPPORT && OMPT_OPTIONAL
1741   if (ompt_enabled.enabled) {
1742     ompt_frame->enter_frame = ompt_data_none;
1743   }
1744 #endif
1745 
1746   return (status != 0) ? 0 : 1;
1747 }
1748 
1749 /*!
1750 @ingroup SYNCHRONIZATION
1751 @param loc source location information
1752 @param global_tid thread id.
1753 
1754 Complete the execution of a combined barrier and master. This function should
1755 only be called at the completion of the <tt>master</tt> code. Other threads will
1756 still be waiting at the barrier and this call releases them.
1757 */
1758 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1759   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1760   __kmp_assert_valid_gtid(global_tid);
1761   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1762 }
1763 
1764 /*!
1765 @ingroup SYNCHRONIZATION
1766 @param loc source location information
1767 @param global_tid thread id.
1768 @return one if the thread should execute the master block, zero otherwise
1769 
1770 Start execution of a combined barrier and master(nowait) construct.
1771 The barrier is executed inside this function.
1772 There is no equivalent "end" function, since the
1773 */
1774 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1775   kmp_int32 ret;
1776   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1777   __kmp_assert_valid_gtid(global_tid);
1778 
1779   if (!TCR_4(__kmp_init_parallel))
1780     __kmp_parallel_initialize();
1781 
1782   __kmp_resume_if_soft_paused();
1783 
1784   if (__kmp_env_consistency_check) {
1785     if (loc == 0) {
1786       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1787     }
1788     __kmp_check_barrier(global_tid, ct_barrier, loc);
1789   }
1790 
1791 #if OMPT_SUPPORT
1792   ompt_frame_t *ompt_frame;
1793   if (ompt_enabled.enabled) {
1794     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1795     if (ompt_frame->enter_frame.ptr == NULL)
1796       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1797   }
1798   OMPT_STORE_RETURN_ADDRESS(global_tid);
1799 #endif
1800 #if USE_ITT_NOTIFY
1801   __kmp_threads[global_tid]->th.th_ident = loc;
1802 #endif
1803   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1804 #if OMPT_SUPPORT && OMPT_OPTIONAL
1805   if (ompt_enabled.enabled) {
1806     ompt_frame->enter_frame = ompt_data_none;
1807   }
1808 #endif
1809 
1810   ret = __kmpc_master(loc, global_tid);
1811 
1812   if (__kmp_env_consistency_check) {
1813     /*  there's no __kmpc_end_master called; so the (stats) */
1814     /*  actions of __kmpc_end_master are done here          */
1815     if (ret) {
1816       /* only one thread should do the pop since only */
1817       /* one did the push (see __kmpc_master())       */
1818       __kmp_pop_sync(global_tid, ct_master, loc);
1819     }
1820   }
1821 
1822   return (ret);
1823 }
1824 
1825 /* The BARRIER for a SINGLE process section is always explicit   */
1826 /*!
1827 @ingroup WORK_SHARING
1828 @param loc  source location information
1829 @param global_tid  global thread number
1830 @return One if this thread should execute the single construct, zero otherwise.
1831 
1832 Test whether to execute a <tt>single</tt> construct.
1833 There are no implicit barriers in the two "single" calls, rather the compiler
1834 should introduce an explicit barrier if it is required.
1835 */
1836 
1837 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1838   __kmp_assert_valid_gtid(global_tid);
1839   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1840 
1841   if (rc) {
1842     // We are going to execute the single statement, so we should count it.
1843     KMP_COUNT_BLOCK(OMP_SINGLE);
1844     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1845   }
1846 
1847 #if OMPT_SUPPORT && OMPT_OPTIONAL
1848   kmp_info_t *this_thr = __kmp_threads[global_tid];
1849   kmp_team_t *team = this_thr->th.th_team;
1850   int tid = __kmp_tid_from_gtid(global_tid);
1851 
1852   if (ompt_enabled.enabled) {
1853     if (rc) {
1854       if (ompt_enabled.ompt_callback_work) {
1855         ompt_callbacks.ompt_callback(ompt_callback_work)(
1856             ompt_work_single_executor, ompt_scope_begin,
1857             &(team->t.ompt_team_info.parallel_data),
1858             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1859             1, OMPT_GET_RETURN_ADDRESS(0));
1860       }
1861     } else {
1862       if (ompt_enabled.ompt_callback_work) {
1863         ompt_callbacks.ompt_callback(ompt_callback_work)(
1864             ompt_work_single_other, ompt_scope_begin,
1865             &(team->t.ompt_team_info.parallel_data),
1866             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1867             1, OMPT_GET_RETURN_ADDRESS(0));
1868         ompt_callbacks.ompt_callback(ompt_callback_work)(
1869             ompt_work_single_other, ompt_scope_end,
1870             &(team->t.ompt_team_info.parallel_data),
1871             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1872             1, OMPT_GET_RETURN_ADDRESS(0));
1873       }
1874     }
1875   }
1876 #endif
1877 
1878   return rc;
1879 }
1880 
1881 /*!
1882 @ingroup WORK_SHARING
1883 @param loc  source location information
1884 @param global_tid  global thread number
1885 
1886 Mark the end of a <tt>single</tt> construct.  This function should
1887 only be called by the thread that executed the block of code protected
1888 by the `single` construct.
1889 */
1890 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1891   __kmp_assert_valid_gtid(global_tid);
1892   __kmp_exit_single(global_tid);
1893   KMP_POP_PARTITIONED_TIMER();
1894 
1895 #if OMPT_SUPPORT && OMPT_OPTIONAL
1896   kmp_info_t *this_thr = __kmp_threads[global_tid];
1897   kmp_team_t *team = this_thr->th.th_team;
1898   int tid = __kmp_tid_from_gtid(global_tid);
1899 
1900   if (ompt_enabled.ompt_callback_work) {
1901     ompt_callbacks.ompt_callback(ompt_callback_work)(
1902         ompt_work_single_executor, ompt_scope_end,
1903         &(team->t.ompt_team_info.parallel_data),
1904         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1905         OMPT_GET_RETURN_ADDRESS(0));
1906   }
1907 #endif
1908 }
1909 
1910 /*!
1911 @ingroup WORK_SHARING
1912 @param loc Source location
1913 @param global_tid Global thread id
1914 
1915 Mark the end of a statically scheduled loop.
1916 */
1917 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1918   KMP_POP_PARTITIONED_TIMER();
1919   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1920 
1921 #if OMPT_SUPPORT && OMPT_OPTIONAL
1922   if (ompt_enabled.ompt_callback_work) {
1923     ompt_work_t ompt_work_type = ompt_work_loop;
1924     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1925     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1926     // Determine workshare type
1927     if (loc != NULL) {
1928       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1929         ompt_work_type = ompt_work_loop;
1930       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1931         ompt_work_type = ompt_work_sections;
1932       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1933         ompt_work_type = ompt_work_distribute;
1934       } else {
1935         // use default set above.
1936         // a warning about this case is provided in __kmpc_for_static_init
1937       }
1938       KMP_DEBUG_ASSERT(ompt_work_type);
1939     }
1940     ompt_callbacks.ompt_callback(ompt_callback_work)(
1941         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1942         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1943   }
1944 #endif
1945   if (__kmp_env_consistency_check)
1946     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1947 }
1948 
1949 // User routines which take C-style arguments (call by value)
1950 // different from the Fortran equivalent routines
1951 
1952 void ompc_set_num_threads(int arg) {
1953   // !!!!! TODO: check the per-task binding
1954   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1955 }
1956 
1957 void ompc_set_dynamic(int flag) {
1958   kmp_info_t *thread;
1959 
1960   /* For the thread-private implementation of the internal controls */
1961   thread = __kmp_entry_thread();
1962 
1963   __kmp_save_internal_controls(thread);
1964 
1965   set__dynamic(thread, flag ? true : false);
1966 }
1967 
1968 void ompc_set_nested(int flag) {
1969   kmp_info_t *thread;
1970 
1971   /* For the thread-private internal controls implementation */
1972   thread = __kmp_entry_thread();
1973 
1974   __kmp_save_internal_controls(thread);
1975 
1976   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1977 }
1978 
1979 void ompc_set_max_active_levels(int max_active_levels) {
1980   /* TO DO */
1981   /* we want per-task implementation of this internal control */
1982 
1983   /* For the per-thread internal controls implementation */
1984   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1985 }
1986 
1987 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1988   // !!!!! TODO: check the per-task binding
1989   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1990 }
1991 
1992 int ompc_get_ancestor_thread_num(int level) {
1993   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1994 }
1995 
1996 int ompc_get_team_size(int level) {
1997   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1998 }
1999 
2000 /* OpenMP 5.0 Affinity Format API */
2001 void KMP_EXPAND_NAME(ompc_set_affinity_format)(char const *format) {
2002   if (!__kmp_init_serial) {
2003     __kmp_serial_initialize();
2004   }
2005   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
2006                          format, KMP_STRLEN(format) + 1);
2007 }
2008 
2009 size_t KMP_EXPAND_NAME(ompc_get_affinity_format)(char *buffer, size_t size) {
2010   size_t format_size;
2011   if (!__kmp_init_serial) {
2012     __kmp_serial_initialize();
2013   }
2014   format_size = KMP_STRLEN(__kmp_affinity_format);
2015   if (buffer && size) {
2016     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
2017                            format_size + 1);
2018   }
2019   return format_size;
2020 }
2021 
2022 void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format) {
2023   int gtid;
2024   if (!TCR_4(__kmp_init_middle)) {
2025     __kmp_middle_initialize();
2026   }
2027   __kmp_assign_root_init_mask();
2028   gtid = __kmp_get_gtid();
2029 #if KMP_AFFINITY_SUPPORTED
2030   if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 && __kmp_affin_reset) {
2031     __kmp_reset_root_init_mask(gtid);
2032   }
2033 #endif
2034   __kmp_aux_display_affinity(gtid, format);
2035 }
2036 
2037 size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
2038                                               char const *format) {
2039   int gtid;
2040   size_t num_required;
2041   kmp_str_buf_t capture_buf;
2042   if (!TCR_4(__kmp_init_middle)) {
2043     __kmp_middle_initialize();
2044   }
2045   __kmp_assign_root_init_mask();
2046   gtid = __kmp_get_gtid();
2047 #if KMP_AFFINITY_SUPPORTED
2048   if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 && __kmp_affin_reset) {
2049     __kmp_reset_root_init_mask(gtid);
2050   }
2051 #endif
2052   __kmp_str_buf_init(&capture_buf);
2053   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
2054   if (buffer && buf_size) {
2055     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
2056                            capture_buf.used + 1);
2057   }
2058   __kmp_str_buf_free(&capture_buf);
2059   return num_required;
2060 }
2061 
2062 void kmpc_set_stacksize(int arg) {
2063   // __kmp_aux_set_stacksize initializes the library if needed
2064   __kmp_aux_set_stacksize(arg);
2065 }
2066 
2067 void kmpc_set_stacksize_s(size_t arg) {
2068   // __kmp_aux_set_stacksize initializes the library if needed
2069   __kmp_aux_set_stacksize(arg);
2070 }
2071 
2072 void kmpc_set_blocktime(int arg) {
2073   int gtid, tid;
2074   kmp_info_t *thread;
2075 
2076   gtid = __kmp_entry_gtid();
2077   tid = __kmp_tid_from_gtid(gtid);
2078   thread = __kmp_thread_from_gtid(gtid);
2079 
2080   __kmp_aux_set_blocktime(arg, thread, tid);
2081 }
2082 
2083 void kmpc_set_library(int arg) {
2084   // __kmp_user_set_library initializes the library if needed
2085   __kmp_user_set_library((enum library_type)arg);
2086 }
2087 
2088 void kmpc_set_defaults(char const *str) {
2089   // __kmp_aux_set_defaults initializes the library if needed
2090   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
2091 }
2092 
2093 void kmpc_set_disp_num_buffers(int arg) {
2094   // ignore after initialization because some teams have already
2095   // allocated dispatch buffers
2096   if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
2097       arg <= KMP_MAX_DISP_NUM_BUFF) {
2098     __kmp_dispatch_num_buffers = arg;
2099   }
2100 }
2101 
2102 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2103 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2104   return -1;
2105 #else
2106   if (!TCR_4(__kmp_init_middle)) {
2107     __kmp_middle_initialize();
2108   }
2109   __kmp_assign_root_init_mask();
2110   return __kmp_aux_set_affinity_mask_proc(proc, mask);
2111 #endif
2112 }
2113 
2114 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2115 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2116   return -1;
2117 #else
2118   if (!TCR_4(__kmp_init_middle)) {
2119     __kmp_middle_initialize();
2120   }
2121   __kmp_assign_root_init_mask();
2122   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2123 #endif
2124 }
2125 
2126 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2127 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2128   return -1;
2129 #else
2130   if (!TCR_4(__kmp_init_middle)) {
2131     __kmp_middle_initialize();
2132   }
2133   __kmp_assign_root_init_mask();
2134   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2135 #endif
2136 }
2137 
2138 /* -------------------------------------------------------------------------- */
2139 /*!
2140 @ingroup THREADPRIVATE
2141 @param loc       source location information
2142 @param gtid      global thread number
2143 @param cpy_size  size of the cpy_data buffer
2144 @param cpy_data  pointer to data to be copied
2145 @param cpy_func  helper function to call for copying data
2146 @param didit     flag variable: 1=single thread; 0=not single thread
2147 
2148 __kmpc_copyprivate implements the interface for the private data broadcast
2149 needed for the copyprivate clause associated with a single region in an
2150 OpenMP<sup>*</sup> program (both C and Fortran).
2151 All threads participating in the parallel region call this routine.
2152 One of the threads (called the single thread) should have the <tt>didit</tt>
2153 variable set to 1 and all other threads should have that variable set to 0.
2154 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2155 
2156 The OpenMP specification forbids the use of nowait on the single region when a
2157 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2158 barrier internally to avoid race conditions, so the code generation for the
2159 single region should avoid generating a barrier after the call to @ref
2160 __kmpc_copyprivate.
2161 
2162 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2163 The <tt>loc</tt> parameter is a pointer to source location information.
2164 
2165 Internal implementation: The single thread will first copy its descriptor
2166 address (cpy_data) to a team-private location, then the other threads will each
2167 call the function pointed to by the parameter cpy_func, which carries out the
2168 copy by copying the data using the cpy_data buffer.
2169 
2170 The cpy_func routine used for the copy and the contents of the data area defined
2171 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2172 to be done. For instance, the cpy_data buffer can hold the actual data to be
2173 copied or it may hold a list of pointers to the data. The cpy_func routine must
2174 interpret the cpy_data buffer appropriately.
2175 
2176 The interface to cpy_func is as follows:
2177 @code
2178 void cpy_func( void *destination, void *source )
2179 @endcode
2180 where void *destination is the cpy_data pointer for the thread being copied to
2181 and void *source is the cpy_data pointer for the thread being copied from.
2182 */
2183 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2184                         void *cpy_data, void (*cpy_func)(void *, void *),
2185                         kmp_int32 didit) {
2186   void **data_ptr;
2187   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2188   __kmp_assert_valid_gtid(gtid);
2189 
2190   KMP_MB();
2191 
2192   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2193 
2194   if (__kmp_env_consistency_check) {
2195     if (loc == 0) {
2196       KMP_WARNING(ConstructIdentInvalid);
2197     }
2198   }
2199 
2200   // ToDo: Optimize the following two barriers into some kind of split barrier
2201 
2202   if (didit)
2203     *data_ptr = cpy_data;
2204 
2205 #if OMPT_SUPPORT
2206   ompt_frame_t *ompt_frame;
2207   if (ompt_enabled.enabled) {
2208     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2209     if (ompt_frame->enter_frame.ptr == NULL)
2210       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2211   }
2212   OMPT_STORE_RETURN_ADDRESS(gtid);
2213 #endif
2214 /* This barrier is not a barrier region boundary */
2215 #if USE_ITT_NOTIFY
2216   __kmp_threads[gtid]->th.th_ident = loc;
2217 #endif
2218   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2219 
2220   if (!didit)
2221     (*cpy_func)(cpy_data, *data_ptr);
2222 
2223   // Consider next barrier a user-visible barrier for barrier region boundaries
2224   // Nesting checks are already handled by the single construct checks
2225   {
2226 #if OMPT_SUPPORT
2227     OMPT_STORE_RETURN_ADDRESS(gtid);
2228 #endif
2229 #if USE_ITT_NOTIFY
2230     __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2231 // tasks can overwrite the location)
2232 #endif
2233     __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2234 #if OMPT_SUPPORT && OMPT_OPTIONAL
2235     if (ompt_enabled.enabled) {
2236       ompt_frame->enter_frame = ompt_data_none;
2237     }
2238 #endif
2239   }
2240 }
2241 
2242 /* --------------------------------------------------------------------------*/
2243 /*!
2244 @ingroup THREADPRIVATE
2245 @param loc       source location information
2246 @param gtid      global thread number
2247 @param cpy_data  pointer to the data to be saved/copied or 0
2248 @return          the saved pointer to the data
2249 
2250 __kmpc_copyprivate_light is a lighter version of __kmpc_copyprivate:
2251 __kmpc_copyprivate_light only saves the pointer it's given (if it's not 0, so
2252 coming from single), and returns that pointer in all calls (for single thread
2253 it's not needed). This version doesn't do any actual data copying. Data copying
2254 has to be done somewhere else, e.g. inline in the generated code. Due to this,
2255 this function doesn't have any barrier at the end of the function, like
2256 __kmpc_copyprivate does, so generated code needs barrier after copying of all
2257 data was done.
2258 */
2259 void *__kmpc_copyprivate_light(ident_t *loc, kmp_int32 gtid, void *cpy_data) {
2260   void **data_ptr;
2261 
2262   KC_TRACE(10, ("__kmpc_copyprivate_light: called T#%d\n", gtid));
2263 
2264   KMP_MB();
2265 
2266   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2267 
2268   if (__kmp_env_consistency_check) {
2269     if (loc == 0) {
2270       KMP_WARNING(ConstructIdentInvalid);
2271     }
2272   }
2273 
2274   // ToDo: Optimize the following barrier
2275 
2276   if (cpy_data)
2277     *data_ptr = cpy_data;
2278 
2279 #if OMPT_SUPPORT
2280   ompt_frame_t *ompt_frame;
2281   if (ompt_enabled.enabled) {
2282     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2283     if (ompt_frame->enter_frame.ptr == NULL)
2284       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2285     OMPT_STORE_RETURN_ADDRESS(gtid);
2286   }
2287 #endif
2288 /* This barrier is not a barrier region boundary */
2289 #if USE_ITT_NOTIFY
2290   __kmp_threads[gtid]->th.th_ident = loc;
2291 #endif
2292   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2293 
2294   return *data_ptr;
2295 }
2296 
2297 /* -------------------------------------------------------------------------- */
2298 
2299 #define INIT_LOCK __kmp_init_user_lock_with_checks
2300 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2301 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2302 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2303 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2304 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2305   __kmp_acquire_nested_user_lock_with_checks_timed
2306 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2307 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2308 #define TEST_LOCK __kmp_test_user_lock_with_checks
2309 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2310 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2311 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2312 
2313 // TODO: Make check abort messages use location info & pass it into
2314 // with_checks routines
2315 
2316 #if KMP_USE_DYNAMIC_LOCK
2317 
2318 // internal lock initializer
2319 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2320                                                     kmp_dyna_lockseq_t seq) {
2321   if (KMP_IS_D_LOCK(seq)) {
2322     KMP_INIT_D_LOCK(lock, seq);
2323 #if USE_ITT_BUILD
2324     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2325 #endif
2326   } else {
2327     KMP_INIT_I_LOCK(lock, seq);
2328 #if USE_ITT_BUILD
2329     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2330     __kmp_itt_lock_creating(ilk->lock, loc);
2331 #endif
2332   }
2333 }
2334 
2335 // internal nest lock initializer
2336 static __forceinline void
2337 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2338                                kmp_dyna_lockseq_t seq) {
2339 #if KMP_USE_TSX
2340   // Don't have nested lock implementation for speculative locks
2341   if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2342       seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2343     seq = __kmp_user_lock_seq;
2344 #endif
2345   switch (seq) {
2346   case lockseq_tas:
2347     seq = lockseq_nested_tas;
2348     break;
2349 #if KMP_USE_FUTEX
2350   case lockseq_futex:
2351     seq = lockseq_nested_futex;
2352     break;
2353 #endif
2354   case lockseq_ticket:
2355     seq = lockseq_nested_ticket;
2356     break;
2357   case lockseq_queuing:
2358     seq = lockseq_nested_queuing;
2359     break;
2360   case lockseq_drdpa:
2361     seq = lockseq_nested_drdpa;
2362     break;
2363   default:
2364     seq = lockseq_nested_queuing;
2365   }
2366   KMP_INIT_I_LOCK(lock, seq);
2367 #if USE_ITT_BUILD
2368   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2369   __kmp_itt_lock_creating(ilk->lock, loc);
2370 #endif
2371 }
2372 
2373 /* initialize the lock with a hint */
2374 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2375                                 uintptr_t hint) {
2376   KMP_DEBUG_ASSERT(__kmp_init_serial);
2377   if (__kmp_env_consistency_check && user_lock == NULL) {
2378     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2379   }
2380 
2381   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2382 
2383 #if OMPT_SUPPORT && OMPT_OPTIONAL
2384   // This is the case, if called from omp_init_lock_with_hint:
2385   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2386   if (!codeptr)
2387     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2388   if (ompt_enabled.ompt_callback_lock_init) {
2389     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2390         ompt_mutex_lock, (omp_lock_hint_t)hint,
2391         __ompt_get_mutex_impl_type(user_lock),
2392         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2393   }
2394 #endif
2395 }
2396 
2397 /* initialize the lock with a hint */
2398 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2399                                      void **user_lock, uintptr_t hint) {
2400   KMP_DEBUG_ASSERT(__kmp_init_serial);
2401   if (__kmp_env_consistency_check && user_lock == NULL) {
2402     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2403   }
2404 
2405   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2406 
2407 #if OMPT_SUPPORT && OMPT_OPTIONAL
2408   // This is the case, if called from omp_init_lock_with_hint:
2409   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2410   if (!codeptr)
2411     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2412   if (ompt_enabled.ompt_callback_lock_init) {
2413     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2414         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2415         __ompt_get_mutex_impl_type(user_lock),
2416         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2417   }
2418 #endif
2419 }
2420 
2421 #endif // KMP_USE_DYNAMIC_LOCK
2422 
2423 /* initialize the lock */
2424 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2425 #if KMP_USE_DYNAMIC_LOCK
2426 
2427   KMP_DEBUG_ASSERT(__kmp_init_serial);
2428   if (__kmp_env_consistency_check && user_lock == NULL) {
2429     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2430   }
2431   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2432 
2433 #if OMPT_SUPPORT && OMPT_OPTIONAL
2434   // This is the case, if called from omp_init_lock_with_hint:
2435   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2436   if (!codeptr)
2437     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2438   if (ompt_enabled.ompt_callback_lock_init) {
2439     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2440         ompt_mutex_lock, omp_lock_hint_none,
2441         __ompt_get_mutex_impl_type(user_lock),
2442         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2443   }
2444 #endif
2445 
2446 #else // KMP_USE_DYNAMIC_LOCK
2447 
2448   static char const *const func = "omp_init_lock";
2449   kmp_user_lock_p lck;
2450   KMP_DEBUG_ASSERT(__kmp_init_serial);
2451 
2452   if (__kmp_env_consistency_check) {
2453     if (user_lock == NULL) {
2454       KMP_FATAL(LockIsUninitialized, func);
2455     }
2456   }
2457 
2458   KMP_CHECK_USER_LOCK_INIT();
2459 
2460   if ((__kmp_user_lock_kind == lk_tas) &&
2461       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2462     lck = (kmp_user_lock_p)user_lock;
2463   }
2464 #if KMP_USE_FUTEX
2465   else if ((__kmp_user_lock_kind == lk_futex) &&
2466            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2467     lck = (kmp_user_lock_p)user_lock;
2468   }
2469 #endif
2470   else {
2471     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2472   }
2473   INIT_LOCK(lck);
2474   __kmp_set_user_lock_location(lck, loc);
2475 
2476 #if OMPT_SUPPORT && OMPT_OPTIONAL
2477   // This is the case, if called from omp_init_lock_with_hint:
2478   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2479   if (!codeptr)
2480     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2481   if (ompt_enabled.ompt_callback_lock_init) {
2482     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2483         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2484         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2485   }
2486 #endif
2487 
2488 #if USE_ITT_BUILD
2489   __kmp_itt_lock_creating(lck);
2490 #endif /* USE_ITT_BUILD */
2491 
2492 #endif // KMP_USE_DYNAMIC_LOCK
2493 } // __kmpc_init_lock
2494 
2495 /* initialize the lock */
2496 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2497 #if KMP_USE_DYNAMIC_LOCK
2498 
2499   KMP_DEBUG_ASSERT(__kmp_init_serial);
2500   if (__kmp_env_consistency_check && user_lock == NULL) {
2501     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2502   }
2503   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2504 
2505 #if OMPT_SUPPORT && OMPT_OPTIONAL
2506   // This is the case, if called from omp_init_lock_with_hint:
2507   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2508   if (!codeptr)
2509     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2510   if (ompt_enabled.ompt_callback_lock_init) {
2511     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2512         ompt_mutex_nest_lock, omp_lock_hint_none,
2513         __ompt_get_mutex_impl_type(user_lock),
2514         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2515   }
2516 #endif
2517 
2518 #else // KMP_USE_DYNAMIC_LOCK
2519 
2520   static char const *const func = "omp_init_nest_lock";
2521   kmp_user_lock_p lck;
2522   KMP_DEBUG_ASSERT(__kmp_init_serial);
2523 
2524   if (__kmp_env_consistency_check) {
2525     if (user_lock == NULL) {
2526       KMP_FATAL(LockIsUninitialized, func);
2527     }
2528   }
2529 
2530   KMP_CHECK_USER_LOCK_INIT();
2531 
2532   if ((__kmp_user_lock_kind == lk_tas) &&
2533       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2534        OMP_NEST_LOCK_T_SIZE)) {
2535     lck = (kmp_user_lock_p)user_lock;
2536   }
2537 #if KMP_USE_FUTEX
2538   else if ((__kmp_user_lock_kind == lk_futex) &&
2539            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2540             OMP_NEST_LOCK_T_SIZE)) {
2541     lck = (kmp_user_lock_p)user_lock;
2542   }
2543 #endif
2544   else {
2545     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2546   }
2547 
2548   INIT_NESTED_LOCK(lck);
2549   __kmp_set_user_lock_location(lck, loc);
2550 
2551 #if OMPT_SUPPORT && OMPT_OPTIONAL
2552   // This is the case, if called from omp_init_lock_with_hint:
2553   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2554   if (!codeptr)
2555     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2556   if (ompt_enabled.ompt_callback_lock_init) {
2557     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2558         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2559         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2560   }
2561 #endif
2562 
2563 #if USE_ITT_BUILD
2564   __kmp_itt_lock_creating(lck);
2565 #endif /* USE_ITT_BUILD */
2566 
2567 #endif // KMP_USE_DYNAMIC_LOCK
2568 } // __kmpc_init_nest_lock
2569 
2570 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2571 #if KMP_USE_DYNAMIC_LOCK
2572 
2573 #if USE_ITT_BUILD
2574   kmp_user_lock_p lck;
2575   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2576     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2577   } else {
2578     lck = (kmp_user_lock_p)user_lock;
2579   }
2580   __kmp_itt_lock_destroyed(lck);
2581 #endif
2582 #if OMPT_SUPPORT && OMPT_OPTIONAL
2583   // This is the case, if called from omp_init_lock_with_hint:
2584   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2585   if (!codeptr)
2586     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2587   if (ompt_enabled.ompt_callback_lock_destroy) {
2588     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2589         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2590   }
2591 #endif
2592   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2593 #else
2594   kmp_user_lock_p lck;
2595 
2596   if ((__kmp_user_lock_kind == lk_tas) &&
2597       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2598     lck = (kmp_user_lock_p)user_lock;
2599   }
2600 #if KMP_USE_FUTEX
2601   else if ((__kmp_user_lock_kind == lk_futex) &&
2602            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2603     lck = (kmp_user_lock_p)user_lock;
2604   }
2605 #endif
2606   else {
2607     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2608   }
2609 
2610 #if OMPT_SUPPORT && OMPT_OPTIONAL
2611   // This is the case, if called from omp_init_lock_with_hint:
2612   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2613   if (!codeptr)
2614     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2615   if (ompt_enabled.ompt_callback_lock_destroy) {
2616     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2617         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2618   }
2619 #endif
2620 
2621 #if USE_ITT_BUILD
2622   __kmp_itt_lock_destroyed(lck);
2623 #endif /* USE_ITT_BUILD */
2624   DESTROY_LOCK(lck);
2625 
2626   if ((__kmp_user_lock_kind == lk_tas) &&
2627       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2628     ;
2629   }
2630 #if KMP_USE_FUTEX
2631   else if ((__kmp_user_lock_kind == lk_futex) &&
2632            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2633     ;
2634   }
2635 #endif
2636   else {
2637     __kmp_user_lock_free(user_lock, gtid, lck);
2638   }
2639 #endif // KMP_USE_DYNAMIC_LOCK
2640 } // __kmpc_destroy_lock
2641 
2642 /* destroy the lock */
2643 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2644 #if KMP_USE_DYNAMIC_LOCK
2645 
2646 #if USE_ITT_BUILD
2647   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2648   __kmp_itt_lock_destroyed(ilk->lock);
2649 #endif
2650 #if OMPT_SUPPORT && OMPT_OPTIONAL
2651   // This is the case, if called from omp_init_lock_with_hint:
2652   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2653   if (!codeptr)
2654     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2655   if (ompt_enabled.ompt_callback_lock_destroy) {
2656     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2657         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2658   }
2659 #endif
2660   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2661 
2662 #else // KMP_USE_DYNAMIC_LOCK
2663 
2664   kmp_user_lock_p lck;
2665 
2666   if ((__kmp_user_lock_kind == lk_tas) &&
2667       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2668        OMP_NEST_LOCK_T_SIZE)) {
2669     lck = (kmp_user_lock_p)user_lock;
2670   }
2671 #if KMP_USE_FUTEX
2672   else if ((__kmp_user_lock_kind == lk_futex) &&
2673            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2674             OMP_NEST_LOCK_T_SIZE)) {
2675     lck = (kmp_user_lock_p)user_lock;
2676   }
2677 #endif
2678   else {
2679     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2680   }
2681 
2682 #if OMPT_SUPPORT && OMPT_OPTIONAL
2683   // This is the case, if called from omp_init_lock_with_hint:
2684   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2685   if (!codeptr)
2686     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2687   if (ompt_enabled.ompt_callback_lock_destroy) {
2688     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2689         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2690   }
2691 #endif
2692 
2693 #if USE_ITT_BUILD
2694   __kmp_itt_lock_destroyed(lck);
2695 #endif /* USE_ITT_BUILD */
2696 
2697   DESTROY_NESTED_LOCK(lck);
2698 
2699   if ((__kmp_user_lock_kind == lk_tas) &&
2700       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2701        OMP_NEST_LOCK_T_SIZE)) {
2702     ;
2703   }
2704 #if KMP_USE_FUTEX
2705   else if ((__kmp_user_lock_kind == lk_futex) &&
2706            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2707             OMP_NEST_LOCK_T_SIZE)) {
2708     ;
2709   }
2710 #endif
2711   else {
2712     __kmp_user_lock_free(user_lock, gtid, lck);
2713   }
2714 #endif // KMP_USE_DYNAMIC_LOCK
2715 } // __kmpc_destroy_nest_lock
2716 
2717 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2718   KMP_COUNT_BLOCK(OMP_set_lock);
2719 #if KMP_USE_DYNAMIC_LOCK
2720   int tag = KMP_EXTRACT_D_TAG(user_lock);
2721 #if USE_ITT_BUILD
2722   __kmp_itt_lock_acquiring(
2723       (kmp_user_lock_p)
2724           user_lock); // itt function will get to the right lock object.
2725 #endif
2726 #if OMPT_SUPPORT && OMPT_OPTIONAL
2727   // This is the case, if called from omp_init_lock_with_hint:
2728   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2729   if (!codeptr)
2730     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2731   if (ompt_enabled.ompt_callback_mutex_acquire) {
2732     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2733         ompt_mutex_lock, omp_lock_hint_none,
2734         __ompt_get_mutex_impl_type(user_lock),
2735         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2736   }
2737 #endif
2738 #if KMP_USE_INLINED_TAS
2739   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2740     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2741   } else
2742 #elif KMP_USE_INLINED_FUTEX
2743   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2744     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2745   } else
2746 #endif
2747   {
2748     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2749   }
2750 #if USE_ITT_BUILD
2751   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2752 #endif
2753 #if OMPT_SUPPORT && OMPT_OPTIONAL
2754   if (ompt_enabled.ompt_callback_mutex_acquired) {
2755     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2756         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2757   }
2758 #endif
2759 
2760 #else // KMP_USE_DYNAMIC_LOCK
2761 
2762   kmp_user_lock_p lck;
2763 
2764   if ((__kmp_user_lock_kind == lk_tas) &&
2765       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2766     lck = (kmp_user_lock_p)user_lock;
2767   }
2768 #if KMP_USE_FUTEX
2769   else if ((__kmp_user_lock_kind == lk_futex) &&
2770            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2771     lck = (kmp_user_lock_p)user_lock;
2772   }
2773 #endif
2774   else {
2775     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2776   }
2777 
2778 #if USE_ITT_BUILD
2779   __kmp_itt_lock_acquiring(lck);
2780 #endif /* USE_ITT_BUILD */
2781 #if OMPT_SUPPORT && OMPT_OPTIONAL
2782   // This is the case, if called from omp_init_lock_with_hint:
2783   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2784   if (!codeptr)
2785     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2786   if (ompt_enabled.ompt_callback_mutex_acquire) {
2787     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2788         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2789         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2790   }
2791 #endif
2792 
2793   ACQUIRE_LOCK(lck, gtid);
2794 
2795 #if USE_ITT_BUILD
2796   __kmp_itt_lock_acquired(lck);
2797 #endif /* USE_ITT_BUILD */
2798 
2799 #if OMPT_SUPPORT && OMPT_OPTIONAL
2800   if (ompt_enabled.ompt_callback_mutex_acquired) {
2801     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2802         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2803   }
2804 #endif
2805 
2806 #endif // KMP_USE_DYNAMIC_LOCK
2807 }
2808 
2809 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2810 #if KMP_USE_DYNAMIC_LOCK
2811 
2812 #if USE_ITT_BUILD
2813   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2814 #endif
2815 #if OMPT_SUPPORT && OMPT_OPTIONAL
2816   // This is the case, if called from omp_init_lock_with_hint:
2817   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2818   if (!codeptr)
2819     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2820   if (ompt_enabled.enabled) {
2821     if (ompt_enabled.ompt_callback_mutex_acquire) {
2822       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2823           ompt_mutex_nest_lock, omp_lock_hint_none,
2824           __ompt_get_mutex_impl_type(user_lock),
2825           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2826     }
2827   }
2828 #endif
2829   int acquire_status =
2830       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2831   (void)acquire_status;
2832 #if USE_ITT_BUILD
2833   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2834 #endif
2835 
2836 #if OMPT_SUPPORT && OMPT_OPTIONAL
2837   if (ompt_enabled.enabled) {
2838     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2839       if (ompt_enabled.ompt_callback_mutex_acquired) {
2840         // lock_first
2841         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2842             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2843             codeptr);
2844       }
2845     } else {
2846       if (ompt_enabled.ompt_callback_nest_lock) {
2847         // lock_next
2848         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2849             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2850       }
2851     }
2852   }
2853 #endif
2854 
2855 #else // KMP_USE_DYNAMIC_LOCK
2856   int acquire_status;
2857   kmp_user_lock_p lck;
2858 
2859   if ((__kmp_user_lock_kind == lk_tas) &&
2860       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2861        OMP_NEST_LOCK_T_SIZE)) {
2862     lck = (kmp_user_lock_p)user_lock;
2863   }
2864 #if KMP_USE_FUTEX
2865   else if ((__kmp_user_lock_kind == lk_futex) &&
2866            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2867             OMP_NEST_LOCK_T_SIZE)) {
2868     lck = (kmp_user_lock_p)user_lock;
2869   }
2870 #endif
2871   else {
2872     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2873   }
2874 
2875 #if USE_ITT_BUILD
2876   __kmp_itt_lock_acquiring(lck);
2877 #endif /* USE_ITT_BUILD */
2878 #if OMPT_SUPPORT && OMPT_OPTIONAL
2879   // This is the case, if called from omp_init_lock_with_hint:
2880   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2881   if (!codeptr)
2882     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2883   if (ompt_enabled.enabled) {
2884     if (ompt_enabled.ompt_callback_mutex_acquire) {
2885       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2886           ompt_mutex_nest_lock, omp_lock_hint_none,
2887           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2888           codeptr);
2889     }
2890   }
2891 #endif
2892 
2893   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2894 
2895 #if USE_ITT_BUILD
2896   __kmp_itt_lock_acquired(lck);
2897 #endif /* USE_ITT_BUILD */
2898 
2899 #if OMPT_SUPPORT && OMPT_OPTIONAL
2900   if (ompt_enabled.enabled) {
2901     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2902       if (ompt_enabled.ompt_callback_mutex_acquired) {
2903         // lock_first
2904         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2905             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2906       }
2907     } else {
2908       if (ompt_enabled.ompt_callback_nest_lock) {
2909         // lock_next
2910         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2911             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2912       }
2913     }
2914   }
2915 #endif
2916 
2917 #endif // KMP_USE_DYNAMIC_LOCK
2918 }
2919 
2920 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2921 #if KMP_USE_DYNAMIC_LOCK
2922 
2923   int tag = KMP_EXTRACT_D_TAG(user_lock);
2924 #if USE_ITT_BUILD
2925   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2926 #endif
2927 #if KMP_USE_INLINED_TAS
2928   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2929     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2930   } else
2931 #elif KMP_USE_INLINED_FUTEX
2932   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2933     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2934   } else
2935 #endif
2936   {
2937     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2938   }
2939 
2940 #if OMPT_SUPPORT && OMPT_OPTIONAL
2941   // This is the case, if called from omp_init_lock_with_hint:
2942   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2943   if (!codeptr)
2944     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2945   if (ompt_enabled.ompt_callback_mutex_released) {
2946     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2947         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2948   }
2949 #endif
2950 
2951 #else // KMP_USE_DYNAMIC_LOCK
2952 
2953   kmp_user_lock_p lck;
2954 
2955   /* Can't use serial interval since not block structured */
2956   /* release the lock */
2957 
2958   if ((__kmp_user_lock_kind == lk_tas) &&
2959       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2960 #if KMP_OS_LINUX &&                                                            \
2961     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2962 // "fast" path implemented to fix customer performance issue
2963 #if USE_ITT_BUILD
2964     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2965 #endif /* USE_ITT_BUILD */
2966     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2967     KMP_MB();
2968 
2969 #if OMPT_SUPPORT && OMPT_OPTIONAL
2970     // This is the case, if called from omp_init_lock_with_hint:
2971     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2972     if (!codeptr)
2973       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2974     if (ompt_enabled.ompt_callback_mutex_released) {
2975       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2976           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2977     }
2978 #endif
2979 
2980     return;
2981 #else
2982     lck = (kmp_user_lock_p)user_lock;
2983 #endif
2984   }
2985 #if KMP_USE_FUTEX
2986   else if ((__kmp_user_lock_kind == lk_futex) &&
2987            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2988     lck = (kmp_user_lock_p)user_lock;
2989   }
2990 #endif
2991   else {
2992     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2993   }
2994 
2995 #if USE_ITT_BUILD
2996   __kmp_itt_lock_releasing(lck);
2997 #endif /* USE_ITT_BUILD */
2998 
2999   RELEASE_LOCK(lck, gtid);
3000 
3001 #if OMPT_SUPPORT && OMPT_OPTIONAL
3002   // This is the case, if called from omp_init_lock_with_hint:
3003   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3004   if (!codeptr)
3005     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3006   if (ompt_enabled.ompt_callback_mutex_released) {
3007     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3008         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3009   }
3010 #endif
3011 
3012 #endif // KMP_USE_DYNAMIC_LOCK
3013 }
3014 
3015 /* release the lock */
3016 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3017 #if KMP_USE_DYNAMIC_LOCK
3018 
3019 #if USE_ITT_BUILD
3020   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
3021 #endif
3022   int release_status =
3023       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
3024   (void)release_status;
3025 
3026 #if OMPT_SUPPORT && OMPT_OPTIONAL
3027   // This is the case, if called from omp_init_lock_with_hint:
3028   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3029   if (!codeptr)
3030     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3031   if (ompt_enabled.enabled) {
3032     if (release_status == KMP_LOCK_RELEASED) {
3033       if (ompt_enabled.ompt_callback_mutex_released) {
3034         // release_lock_last
3035         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3036             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3037             codeptr);
3038       }
3039     } else if (ompt_enabled.ompt_callback_nest_lock) {
3040       // release_lock_prev
3041       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3042           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3043     }
3044   }
3045 #endif
3046 
3047 #else // KMP_USE_DYNAMIC_LOCK
3048 
3049   kmp_user_lock_p lck;
3050 
3051   /* Can't use serial interval since not block structured */
3052 
3053   if ((__kmp_user_lock_kind == lk_tas) &&
3054       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3055        OMP_NEST_LOCK_T_SIZE)) {
3056 #if KMP_OS_LINUX &&                                                            \
3057     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
3058     // "fast" path implemented to fix customer performance issue
3059     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
3060 #if USE_ITT_BUILD
3061     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
3062 #endif /* USE_ITT_BUILD */
3063 
3064 #if OMPT_SUPPORT && OMPT_OPTIONAL
3065     int release_status = KMP_LOCK_STILL_HELD;
3066 #endif
3067 
3068     if (--(tl->lk.depth_locked) == 0) {
3069       TCW_4(tl->lk.poll, 0);
3070 #if OMPT_SUPPORT && OMPT_OPTIONAL
3071       release_status = KMP_LOCK_RELEASED;
3072 #endif
3073     }
3074     KMP_MB();
3075 
3076 #if OMPT_SUPPORT && OMPT_OPTIONAL
3077     // This is the case, if called from omp_init_lock_with_hint:
3078     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3079     if (!codeptr)
3080       codeptr = OMPT_GET_RETURN_ADDRESS(0);
3081     if (ompt_enabled.enabled) {
3082       if (release_status == KMP_LOCK_RELEASED) {
3083         if (ompt_enabled.ompt_callback_mutex_released) {
3084           // release_lock_last
3085           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3086               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3087         }
3088       } else if (ompt_enabled.ompt_callback_nest_lock) {
3089         // release_lock_previous
3090         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3091             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3092       }
3093     }
3094 #endif
3095 
3096     return;
3097 #else
3098     lck = (kmp_user_lock_p)user_lock;
3099 #endif
3100   }
3101 #if KMP_USE_FUTEX
3102   else if ((__kmp_user_lock_kind == lk_futex) &&
3103            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3104             OMP_NEST_LOCK_T_SIZE)) {
3105     lck = (kmp_user_lock_p)user_lock;
3106   }
3107 #endif
3108   else {
3109     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
3110   }
3111 
3112 #if USE_ITT_BUILD
3113   __kmp_itt_lock_releasing(lck);
3114 #endif /* USE_ITT_BUILD */
3115 
3116   int release_status;
3117   release_status = RELEASE_NESTED_LOCK(lck, gtid);
3118 #if OMPT_SUPPORT && OMPT_OPTIONAL
3119   // This is the case, if called from omp_init_lock_with_hint:
3120   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3121   if (!codeptr)
3122     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3123   if (ompt_enabled.enabled) {
3124     if (release_status == KMP_LOCK_RELEASED) {
3125       if (ompt_enabled.ompt_callback_mutex_released) {
3126         // release_lock_last
3127         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3128             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3129       }
3130     } else if (ompt_enabled.ompt_callback_nest_lock) {
3131       // release_lock_previous
3132       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3133           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3134     }
3135   }
3136 #endif
3137 
3138 #endif // KMP_USE_DYNAMIC_LOCK
3139 }
3140 
3141 /* try to acquire the lock */
3142 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3143   KMP_COUNT_BLOCK(OMP_test_lock);
3144 
3145 #if KMP_USE_DYNAMIC_LOCK
3146   int rc;
3147   int tag = KMP_EXTRACT_D_TAG(user_lock);
3148 #if USE_ITT_BUILD
3149   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3150 #endif
3151 #if OMPT_SUPPORT && OMPT_OPTIONAL
3152   // This is the case, if called from omp_init_lock_with_hint:
3153   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3154   if (!codeptr)
3155     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3156   if (ompt_enabled.ompt_callback_mutex_acquire) {
3157     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3158         ompt_mutex_lock, omp_lock_hint_none,
3159         __ompt_get_mutex_impl_type(user_lock),
3160         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3161   }
3162 #endif
3163 #if KMP_USE_INLINED_TAS
3164   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3165     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3166   } else
3167 #elif KMP_USE_INLINED_FUTEX
3168   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3169     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3170   } else
3171 #endif
3172   {
3173     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3174   }
3175   if (rc) {
3176 #if USE_ITT_BUILD
3177     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3178 #endif
3179 #if OMPT_SUPPORT && OMPT_OPTIONAL
3180     if (ompt_enabled.ompt_callback_mutex_acquired) {
3181       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3182           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3183     }
3184 #endif
3185     return FTN_TRUE;
3186   } else {
3187 #if USE_ITT_BUILD
3188     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3189 #endif
3190     return FTN_FALSE;
3191   }
3192 
3193 #else // KMP_USE_DYNAMIC_LOCK
3194 
3195   kmp_user_lock_p lck;
3196   int rc;
3197 
3198   if ((__kmp_user_lock_kind == lk_tas) &&
3199       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3200     lck = (kmp_user_lock_p)user_lock;
3201   }
3202 #if KMP_USE_FUTEX
3203   else if ((__kmp_user_lock_kind == lk_futex) &&
3204            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3205     lck = (kmp_user_lock_p)user_lock;
3206   }
3207 #endif
3208   else {
3209     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3210   }
3211 
3212 #if USE_ITT_BUILD
3213   __kmp_itt_lock_acquiring(lck);
3214 #endif /* USE_ITT_BUILD */
3215 #if OMPT_SUPPORT && OMPT_OPTIONAL
3216   // This is the case, if called from omp_init_lock_with_hint:
3217   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3218   if (!codeptr)
3219     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3220   if (ompt_enabled.ompt_callback_mutex_acquire) {
3221     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3222         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3223         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3224   }
3225 #endif
3226 
3227   rc = TEST_LOCK(lck, gtid);
3228 #if USE_ITT_BUILD
3229   if (rc) {
3230     __kmp_itt_lock_acquired(lck);
3231   } else {
3232     __kmp_itt_lock_cancelled(lck);
3233   }
3234 #endif /* USE_ITT_BUILD */
3235 #if OMPT_SUPPORT && OMPT_OPTIONAL
3236   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3237     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3238         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3239   }
3240 #endif
3241 
3242   return (rc ? FTN_TRUE : FTN_FALSE);
3243 
3244   /* Can't use serial interval since not block structured */
3245 
3246 #endif // KMP_USE_DYNAMIC_LOCK
3247 }
3248 
3249 /* try to acquire the lock */
3250 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3251 #if KMP_USE_DYNAMIC_LOCK
3252   int rc;
3253 #if USE_ITT_BUILD
3254   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3255 #endif
3256 #if OMPT_SUPPORT && OMPT_OPTIONAL
3257   // This is the case, if called from omp_init_lock_with_hint:
3258   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3259   if (!codeptr)
3260     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3261   if (ompt_enabled.ompt_callback_mutex_acquire) {
3262     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3263         ompt_mutex_nest_lock, omp_lock_hint_none,
3264         __ompt_get_mutex_impl_type(user_lock),
3265         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3266   }
3267 #endif
3268   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3269 #if USE_ITT_BUILD
3270   if (rc) {
3271     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3272   } else {
3273     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3274   }
3275 #endif
3276 #if OMPT_SUPPORT && OMPT_OPTIONAL
3277   if (ompt_enabled.enabled && rc) {
3278     if (rc == 1) {
3279       if (ompt_enabled.ompt_callback_mutex_acquired) {
3280         // lock_first
3281         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3282             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3283             codeptr);
3284       }
3285     } else {
3286       if (ompt_enabled.ompt_callback_nest_lock) {
3287         // lock_next
3288         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3289             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3290       }
3291     }
3292   }
3293 #endif
3294   return rc;
3295 
3296 #else // KMP_USE_DYNAMIC_LOCK
3297 
3298   kmp_user_lock_p lck;
3299   int rc;
3300 
3301   if ((__kmp_user_lock_kind == lk_tas) &&
3302       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3303        OMP_NEST_LOCK_T_SIZE)) {
3304     lck = (kmp_user_lock_p)user_lock;
3305   }
3306 #if KMP_USE_FUTEX
3307   else if ((__kmp_user_lock_kind == lk_futex) &&
3308            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3309             OMP_NEST_LOCK_T_SIZE)) {
3310     lck = (kmp_user_lock_p)user_lock;
3311   }
3312 #endif
3313   else {
3314     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3315   }
3316 
3317 #if USE_ITT_BUILD
3318   __kmp_itt_lock_acquiring(lck);
3319 #endif /* USE_ITT_BUILD */
3320 
3321 #if OMPT_SUPPORT && OMPT_OPTIONAL
3322   // This is the case, if called from omp_init_lock_with_hint:
3323   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3324   if (!codeptr)
3325     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3326   if (ompt_enabled.enabled) &&
3327         ompt_enabled.ompt_callback_mutex_acquire) {
3328       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3329           ompt_mutex_nest_lock, omp_lock_hint_none,
3330           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3331           codeptr);
3332     }
3333 #endif
3334 
3335   rc = TEST_NESTED_LOCK(lck, gtid);
3336 #if USE_ITT_BUILD
3337   if (rc) {
3338     __kmp_itt_lock_acquired(lck);
3339   } else {
3340     __kmp_itt_lock_cancelled(lck);
3341   }
3342 #endif /* USE_ITT_BUILD */
3343 #if OMPT_SUPPORT && OMPT_OPTIONAL
3344   if (ompt_enabled.enabled && rc) {
3345     if (rc == 1) {
3346       if (ompt_enabled.ompt_callback_mutex_acquired) {
3347         // lock_first
3348         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3349             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3350       }
3351     } else {
3352       if (ompt_enabled.ompt_callback_nest_lock) {
3353         // lock_next
3354         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3355             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3356       }
3357     }
3358   }
3359 #endif
3360   return rc;
3361 
3362   /* Can't use serial interval since not block structured */
3363 
3364 #endif // KMP_USE_DYNAMIC_LOCK
3365 }
3366 
3367 // Interface to fast scalable reduce methods routines
3368 
3369 // keep the selected method in a thread local structure for cross-function
3370 // usage: will be used in __kmpc_end_reduce* functions;
3371 // another solution: to re-determine the method one more time in
3372 // __kmpc_end_reduce* functions (new prototype required then)
3373 // AT: which solution is better?
3374 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3375   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3376 
3377 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3378   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3379 
3380 // description of the packed_reduction_method variable: look at the macros in
3381 // kmp.h
3382 
3383 // used in a critical section reduce block
3384 static __forceinline void
3385 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3386                                           kmp_critical_name *crit) {
3387 
3388   // this lock was visible to a customer and to the threading profile tool as a
3389   // serial overhead span (although it's used for an internal purpose only)
3390   //            why was it visible in previous implementation?
3391   //            should we keep it visible in new reduce block?
3392   kmp_user_lock_p lck;
3393 
3394 #if KMP_USE_DYNAMIC_LOCK
3395 
3396   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3397   // Check if it is initialized.
3398   if (*lk == 0) {
3399     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3400       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3401                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3402     } else {
3403       __kmp_init_indirect_csptr(crit, loc, global_tid,
3404                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3405     }
3406   }
3407   // Branch for accessing the actual lock object and set operation. This
3408   // branching is inevitable since this lock initialization does not follow the
3409   // normal dispatch path (lock table is not used).
3410   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3411     lck = (kmp_user_lock_p)lk;
3412     KMP_DEBUG_ASSERT(lck != NULL);
3413     if (__kmp_env_consistency_check) {
3414       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3415     }
3416     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3417   } else {
3418     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3419     lck = ilk->lock;
3420     KMP_DEBUG_ASSERT(lck != NULL);
3421     if (__kmp_env_consistency_check) {
3422       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3423     }
3424     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3425   }
3426 
3427 #else // KMP_USE_DYNAMIC_LOCK
3428 
3429   // We know that the fast reduction code is only emitted by Intel compilers
3430   // with 32 byte critical sections. If there isn't enough space, then we
3431   // have to use a pointer.
3432   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3433     lck = (kmp_user_lock_p)crit;
3434   } else {
3435     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3436   }
3437   KMP_DEBUG_ASSERT(lck != NULL);
3438 
3439   if (__kmp_env_consistency_check)
3440     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3441 
3442   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3443 
3444 #endif // KMP_USE_DYNAMIC_LOCK
3445 }
3446 
3447 // used in a critical section reduce block
3448 static __forceinline void
3449 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3450                                         kmp_critical_name *crit) {
3451 
3452   kmp_user_lock_p lck;
3453 
3454 #if KMP_USE_DYNAMIC_LOCK
3455 
3456   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3457     lck = (kmp_user_lock_p)crit;
3458     if (__kmp_env_consistency_check)
3459       __kmp_pop_sync(global_tid, ct_critical, loc);
3460     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3461   } else {
3462     kmp_indirect_lock_t *ilk =
3463         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3464     if (__kmp_env_consistency_check)
3465       __kmp_pop_sync(global_tid, ct_critical, loc);
3466     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3467   }
3468 
3469 #else // KMP_USE_DYNAMIC_LOCK
3470 
3471   // We know that the fast reduction code is only emitted by Intel compilers
3472   // with 32 byte critical sections. If there isn't enough space, then we have
3473   // to use a pointer.
3474   if (__kmp_base_user_lock_size > 32) {
3475     lck = *((kmp_user_lock_p *)crit);
3476     KMP_ASSERT(lck != NULL);
3477   } else {
3478     lck = (kmp_user_lock_p)crit;
3479   }
3480 
3481   if (__kmp_env_consistency_check)
3482     __kmp_pop_sync(global_tid, ct_critical, loc);
3483 
3484   __kmp_release_user_lock_with_checks(lck, global_tid);
3485 
3486 #endif // KMP_USE_DYNAMIC_LOCK
3487 } // __kmp_end_critical_section_reduce_block
3488 
3489 static __forceinline int
3490 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3491                                      int *task_state) {
3492   kmp_team_t *team;
3493 
3494   // Check if we are inside the teams construct?
3495   if (th->th.th_teams_microtask) {
3496     *team_p = team = th->th.th_team;
3497     if (team->t.t_level == th->th.th_teams_level) {
3498       // This is reduction at teams construct.
3499       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3500       // Let's swap teams temporarily for the reduction.
3501       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3502       th->th.th_team = team->t.t_parent;
3503       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3504       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3505       *task_state = th->th.th_task_state;
3506       th->th.th_task_state = 0;
3507 
3508       return 1;
3509     }
3510   }
3511   return 0;
3512 }
3513 
3514 static __forceinline void
3515 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3516   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3517   th->th.th_info.ds.ds_tid = 0;
3518   th->th.th_team = team;
3519   th->th.th_team_nproc = team->t.t_nproc;
3520   th->th.th_task_team = team->t.t_task_team[task_state];
3521   __kmp_type_convert(task_state, &(th->th.th_task_state));
3522 }
3523 
3524 /* 2.a.i. Reduce Block without a terminating barrier */
3525 /*!
3526 @ingroup SYNCHRONIZATION
3527 @param loc source location information
3528 @param global_tid global thread number
3529 @param num_vars number of items (variables) to be reduced
3530 @param reduce_size size of data in bytes to be reduced
3531 @param reduce_data pointer to data to be reduced
3532 @param reduce_func callback function providing reduction operation on two
3533 operands and returning result of reduction in lhs_data
3534 @param lck pointer to the unique lock data structure
3535 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3536 threads if atomic reduction needed
3537 
3538 The nowait version is used for a reduce clause with the nowait argument.
3539 */
3540 kmp_int32
3541 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3542                      size_t reduce_size, void *reduce_data,
3543                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3544                      kmp_critical_name *lck) {
3545 
3546   KMP_COUNT_BLOCK(REDUCE_nowait);
3547   int retval = 0;
3548   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3549   kmp_info_t *th;
3550   kmp_team_t *team;
3551   int teams_swapped = 0, task_state;
3552   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3553   __kmp_assert_valid_gtid(global_tid);
3554 
3555   // why do we need this initialization here at all?
3556   // Reduction clause can not be used as a stand-alone directive.
3557 
3558   // do not call __kmp_serial_initialize(), it will be called by
3559   // __kmp_parallel_initialize() if needed
3560   // possible detection of false-positive race by the threadchecker ???
3561   if (!TCR_4(__kmp_init_parallel))
3562     __kmp_parallel_initialize();
3563 
3564   __kmp_resume_if_soft_paused();
3565 
3566 // check correctness of reduce block nesting
3567 #if KMP_USE_DYNAMIC_LOCK
3568   if (__kmp_env_consistency_check)
3569     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3570 #else
3571   if (__kmp_env_consistency_check)
3572     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3573 #endif
3574 
3575   th = __kmp_thread_from_gtid(global_tid);
3576   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3577 
3578   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3579   // the value should be kept in a variable
3580   // the variable should be either a construct-specific or thread-specific
3581   // property, not a team specific property
3582   //     (a thread can reach the next reduce block on the next construct, reduce
3583   //     method may differ on the next construct)
3584   // an ident_t "loc" parameter could be used as a construct-specific property
3585   // (what if loc == 0?)
3586   //     (if both construct-specific and team-specific variables were shared,
3587   //     then unness extra syncs should be needed)
3588   // a thread-specific variable is better regarding two issues above (next
3589   // construct and extra syncs)
3590   // a thread-specific "th_local.reduction_method" variable is used currently
3591   // each thread executes 'determine' and 'set' lines (no need to execute by one
3592   // thread, to avoid unness extra syncs)
3593 
3594   packed_reduction_method = __kmp_determine_reduction_method(
3595       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3596   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3597 
3598   OMPT_REDUCTION_DECL(th, global_tid);
3599   if (packed_reduction_method == critical_reduce_block) {
3600 
3601     OMPT_REDUCTION_BEGIN;
3602 
3603     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3604     retval = 1;
3605 
3606   } else if (packed_reduction_method == empty_reduce_block) {
3607 
3608     OMPT_REDUCTION_BEGIN;
3609 
3610     // usage: if team size == 1, no synchronization is required ( Intel
3611     // platforms only )
3612     retval = 1;
3613 
3614   } else if (packed_reduction_method == atomic_reduce_block) {
3615 
3616     retval = 2;
3617 
3618     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3619     // won't be called by the code gen)
3620     //     (it's not quite good, because the checking block has been closed by
3621     //     this 'pop',
3622     //      but atomic operation has not been executed yet, will be executed
3623     //      slightly later, literally on next instruction)
3624     if (__kmp_env_consistency_check)
3625       __kmp_pop_sync(global_tid, ct_reduce, loc);
3626 
3627   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3628                                    tree_reduce_block)) {
3629 
3630 // AT: performance issue: a real barrier here
3631 // AT: (if primary thread is slow, other threads are blocked here waiting for
3632 //      the primary thread to come and release them)
3633 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3634 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3635 //      be confusing to a customer)
3636 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3637 // might go faster and be more in line with sense of NOWAIT
3638 // AT: TO DO: do epcc test and compare times
3639 
3640 // this barrier should be invisible to a customer and to the threading profile
3641 // tool (it's neither a terminating barrier nor customer's code, it's
3642 // used for an internal purpose)
3643 #if OMPT_SUPPORT
3644     // JP: can this barrier potentially leed to task scheduling?
3645     // JP: as long as there is a barrier in the implementation, OMPT should and
3646     // will provide the barrier events
3647     //         so we set-up the necessary frame/return addresses.
3648     ompt_frame_t *ompt_frame;
3649     if (ompt_enabled.enabled) {
3650       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3651       if (ompt_frame->enter_frame.ptr == NULL)
3652         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3653     }
3654     OMPT_STORE_RETURN_ADDRESS(global_tid);
3655 #endif
3656 #if USE_ITT_NOTIFY
3657     __kmp_threads[global_tid]->th.th_ident = loc;
3658 #endif
3659     retval =
3660         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3661                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3662     retval = (retval != 0) ? (0) : (1);
3663 #if OMPT_SUPPORT && OMPT_OPTIONAL
3664     if (ompt_enabled.enabled) {
3665       ompt_frame->enter_frame = ompt_data_none;
3666     }
3667 #endif
3668 
3669     // all other workers except primary thread should do this pop here
3670     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3671     if (__kmp_env_consistency_check) {
3672       if (retval == 0) {
3673         __kmp_pop_sync(global_tid, ct_reduce, loc);
3674       }
3675     }
3676 
3677   } else {
3678 
3679     // should never reach this block
3680     KMP_ASSERT(0); // "unexpected method"
3681   }
3682   if (teams_swapped) {
3683     __kmp_restore_swapped_teams(th, team, task_state);
3684   }
3685   KA_TRACE(
3686       10,
3687       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3688        global_tid, packed_reduction_method, retval));
3689 
3690   return retval;
3691 }
3692 
3693 /*!
3694 @ingroup SYNCHRONIZATION
3695 @param loc source location information
3696 @param global_tid global thread id.
3697 @param lck pointer to the unique lock data structure
3698 
3699 Finish the execution of a reduce nowait.
3700 */
3701 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3702                               kmp_critical_name *lck) {
3703 
3704   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3705 
3706   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3707   __kmp_assert_valid_gtid(global_tid);
3708 
3709   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3710 
3711   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3712 
3713   if (packed_reduction_method == critical_reduce_block) {
3714 
3715     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3716     OMPT_REDUCTION_END;
3717 
3718   } else if (packed_reduction_method == empty_reduce_block) {
3719 
3720     // usage: if team size == 1, no synchronization is required ( on Intel
3721     // platforms only )
3722 
3723     OMPT_REDUCTION_END;
3724 
3725   } else if (packed_reduction_method == atomic_reduce_block) {
3726 
3727     // neither primary thread nor other workers should get here
3728     //     (code gen does not generate this call in case 2: atomic reduce block)
3729     // actually it's better to remove this elseif at all;
3730     // after removal this value will checked by the 'else' and will assert
3731 
3732   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3733                                    tree_reduce_block)) {
3734 
3735     // only primary thread gets here
3736     // OMPT: tree reduction is annotated in the barrier code
3737 
3738   } else {
3739 
3740     // should never reach this block
3741     KMP_ASSERT(0); // "unexpected method"
3742   }
3743 
3744   if (__kmp_env_consistency_check)
3745     __kmp_pop_sync(global_tid, ct_reduce, loc);
3746 
3747   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3748                 global_tid, packed_reduction_method));
3749 
3750   return;
3751 }
3752 
3753 /* 2.a.ii. Reduce Block with a terminating barrier */
3754 
3755 /*!
3756 @ingroup SYNCHRONIZATION
3757 @param loc source location information
3758 @param global_tid global thread number
3759 @param num_vars number of items (variables) to be reduced
3760 @param reduce_size size of data in bytes to be reduced
3761 @param reduce_data pointer to data to be reduced
3762 @param reduce_func callback function providing reduction operation on two
3763 operands and returning result of reduction in lhs_data
3764 @param lck pointer to the unique lock data structure
3765 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3766 threads if atomic reduction needed
3767 
3768 A blocking reduce that includes an implicit barrier.
3769 */
3770 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3771                         size_t reduce_size, void *reduce_data,
3772                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3773                         kmp_critical_name *lck) {
3774   KMP_COUNT_BLOCK(REDUCE_wait);
3775   int retval = 0;
3776   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3777   kmp_info_t *th;
3778   kmp_team_t *team;
3779   int teams_swapped = 0, task_state;
3780 
3781   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3782   __kmp_assert_valid_gtid(global_tid);
3783 
3784   // why do we need this initialization here at all?
3785   // Reduction clause can not be a stand-alone directive.
3786 
3787   // do not call __kmp_serial_initialize(), it will be called by
3788   // __kmp_parallel_initialize() if needed
3789   // possible detection of false-positive race by the threadchecker ???
3790   if (!TCR_4(__kmp_init_parallel))
3791     __kmp_parallel_initialize();
3792 
3793   __kmp_resume_if_soft_paused();
3794 
3795 // check correctness of reduce block nesting
3796 #if KMP_USE_DYNAMIC_LOCK
3797   if (__kmp_env_consistency_check)
3798     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3799 #else
3800   if (__kmp_env_consistency_check)
3801     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3802 #endif
3803 
3804   th = __kmp_thread_from_gtid(global_tid);
3805   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3806 
3807   packed_reduction_method = __kmp_determine_reduction_method(
3808       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3809   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3810 
3811   OMPT_REDUCTION_DECL(th, global_tid);
3812 
3813   if (packed_reduction_method == critical_reduce_block) {
3814 
3815     OMPT_REDUCTION_BEGIN;
3816     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3817     retval = 1;
3818 
3819   } else if (packed_reduction_method == empty_reduce_block) {
3820 
3821     OMPT_REDUCTION_BEGIN;
3822     // usage: if team size == 1, no synchronization is required ( Intel
3823     // platforms only )
3824     retval = 1;
3825 
3826   } else if (packed_reduction_method == atomic_reduce_block) {
3827 
3828     retval = 2;
3829 
3830   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3831                                    tree_reduce_block)) {
3832 
3833 // case tree_reduce_block:
3834 // this barrier should be visible to a customer and to the threading profile
3835 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3836 #if OMPT_SUPPORT
3837     ompt_frame_t *ompt_frame;
3838     if (ompt_enabled.enabled) {
3839       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3840       if (ompt_frame->enter_frame.ptr == NULL)
3841         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3842     }
3843     OMPT_STORE_RETURN_ADDRESS(global_tid);
3844 #endif
3845 #if USE_ITT_NOTIFY
3846     __kmp_threads[global_tid]->th.th_ident =
3847         loc; // needed for correct notification of frames
3848 #endif
3849     retval =
3850         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3851                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3852     retval = (retval != 0) ? (0) : (1);
3853 #if OMPT_SUPPORT && OMPT_OPTIONAL
3854     if (ompt_enabled.enabled) {
3855       ompt_frame->enter_frame = ompt_data_none;
3856     }
3857 #endif
3858 
3859     // all other workers except primary thread should do this pop here
3860     // (none of other workers except primary will enter __kmpc_end_reduce())
3861     if (__kmp_env_consistency_check) {
3862       if (retval == 0) { // 0: all other workers; 1: primary thread
3863         __kmp_pop_sync(global_tid, ct_reduce, loc);
3864       }
3865     }
3866 
3867   } else {
3868 
3869     // should never reach this block
3870     KMP_ASSERT(0); // "unexpected method"
3871   }
3872   if (teams_swapped) {
3873     __kmp_restore_swapped_teams(th, team, task_state);
3874   }
3875 
3876   KA_TRACE(10,
3877            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3878             global_tid, packed_reduction_method, retval));
3879   return retval;
3880 }
3881 
3882 /*!
3883 @ingroup SYNCHRONIZATION
3884 @param loc source location information
3885 @param global_tid global thread id.
3886 @param lck pointer to the unique lock data structure
3887 
3888 Finish the execution of a blocking reduce.
3889 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3890 start function.
3891 */
3892 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3893                        kmp_critical_name *lck) {
3894 
3895   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3896   kmp_info_t *th;
3897   kmp_team_t *team;
3898   int teams_swapped = 0, task_state;
3899 
3900   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3901   __kmp_assert_valid_gtid(global_tid);
3902 
3903   th = __kmp_thread_from_gtid(global_tid);
3904   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3905 
3906   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3907 
3908   // this barrier should be visible to a customer and to the threading profile
3909   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3910   OMPT_REDUCTION_DECL(th, global_tid);
3911 
3912   if (packed_reduction_method == critical_reduce_block) {
3913     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3914 
3915     OMPT_REDUCTION_END;
3916 
3917 // TODO: implicit barrier: should be exposed
3918 #if OMPT_SUPPORT
3919     ompt_frame_t *ompt_frame;
3920     if (ompt_enabled.enabled) {
3921       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3922       if (ompt_frame->enter_frame.ptr == NULL)
3923         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3924     }
3925     OMPT_STORE_RETURN_ADDRESS(global_tid);
3926 #endif
3927 #if USE_ITT_NOTIFY
3928     __kmp_threads[global_tid]->th.th_ident = loc;
3929 #endif
3930     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3931 #if OMPT_SUPPORT && OMPT_OPTIONAL
3932     if (ompt_enabled.enabled) {
3933       ompt_frame->enter_frame = ompt_data_none;
3934     }
3935 #endif
3936 
3937   } else if (packed_reduction_method == empty_reduce_block) {
3938 
3939     OMPT_REDUCTION_END;
3940 
3941 // usage: if team size==1, no synchronization is required (Intel platforms only)
3942 
3943 // TODO: implicit barrier: should be exposed
3944 #if OMPT_SUPPORT
3945     ompt_frame_t *ompt_frame;
3946     if (ompt_enabled.enabled) {
3947       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3948       if (ompt_frame->enter_frame.ptr == NULL)
3949         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3950     }
3951     OMPT_STORE_RETURN_ADDRESS(global_tid);
3952 #endif
3953 #if USE_ITT_NOTIFY
3954     __kmp_threads[global_tid]->th.th_ident = loc;
3955 #endif
3956     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3957 #if OMPT_SUPPORT && OMPT_OPTIONAL
3958     if (ompt_enabled.enabled) {
3959       ompt_frame->enter_frame = ompt_data_none;
3960     }
3961 #endif
3962 
3963   } else if (packed_reduction_method == atomic_reduce_block) {
3964 
3965 #if OMPT_SUPPORT
3966     ompt_frame_t *ompt_frame;
3967     if (ompt_enabled.enabled) {
3968       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3969       if (ompt_frame->enter_frame.ptr == NULL)
3970         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3971     }
3972     OMPT_STORE_RETURN_ADDRESS(global_tid);
3973 #endif
3974 // TODO: implicit barrier: should be exposed
3975 #if USE_ITT_NOTIFY
3976     __kmp_threads[global_tid]->th.th_ident = loc;
3977 #endif
3978     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3979 #if OMPT_SUPPORT && OMPT_OPTIONAL
3980     if (ompt_enabled.enabled) {
3981       ompt_frame->enter_frame = ompt_data_none;
3982     }
3983 #endif
3984 
3985   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3986                                    tree_reduce_block)) {
3987 
3988     // only primary thread executes here (primary releases all other workers)
3989     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3990                             global_tid);
3991 
3992   } else {
3993 
3994     // should never reach this block
3995     KMP_ASSERT(0); // "unexpected method"
3996   }
3997   if (teams_swapped) {
3998     __kmp_restore_swapped_teams(th, team, task_state);
3999   }
4000 
4001   if (__kmp_env_consistency_check)
4002     __kmp_pop_sync(global_tid, ct_reduce, loc);
4003 
4004   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
4005                 global_tid, packed_reduction_method));
4006 
4007   return;
4008 }
4009 
4010 #undef __KMP_GET_REDUCTION_METHOD
4011 #undef __KMP_SET_REDUCTION_METHOD
4012 
4013 /* end of interface to fast scalable reduce routines */
4014 
4015 kmp_uint64 __kmpc_get_taskid() {
4016 
4017   kmp_int32 gtid;
4018   kmp_info_t *thread;
4019 
4020   gtid = __kmp_get_gtid();
4021   if (gtid < 0) {
4022     return 0;
4023   }
4024   thread = __kmp_thread_from_gtid(gtid);
4025   return thread->th.th_current_task->td_task_id;
4026 
4027 } // __kmpc_get_taskid
4028 
4029 kmp_uint64 __kmpc_get_parent_taskid() {
4030 
4031   kmp_int32 gtid;
4032   kmp_info_t *thread;
4033   kmp_taskdata_t *parent_task;
4034 
4035   gtid = __kmp_get_gtid();
4036   if (gtid < 0) {
4037     return 0;
4038   }
4039   thread = __kmp_thread_from_gtid(gtid);
4040   parent_task = thread->th.th_current_task->td_parent;
4041   return (parent_task == NULL ? 0 : parent_task->td_task_id);
4042 
4043 } // __kmpc_get_parent_taskid
4044 
4045 /*!
4046 @ingroup WORK_SHARING
4047 @param loc  source location information.
4048 @param gtid  global thread number.
4049 @param num_dims  number of associated doacross loops.
4050 @param dims  info on loops bounds.
4051 
4052 Initialize doacross loop information.
4053 Expect compiler send us inclusive bounds,
4054 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
4055 */
4056 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
4057                           const struct kmp_dim *dims) {
4058   __kmp_assert_valid_gtid(gtid);
4059   int j, idx;
4060   kmp_int64 last, trace_count;
4061   kmp_info_t *th = __kmp_threads[gtid];
4062   kmp_team_t *team = th->th.th_team;
4063   kmp_uint32 *flags;
4064   kmp_disp_t *pr_buf = th->th.th_dispatch;
4065   dispatch_shared_info_t *sh_buf;
4066 
4067   KA_TRACE(
4068       20,
4069       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
4070        gtid, num_dims, !team->t.t_serialized));
4071   KMP_DEBUG_ASSERT(dims != NULL);
4072   KMP_DEBUG_ASSERT(num_dims > 0);
4073 
4074   if (team->t.t_serialized) {
4075     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
4076     return; // no dependencies if team is serialized
4077   }
4078   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
4079   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
4080   // the next loop
4081   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4082 
4083   // Save bounds info into allocated private buffer
4084   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
4085   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
4086       th, sizeof(kmp_int64) * (4 * num_dims + 1));
4087   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4088   pr_buf->th_doacross_info[0] =
4089       (kmp_int64)num_dims; // first element is number of dimensions
4090   // Save also address of num_done in order to access it later without knowing
4091   // the buffer index
4092   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
4093   pr_buf->th_doacross_info[2] = dims[0].lo;
4094   pr_buf->th_doacross_info[3] = dims[0].up;
4095   pr_buf->th_doacross_info[4] = dims[0].st;
4096   last = 5;
4097   for (j = 1; j < num_dims; ++j) {
4098     kmp_int64
4099         range_length; // To keep ranges of all dimensions but the first dims[0]
4100     if (dims[j].st == 1) { // most common case
4101       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
4102       range_length = dims[j].up - dims[j].lo + 1;
4103     } else {
4104       if (dims[j].st > 0) {
4105         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
4106         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
4107       } else { // negative increment
4108         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
4109         range_length =
4110             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
4111       }
4112     }
4113     pr_buf->th_doacross_info[last++] = range_length;
4114     pr_buf->th_doacross_info[last++] = dims[j].lo;
4115     pr_buf->th_doacross_info[last++] = dims[j].up;
4116     pr_buf->th_doacross_info[last++] = dims[j].st;
4117   }
4118 
4119   // Compute total trip count.
4120   // Start with range of dims[0] which we don't need to keep in the buffer.
4121   if (dims[0].st == 1) { // most common case
4122     trace_count = dims[0].up - dims[0].lo + 1;
4123   } else if (dims[0].st > 0) {
4124     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
4125     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
4126   } else { // negative increment
4127     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
4128     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
4129   }
4130   for (j = 1; j < num_dims; ++j) {
4131     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
4132   }
4133   KMP_DEBUG_ASSERT(trace_count > 0);
4134 
4135   // Check if shared buffer is not occupied by other loop (idx -
4136   // __kmp_dispatch_num_buffers)
4137   if (idx != sh_buf->doacross_buf_idx) {
4138     // Shared buffer is occupied, wait for it to be free
4139     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
4140                  __kmp_eq_4, NULL);
4141   }
4142 #if KMP_32_BIT_ARCH
4143   // Check if we are the first thread. After the CAS the first thread gets 0,
4144   // others get 1 if initialization is in progress, allocated pointer otherwise.
4145   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
4146   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
4147       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
4148 #else
4149   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
4150       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
4151 #endif
4152   if (flags == NULL) {
4153     // we are the first thread, allocate the array of flags
4154     size_t size =
4155         (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
4156     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4157     KMP_MB();
4158     sh_buf->doacross_flags = flags;
4159   } else if (flags == (kmp_uint32 *)1) {
4160 #if KMP_32_BIT_ARCH
4161     // initialization is still in progress, need to wait
4162     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4163 #else
4164     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4165 #endif
4166       KMP_YIELD(TRUE);
4167     KMP_MB();
4168   } else {
4169     KMP_MB();
4170   }
4171   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4172   pr_buf->th_doacross_flags =
4173       sh_buf->doacross_flags; // save private copy in order to not
4174   // touch shared buffer on each iteration
4175   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4176 }
4177 
4178 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4179   __kmp_assert_valid_gtid(gtid);
4180   kmp_int64 shft;
4181   size_t num_dims, i;
4182   kmp_uint32 flag;
4183   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4184   kmp_info_t *th = __kmp_threads[gtid];
4185   kmp_team_t *team = th->th.th_team;
4186   kmp_disp_t *pr_buf;
4187   kmp_int64 lo, up, st;
4188 
4189   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4190   if (team->t.t_serialized) {
4191     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4192     return; // no dependencies if team is serialized
4193   }
4194 
4195   // calculate sequential iteration number and check out-of-bounds condition
4196   pr_buf = th->th.th_dispatch;
4197   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4198   num_dims = (size_t)pr_buf->th_doacross_info[0];
4199   lo = pr_buf->th_doacross_info[2];
4200   up = pr_buf->th_doacross_info[3];
4201   st = pr_buf->th_doacross_info[4];
4202 #if OMPT_SUPPORT && OMPT_OPTIONAL
4203   ompt_dependence_t deps[num_dims];
4204 #endif
4205   if (st == 1) { // most common case
4206     if (vec[0] < lo || vec[0] > up) {
4207       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4208                     "bounds [%lld,%lld]\n",
4209                     gtid, vec[0], lo, up));
4210       return;
4211     }
4212     iter_number = vec[0] - lo;
4213   } else if (st > 0) {
4214     if (vec[0] < lo || vec[0] > up) {
4215       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4216                     "bounds [%lld,%lld]\n",
4217                     gtid, vec[0], lo, up));
4218       return;
4219     }
4220     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4221   } else { // negative increment
4222     if (vec[0] > lo || vec[0] < up) {
4223       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4224                     "bounds [%lld,%lld]\n",
4225                     gtid, vec[0], lo, up));
4226       return;
4227     }
4228     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4229   }
4230 #if OMPT_SUPPORT && OMPT_OPTIONAL
4231   deps[0].variable.value = iter_number;
4232   deps[0].dependence_type = ompt_dependence_type_sink;
4233 #endif
4234   for (i = 1; i < num_dims; ++i) {
4235     kmp_int64 iter, ln;
4236     size_t j = i * 4;
4237     ln = pr_buf->th_doacross_info[j + 1];
4238     lo = pr_buf->th_doacross_info[j + 2];
4239     up = pr_buf->th_doacross_info[j + 3];
4240     st = pr_buf->th_doacross_info[j + 4];
4241     if (st == 1) {
4242       if (vec[i] < lo || vec[i] > up) {
4243         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4244                       "bounds [%lld,%lld]\n",
4245                       gtid, vec[i], lo, up));
4246         return;
4247       }
4248       iter = vec[i] - lo;
4249     } else if (st > 0) {
4250       if (vec[i] < lo || vec[i] > up) {
4251         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4252                       "bounds [%lld,%lld]\n",
4253                       gtid, vec[i], lo, up));
4254         return;
4255       }
4256       iter = (kmp_uint64)(vec[i] - lo) / st;
4257     } else { // st < 0
4258       if (vec[i] > lo || vec[i] < up) {
4259         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4260                       "bounds [%lld,%lld]\n",
4261                       gtid, vec[i], lo, up));
4262         return;
4263       }
4264       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4265     }
4266     iter_number = iter + ln * iter_number;
4267 #if OMPT_SUPPORT && OMPT_OPTIONAL
4268     deps[i].variable.value = iter;
4269     deps[i].dependence_type = ompt_dependence_type_sink;
4270 #endif
4271   }
4272   shft = iter_number % 32; // use 32-bit granularity
4273   iter_number >>= 5; // divided by 32
4274   flag = 1 << shft;
4275   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4276     KMP_YIELD(TRUE);
4277   }
4278   KMP_MB();
4279 #if OMPT_SUPPORT && OMPT_OPTIONAL
4280   if (ompt_enabled.ompt_callback_dependences) {
4281     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4282         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4283   }
4284 #endif
4285   KA_TRACE(20,
4286            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4287             gtid, (iter_number << 5) + shft));
4288 }
4289 
4290 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4291   __kmp_assert_valid_gtid(gtid);
4292   kmp_int64 shft;
4293   size_t num_dims, i;
4294   kmp_uint32 flag;
4295   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4296   kmp_info_t *th = __kmp_threads[gtid];
4297   kmp_team_t *team = th->th.th_team;
4298   kmp_disp_t *pr_buf;
4299   kmp_int64 lo, st;
4300 
4301   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4302   if (team->t.t_serialized) {
4303     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4304     return; // no dependencies if team is serialized
4305   }
4306 
4307   // calculate sequential iteration number (same as in "wait" but no
4308   // out-of-bounds checks)
4309   pr_buf = th->th.th_dispatch;
4310   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4311   num_dims = (size_t)pr_buf->th_doacross_info[0];
4312   lo = pr_buf->th_doacross_info[2];
4313   st = pr_buf->th_doacross_info[4];
4314 #if OMPT_SUPPORT && OMPT_OPTIONAL
4315   ompt_dependence_t deps[num_dims];
4316 #endif
4317   if (st == 1) { // most common case
4318     iter_number = vec[0] - lo;
4319   } else if (st > 0) {
4320     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4321   } else { // negative increment
4322     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4323   }
4324 #if OMPT_SUPPORT && OMPT_OPTIONAL
4325   deps[0].variable.value = iter_number;
4326   deps[0].dependence_type = ompt_dependence_type_source;
4327 #endif
4328   for (i = 1; i < num_dims; ++i) {
4329     kmp_int64 iter, ln;
4330     size_t j = i * 4;
4331     ln = pr_buf->th_doacross_info[j + 1];
4332     lo = pr_buf->th_doacross_info[j + 2];
4333     st = pr_buf->th_doacross_info[j + 4];
4334     if (st == 1) {
4335       iter = vec[i] - lo;
4336     } else if (st > 0) {
4337       iter = (kmp_uint64)(vec[i] - lo) / st;
4338     } else { // st < 0
4339       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4340     }
4341     iter_number = iter + ln * iter_number;
4342 #if OMPT_SUPPORT && OMPT_OPTIONAL
4343     deps[i].variable.value = iter;
4344     deps[i].dependence_type = ompt_dependence_type_source;
4345 #endif
4346   }
4347 #if OMPT_SUPPORT && OMPT_OPTIONAL
4348   if (ompt_enabled.ompt_callback_dependences) {
4349     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4350         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4351   }
4352 #endif
4353   shft = iter_number % 32; // use 32-bit granularity
4354   iter_number >>= 5; // divided by 32
4355   flag = 1 << shft;
4356   KMP_MB();
4357   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4358     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4359   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4360                 (iter_number << 5) + shft));
4361 }
4362 
4363 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4364   __kmp_assert_valid_gtid(gtid);
4365   kmp_int32 num_done;
4366   kmp_info_t *th = __kmp_threads[gtid];
4367   kmp_team_t *team = th->th.th_team;
4368   kmp_disp_t *pr_buf = th->th.th_dispatch;
4369 
4370   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4371   if (team->t.t_serialized) {
4372     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4373     return; // nothing to do
4374   }
4375   num_done =
4376       KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4377   if (num_done == th->th.th_team_nproc) {
4378     // we are the last thread, need to free shared resources
4379     int idx = pr_buf->th_doacross_buf_idx - 1;
4380     dispatch_shared_info_t *sh_buf =
4381         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4382     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4383                      (kmp_int64)&sh_buf->doacross_num_done);
4384     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4385     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4386     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4387     sh_buf->doacross_flags = NULL;
4388     sh_buf->doacross_num_done = 0;
4389     sh_buf->doacross_buf_idx +=
4390         __kmp_dispatch_num_buffers; // free buffer for future re-use
4391   }
4392   // free private resources (need to keep buffer index forever)
4393   pr_buf->th_doacross_flags = NULL;
4394   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4395   pr_buf->th_doacross_info = NULL;
4396   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4397 }
4398 
4399 /* OpenMP 5.1 Memory Management routines */
4400 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4401   return __kmp_alloc(__kmp_entry_gtid(), 0, size, allocator);
4402 }
4403 
4404 void *omp_aligned_alloc(size_t align, size_t size,
4405                         omp_allocator_handle_t allocator) {
4406   return __kmp_alloc(__kmp_entry_gtid(), align, size, allocator);
4407 }
4408 
4409 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4410   return __kmp_calloc(__kmp_entry_gtid(), 0, nmemb, size, allocator);
4411 }
4412 
4413 void *omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
4414                          omp_allocator_handle_t allocator) {
4415   return __kmp_calloc(__kmp_entry_gtid(), align, nmemb, size, allocator);
4416 }
4417 
4418 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4419                   omp_allocator_handle_t free_allocator) {
4420   return __kmp_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4421                        free_allocator);
4422 }
4423 
4424 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4425   ___kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4426 }
4427 /* end of OpenMP 5.1 Memory Management routines */
4428 
4429 int __kmpc_get_target_offload(void) {
4430   if (!__kmp_init_serial) {
4431     __kmp_serial_initialize();
4432   }
4433   return __kmp_target_offload;
4434 }
4435 
4436 int __kmpc_pause_resource(kmp_pause_status_t level) {
4437   if (!__kmp_init_serial) {
4438     return 1; // Can't pause if runtime is not initialized
4439   }
4440   return __kmp_pause_resource(level);
4441 }
4442 
4443 void __kmpc_error(ident_t *loc, int severity, const char *message) {
4444   if (!__kmp_init_serial)
4445     __kmp_serial_initialize();
4446 
4447   KMP_ASSERT(severity == severity_warning || severity == severity_fatal);
4448 
4449 #if OMPT_SUPPORT
4450   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_error) {
4451     ompt_callbacks.ompt_callback(ompt_callback_error)(
4452         (ompt_severity_t)severity, message, KMP_STRLEN(message),
4453         OMPT_GET_RETURN_ADDRESS(0));
4454   }
4455 #endif // OMPT_SUPPORT
4456 
4457   char *src_loc;
4458   if (loc && loc->psource) {
4459     kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
4460     src_loc =
4461         __kmp_str_format("%s:%s:%s", str_loc.file, str_loc.line, str_loc.col);
4462     __kmp_str_loc_free(&str_loc);
4463   } else {
4464     src_loc = __kmp_str_format("unknown");
4465   }
4466 
4467   if (severity == severity_warning)
4468     KMP_WARNING(UserDirectedWarning, src_loc, message);
4469   else
4470     KMP_FATAL(UserDirectedError, src_loc, message);
4471 
4472   __kmp_str_free(&src_loc);
4473 }
4474 
4475 // Mark begin of scope directive.
4476 void __kmpc_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4477 // reserved is for extension of scope directive and not used.
4478 #if OMPT_SUPPORT && OMPT_OPTIONAL
4479   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4480     kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4481     int tid = __kmp_tid_from_gtid(gtid);
4482     ompt_callbacks.ompt_callback(ompt_callback_work)(
4483         ompt_work_scope, ompt_scope_begin,
4484         &(team->t.ompt_team_info.parallel_data),
4485         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4486         OMPT_GET_RETURN_ADDRESS(0));
4487   }
4488 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4489 }
4490 
4491 // Mark end of scope directive
4492 void __kmpc_end_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4493 // reserved is for extension of scope directive and not used.
4494 #if OMPT_SUPPORT && OMPT_OPTIONAL
4495   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4496     kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4497     int tid = __kmp_tid_from_gtid(gtid);
4498     ompt_callbacks.ompt_callback(ompt_callback_work)(
4499         ompt_work_scope, ompt_scope_end,
4500         &(team->t.ompt_team_info.parallel_data),
4501         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4502         OMPT_GET_RETURN_ADDRESS(0));
4503   }
4504 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4505 }
4506 
4507 #ifdef KMP_USE_VERSION_SYMBOLS
4508 // For GOMP compatibility there are two versions of each omp_* API.
4509 // One is the plain C symbol and one is the Fortran symbol with an appended
4510 // underscore. When we implement a specific ompc_* version of an omp_*
4511 // function, we want the plain GOMP versioned symbol to alias the ompc_* version
4512 // instead of the Fortran versions in kmp_ftn_entry.h
4513 extern "C" {
4514 // Have to undef these from omp.h so they aren't translated into
4515 // their ompc counterparts in the KMP_VERSION_OMPC_SYMBOL macros below
4516 #ifdef omp_set_affinity_format
4517 #undef omp_set_affinity_format
4518 #endif
4519 #ifdef omp_get_affinity_format
4520 #undef omp_get_affinity_format
4521 #endif
4522 #ifdef omp_display_affinity
4523 #undef omp_display_affinity
4524 #endif
4525 #ifdef omp_capture_affinity
4526 #undef omp_capture_affinity
4527 #endif
4528 KMP_VERSION_OMPC_SYMBOL(ompc_set_affinity_format, omp_set_affinity_format, 50,
4529                         "OMP_5.0");
4530 KMP_VERSION_OMPC_SYMBOL(ompc_get_affinity_format, omp_get_affinity_format, 50,
4531                         "OMP_5.0");
4532 KMP_VERSION_OMPC_SYMBOL(ompc_display_affinity, omp_display_affinity, 50,
4533                         "OMP_5.0");
4534 KMP_VERSION_OMPC_SYMBOL(ompc_capture_affinity, omp_capture_affinity, 50,
4535                         "OMP_5.0");
4536 } // extern "C"
4537 #endif
4538