xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_csupport.cpp (revision 924226fba12cc9a228c73b956e1b7fa24c60b055)
1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     __kmp_assign_root_init_mask();
43     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
44   } else if (__kmp_ignore_mppbeg() == FALSE) {
45     // By default __kmp_ignore_mppbeg() returns TRUE.
46     __kmp_internal_begin();
47     KC_TRACE(10, ("__kmpc_begin: called\n"));
48   }
49 }
50 
51 /*!
52  * @ingroup STARTUP_SHUTDOWN
53  * @param loc source location information
54  *
55  * Shutdown the runtime library. This is also optional, and even if called will
56  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
57  * zero.
58  */
59 void __kmpc_end(ident_t *loc) {
60   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
61   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
62   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
63   // returns FALSE and __kmpc_end() will unregister this root (it can cause
64   // library shut down).
65   if (__kmp_ignore_mppend() == FALSE) {
66     KC_TRACE(10, ("__kmpc_end: called\n"));
67     KA_TRACE(30, ("__kmpc_end\n"));
68 
69     __kmp_internal_end_thread(-1);
70   }
71 #if KMP_OS_WINDOWS && OMPT_SUPPORT
72   // Normal exit process on Windows does not allow worker threads of the final
73   // parallel region to finish reporting their events, so shutting down the
74   // library here fixes the issue at least for the cases where __kmpc_end() is
75   // placed properly.
76   if (ompt_enabled.enabled)
77     __kmp_internal_end_library(__kmp_gtid_get_specific());
78 #endif
79 }
80 
81 /*!
82 @ingroup THREAD_STATES
83 @param loc Source location information.
84 @return The global thread index of the active thread.
85 
86 This function can be called in any context.
87 
88 If the runtime has ony been entered at the outermost level from a
89 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
90 that which would be returned by omp_get_thread_num() in the outermost
91 active parallel construct. (Or zero if there is no active parallel
92 construct, since the primary thread is necessarily thread zero).
93 
94 If multiple non-OpenMP threads all enter an OpenMP construct then this
95 will be a unique thread identifier among all the threads created by
96 the OpenMP runtime (but the value cannot be defined in terms of
97 OpenMP thread ids returned by omp_get_thread_num()).
98 */
99 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
100   kmp_int32 gtid = __kmp_entry_gtid();
101 
102   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
103 
104   return gtid;
105 }
106 
107 /*!
108 @ingroup THREAD_STATES
109 @param loc Source location information.
110 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
111 
112 This function can be called in any context.
113 It returns the total number of threads under the control of the OpenMP runtime.
114 That is not a number that can be determined by any OpenMP standard calls, since
115 the library may be called from more than one non-OpenMP thread, and this
116 reflects the total over all such calls. Similarly the runtime maintains
117 underlying threads even when they are not active (since the cost of creating
118 and destroying OS threads is high), this call counts all such threads even if
119 they are not waiting for work.
120 */
121 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
122   KC_TRACE(10,
123            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
124 
125   return TCR_4(__kmp_all_nth);
126 }
127 
128 /*!
129 @ingroup THREAD_STATES
130 @param loc Source location information.
131 @return The thread number of the calling thread in the innermost active parallel
132 construct.
133 */
134 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
135   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
136   return __kmp_tid_from_gtid(__kmp_entry_gtid());
137 }
138 
139 /*!
140 @ingroup THREAD_STATES
141 @param loc Source location information.
142 @return The number of threads in the innermost active parallel construct.
143 */
144 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
145   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
146 
147   return __kmp_entry_thread()->th.th_team->t.t_nproc;
148 }
149 
150 /*!
151  * @ingroup DEPRECATED
152  * @param loc location description
153  *
154  * This function need not be called. It always returns TRUE.
155  */
156 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
157 #ifndef KMP_DEBUG
158 
159   return TRUE;
160 
161 #else
162 
163   const char *semi2;
164   const char *semi3;
165   int line_no;
166 
167   if (__kmp_par_range == 0) {
168     return TRUE;
169   }
170   semi2 = loc->psource;
171   if (semi2 == NULL) {
172     return TRUE;
173   }
174   semi2 = strchr(semi2, ';');
175   if (semi2 == NULL) {
176     return TRUE;
177   }
178   semi2 = strchr(semi2 + 1, ';');
179   if (semi2 == NULL) {
180     return TRUE;
181   }
182   if (__kmp_par_range_filename[0]) {
183     const char *name = semi2 - 1;
184     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
185       name--;
186     }
187     if ((*name == '/') || (*name == ';')) {
188       name++;
189     }
190     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
191       return __kmp_par_range < 0;
192     }
193   }
194   semi3 = strchr(semi2 + 1, ';');
195   if (__kmp_par_range_routine[0]) {
196     if ((semi3 != NULL) && (semi3 > semi2) &&
197         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
198       return __kmp_par_range < 0;
199     }
200   }
201   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
202     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
203       return __kmp_par_range > 0;
204     }
205     return __kmp_par_range < 0;
206   }
207   return TRUE;
208 
209 #endif /* KMP_DEBUG */
210 }
211 
212 /*!
213 @ingroup THREAD_STATES
214 @param loc Source location information.
215 @return 1 if this thread is executing inside an active parallel region, zero if
216 not.
217 */
218 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
219   return __kmp_entry_thread()->th.th_root->r.r_active;
220 }
221 
222 /*!
223 @ingroup PARALLEL
224 @param loc source location information
225 @param global_tid global thread number
226 @param num_threads number of threads requested for this parallel construct
227 
228 Set the number of threads to be used by the next fork spawned by this thread.
229 This call is only required if the parallel construct has a `num_threads` clause.
230 */
231 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
232                              kmp_int32 num_threads) {
233   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
234                 global_tid, num_threads));
235   __kmp_assert_valid_gtid(global_tid);
236   __kmp_push_num_threads(loc, global_tid, num_threads);
237 }
238 
239 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
240   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
241   /* the num_threads are automatically popped */
242 }
243 
244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245                            kmp_int32 proc_bind) {
246   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247                 proc_bind));
248   __kmp_assert_valid_gtid(global_tid);
249   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250 }
251 
252 /*!
253 @ingroup PARALLEL
254 @param loc  source location information
255 @param argc  total number of arguments in the ellipsis
256 @param microtask  pointer to callback routine consisting of outlined parallel
257 construct
258 @param ...  pointers to shared variables that aren't global
259 
260 Do the actual fork and call the microtask in the relevant number of threads.
261 */
262 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263   int gtid = __kmp_entry_gtid();
264 
265 #if (KMP_STATS_ENABLED)
266   // If we were in a serial region, then stop the serial timer, record
267   // the event, and start parallel region timer
268   stats_state_e previous_state = KMP_GET_THREAD_STATE();
269   if (previous_state == stats_state_e::SERIAL_REGION) {
270     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
271   } else {
272     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273   }
274   int inParallel = __kmpc_in_parallel(loc);
275   if (inParallel) {
276     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
277   } else {
278     KMP_COUNT_BLOCK(OMP_PARALLEL);
279   }
280 #endif
281 
282   // maybe to save thr_state is enough here
283   {
284     va_list ap;
285     va_start(ap, microtask);
286 
287 #if OMPT_SUPPORT
288     ompt_frame_t *ompt_frame;
289     if (ompt_enabled.enabled) {
290       kmp_info_t *master_th = __kmp_threads[gtid];
291       ompt_frame = &master_th->th.th_current_task->ompt_task_info.frame;
292       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
293     }
294     OMPT_STORE_RETURN_ADDRESS(gtid);
295 #endif
296 
297 #if INCLUDE_SSC_MARKS
298     SSC_MARK_FORKING();
299 #endif
300     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
301                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
302                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
303                     kmp_va_addr_of(ap));
304 #if INCLUDE_SSC_MARKS
305     SSC_MARK_JOINING();
306 #endif
307     __kmp_join_call(loc, gtid
308 #if OMPT_SUPPORT
309                     ,
310                     fork_context_intel
311 #endif
312     );
313 
314     va_end(ap);
315 
316 #if OMPT_SUPPORT
317     if (ompt_enabled.enabled) {
318       ompt_frame->enter_frame = ompt_data_none;
319     }
320 #endif
321   }
322 
323 #if KMP_STATS_ENABLED
324   if (previous_state == stats_state_e::SERIAL_REGION) {
325     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
326     KMP_SET_THREAD_STATE(previous_state);
327   } else {
328     KMP_POP_PARTITIONED_TIMER();
329   }
330 #endif // KMP_STATS_ENABLED
331 }
332 
333 /*!
334 @ingroup PARALLEL
335 @param loc source location information
336 @param global_tid global thread number
337 @param num_teams number of teams requested for the teams construct
338 @param num_threads number of threads per team requested for the teams construct
339 
340 Set the number of teams to be used by the teams construct.
341 This call is only required if the teams construct has a `num_teams` clause
342 or a `thread_limit` clause (or both).
343 */
344 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
345                            kmp_int32 num_teams, kmp_int32 num_threads) {
346   KA_TRACE(20,
347            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
348             global_tid, num_teams, num_threads));
349   __kmp_assert_valid_gtid(global_tid);
350   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
351 }
352 
353 /*!
354 @ingroup PARALLEL
355 @param loc source location information
356 @param global_tid global thread number
357 @param num_teams_lo lower bound on number of teams requested for the teams
358 construct
359 @param num_teams_up upper bound on number of teams requested for the teams
360 construct
361 @param num_threads number of threads per team requested for the teams construct
362 
363 Set the number of teams to be used by the teams construct. The number of initial
364 teams cretaed will be greater than or equal to the lower bound and less than or
365 equal to the upper bound.
366 This call is only required if the teams construct has a `num_teams` clause
367 or a `thread_limit` clause (or both).
368 */
369 void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
370                               kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
371                               kmp_int32 num_threads) {
372   KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
373                 " num_teams_ub=%d num_threads=%d\n",
374                 global_tid, num_teams_lb, num_teams_ub, num_threads));
375   __kmp_assert_valid_gtid(global_tid);
376   __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
377                           num_threads);
378 }
379 
380 /*!
381 @ingroup PARALLEL
382 @param loc  source location information
383 @param argc  total number of arguments in the ellipsis
384 @param microtask  pointer to callback routine consisting of outlined teams
385 construct
386 @param ...  pointers to shared variables that aren't global
387 
388 Do the actual fork and call the microtask in the relevant number of threads.
389 */
390 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
391                        ...) {
392   int gtid = __kmp_entry_gtid();
393   kmp_info_t *this_thr = __kmp_threads[gtid];
394   va_list ap;
395   va_start(ap, microtask);
396 
397 #if KMP_STATS_ENABLED
398   KMP_COUNT_BLOCK(OMP_TEAMS);
399   stats_state_e previous_state = KMP_GET_THREAD_STATE();
400   if (previous_state == stats_state_e::SERIAL_REGION) {
401     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
402   } else {
403     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
404   }
405 #endif
406 
407   // remember teams entry point and nesting level
408   this_thr->th.th_teams_microtask = microtask;
409   this_thr->th.th_teams_level =
410       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
411 
412 #if OMPT_SUPPORT
413   kmp_team_t *parent_team = this_thr->th.th_team;
414   int tid = __kmp_tid_from_gtid(gtid);
415   if (ompt_enabled.enabled) {
416     parent_team->t.t_implicit_task_taskdata[tid]
417         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
418   }
419   OMPT_STORE_RETURN_ADDRESS(gtid);
420 #endif
421 
422   // check if __kmpc_push_num_teams called, set default number of teams
423   // otherwise
424   if (this_thr->th.th_teams_size.nteams == 0) {
425     __kmp_push_num_teams(loc, gtid, 0, 0);
426   }
427   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
428   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
429   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
430 
431   __kmp_fork_call(
432       loc, gtid, fork_context_intel, argc,
433       VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
434       VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
435   __kmp_join_call(loc, gtid
436 #if OMPT_SUPPORT
437                   ,
438                   fork_context_intel
439 #endif
440   );
441 
442   // Pop current CG root off list
443   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
444   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
445   this_thr->th.th_cg_roots = tmp->up;
446   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
447                  " to node %p. cg_nthreads was %d\n",
448                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
449   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
450   int i = tmp->cg_nthreads--;
451   if (i == 1) { // check is we are the last thread in CG (not always the case)
452     __kmp_free(tmp);
453   }
454   // Restore current task's thread_limit from CG root
455   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
456   this_thr->th.th_current_task->td_icvs.thread_limit =
457       this_thr->th.th_cg_roots->cg_thread_limit;
458 
459   this_thr->th.th_teams_microtask = NULL;
460   this_thr->th.th_teams_level = 0;
461   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
462   va_end(ap);
463 #if KMP_STATS_ENABLED
464   if (previous_state == stats_state_e::SERIAL_REGION) {
465     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
466     KMP_SET_THREAD_STATE(previous_state);
467   } else {
468     KMP_POP_PARTITIONED_TIMER();
469   }
470 #endif // KMP_STATS_ENABLED
471 }
472 
473 // I don't think this function should ever have been exported.
474 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
475 // openmp code ever called it, but it's been exported from the RTL for so
476 // long that I'm afraid to remove the definition.
477 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
478 
479 /*!
480 @ingroup PARALLEL
481 @param loc  source location information
482 @param global_tid  global thread number
483 
484 Enter a serialized parallel construct. This interface is used to handle a
485 conditional parallel region, like this,
486 @code
487 #pragma omp parallel if (condition)
488 @endcode
489 when the condition is false.
490 */
491 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
492   // The implementation is now in kmp_runtime.cpp so that it can share static
493   // functions with kmp_fork_call since the tasks to be done are similar in
494   // each case.
495   __kmp_assert_valid_gtid(global_tid);
496 #if OMPT_SUPPORT
497   OMPT_STORE_RETURN_ADDRESS(global_tid);
498 #endif
499   __kmp_serialized_parallel(loc, global_tid);
500 }
501 
502 /*!
503 @ingroup PARALLEL
504 @param loc  source location information
505 @param global_tid  global thread number
506 
507 Leave a serialized parallel construct.
508 */
509 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
510   kmp_internal_control_t *top;
511   kmp_info_t *this_thr;
512   kmp_team_t *serial_team;
513 
514   KC_TRACE(10,
515            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
516 
517   /* skip all this code for autopar serialized loops since it results in
518      unacceptable overhead */
519   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
520     return;
521 
522   // Not autopar code
523   __kmp_assert_valid_gtid(global_tid);
524   if (!TCR_4(__kmp_init_parallel))
525     __kmp_parallel_initialize();
526 
527   __kmp_resume_if_soft_paused();
528 
529   this_thr = __kmp_threads[global_tid];
530   serial_team = this_thr->th.th_serial_team;
531 
532   kmp_task_team_t *task_team = this_thr->th.th_task_team;
533   // we need to wait for the proxy tasks before finishing the thread
534   if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
535                             task_team->tt.tt_hidden_helper_task_encountered))
536     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
537 
538   KMP_MB();
539   KMP_DEBUG_ASSERT(serial_team);
540   KMP_ASSERT(serial_team->t.t_serialized);
541   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
542   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
543   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
544   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
545 
546 #if OMPT_SUPPORT
547   if (ompt_enabled.enabled &&
548       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
549     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
550     if (ompt_enabled.ompt_callback_implicit_task) {
551       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
552           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
553           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
554     }
555 
556     // reset clear the task id only after unlinking the task
557     ompt_data_t *parent_task_data;
558     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
559 
560     if (ompt_enabled.ompt_callback_parallel_end) {
561       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
562           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
563           ompt_parallel_invoker_program | ompt_parallel_team,
564           OMPT_LOAD_RETURN_ADDRESS(global_tid));
565     }
566     __ompt_lw_taskteam_unlink(this_thr);
567     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
568   }
569 #endif
570 
571   /* If necessary, pop the internal control stack values and replace the team
572    * values */
573   top = serial_team->t.t_control_stack_top;
574   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
575     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
576     serial_team->t.t_control_stack_top = top->next;
577     __kmp_free(top);
578   }
579 
580   /* pop dispatch buffers stack */
581   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
582   {
583     dispatch_private_info_t *disp_buffer =
584         serial_team->t.t_dispatch->th_disp_buffer;
585     serial_team->t.t_dispatch->th_disp_buffer =
586         serial_team->t.t_dispatch->th_disp_buffer->next;
587     __kmp_free(disp_buffer);
588   }
589   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
590 
591   --serial_team->t.t_serialized;
592   if (serial_team->t.t_serialized == 0) {
593 
594     /* return to the parallel section */
595 
596 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
597     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
598       __kmp_clear_x87_fpu_status_word();
599       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
600       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
601     }
602 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
603 
604     __kmp_pop_current_task_from_thread(this_thr);
605 #if OMPD_SUPPORT
606     if (ompd_state & OMPD_ENABLE_BP)
607       ompd_bp_parallel_end();
608 #endif
609 
610     this_thr->th.th_team = serial_team->t.t_parent;
611     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
612 
613     /* restore values cached in the thread */
614     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
615     this_thr->th.th_team_master =
616         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
617     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
618 
619     /* TODO the below shouldn't need to be adjusted for serialized teams */
620     this_thr->th.th_dispatch =
621         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
622 
623     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
624     this_thr->th.th_current_task->td_flags.executing = 1;
625 
626     if (__kmp_tasking_mode != tskm_immediate_exec) {
627       // Copy the task team from the new child / old parent team to the thread.
628       this_thr->th.th_task_team =
629           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
630       KA_TRACE(20,
631                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
632                 "team %p\n",
633                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
634     }
635   } else {
636     if (__kmp_tasking_mode != tskm_immediate_exec) {
637       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
638                     "depth of serial team %p to %d\n",
639                     global_tid, serial_team, serial_team->t.t_serialized));
640     }
641   }
642 
643   serial_team->t.t_level--;
644   if (__kmp_env_consistency_check)
645     __kmp_pop_parallel(global_tid, NULL);
646 #if OMPT_SUPPORT
647   if (ompt_enabled.enabled)
648     this_thr->th.ompt_thread_info.state =
649         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
650                                            : ompt_state_work_parallel);
651 #endif
652 }
653 
654 /*!
655 @ingroup SYNCHRONIZATION
656 @param loc  source location information.
657 
658 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
659 depending on the memory ordering convention obeyed by the compiler
660 even that may not be necessary).
661 */
662 void __kmpc_flush(ident_t *loc) {
663   KC_TRACE(10, ("__kmpc_flush: called\n"));
664 
665   /* need explicit __mf() here since use volatile instead in library */
666   KMP_MB(); /* Flush all pending memory write invalidates.  */
667 
668 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
669 #if KMP_MIC
670 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
671 // We shouldn't need it, though, since the ABI rules require that
672 // * If the compiler generates NGO stores it also generates the fence
673 // * If users hand-code NGO stores they should insert the fence
674 // therefore no incomplete unordered stores should be visible.
675 #else
676   // C74404
677   // This is to address non-temporal store instructions (sfence needed).
678   // The clflush instruction is addressed either (mfence needed).
679   // Probably the non-temporal load monvtdqa instruction should also be
680   // addressed.
681   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
682   if (!__kmp_cpuinfo.initialized) {
683     __kmp_query_cpuid(&__kmp_cpuinfo);
684   }
685   if (!__kmp_cpuinfo.flags.sse2) {
686     // CPU cannot execute SSE2 instructions.
687   } else {
688 #if KMP_COMPILER_ICC
689     _mm_mfence();
690 #elif KMP_COMPILER_MSVC
691     MemoryBarrier();
692 #else
693     __sync_synchronize();
694 #endif // KMP_COMPILER_ICC
695   }
696 #endif // KMP_MIC
697 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
698        KMP_ARCH_RISCV64)
699 // Nothing to see here move along
700 #elif KMP_ARCH_PPC64
701 // Nothing needed here (we have a real MB above).
702 #else
703 #error Unknown or unsupported architecture
704 #endif
705 
706 #if OMPT_SUPPORT && OMPT_OPTIONAL
707   if (ompt_enabled.ompt_callback_flush) {
708     ompt_callbacks.ompt_callback(ompt_callback_flush)(
709         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
710   }
711 #endif
712 }
713 
714 /* -------------------------------------------------------------------------- */
715 /*!
716 @ingroup SYNCHRONIZATION
717 @param loc source location information
718 @param global_tid thread id.
719 
720 Execute a barrier.
721 */
722 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
723   KMP_COUNT_BLOCK(OMP_BARRIER);
724   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
725   __kmp_assert_valid_gtid(global_tid);
726 
727   if (!TCR_4(__kmp_init_parallel))
728     __kmp_parallel_initialize();
729 
730   __kmp_resume_if_soft_paused();
731 
732   if (__kmp_env_consistency_check) {
733     if (loc == 0) {
734       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
735     }
736     __kmp_check_barrier(global_tid, ct_barrier, loc);
737   }
738 
739 #if OMPT_SUPPORT
740   ompt_frame_t *ompt_frame;
741   if (ompt_enabled.enabled) {
742     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
743     if (ompt_frame->enter_frame.ptr == NULL)
744       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
745   }
746   OMPT_STORE_RETURN_ADDRESS(global_tid);
747 #endif
748   __kmp_threads[global_tid]->th.th_ident = loc;
749   // TODO: explicit barrier_wait_id:
750   //   this function is called when 'barrier' directive is present or
751   //   implicit barrier at the end of a worksharing construct.
752   // 1) better to add a per-thread barrier counter to a thread data structure
753   // 2) set to 0 when a new team is created
754   // 4) no sync is required
755 
756   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
757 #if OMPT_SUPPORT && OMPT_OPTIONAL
758   if (ompt_enabled.enabled) {
759     ompt_frame->enter_frame = ompt_data_none;
760   }
761 #endif
762 }
763 
764 /* The BARRIER for a MASTER section is always explicit   */
765 /*!
766 @ingroup WORK_SHARING
767 @param loc  source location information.
768 @param global_tid  global thread number .
769 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
770 */
771 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
772   int status = 0;
773 
774   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
775   __kmp_assert_valid_gtid(global_tid);
776 
777   if (!TCR_4(__kmp_init_parallel))
778     __kmp_parallel_initialize();
779 
780   __kmp_resume_if_soft_paused();
781 
782   if (KMP_MASTER_GTID(global_tid)) {
783     KMP_COUNT_BLOCK(OMP_MASTER);
784     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
785     status = 1;
786   }
787 
788 #if OMPT_SUPPORT && OMPT_OPTIONAL
789   if (status) {
790     if (ompt_enabled.ompt_callback_masked) {
791       kmp_info_t *this_thr = __kmp_threads[global_tid];
792       kmp_team_t *team = this_thr->th.th_team;
793 
794       int tid = __kmp_tid_from_gtid(global_tid);
795       ompt_callbacks.ompt_callback(ompt_callback_masked)(
796           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
797           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
798           OMPT_GET_RETURN_ADDRESS(0));
799     }
800   }
801 #endif
802 
803   if (__kmp_env_consistency_check) {
804 #if KMP_USE_DYNAMIC_LOCK
805     if (status)
806       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
807     else
808       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
809 #else
810     if (status)
811       __kmp_push_sync(global_tid, ct_master, loc, NULL);
812     else
813       __kmp_check_sync(global_tid, ct_master, loc, NULL);
814 #endif
815   }
816 
817   return status;
818 }
819 
820 /*!
821 @ingroup WORK_SHARING
822 @param loc  source location information.
823 @param global_tid  global thread number .
824 
825 Mark the end of a <tt>master</tt> region. This should only be called by the
826 thread that executes the <tt>master</tt> region.
827 */
828 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
829   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
830   __kmp_assert_valid_gtid(global_tid);
831   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
832   KMP_POP_PARTITIONED_TIMER();
833 
834 #if OMPT_SUPPORT && OMPT_OPTIONAL
835   kmp_info_t *this_thr = __kmp_threads[global_tid];
836   kmp_team_t *team = this_thr->th.th_team;
837   if (ompt_enabled.ompt_callback_masked) {
838     int tid = __kmp_tid_from_gtid(global_tid);
839     ompt_callbacks.ompt_callback(ompt_callback_masked)(
840         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
841         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
842         OMPT_GET_RETURN_ADDRESS(0));
843   }
844 #endif
845 
846   if (__kmp_env_consistency_check) {
847     if (KMP_MASTER_GTID(global_tid))
848       __kmp_pop_sync(global_tid, ct_master, loc);
849   }
850 }
851 
852 /*!
853 @ingroup WORK_SHARING
854 @param loc  source location information.
855 @param global_tid  global thread number.
856 @param filter result of evaluating filter clause on thread global_tid, or zero
857 if no filter clause present
858 @return 1 if this thread should execute the <tt>masked</tt> block, 0 otherwise.
859 */
860 kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
861   int status = 0;
862   int tid;
863   KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
864   __kmp_assert_valid_gtid(global_tid);
865 
866   if (!TCR_4(__kmp_init_parallel))
867     __kmp_parallel_initialize();
868 
869   __kmp_resume_if_soft_paused();
870 
871   tid = __kmp_tid_from_gtid(global_tid);
872   if (tid == filter) {
873     KMP_COUNT_BLOCK(OMP_MASKED);
874     KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
875     status = 1;
876   }
877 
878 #if OMPT_SUPPORT && OMPT_OPTIONAL
879   if (status) {
880     if (ompt_enabled.ompt_callback_masked) {
881       kmp_info_t *this_thr = __kmp_threads[global_tid];
882       kmp_team_t *team = this_thr->th.th_team;
883       ompt_callbacks.ompt_callback(ompt_callback_masked)(
884           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
885           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
886           OMPT_GET_RETURN_ADDRESS(0));
887     }
888   }
889 #endif
890 
891   if (__kmp_env_consistency_check) {
892 #if KMP_USE_DYNAMIC_LOCK
893     if (status)
894       __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
895     else
896       __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
897 #else
898     if (status)
899       __kmp_push_sync(global_tid, ct_masked, loc, NULL);
900     else
901       __kmp_check_sync(global_tid, ct_masked, loc, NULL);
902 #endif
903   }
904 
905   return status;
906 }
907 
908 /*!
909 @ingroup WORK_SHARING
910 @param loc  source location information.
911 @param global_tid  global thread number .
912 
913 Mark the end of a <tt>masked</tt> region. This should only be called by the
914 thread that executes the <tt>masked</tt> region.
915 */
916 void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
917   KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
918   __kmp_assert_valid_gtid(global_tid);
919   KMP_POP_PARTITIONED_TIMER();
920 
921 #if OMPT_SUPPORT && OMPT_OPTIONAL
922   kmp_info_t *this_thr = __kmp_threads[global_tid];
923   kmp_team_t *team = this_thr->th.th_team;
924   if (ompt_enabled.ompt_callback_masked) {
925     int tid = __kmp_tid_from_gtid(global_tid);
926     ompt_callbacks.ompt_callback(ompt_callback_masked)(
927         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
928         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
929         OMPT_GET_RETURN_ADDRESS(0));
930   }
931 #endif
932 
933   if (__kmp_env_consistency_check) {
934     __kmp_pop_sync(global_tid, ct_masked, loc);
935   }
936 }
937 
938 /*!
939 @ingroup WORK_SHARING
940 @param loc  source location information.
941 @param gtid  global thread number.
942 
943 Start execution of an <tt>ordered</tt> construct.
944 */
945 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
946   int cid = 0;
947   kmp_info_t *th;
948   KMP_DEBUG_ASSERT(__kmp_init_serial);
949 
950   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
951   __kmp_assert_valid_gtid(gtid);
952 
953   if (!TCR_4(__kmp_init_parallel))
954     __kmp_parallel_initialize();
955 
956   __kmp_resume_if_soft_paused();
957 
958 #if USE_ITT_BUILD
959   __kmp_itt_ordered_prep(gtid);
960 // TODO: ordered_wait_id
961 #endif /* USE_ITT_BUILD */
962 
963   th = __kmp_threads[gtid];
964 
965 #if OMPT_SUPPORT && OMPT_OPTIONAL
966   kmp_team_t *team;
967   ompt_wait_id_t lck;
968   void *codeptr_ra;
969   OMPT_STORE_RETURN_ADDRESS(gtid);
970   if (ompt_enabled.enabled) {
971     team = __kmp_team_from_gtid(gtid);
972     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
973     /* OMPT state update */
974     th->th.ompt_thread_info.wait_id = lck;
975     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
976 
977     /* OMPT event callback */
978     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
979     if (ompt_enabled.ompt_callback_mutex_acquire) {
980       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
981           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
982           codeptr_ra);
983     }
984   }
985 #endif
986 
987   if (th->th.th_dispatch->th_deo_fcn != 0)
988     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
989   else
990     __kmp_parallel_deo(&gtid, &cid, loc);
991 
992 #if OMPT_SUPPORT && OMPT_OPTIONAL
993   if (ompt_enabled.enabled) {
994     /* OMPT state update */
995     th->th.ompt_thread_info.state = ompt_state_work_parallel;
996     th->th.ompt_thread_info.wait_id = 0;
997 
998     /* OMPT event callback */
999     if (ompt_enabled.ompt_callback_mutex_acquired) {
1000       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1001           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1002     }
1003   }
1004 #endif
1005 
1006 #if USE_ITT_BUILD
1007   __kmp_itt_ordered_start(gtid);
1008 #endif /* USE_ITT_BUILD */
1009 }
1010 
1011 /*!
1012 @ingroup WORK_SHARING
1013 @param loc  source location information.
1014 @param gtid  global thread number.
1015 
1016 End execution of an <tt>ordered</tt> construct.
1017 */
1018 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
1019   int cid = 0;
1020   kmp_info_t *th;
1021 
1022   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
1023   __kmp_assert_valid_gtid(gtid);
1024 
1025 #if USE_ITT_BUILD
1026   __kmp_itt_ordered_end(gtid);
1027 // TODO: ordered_wait_id
1028 #endif /* USE_ITT_BUILD */
1029 
1030   th = __kmp_threads[gtid];
1031 
1032   if (th->th.th_dispatch->th_dxo_fcn != 0)
1033     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
1034   else
1035     __kmp_parallel_dxo(&gtid, &cid, loc);
1036 
1037 #if OMPT_SUPPORT && OMPT_OPTIONAL
1038   OMPT_STORE_RETURN_ADDRESS(gtid);
1039   if (ompt_enabled.ompt_callback_mutex_released) {
1040     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1041         ompt_mutex_ordered,
1042         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
1043             ->t.t_ordered.dt.t_value,
1044         OMPT_LOAD_RETURN_ADDRESS(gtid));
1045   }
1046 #endif
1047 }
1048 
1049 #if KMP_USE_DYNAMIC_LOCK
1050 
1051 static __forceinline void
1052 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
1053                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
1054   // Pointer to the allocated indirect lock is written to crit, while indexing
1055   // is ignored.
1056   void *idx;
1057   kmp_indirect_lock_t **lck;
1058   lck = (kmp_indirect_lock_t **)crit;
1059   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
1060   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
1061   KMP_SET_I_LOCK_LOCATION(ilk, loc);
1062   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
1063   KA_TRACE(20,
1064            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
1065 #if USE_ITT_BUILD
1066   __kmp_itt_critical_creating(ilk->lock, loc);
1067 #endif
1068   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
1069   if (status == 0) {
1070 #if USE_ITT_BUILD
1071     __kmp_itt_critical_destroyed(ilk->lock);
1072 #endif
1073     // We don't really need to destroy the unclaimed lock here since it will be
1074     // cleaned up at program exit.
1075     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
1076   }
1077   KMP_DEBUG_ASSERT(*lck != NULL);
1078 }
1079 
1080 // Fast-path acquire tas lock
1081 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
1082   {                                                                            \
1083     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1084     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1085     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1086     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
1087         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
1088       kmp_uint32 spins;                                                        \
1089       KMP_FSYNC_PREPARE(l);                                                    \
1090       KMP_INIT_YIELD(spins);                                                   \
1091       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1092       do {                                                                     \
1093         if (TCR_4(__kmp_nth) >                                                 \
1094             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1095           KMP_YIELD(TRUE);                                                     \
1096         } else {                                                               \
1097           KMP_YIELD_SPIN(spins);                                               \
1098         }                                                                      \
1099         __kmp_spin_backoff(&backoff);                                          \
1100       } while (                                                                \
1101           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1102           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1103     }                                                                          \
1104     KMP_FSYNC_ACQUIRED(l);                                                     \
1105   }
1106 
1107 // Fast-path test tas lock
1108 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1109   {                                                                            \
1110     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1111     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1112     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1113     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1114          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1115   }
1116 
1117 // Fast-path release tas lock
1118 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1119   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1120 
1121 #if KMP_USE_FUTEX
1122 
1123 #include <sys/syscall.h>
1124 #include <unistd.h>
1125 #ifndef FUTEX_WAIT
1126 #define FUTEX_WAIT 0
1127 #endif
1128 #ifndef FUTEX_WAKE
1129 #define FUTEX_WAKE 1
1130 #endif
1131 
1132 // Fast-path acquire futex lock
1133 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1134   {                                                                            \
1135     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1136     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1137     KMP_MB();                                                                  \
1138     KMP_FSYNC_PREPARE(ftx);                                                    \
1139     kmp_int32 poll_val;                                                        \
1140     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1141                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1142                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1143       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1144       if (!cond) {                                                             \
1145         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1146                                          poll_val |                            \
1147                                              KMP_LOCK_BUSY(1, futex))) {       \
1148           continue;                                                            \
1149         }                                                                      \
1150         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1151       }                                                                        \
1152       kmp_int32 rc;                                                            \
1153       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1154                         NULL, NULL, 0)) != 0) {                                \
1155         continue;                                                              \
1156       }                                                                        \
1157       gtid_code |= 1;                                                          \
1158     }                                                                          \
1159     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1160   }
1161 
1162 // Fast-path test futex lock
1163 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1164   {                                                                            \
1165     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1166     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1167                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1168       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1169       rc = TRUE;                                                               \
1170     } else {                                                                   \
1171       rc = FALSE;                                                              \
1172     }                                                                          \
1173   }
1174 
1175 // Fast-path release futex lock
1176 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1177   {                                                                            \
1178     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1179     KMP_MB();                                                                  \
1180     KMP_FSYNC_RELEASING(ftx);                                                  \
1181     kmp_int32 poll_val =                                                       \
1182         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1183     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1184       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1185               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1186     }                                                                          \
1187     KMP_MB();                                                                  \
1188     KMP_YIELD_OVERSUB();                                                       \
1189   }
1190 
1191 #endif // KMP_USE_FUTEX
1192 
1193 #else // KMP_USE_DYNAMIC_LOCK
1194 
1195 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1196                                                       ident_t const *loc,
1197                                                       kmp_int32 gtid) {
1198   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1199 
1200   // Because of the double-check, the following load doesn't need to be volatile
1201   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1202 
1203   if (lck == NULL) {
1204     void *idx;
1205 
1206     // Allocate & initialize the lock.
1207     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1208     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1209     __kmp_init_user_lock_with_checks(lck);
1210     __kmp_set_user_lock_location(lck, loc);
1211 #if USE_ITT_BUILD
1212     __kmp_itt_critical_creating(lck);
1213 // __kmp_itt_critical_creating() should be called *before* the first usage
1214 // of underlying lock. It is the only place where we can guarantee it. There
1215 // are chances the lock will destroyed with no usage, but it is not a
1216 // problem, because this is not real event seen by user but rather setting
1217 // name for object (lock). See more details in kmp_itt.h.
1218 #endif /* USE_ITT_BUILD */
1219 
1220     // Use a cmpxchg instruction to slam the start of the critical section with
1221     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1222     // and use the lock that the other thread allocated.
1223     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1224 
1225     if (status == 0) {
1226 // Deallocate the lock and reload the value.
1227 #if USE_ITT_BUILD
1228       __kmp_itt_critical_destroyed(lck);
1229 // Let ITT know the lock is destroyed and the same memory location may be reused
1230 // for another purpose.
1231 #endif /* USE_ITT_BUILD */
1232       __kmp_destroy_user_lock_with_checks(lck);
1233       __kmp_user_lock_free(&idx, gtid, lck);
1234       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1235       KMP_DEBUG_ASSERT(lck != NULL);
1236     }
1237   }
1238   return lck;
1239 }
1240 
1241 #endif // KMP_USE_DYNAMIC_LOCK
1242 
1243 /*!
1244 @ingroup WORK_SHARING
1245 @param loc  source location information.
1246 @param global_tid  global thread number.
1247 @param crit identity of the critical section. This could be a pointer to a lock
1248 associated with the critical section, or some other suitably unique value.
1249 
1250 Enter code protected by a `critical` construct.
1251 This function blocks until the executing thread can enter the critical section.
1252 */
1253 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1254                      kmp_critical_name *crit) {
1255 #if KMP_USE_DYNAMIC_LOCK
1256 #if OMPT_SUPPORT && OMPT_OPTIONAL
1257   OMPT_STORE_RETURN_ADDRESS(global_tid);
1258 #endif // OMPT_SUPPORT
1259   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1260 #else
1261   KMP_COUNT_BLOCK(OMP_CRITICAL);
1262 #if OMPT_SUPPORT && OMPT_OPTIONAL
1263   ompt_state_t prev_state = ompt_state_undefined;
1264   ompt_thread_info_t ti;
1265 #endif
1266   kmp_user_lock_p lck;
1267 
1268   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1269   __kmp_assert_valid_gtid(global_tid);
1270 
1271   // TODO: add THR_OVHD_STATE
1272 
1273   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1274   KMP_CHECK_USER_LOCK_INIT();
1275 
1276   if ((__kmp_user_lock_kind == lk_tas) &&
1277       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1278     lck = (kmp_user_lock_p)crit;
1279   }
1280 #if KMP_USE_FUTEX
1281   else if ((__kmp_user_lock_kind == lk_futex) &&
1282            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1283     lck = (kmp_user_lock_p)crit;
1284   }
1285 #endif
1286   else { // ticket, queuing or drdpa
1287     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1288   }
1289 
1290   if (__kmp_env_consistency_check)
1291     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1292 
1293     // since the critical directive binds to all threads, not just the current
1294     // team we have to check this even if we are in a serialized team.
1295     // also, even if we are the uber thread, we still have to conduct the lock,
1296     // as we have to contend with sibling threads.
1297 
1298 #if USE_ITT_BUILD
1299   __kmp_itt_critical_acquiring(lck);
1300 #endif /* USE_ITT_BUILD */
1301 #if OMPT_SUPPORT && OMPT_OPTIONAL
1302   OMPT_STORE_RETURN_ADDRESS(gtid);
1303   void *codeptr_ra = NULL;
1304   if (ompt_enabled.enabled) {
1305     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1306     /* OMPT state update */
1307     prev_state = ti.state;
1308     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1309     ti.state = ompt_state_wait_critical;
1310 
1311     /* OMPT event callback */
1312     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1313     if (ompt_enabled.ompt_callback_mutex_acquire) {
1314       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1315           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1316           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1317     }
1318   }
1319 #endif
1320   // Value of 'crit' should be good for using as a critical_id of the critical
1321   // section directive.
1322   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1323 
1324 #if USE_ITT_BUILD
1325   __kmp_itt_critical_acquired(lck);
1326 #endif /* USE_ITT_BUILD */
1327 #if OMPT_SUPPORT && OMPT_OPTIONAL
1328   if (ompt_enabled.enabled) {
1329     /* OMPT state update */
1330     ti.state = prev_state;
1331     ti.wait_id = 0;
1332 
1333     /* OMPT event callback */
1334     if (ompt_enabled.ompt_callback_mutex_acquired) {
1335       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1336           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1337     }
1338   }
1339 #endif
1340   KMP_POP_PARTITIONED_TIMER();
1341 
1342   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1343   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1344 #endif // KMP_USE_DYNAMIC_LOCK
1345 }
1346 
1347 #if KMP_USE_DYNAMIC_LOCK
1348 
1349 // Converts the given hint to an internal lock implementation
1350 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1351 #if KMP_USE_TSX
1352 #define KMP_TSX_LOCK(seq) lockseq_##seq
1353 #else
1354 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1355 #endif
1356 
1357 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1358 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.flags.rtm)
1359 #else
1360 #define KMP_CPUINFO_RTM 0
1361 #endif
1362 
1363   // Hints that do not require further logic
1364   if (hint & kmp_lock_hint_hle)
1365     return KMP_TSX_LOCK(hle);
1366   if (hint & kmp_lock_hint_rtm)
1367     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1368   if (hint & kmp_lock_hint_adaptive)
1369     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1370 
1371   // Rule out conflicting hints first by returning the default lock
1372   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1373     return __kmp_user_lock_seq;
1374   if ((hint & omp_lock_hint_speculative) &&
1375       (hint & omp_lock_hint_nonspeculative))
1376     return __kmp_user_lock_seq;
1377 
1378   // Do not even consider speculation when it appears to be contended
1379   if (hint & omp_lock_hint_contended)
1380     return lockseq_queuing;
1381 
1382   // Uncontended lock without speculation
1383   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1384     return lockseq_tas;
1385 
1386   // Use RTM lock for speculation
1387   if (hint & omp_lock_hint_speculative)
1388     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1389 
1390   return __kmp_user_lock_seq;
1391 }
1392 
1393 #if OMPT_SUPPORT && OMPT_OPTIONAL
1394 #if KMP_USE_DYNAMIC_LOCK
1395 static kmp_mutex_impl_t
1396 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1397   if (user_lock) {
1398     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1399     case 0:
1400       break;
1401 #if KMP_USE_FUTEX
1402     case locktag_futex:
1403       return kmp_mutex_impl_queuing;
1404 #endif
1405     case locktag_tas:
1406       return kmp_mutex_impl_spin;
1407 #if KMP_USE_TSX
1408     case locktag_hle:
1409     case locktag_rtm_spin:
1410       return kmp_mutex_impl_speculative;
1411 #endif
1412     default:
1413       return kmp_mutex_impl_none;
1414     }
1415     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1416   }
1417   KMP_ASSERT(ilock);
1418   switch (ilock->type) {
1419 #if KMP_USE_TSX
1420   case locktag_adaptive:
1421   case locktag_rtm_queuing:
1422     return kmp_mutex_impl_speculative;
1423 #endif
1424   case locktag_nested_tas:
1425     return kmp_mutex_impl_spin;
1426 #if KMP_USE_FUTEX
1427   case locktag_nested_futex:
1428 #endif
1429   case locktag_ticket:
1430   case locktag_queuing:
1431   case locktag_drdpa:
1432   case locktag_nested_ticket:
1433   case locktag_nested_queuing:
1434   case locktag_nested_drdpa:
1435     return kmp_mutex_impl_queuing;
1436   default:
1437     return kmp_mutex_impl_none;
1438   }
1439 }
1440 #else
1441 // For locks without dynamic binding
1442 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1443   switch (__kmp_user_lock_kind) {
1444   case lk_tas:
1445     return kmp_mutex_impl_spin;
1446 #if KMP_USE_FUTEX
1447   case lk_futex:
1448 #endif
1449   case lk_ticket:
1450   case lk_queuing:
1451   case lk_drdpa:
1452     return kmp_mutex_impl_queuing;
1453 #if KMP_USE_TSX
1454   case lk_hle:
1455   case lk_rtm_queuing:
1456   case lk_rtm_spin:
1457   case lk_adaptive:
1458     return kmp_mutex_impl_speculative;
1459 #endif
1460   default:
1461     return kmp_mutex_impl_none;
1462   }
1463 }
1464 #endif // KMP_USE_DYNAMIC_LOCK
1465 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1466 
1467 /*!
1468 @ingroup WORK_SHARING
1469 @param loc  source location information.
1470 @param global_tid  global thread number.
1471 @param crit identity of the critical section. This could be a pointer to a lock
1472 associated with the critical section, or some other suitably unique value.
1473 @param hint the lock hint.
1474 
1475 Enter code protected by a `critical` construct with a hint. The hint value is
1476 used to suggest a lock implementation. This function blocks until the executing
1477 thread can enter the critical section unless the hint suggests use of
1478 speculative execution and the hardware supports it.
1479 */
1480 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1481                                kmp_critical_name *crit, uint32_t hint) {
1482   KMP_COUNT_BLOCK(OMP_CRITICAL);
1483   kmp_user_lock_p lck;
1484 #if OMPT_SUPPORT && OMPT_OPTIONAL
1485   ompt_state_t prev_state = ompt_state_undefined;
1486   ompt_thread_info_t ti;
1487   // This is the case, if called from __kmpc_critical:
1488   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1489   if (!codeptr)
1490     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1491 #endif
1492 
1493   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1494   __kmp_assert_valid_gtid(global_tid);
1495 
1496   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1497   // Check if it is initialized.
1498   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1499   kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
1500   if (*lk == 0) {
1501     if (KMP_IS_D_LOCK(lockseq)) {
1502       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1503                                   KMP_GET_D_TAG(lockseq));
1504     } else {
1505       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
1506     }
1507   }
1508   // Branch for accessing the actual lock object and set operation. This
1509   // branching is inevitable since this lock initialization does not follow the
1510   // normal dispatch path (lock table is not used).
1511   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1512     lck = (kmp_user_lock_p)lk;
1513     if (__kmp_env_consistency_check) {
1514       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1515                       __kmp_map_hint_to_lock(hint));
1516     }
1517 #if USE_ITT_BUILD
1518     __kmp_itt_critical_acquiring(lck);
1519 #endif
1520 #if OMPT_SUPPORT && OMPT_OPTIONAL
1521     if (ompt_enabled.enabled) {
1522       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1523       /* OMPT state update */
1524       prev_state = ti.state;
1525       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1526       ti.state = ompt_state_wait_critical;
1527 
1528       /* OMPT event callback */
1529       if (ompt_enabled.ompt_callback_mutex_acquire) {
1530         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1531             ompt_mutex_critical, (unsigned int)hint,
1532             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1533             codeptr);
1534       }
1535     }
1536 #endif
1537 #if KMP_USE_INLINED_TAS
1538     if (lockseq == lockseq_tas && !__kmp_env_consistency_check) {
1539       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1540     } else
1541 #elif KMP_USE_INLINED_FUTEX
1542     if (lockseq == lockseq_futex && !__kmp_env_consistency_check) {
1543       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1544     } else
1545 #endif
1546     {
1547       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1548     }
1549   } else {
1550     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1551     lck = ilk->lock;
1552     if (__kmp_env_consistency_check) {
1553       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1554                       __kmp_map_hint_to_lock(hint));
1555     }
1556 #if USE_ITT_BUILD
1557     __kmp_itt_critical_acquiring(lck);
1558 #endif
1559 #if OMPT_SUPPORT && OMPT_OPTIONAL
1560     if (ompt_enabled.enabled) {
1561       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1562       /* OMPT state update */
1563       prev_state = ti.state;
1564       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1565       ti.state = ompt_state_wait_critical;
1566 
1567       /* OMPT event callback */
1568       if (ompt_enabled.ompt_callback_mutex_acquire) {
1569         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1570             ompt_mutex_critical, (unsigned int)hint,
1571             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1572             codeptr);
1573       }
1574     }
1575 #endif
1576     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1577   }
1578   KMP_POP_PARTITIONED_TIMER();
1579 
1580 #if USE_ITT_BUILD
1581   __kmp_itt_critical_acquired(lck);
1582 #endif /* USE_ITT_BUILD */
1583 #if OMPT_SUPPORT && OMPT_OPTIONAL
1584   if (ompt_enabled.enabled) {
1585     /* OMPT state update */
1586     ti.state = prev_state;
1587     ti.wait_id = 0;
1588 
1589     /* OMPT event callback */
1590     if (ompt_enabled.ompt_callback_mutex_acquired) {
1591       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1592           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1593     }
1594   }
1595 #endif
1596 
1597   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1598   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1599 } // __kmpc_critical_with_hint
1600 
1601 #endif // KMP_USE_DYNAMIC_LOCK
1602 
1603 /*!
1604 @ingroup WORK_SHARING
1605 @param loc  source location information.
1606 @param global_tid  global thread number .
1607 @param crit identity of the critical section. This could be a pointer to a lock
1608 associated with the critical section, or some other suitably unique value.
1609 
1610 Leave a critical section, releasing any lock that was held during its execution.
1611 */
1612 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1613                          kmp_critical_name *crit) {
1614   kmp_user_lock_p lck;
1615 
1616   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1617 
1618 #if KMP_USE_DYNAMIC_LOCK
1619   int locktag = KMP_EXTRACT_D_TAG(crit);
1620   if (locktag) {
1621     lck = (kmp_user_lock_p)crit;
1622     KMP_ASSERT(lck != NULL);
1623     if (__kmp_env_consistency_check) {
1624       __kmp_pop_sync(global_tid, ct_critical, loc);
1625     }
1626 #if USE_ITT_BUILD
1627     __kmp_itt_critical_releasing(lck);
1628 #endif
1629 #if KMP_USE_INLINED_TAS
1630     if (locktag == locktag_tas && !__kmp_env_consistency_check) {
1631       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1632     } else
1633 #elif KMP_USE_INLINED_FUTEX
1634     if (locktag == locktag_futex && !__kmp_env_consistency_check) {
1635       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1636     } else
1637 #endif
1638     {
1639       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1640     }
1641   } else {
1642     kmp_indirect_lock_t *ilk =
1643         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1644     KMP_ASSERT(ilk != NULL);
1645     lck = ilk->lock;
1646     if (__kmp_env_consistency_check) {
1647       __kmp_pop_sync(global_tid, ct_critical, loc);
1648     }
1649 #if USE_ITT_BUILD
1650     __kmp_itt_critical_releasing(lck);
1651 #endif
1652     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1653   }
1654 
1655 #else // KMP_USE_DYNAMIC_LOCK
1656 
1657   if ((__kmp_user_lock_kind == lk_tas) &&
1658       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1659     lck = (kmp_user_lock_p)crit;
1660   }
1661 #if KMP_USE_FUTEX
1662   else if ((__kmp_user_lock_kind == lk_futex) &&
1663            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1664     lck = (kmp_user_lock_p)crit;
1665   }
1666 #endif
1667   else { // ticket, queuing or drdpa
1668     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1669   }
1670 
1671   KMP_ASSERT(lck != NULL);
1672 
1673   if (__kmp_env_consistency_check)
1674     __kmp_pop_sync(global_tid, ct_critical, loc);
1675 
1676 #if USE_ITT_BUILD
1677   __kmp_itt_critical_releasing(lck);
1678 #endif /* USE_ITT_BUILD */
1679   // Value of 'crit' should be good for using as a critical_id of the critical
1680   // section directive.
1681   __kmp_release_user_lock_with_checks(lck, global_tid);
1682 
1683 #endif // KMP_USE_DYNAMIC_LOCK
1684 
1685 #if OMPT_SUPPORT && OMPT_OPTIONAL
1686   /* OMPT release event triggers after lock is released; place here to trigger
1687    * for all #if branches */
1688   OMPT_STORE_RETURN_ADDRESS(global_tid);
1689   if (ompt_enabled.ompt_callback_mutex_released) {
1690     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1691         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1692         OMPT_LOAD_RETURN_ADDRESS(0));
1693   }
1694 #endif
1695 
1696   KMP_POP_PARTITIONED_TIMER();
1697   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1698 }
1699 
1700 /*!
1701 @ingroup SYNCHRONIZATION
1702 @param loc source location information
1703 @param global_tid thread id.
1704 @return one if the thread should execute the master block, zero otherwise
1705 
1706 Start execution of a combined barrier and master. The barrier is executed inside
1707 this function.
1708 */
1709 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1710   int status;
1711   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1712   __kmp_assert_valid_gtid(global_tid);
1713 
1714   if (!TCR_4(__kmp_init_parallel))
1715     __kmp_parallel_initialize();
1716 
1717   __kmp_resume_if_soft_paused();
1718 
1719   if (__kmp_env_consistency_check)
1720     __kmp_check_barrier(global_tid, ct_barrier, loc);
1721 
1722 #if OMPT_SUPPORT
1723   ompt_frame_t *ompt_frame;
1724   if (ompt_enabled.enabled) {
1725     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1726     if (ompt_frame->enter_frame.ptr == NULL)
1727       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1728   }
1729   OMPT_STORE_RETURN_ADDRESS(global_tid);
1730 #endif
1731 #if USE_ITT_NOTIFY
1732   __kmp_threads[global_tid]->th.th_ident = loc;
1733 #endif
1734   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1735 #if OMPT_SUPPORT && OMPT_OPTIONAL
1736   if (ompt_enabled.enabled) {
1737     ompt_frame->enter_frame = ompt_data_none;
1738   }
1739 #endif
1740 
1741   return (status != 0) ? 0 : 1;
1742 }
1743 
1744 /*!
1745 @ingroup SYNCHRONIZATION
1746 @param loc source location information
1747 @param global_tid thread id.
1748 
1749 Complete the execution of a combined barrier and master. This function should
1750 only be called at the completion of the <tt>master</tt> code. Other threads will
1751 still be waiting at the barrier and this call releases them.
1752 */
1753 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1754   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1755   __kmp_assert_valid_gtid(global_tid);
1756   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1757 }
1758 
1759 /*!
1760 @ingroup SYNCHRONIZATION
1761 @param loc source location information
1762 @param global_tid thread id.
1763 @return one if the thread should execute the master block, zero otherwise
1764 
1765 Start execution of a combined barrier and master(nowait) construct.
1766 The barrier is executed inside this function.
1767 There is no equivalent "end" function, since the
1768 */
1769 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1770   kmp_int32 ret;
1771   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1772   __kmp_assert_valid_gtid(global_tid);
1773 
1774   if (!TCR_4(__kmp_init_parallel))
1775     __kmp_parallel_initialize();
1776 
1777   __kmp_resume_if_soft_paused();
1778 
1779   if (__kmp_env_consistency_check) {
1780     if (loc == 0) {
1781       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1782     }
1783     __kmp_check_barrier(global_tid, ct_barrier, loc);
1784   }
1785 
1786 #if OMPT_SUPPORT
1787   ompt_frame_t *ompt_frame;
1788   if (ompt_enabled.enabled) {
1789     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1790     if (ompt_frame->enter_frame.ptr == NULL)
1791       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1792   }
1793   OMPT_STORE_RETURN_ADDRESS(global_tid);
1794 #endif
1795 #if USE_ITT_NOTIFY
1796   __kmp_threads[global_tid]->th.th_ident = loc;
1797 #endif
1798   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1799 #if OMPT_SUPPORT && OMPT_OPTIONAL
1800   if (ompt_enabled.enabled) {
1801     ompt_frame->enter_frame = ompt_data_none;
1802   }
1803 #endif
1804 
1805   ret = __kmpc_master(loc, global_tid);
1806 
1807   if (__kmp_env_consistency_check) {
1808     /*  there's no __kmpc_end_master called; so the (stats) */
1809     /*  actions of __kmpc_end_master are done here          */
1810     if (ret) {
1811       /* only one thread should do the pop since only */
1812       /* one did the push (see __kmpc_master())       */
1813       __kmp_pop_sync(global_tid, ct_master, loc);
1814     }
1815   }
1816 
1817   return (ret);
1818 }
1819 
1820 /* The BARRIER for a SINGLE process section is always explicit   */
1821 /*!
1822 @ingroup WORK_SHARING
1823 @param loc  source location information
1824 @param global_tid  global thread number
1825 @return One if this thread should execute the single construct, zero otherwise.
1826 
1827 Test whether to execute a <tt>single</tt> construct.
1828 There are no implicit barriers in the two "single" calls, rather the compiler
1829 should introduce an explicit barrier if it is required.
1830 */
1831 
1832 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1833   __kmp_assert_valid_gtid(global_tid);
1834   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1835 
1836   if (rc) {
1837     // We are going to execute the single statement, so we should count it.
1838     KMP_COUNT_BLOCK(OMP_SINGLE);
1839     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1840   }
1841 
1842 #if OMPT_SUPPORT && OMPT_OPTIONAL
1843   kmp_info_t *this_thr = __kmp_threads[global_tid];
1844   kmp_team_t *team = this_thr->th.th_team;
1845   int tid = __kmp_tid_from_gtid(global_tid);
1846 
1847   if (ompt_enabled.enabled) {
1848     if (rc) {
1849       if (ompt_enabled.ompt_callback_work) {
1850         ompt_callbacks.ompt_callback(ompt_callback_work)(
1851             ompt_work_single_executor, ompt_scope_begin,
1852             &(team->t.ompt_team_info.parallel_data),
1853             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1854             1, OMPT_GET_RETURN_ADDRESS(0));
1855       }
1856     } else {
1857       if (ompt_enabled.ompt_callback_work) {
1858         ompt_callbacks.ompt_callback(ompt_callback_work)(
1859             ompt_work_single_other, ompt_scope_begin,
1860             &(team->t.ompt_team_info.parallel_data),
1861             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1862             1, OMPT_GET_RETURN_ADDRESS(0));
1863         ompt_callbacks.ompt_callback(ompt_callback_work)(
1864             ompt_work_single_other, ompt_scope_end,
1865             &(team->t.ompt_team_info.parallel_data),
1866             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1867             1, OMPT_GET_RETURN_ADDRESS(0));
1868       }
1869     }
1870   }
1871 #endif
1872 
1873   return rc;
1874 }
1875 
1876 /*!
1877 @ingroup WORK_SHARING
1878 @param loc  source location information
1879 @param global_tid  global thread number
1880 
1881 Mark the end of a <tt>single</tt> construct.  This function should
1882 only be called by the thread that executed the block of code protected
1883 by the `single` construct.
1884 */
1885 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1886   __kmp_assert_valid_gtid(global_tid);
1887   __kmp_exit_single(global_tid);
1888   KMP_POP_PARTITIONED_TIMER();
1889 
1890 #if OMPT_SUPPORT && OMPT_OPTIONAL
1891   kmp_info_t *this_thr = __kmp_threads[global_tid];
1892   kmp_team_t *team = this_thr->th.th_team;
1893   int tid = __kmp_tid_from_gtid(global_tid);
1894 
1895   if (ompt_enabled.ompt_callback_work) {
1896     ompt_callbacks.ompt_callback(ompt_callback_work)(
1897         ompt_work_single_executor, ompt_scope_end,
1898         &(team->t.ompt_team_info.parallel_data),
1899         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1900         OMPT_GET_RETURN_ADDRESS(0));
1901   }
1902 #endif
1903 }
1904 
1905 /*!
1906 @ingroup WORK_SHARING
1907 @param loc Source location
1908 @param global_tid Global thread id
1909 
1910 Mark the end of a statically scheduled loop.
1911 */
1912 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1913   KMP_POP_PARTITIONED_TIMER();
1914   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1915 
1916 #if OMPT_SUPPORT && OMPT_OPTIONAL
1917   if (ompt_enabled.ompt_callback_work) {
1918     ompt_work_t ompt_work_type = ompt_work_loop;
1919     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1920     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1921     // Determine workshare type
1922     if (loc != NULL) {
1923       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1924         ompt_work_type = ompt_work_loop;
1925       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1926         ompt_work_type = ompt_work_sections;
1927       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1928         ompt_work_type = ompt_work_distribute;
1929       } else {
1930         // use default set above.
1931         // a warning about this case is provided in __kmpc_for_static_init
1932       }
1933       KMP_DEBUG_ASSERT(ompt_work_type);
1934     }
1935     ompt_callbacks.ompt_callback(ompt_callback_work)(
1936         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1937         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1938   }
1939 #endif
1940   if (__kmp_env_consistency_check)
1941     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1942 }
1943 
1944 // User routines which take C-style arguments (call by value)
1945 // different from the Fortran equivalent routines
1946 
1947 void ompc_set_num_threads(int arg) {
1948   // !!!!! TODO: check the per-task binding
1949   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1950 }
1951 
1952 void ompc_set_dynamic(int flag) {
1953   kmp_info_t *thread;
1954 
1955   /* For the thread-private implementation of the internal controls */
1956   thread = __kmp_entry_thread();
1957 
1958   __kmp_save_internal_controls(thread);
1959 
1960   set__dynamic(thread, flag ? true : false);
1961 }
1962 
1963 void ompc_set_nested(int flag) {
1964   kmp_info_t *thread;
1965 
1966   /* For the thread-private internal controls implementation */
1967   thread = __kmp_entry_thread();
1968 
1969   __kmp_save_internal_controls(thread);
1970 
1971   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1972 }
1973 
1974 void ompc_set_max_active_levels(int max_active_levels) {
1975   /* TO DO */
1976   /* we want per-task implementation of this internal control */
1977 
1978   /* For the per-thread internal controls implementation */
1979   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1980 }
1981 
1982 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1983   // !!!!! TODO: check the per-task binding
1984   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1985 }
1986 
1987 int ompc_get_ancestor_thread_num(int level) {
1988   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1989 }
1990 
1991 int ompc_get_team_size(int level) {
1992   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1993 }
1994 
1995 /* OpenMP 5.0 Affinity Format API */
1996 void KMP_EXPAND_NAME(ompc_set_affinity_format)(char const *format) {
1997   if (!__kmp_init_serial) {
1998     __kmp_serial_initialize();
1999   }
2000   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
2001                          format, KMP_STRLEN(format) + 1);
2002 }
2003 
2004 size_t KMP_EXPAND_NAME(ompc_get_affinity_format)(char *buffer, size_t size) {
2005   size_t format_size;
2006   if (!__kmp_init_serial) {
2007     __kmp_serial_initialize();
2008   }
2009   format_size = KMP_STRLEN(__kmp_affinity_format);
2010   if (buffer && size) {
2011     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
2012                            format_size + 1);
2013   }
2014   return format_size;
2015 }
2016 
2017 void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format) {
2018   int gtid;
2019   if (!TCR_4(__kmp_init_middle)) {
2020     __kmp_middle_initialize();
2021   }
2022   __kmp_assign_root_init_mask();
2023   gtid = __kmp_get_gtid();
2024   __kmp_aux_display_affinity(gtid, format);
2025 }
2026 
2027 size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
2028                                               char const *format) {
2029   int gtid;
2030   size_t num_required;
2031   kmp_str_buf_t capture_buf;
2032   if (!TCR_4(__kmp_init_middle)) {
2033     __kmp_middle_initialize();
2034   }
2035   __kmp_assign_root_init_mask();
2036   gtid = __kmp_get_gtid();
2037   __kmp_str_buf_init(&capture_buf);
2038   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
2039   if (buffer && buf_size) {
2040     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
2041                            capture_buf.used + 1);
2042   }
2043   __kmp_str_buf_free(&capture_buf);
2044   return num_required;
2045 }
2046 
2047 void kmpc_set_stacksize(int arg) {
2048   // __kmp_aux_set_stacksize initializes the library if needed
2049   __kmp_aux_set_stacksize(arg);
2050 }
2051 
2052 void kmpc_set_stacksize_s(size_t arg) {
2053   // __kmp_aux_set_stacksize initializes the library if needed
2054   __kmp_aux_set_stacksize(arg);
2055 }
2056 
2057 void kmpc_set_blocktime(int arg) {
2058   int gtid, tid;
2059   kmp_info_t *thread;
2060 
2061   gtid = __kmp_entry_gtid();
2062   tid = __kmp_tid_from_gtid(gtid);
2063   thread = __kmp_thread_from_gtid(gtid);
2064 
2065   __kmp_aux_set_blocktime(arg, thread, tid);
2066 }
2067 
2068 void kmpc_set_library(int arg) {
2069   // __kmp_user_set_library initializes the library if needed
2070   __kmp_user_set_library((enum library_type)arg);
2071 }
2072 
2073 void kmpc_set_defaults(char const *str) {
2074   // __kmp_aux_set_defaults initializes the library if needed
2075   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
2076 }
2077 
2078 void kmpc_set_disp_num_buffers(int arg) {
2079   // ignore after initialization because some teams have already
2080   // allocated dispatch buffers
2081   if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
2082       arg <= KMP_MAX_DISP_NUM_BUFF) {
2083     __kmp_dispatch_num_buffers = arg;
2084   }
2085 }
2086 
2087 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2088 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2089   return -1;
2090 #else
2091   if (!TCR_4(__kmp_init_middle)) {
2092     __kmp_middle_initialize();
2093   }
2094   __kmp_assign_root_init_mask();
2095   return __kmp_aux_set_affinity_mask_proc(proc, mask);
2096 #endif
2097 }
2098 
2099 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2100 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2101   return -1;
2102 #else
2103   if (!TCR_4(__kmp_init_middle)) {
2104     __kmp_middle_initialize();
2105   }
2106   __kmp_assign_root_init_mask();
2107   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2108 #endif
2109 }
2110 
2111 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2112 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2113   return -1;
2114 #else
2115   if (!TCR_4(__kmp_init_middle)) {
2116     __kmp_middle_initialize();
2117   }
2118   __kmp_assign_root_init_mask();
2119   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2120 #endif
2121 }
2122 
2123 /* -------------------------------------------------------------------------- */
2124 /*!
2125 @ingroup THREADPRIVATE
2126 @param loc       source location information
2127 @param gtid      global thread number
2128 @param cpy_size  size of the cpy_data buffer
2129 @param cpy_data  pointer to data to be copied
2130 @param cpy_func  helper function to call for copying data
2131 @param didit     flag variable: 1=single thread; 0=not single thread
2132 
2133 __kmpc_copyprivate implements the interface for the private data broadcast
2134 needed for the copyprivate clause associated with a single region in an
2135 OpenMP<sup>*</sup> program (both C and Fortran).
2136 All threads participating in the parallel region call this routine.
2137 One of the threads (called the single thread) should have the <tt>didit</tt>
2138 variable set to 1 and all other threads should have that variable set to 0.
2139 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2140 
2141 The OpenMP specification forbids the use of nowait on the single region when a
2142 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2143 barrier internally to avoid race conditions, so the code generation for the
2144 single region should avoid generating a barrier after the call to @ref
2145 __kmpc_copyprivate.
2146 
2147 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2148 The <tt>loc</tt> parameter is a pointer to source location information.
2149 
2150 Internal implementation: The single thread will first copy its descriptor
2151 address (cpy_data) to a team-private location, then the other threads will each
2152 call the function pointed to by the parameter cpy_func, which carries out the
2153 copy by copying the data using the cpy_data buffer.
2154 
2155 The cpy_func routine used for the copy and the contents of the data area defined
2156 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2157 to be done. For instance, the cpy_data buffer can hold the actual data to be
2158 copied or it may hold a list of pointers to the data. The cpy_func routine must
2159 interpret the cpy_data buffer appropriately.
2160 
2161 The interface to cpy_func is as follows:
2162 @code
2163 void cpy_func( void *destination, void *source )
2164 @endcode
2165 where void *destination is the cpy_data pointer for the thread being copied to
2166 and void *source is the cpy_data pointer for the thread being copied from.
2167 */
2168 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2169                         void *cpy_data, void (*cpy_func)(void *, void *),
2170                         kmp_int32 didit) {
2171   void **data_ptr;
2172   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2173   __kmp_assert_valid_gtid(gtid);
2174 
2175   KMP_MB();
2176 
2177   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2178 
2179   if (__kmp_env_consistency_check) {
2180     if (loc == 0) {
2181       KMP_WARNING(ConstructIdentInvalid);
2182     }
2183   }
2184 
2185   // ToDo: Optimize the following two barriers into some kind of split barrier
2186 
2187   if (didit)
2188     *data_ptr = cpy_data;
2189 
2190 #if OMPT_SUPPORT
2191   ompt_frame_t *ompt_frame;
2192   if (ompt_enabled.enabled) {
2193     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2194     if (ompt_frame->enter_frame.ptr == NULL)
2195       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2196   }
2197   OMPT_STORE_RETURN_ADDRESS(gtid);
2198 #endif
2199 /* This barrier is not a barrier region boundary */
2200 #if USE_ITT_NOTIFY
2201   __kmp_threads[gtid]->th.th_ident = loc;
2202 #endif
2203   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2204 
2205   if (!didit)
2206     (*cpy_func)(cpy_data, *data_ptr);
2207 
2208   // Consider next barrier a user-visible barrier for barrier region boundaries
2209   // Nesting checks are already handled by the single construct checks
2210   {
2211 #if OMPT_SUPPORT
2212     OMPT_STORE_RETURN_ADDRESS(gtid);
2213 #endif
2214 #if USE_ITT_NOTIFY
2215     __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2216 // tasks can overwrite the location)
2217 #endif
2218     __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2219 #if OMPT_SUPPORT && OMPT_OPTIONAL
2220     if (ompt_enabled.enabled) {
2221       ompt_frame->enter_frame = ompt_data_none;
2222     }
2223 #endif
2224   }
2225 }
2226 
2227 /* -------------------------------------------------------------------------- */
2228 
2229 #define INIT_LOCK __kmp_init_user_lock_with_checks
2230 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2231 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2232 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2233 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2234 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2235   __kmp_acquire_nested_user_lock_with_checks_timed
2236 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2237 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2238 #define TEST_LOCK __kmp_test_user_lock_with_checks
2239 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2240 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2241 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2242 
2243 // TODO: Make check abort messages use location info & pass it into
2244 // with_checks routines
2245 
2246 #if KMP_USE_DYNAMIC_LOCK
2247 
2248 // internal lock initializer
2249 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2250                                                     kmp_dyna_lockseq_t seq) {
2251   if (KMP_IS_D_LOCK(seq)) {
2252     KMP_INIT_D_LOCK(lock, seq);
2253 #if USE_ITT_BUILD
2254     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2255 #endif
2256   } else {
2257     KMP_INIT_I_LOCK(lock, seq);
2258 #if USE_ITT_BUILD
2259     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2260     __kmp_itt_lock_creating(ilk->lock, loc);
2261 #endif
2262   }
2263 }
2264 
2265 // internal nest lock initializer
2266 static __forceinline void
2267 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2268                                kmp_dyna_lockseq_t seq) {
2269 #if KMP_USE_TSX
2270   // Don't have nested lock implementation for speculative locks
2271   if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2272       seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2273     seq = __kmp_user_lock_seq;
2274 #endif
2275   switch (seq) {
2276   case lockseq_tas:
2277     seq = lockseq_nested_tas;
2278     break;
2279 #if KMP_USE_FUTEX
2280   case lockseq_futex:
2281     seq = lockseq_nested_futex;
2282     break;
2283 #endif
2284   case lockseq_ticket:
2285     seq = lockseq_nested_ticket;
2286     break;
2287   case lockseq_queuing:
2288     seq = lockseq_nested_queuing;
2289     break;
2290   case lockseq_drdpa:
2291     seq = lockseq_nested_drdpa;
2292     break;
2293   default:
2294     seq = lockseq_nested_queuing;
2295   }
2296   KMP_INIT_I_LOCK(lock, seq);
2297 #if USE_ITT_BUILD
2298   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2299   __kmp_itt_lock_creating(ilk->lock, loc);
2300 #endif
2301 }
2302 
2303 /* initialize the lock with a hint */
2304 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2305                                 uintptr_t hint) {
2306   KMP_DEBUG_ASSERT(__kmp_init_serial);
2307   if (__kmp_env_consistency_check && user_lock == NULL) {
2308     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2309   }
2310 
2311   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2312 
2313 #if OMPT_SUPPORT && OMPT_OPTIONAL
2314   // This is the case, if called from omp_init_lock_with_hint:
2315   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2316   if (!codeptr)
2317     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2318   if (ompt_enabled.ompt_callback_lock_init) {
2319     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2320         ompt_mutex_lock, (omp_lock_hint_t)hint,
2321         __ompt_get_mutex_impl_type(user_lock),
2322         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2323   }
2324 #endif
2325 }
2326 
2327 /* initialize the lock with a hint */
2328 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2329                                      void **user_lock, uintptr_t hint) {
2330   KMP_DEBUG_ASSERT(__kmp_init_serial);
2331   if (__kmp_env_consistency_check && user_lock == NULL) {
2332     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2333   }
2334 
2335   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2336 
2337 #if OMPT_SUPPORT && OMPT_OPTIONAL
2338   // This is the case, if called from omp_init_lock_with_hint:
2339   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2340   if (!codeptr)
2341     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2342   if (ompt_enabled.ompt_callback_lock_init) {
2343     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2344         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2345         __ompt_get_mutex_impl_type(user_lock),
2346         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2347   }
2348 #endif
2349 }
2350 
2351 #endif // KMP_USE_DYNAMIC_LOCK
2352 
2353 /* initialize the lock */
2354 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2355 #if KMP_USE_DYNAMIC_LOCK
2356 
2357   KMP_DEBUG_ASSERT(__kmp_init_serial);
2358   if (__kmp_env_consistency_check && user_lock == NULL) {
2359     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2360   }
2361   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2362 
2363 #if OMPT_SUPPORT && OMPT_OPTIONAL
2364   // This is the case, if called from omp_init_lock_with_hint:
2365   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2366   if (!codeptr)
2367     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2368   if (ompt_enabled.ompt_callback_lock_init) {
2369     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2370         ompt_mutex_lock, omp_lock_hint_none,
2371         __ompt_get_mutex_impl_type(user_lock),
2372         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2373   }
2374 #endif
2375 
2376 #else // KMP_USE_DYNAMIC_LOCK
2377 
2378   static char const *const func = "omp_init_lock";
2379   kmp_user_lock_p lck;
2380   KMP_DEBUG_ASSERT(__kmp_init_serial);
2381 
2382   if (__kmp_env_consistency_check) {
2383     if (user_lock == NULL) {
2384       KMP_FATAL(LockIsUninitialized, func);
2385     }
2386   }
2387 
2388   KMP_CHECK_USER_LOCK_INIT();
2389 
2390   if ((__kmp_user_lock_kind == lk_tas) &&
2391       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2392     lck = (kmp_user_lock_p)user_lock;
2393   }
2394 #if KMP_USE_FUTEX
2395   else if ((__kmp_user_lock_kind == lk_futex) &&
2396            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2397     lck = (kmp_user_lock_p)user_lock;
2398   }
2399 #endif
2400   else {
2401     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2402   }
2403   INIT_LOCK(lck);
2404   __kmp_set_user_lock_location(lck, loc);
2405 
2406 #if OMPT_SUPPORT && OMPT_OPTIONAL
2407   // This is the case, if called from omp_init_lock_with_hint:
2408   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2409   if (!codeptr)
2410     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2411   if (ompt_enabled.ompt_callback_lock_init) {
2412     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2413         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2414         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2415   }
2416 #endif
2417 
2418 #if USE_ITT_BUILD
2419   __kmp_itt_lock_creating(lck);
2420 #endif /* USE_ITT_BUILD */
2421 
2422 #endif // KMP_USE_DYNAMIC_LOCK
2423 } // __kmpc_init_lock
2424 
2425 /* initialize the lock */
2426 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2427 #if KMP_USE_DYNAMIC_LOCK
2428 
2429   KMP_DEBUG_ASSERT(__kmp_init_serial);
2430   if (__kmp_env_consistency_check && user_lock == NULL) {
2431     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2432   }
2433   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2434 
2435 #if OMPT_SUPPORT && OMPT_OPTIONAL
2436   // This is the case, if called from omp_init_lock_with_hint:
2437   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2438   if (!codeptr)
2439     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2440   if (ompt_enabled.ompt_callback_lock_init) {
2441     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2442         ompt_mutex_nest_lock, omp_lock_hint_none,
2443         __ompt_get_mutex_impl_type(user_lock),
2444         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2445   }
2446 #endif
2447 
2448 #else // KMP_USE_DYNAMIC_LOCK
2449 
2450   static char const *const func = "omp_init_nest_lock";
2451   kmp_user_lock_p lck;
2452   KMP_DEBUG_ASSERT(__kmp_init_serial);
2453 
2454   if (__kmp_env_consistency_check) {
2455     if (user_lock == NULL) {
2456       KMP_FATAL(LockIsUninitialized, func);
2457     }
2458   }
2459 
2460   KMP_CHECK_USER_LOCK_INIT();
2461 
2462   if ((__kmp_user_lock_kind == lk_tas) &&
2463       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2464        OMP_NEST_LOCK_T_SIZE)) {
2465     lck = (kmp_user_lock_p)user_lock;
2466   }
2467 #if KMP_USE_FUTEX
2468   else if ((__kmp_user_lock_kind == lk_futex) &&
2469            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2470             OMP_NEST_LOCK_T_SIZE)) {
2471     lck = (kmp_user_lock_p)user_lock;
2472   }
2473 #endif
2474   else {
2475     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2476   }
2477 
2478   INIT_NESTED_LOCK(lck);
2479   __kmp_set_user_lock_location(lck, loc);
2480 
2481 #if OMPT_SUPPORT && OMPT_OPTIONAL
2482   // This is the case, if called from omp_init_lock_with_hint:
2483   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2484   if (!codeptr)
2485     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2486   if (ompt_enabled.ompt_callback_lock_init) {
2487     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2488         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2489         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2490   }
2491 #endif
2492 
2493 #if USE_ITT_BUILD
2494   __kmp_itt_lock_creating(lck);
2495 #endif /* USE_ITT_BUILD */
2496 
2497 #endif // KMP_USE_DYNAMIC_LOCK
2498 } // __kmpc_init_nest_lock
2499 
2500 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2501 #if KMP_USE_DYNAMIC_LOCK
2502 
2503 #if USE_ITT_BUILD
2504   kmp_user_lock_p lck;
2505   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2506     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2507   } else {
2508     lck = (kmp_user_lock_p)user_lock;
2509   }
2510   __kmp_itt_lock_destroyed(lck);
2511 #endif
2512 #if OMPT_SUPPORT && OMPT_OPTIONAL
2513   // This is the case, if called from omp_init_lock_with_hint:
2514   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2515   if (!codeptr)
2516     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2517   if (ompt_enabled.ompt_callback_lock_destroy) {
2518     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2519         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2520   }
2521 #endif
2522   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2523 #else
2524   kmp_user_lock_p lck;
2525 
2526   if ((__kmp_user_lock_kind == lk_tas) &&
2527       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2528     lck = (kmp_user_lock_p)user_lock;
2529   }
2530 #if KMP_USE_FUTEX
2531   else if ((__kmp_user_lock_kind == lk_futex) &&
2532            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2533     lck = (kmp_user_lock_p)user_lock;
2534   }
2535 #endif
2536   else {
2537     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2538   }
2539 
2540 #if OMPT_SUPPORT && OMPT_OPTIONAL
2541   // This is the case, if called from omp_init_lock_with_hint:
2542   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2543   if (!codeptr)
2544     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2545   if (ompt_enabled.ompt_callback_lock_destroy) {
2546     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2547         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2548   }
2549 #endif
2550 
2551 #if USE_ITT_BUILD
2552   __kmp_itt_lock_destroyed(lck);
2553 #endif /* USE_ITT_BUILD */
2554   DESTROY_LOCK(lck);
2555 
2556   if ((__kmp_user_lock_kind == lk_tas) &&
2557       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2558     ;
2559   }
2560 #if KMP_USE_FUTEX
2561   else if ((__kmp_user_lock_kind == lk_futex) &&
2562            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2563     ;
2564   }
2565 #endif
2566   else {
2567     __kmp_user_lock_free(user_lock, gtid, lck);
2568   }
2569 #endif // KMP_USE_DYNAMIC_LOCK
2570 } // __kmpc_destroy_lock
2571 
2572 /* destroy the lock */
2573 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2574 #if KMP_USE_DYNAMIC_LOCK
2575 
2576 #if USE_ITT_BUILD
2577   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2578   __kmp_itt_lock_destroyed(ilk->lock);
2579 #endif
2580 #if OMPT_SUPPORT && OMPT_OPTIONAL
2581   // This is the case, if called from omp_init_lock_with_hint:
2582   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2583   if (!codeptr)
2584     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2585   if (ompt_enabled.ompt_callback_lock_destroy) {
2586     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2587         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2588   }
2589 #endif
2590   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2591 
2592 #else // KMP_USE_DYNAMIC_LOCK
2593 
2594   kmp_user_lock_p lck;
2595 
2596   if ((__kmp_user_lock_kind == lk_tas) &&
2597       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2598        OMP_NEST_LOCK_T_SIZE)) {
2599     lck = (kmp_user_lock_p)user_lock;
2600   }
2601 #if KMP_USE_FUTEX
2602   else if ((__kmp_user_lock_kind == lk_futex) &&
2603            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2604             OMP_NEST_LOCK_T_SIZE)) {
2605     lck = (kmp_user_lock_p)user_lock;
2606   }
2607 #endif
2608   else {
2609     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2610   }
2611 
2612 #if OMPT_SUPPORT && OMPT_OPTIONAL
2613   // This is the case, if called from omp_init_lock_with_hint:
2614   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2615   if (!codeptr)
2616     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2617   if (ompt_enabled.ompt_callback_lock_destroy) {
2618     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2619         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2620   }
2621 #endif
2622 
2623 #if USE_ITT_BUILD
2624   __kmp_itt_lock_destroyed(lck);
2625 #endif /* USE_ITT_BUILD */
2626 
2627   DESTROY_NESTED_LOCK(lck);
2628 
2629   if ((__kmp_user_lock_kind == lk_tas) &&
2630       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2631        OMP_NEST_LOCK_T_SIZE)) {
2632     ;
2633   }
2634 #if KMP_USE_FUTEX
2635   else if ((__kmp_user_lock_kind == lk_futex) &&
2636            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2637             OMP_NEST_LOCK_T_SIZE)) {
2638     ;
2639   }
2640 #endif
2641   else {
2642     __kmp_user_lock_free(user_lock, gtid, lck);
2643   }
2644 #endif // KMP_USE_DYNAMIC_LOCK
2645 } // __kmpc_destroy_nest_lock
2646 
2647 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2648   KMP_COUNT_BLOCK(OMP_set_lock);
2649 #if KMP_USE_DYNAMIC_LOCK
2650   int tag = KMP_EXTRACT_D_TAG(user_lock);
2651 #if USE_ITT_BUILD
2652   __kmp_itt_lock_acquiring(
2653       (kmp_user_lock_p)
2654           user_lock); // itt function will get to the right lock object.
2655 #endif
2656 #if OMPT_SUPPORT && OMPT_OPTIONAL
2657   // This is the case, if called from omp_init_lock_with_hint:
2658   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2659   if (!codeptr)
2660     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2661   if (ompt_enabled.ompt_callback_mutex_acquire) {
2662     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2663         ompt_mutex_lock, omp_lock_hint_none,
2664         __ompt_get_mutex_impl_type(user_lock),
2665         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2666   }
2667 #endif
2668 #if KMP_USE_INLINED_TAS
2669   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2670     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2671   } else
2672 #elif KMP_USE_INLINED_FUTEX
2673   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2674     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2675   } else
2676 #endif
2677   {
2678     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2679   }
2680 #if USE_ITT_BUILD
2681   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2682 #endif
2683 #if OMPT_SUPPORT && OMPT_OPTIONAL
2684   if (ompt_enabled.ompt_callback_mutex_acquired) {
2685     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2686         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2687   }
2688 #endif
2689 
2690 #else // KMP_USE_DYNAMIC_LOCK
2691 
2692   kmp_user_lock_p lck;
2693 
2694   if ((__kmp_user_lock_kind == lk_tas) &&
2695       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2696     lck = (kmp_user_lock_p)user_lock;
2697   }
2698 #if KMP_USE_FUTEX
2699   else if ((__kmp_user_lock_kind == lk_futex) &&
2700            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2701     lck = (kmp_user_lock_p)user_lock;
2702   }
2703 #endif
2704   else {
2705     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2706   }
2707 
2708 #if USE_ITT_BUILD
2709   __kmp_itt_lock_acquiring(lck);
2710 #endif /* USE_ITT_BUILD */
2711 #if OMPT_SUPPORT && OMPT_OPTIONAL
2712   // This is the case, if called from omp_init_lock_with_hint:
2713   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2714   if (!codeptr)
2715     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2716   if (ompt_enabled.ompt_callback_mutex_acquire) {
2717     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2718         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2719         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2720   }
2721 #endif
2722 
2723   ACQUIRE_LOCK(lck, gtid);
2724 
2725 #if USE_ITT_BUILD
2726   __kmp_itt_lock_acquired(lck);
2727 #endif /* USE_ITT_BUILD */
2728 
2729 #if OMPT_SUPPORT && OMPT_OPTIONAL
2730   if (ompt_enabled.ompt_callback_mutex_acquired) {
2731     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2732         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2733   }
2734 #endif
2735 
2736 #endif // KMP_USE_DYNAMIC_LOCK
2737 }
2738 
2739 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2740 #if KMP_USE_DYNAMIC_LOCK
2741 
2742 #if USE_ITT_BUILD
2743   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2744 #endif
2745 #if OMPT_SUPPORT && OMPT_OPTIONAL
2746   // This is the case, if called from omp_init_lock_with_hint:
2747   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2748   if (!codeptr)
2749     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2750   if (ompt_enabled.enabled) {
2751     if (ompt_enabled.ompt_callback_mutex_acquire) {
2752       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2753           ompt_mutex_nest_lock, omp_lock_hint_none,
2754           __ompt_get_mutex_impl_type(user_lock),
2755           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2756     }
2757   }
2758 #endif
2759   int acquire_status =
2760       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2761   (void)acquire_status;
2762 #if USE_ITT_BUILD
2763   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2764 #endif
2765 
2766 #if OMPT_SUPPORT && OMPT_OPTIONAL
2767   if (ompt_enabled.enabled) {
2768     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2769       if (ompt_enabled.ompt_callback_mutex_acquired) {
2770         // lock_first
2771         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2772             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2773             codeptr);
2774       }
2775     } else {
2776       if (ompt_enabled.ompt_callback_nest_lock) {
2777         // lock_next
2778         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2779             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2780       }
2781     }
2782   }
2783 #endif
2784 
2785 #else // KMP_USE_DYNAMIC_LOCK
2786   int acquire_status;
2787   kmp_user_lock_p lck;
2788 
2789   if ((__kmp_user_lock_kind == lk_tas) &&
2790       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2791        OMP_NEST_LOCK_T_SIZE)) {
2792     lck = (kmp_user_lock_p)user_lock;
2793   }
2794 #if KMP_USE_FUTEX
2795   else if ((__kmp_user_lock_kind == lk_futex) &&
2796            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2797             OMP_NEST_LOCK_T_SIZE)) {
2798     lck = (kmp_user_lock_p)user_lock;
2799   }
2800 #endif
2801   else {
2802     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2803   }
2804 
2805 #if USE_ITT_BUILD
2806   __kmp_itt_lock_acquiring(lck);
2807 #endif /* USE_ITT_BUILD */
2808 #if OMPT_SUPPORT && OMPT_OPTIONAL
2809   // This is the case, if called from omp_init_lock_with_hint:
2810   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2811   if (!codeptr)
2812     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2813   if (ompt_enabled.enabled) {
2814     if (ompt_enabled.ompt_callback_mutex_acquire) {
2815       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2816           ompt_mutex_nest_lock, omp_lock_hint_none,
2817           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2818           codeptr);
2819     }
2820   }
2821 #endif
2822 
2823   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2824 
2825 #if USE_ITT_BUILD
2826   __kmp_itt_lock_acquired(lck);
2827 #endif /* USE_ITT_BUILD */
2828 
2829 #if OMPT_SUPPORT && OMPT_OPTIONAL
2830   if (ompt_enabled.enabled) {
2831     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2832       if (ompt_enabled.ompt_callback_mutex_acquired) {
2833         // lock_first
2834         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2835             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2836       }
2837     } else {
2838       if (ompt_enabled.ompt_callback_nest_lock) {
2839         // lock_next
2840         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2841             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2842       }
2843     }
2844   }
2845 #endif
2846 
2847 #endif // KMP_USE_DYNAMIC_LOCK
2848 }
2849 
2850 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2851 #if KMP_USE_DYNAMIC_LOCK
2852 
2853   int tag = KMP_EXTRACT_D_TAG(user_lock);
2854 #if USE_ITT_BUILD
2855   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2856 #endif
2857 #if KMP_USE_INLINED_TAS
2858   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2859     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2860   } else
2861 #elif KMP_USE_INLINED_FUTEX
2862   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2863     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2864   } else
2865 #endif
2866   {
2867     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2868   }
2869 
2870 #if OMPT_SUPPORT && OMPT_OPTIONAL
2871   // This is the case, if called from omp_init_lock_with_hint:
2872   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2873   if (!codeptr)
2874     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2875   if (ompt_enabled.ompt_callback_mutex_released) {
2876     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2877         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2878   }
2879 #endif
2880 
2881 #else // KMP_USE_DYNAMIC_LOCK
2882 
2883   kmp_user_lock_p lck;
2884 
2885   /* Can't use serial interval since not block structured */
2886   /* release the lock */
2887 
2888   if ((__kmp_user_lock_kind == lk_tas) &&
2889       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2890 #if KMP_OS_LINUX &&                                                            \
2891     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2892 // "fast" path implemented to fix customer performance issue
2893 #if USE_ITT_BUILD
2894     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2895 #endif /* USE_ITT_BUILD */
2896     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2897     KMP_MB();
2898 
2899 #if OMPT_SUPPORT && OMPT_OPTIONAL
2900     // This is the case, if called from omp_init_lock_with_hint:
2901     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2902     if (!codeptr)
2903       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2904     if (ompt_enabled.ompt_callback_mutex_released) {
2905       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2906           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2907     }
2908 #endif
2909 
2910     return;
2911 #else
2912     lck = (kmp_user_lock_p)user_lock;
2913 #endif
2914   }
2915 #if KMP_USE_FUTEX
2916   else if ((__kmp_user_lock_kind == lk_futex) &&
2917            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2918     lck = (kmp_user_lock_p)user_lock;
2919   }
2920 #endif
2921   else {
2922     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2923   }
2924 
2925 #if USE_ITT_BUILD
2926   __kmp_itt_lock_releasing(lck);
2927 #endif /* USE_ITT_BUILD */
2928 
2929   RELEASE_LOCK(lck, gtid);
2930 
2931 #if OMPT_SUPPORT && OMPT_OPTIONAL
2932   // This is the case, if called from omp_init_lock_with_hint:
2933   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2934   if (!codeptr)
2935     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2936   if (ompt_enabled.ompt_callback_mutex_released) {
2937     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2938         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2939   }
2940 #endif
2941 
2942 #endif // KMP_USE_DYNAMIC_LOCK
2943 }
2944 
2945 /* release the lock */
2946 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2947 #if KMP_USE_DYNAMIC_LOCK
2948 
2949 #if USE_ITT_BUILD
2950   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2951 #endif
2952   int release_status =
2953       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2954   (void)release_status;
2955 
2956 #if OMPT_SUPPORT && OMPT_OPTIONAL
2957   // This is the case, if called from omp_init_lock_with_hint:
2958   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2959   if (!codeptr)
2960     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2961   if (ompt_enabled.enabled) {
2962     if (release_status == KMP_LOCK_RELEASED) {
2963       if (ompt_enabled.ompt_callback_mutex_released) {
2964         // release_lock_last
2965         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2966             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2967             codeptr);
2968       }
2969     } else if (ompt_enabled.ompt_callback_nest_lock) {
2970       // release_lock_prev
2971       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2972           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2973     }
2974   }
2975 #endif
2976 
2977 #else // KMP_USE_DYNAMIC_LOCK
2978 
2979   kmp_user_lock_p lck;
2980 
2981   /* Can't use serial interval since not block structured */
2982 
2983   if ((__kmp_user_lock_kind == lk_tas) &&
2984       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2985        OMP_NEST_LOCK_T_SIZE)) {
2986 #if KMP_OS_LINUX &&                                                            \
2987     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2988     // "fast" path implemented to fix customer performance issue
2989     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2990 #if USE_ITT_BUILD
2991     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2992 #endif /* USE_ITT_BUILD */
2993 
2994 #if OMPT_SUPPORT && OMPT_OPTIONAL
2995     int release_status = KMP_LOCK_STILL_HELD;
2996 #endif
2997 
2998     if (--(tl->lk.depth_locked) == 0) {
2999       TCW_4(tl->lk.poll, 0);
3000 #if OMPT_SUPPORT && OMPT_OPTIONAL
3001       release_status = KMP_LOCK_RELEASED;
3002 #endif
3003     }
3004     KMP_MB();
3005 
3006 #if OMPT_SUPPORT && OMPT_OPTIONAL
3007     // This is the case, if called from omp_init_lock_with_hint:
3008     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3009     if (!codeptr)
3010       codeptr = OMPT_GET_RETURN_ADDRESS(0);
3011     if (ompt_enabled.enabled) {
3012       if (release_status == KMP_LOCK_RELEASED) {
3013         if (ompt_enabled.ompt_callback_mutex_released) {
3014           // release_lock_last
3015           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3016               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3017         }
3018       } else if (ompt_enabled.ompt_callback_nest_lock) {
3019         // release_lock_previous
3020         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3021             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3022       }
3023     }
3024 #endif
3025 
3026     return;
3027 #else
3028     lck = (kmp_user_lock_p)user_lock;
3029 #endif
3030   }
3031 #if KMP_USE_FUTEX
3032   else if ((__kmp_user_lock_kind == lk_futex) &&
3033            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3034             OMP_NEST_LOCK_T_SIZE)) {
3035     lck = (kmp_user_lock_p)user_lock;
3036   }
3037 #endif
3038   else {
3039     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
3040   }
3041 
3042 #if USE_ITT_BUILD
3043   __kmp_itt_lock_releasing(lck);
3044 #endif /* USE_ITT_BUILD */
3045 
3046   int release_status;
3047   release_status = RELEASE_NESTED_LOCK(lck, gtid);
3048 #if OMPT_SUPPORT && OMPT_OPTIONAL
3049   // This is the case, if called from omp_init_lock_with_hint:
3050   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3051   if (!codeptr)
3052     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3053   if (ompt_enabled.enabled) {
3054     if (release_status == KMP_LOCK_RELEASED) {
3055       if (ompt_enabled.ompt_callback_mutex_released) {
3056         // release_lock_last
3057         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3058             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3059       }
3060     } else if (ompt_enabled.ompt_callback_nest_lock) {
3061       // release_lock_previous
3062       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3063           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3064     }
3065   }
3066 #endif
3067 
3068 #endif // KMP_USE_DYNAMIC_LOCK
3069 }
3070 
3071 /* try to acquire the lock */
3072 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3073   KMP_COUNT_BLOCK(OMP_test_lock);
3074 
3075 #if KMP_USE_DYNAMIC_LOCK
3076   int rc;
3077   int tag = KMP_EXTRACT_D_TAG(user_lock);
3078 #if USE_ITT_BUILD
3079   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3080 #endif
3081 #if OMPT_SUPPORT && OMPT_OPTIONAL
3082   // This is the case, if called from omp_init_lock_with_hint:
3083   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3084   if (!codeptr)
3085     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3086   if (ompt_enabled.ompt_callback_mutex_acquire) {
3087     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3088         ompt_mutex_lock, omp_lock_hint_none,
3089         __ompt_get_mutex_impl_type(user_lock),
3090         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3091   }
3092 #endif
3093 #if KMP_USE_INLINED_TAS
3094   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3095     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3096   } else
3097 #elif KMP_USE_INLINED_FUTEX
3098   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3099     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3100   } else
3101 #endif
3102   {
3103     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3104   }
3105   if (rc) {
3106 #if USE_ITT_BUILD
3107     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3108 #endif
3109 #if OMPT_SUPPORT && OMPT_OPTIONAL
3110     if (ompt_enabled.ompt_callback_mutex_acquired) {
3111       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3112           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3113     }
3114 #endif
3115     return FTN_TRUE;
3116   } else {
3117 #if USE_ITT_BUILD
3118     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3119 #endif
3120     return FTN_FALSE;
3121   }
3122 
3123 #else // KMP_USE_DYNAMIC_LOCK
3124 
3125   kmp_user_lock_p lck;
3126   int rc;
3127 
3128   if ((__kmp_user_lock_kind == lk_tas) &&
3129       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3130     lck = (kmp_user_lock_p)user_lock;
3131   }
3132 #if KMP_USE_FUTEX
3133   else if ((__kmp_user_lock_kind == lk_futex) &&
3134            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3135     lck = (kmp_user_lock_p)user_lock;
3136   }
3137 #endif
3138   else {
3139     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3140   }
3141 
3142 #if USE_ITT_BUILD
3143   __kmp_itt_lock_acquiring(lck);
3144 #endif /* USE_ITT_BUILD */
3145 #if OMPT_SUPPORT && OMPT_OPTIONAL
3146   // This is the case, if called from omp_init_lock_with_hint:
3147   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3148   if (!codeptr)
3149     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3150   if (ompt_enabled.ompt_callback_mutex_acquire) {
3151     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3152         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3153         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3154   }
3155 #endif
3156 
3157   rc = TEST_LOCK(lck, gtid);
3158 #if USE_ITT_BUILD
3159   if (rc) {
3160     __kmp_itt_lock_acquired(lck);
3161   } else {
3162     __kmp_itt_lock_cancelled(lck);
3163   }
3164 #endif /* USE_ITT_BUILD */
3165 #if OMPT_SUPPORT && OMPT_OPTIONAL
3166   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3167     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3168         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3169   }
3170 #endif
3171 
3172   return (rc ? FTN_TRUE : FTN_FALSE);
3173 
3174   /* Can't use serial interval since not block structured */
3175 
3176 #endif // KMP_USE_DYNAMIC_LOCK
3177 }
3178 
3179 /* try to acquire the lock */
3180 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3181 #if KMP_USE_DYNAMIC_LOCK
3182   int rc;
3183 #if USE_ITT_BUILD
3184   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3185 #endif
3186 #if OMPT_SUPPORT && OMPT_OPTIONAL
3187   // This is the case, if called from omp_init_lock_with_hint:
3188   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3189   if (!codeptr)
3190     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3191   if (ompt_enabled.ompt_callback_mutex_acquire) {
3192     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3193         ompt_mutex_nest_lock, omp_lock_hint_none,
3194         __ompt_get_mutex_impl_type(user_lock),
3195         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3196   }
3197 #endif
3198   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3199 #if USE_ITT_BUILD
3200   if (rc) {
3201     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3202   } else {
3203     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3204   }
3205 #endif
3206 #if OMPT_SUPPORT && OMPT_OPTIONAL
3207   if (ompt_enabled.enabled && rc) {
3208     if (rc == 1) {
3209       if (ompt_enabled.ompt_callback_mutex_acquired) {
3210         // lock_first
3211         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3212             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3213             codeptr);
3214       }
3215     } else {
3216       if (ompt_enabled.ompt_callback_nest_lock) {
3217         // lock_next
3218         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3219             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3220       }
3221     }
3222   }
3223 #endif
3224   return rc;
3225 
3226 #else // KMP_USE_DYNAMIC_LOCK
3227 
3228   kmp_user_lock_p lck;
3229   int rc;
3230 
3231   if ((__kmp_user_lock_kind == lk_tas) &&
3232       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3233        OMP_NEST_LOCK_T_SIZE)) {
3234     lck = (kmp_user_lock_p)user_lock;
3235   }
3236 #if KMP_USE_FUTEX
3237   else if ((__kmp_user_lock_kind == lk_futex) &&
3238            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3239             OMP_NEST_LOCK_T_SIZE)) {
3240     lck = (kmp_user_lock_p)user_lock;
3241   }
3242 #endif
3243   else {
3244     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3245   }
3246 
3247 #if USE_ITT_BUILD
3248   __kmp_itt_lock_acquiring(lck);
3249 #endif /* USE_ITT_BUILD */
3250 
3251 #if OMPT_SUPPORT && OMPT_OPTIONAL
3252   // This is the case, if called from omp_init_lock_with_hint:
3253   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3254   if (!codeptr)
3255     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3256   if (ompt_enabled.enabled) &&
3257         ompt_enabled.ompt_callback_mutex_acquire) {
3258       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3259           ompt_mutex_nest_lock, omp_lock_hint_none,
3260           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3261           codeptr);
3262     }
3263 #endif
3264 
3265   rc = TEST_NESTED_LOCK(lck, gtid);
3266 #if USE_ITT_BUILD
3267   if (rc) {
3268     __kmp_itt_lock_acquired(lck);
3269   } else {
3270     __kmp_itt_lock_cancelled(lck);
3271   }
3272 #endif /* USE_ITT_BUILD */
3273 #if OMPT_SUPPORT && OMPT_OPTIONAL
3274   if (ompt_enabled.enabled && rc) {
3275     if (rc == 1) {
3276       if (ompt_enabled.ompt_callback_mutex_acquired) {
3277         // lock_first
3278         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3279             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3280       }
3281     } else {
3282       if (ompt_enabled.ompt_callback_nest_lock) {
3283         // lock_next
3284         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3285             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3286       }
3287     }
3288   }
3289 #endif
3290   return rc;
3291 
3292   /* Can't use serial interval since not block structured */
3293 
3294 #endif // KMP_USE_DYNAMIC_LOCK
3295 }
3296 
3297 // Interface to fast scalable reduce methods routines
3298 
3299 // keep the selected method in a thread local structure for cross-function
3300 // usage: will be used in __kmpc_end_reduce* functions;
3301 // another solution: to re-determine the method one more time in
3302 // __kmpc_end_reduce* functions (new prototype required then)
3303 // AT: which solution is better?
3304 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3305   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3306 
3307 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3308   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3309 
3310 // description of the packed_reduction_method variable: look at the macros in
3311 // kmp.h
3312 
3313 // used in a critical section reduce block
3314 static __forceinline void
3315 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3316                                           kmp_critical_name *crit) {
3317 
3318   // this lock was visible to a customer and to the threading profile tool as a
3319   // serial overhead span (although it's used for an internal purpose only)
3320   //            why was it visible in previous implementation?
3321   //            should we keep it visible in new reduce block?
3322   kmp_user_lock_p lck;
3323 
3324 #if KMP_USE_DYNAMIC_LOCK
3325 
3326   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3327   // Check if it is initialized.
3328   if (*lk == 0) {
3329     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3330       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3331                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3332     } else {
3333       __kmp_init_indirect_csptr(crit, loc, global_tid,
3334                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3335     }
3336   }
3337   // Branch for accessing the actual lock object and set operation. This
3338   // branching is inevitable since this lock initialization does not follow the
3339   // normal dispatch path (lock table is not used).
3340   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3341     lck = (kmp_user_lock_p)lk;
3342     KMP_DEBUG_ASSERT(lck != NULL);
3343     if (__kmp_env_consistency_check) {
3344       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3345     }
3346     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3347   } else {
3348     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3349     lck = ilk->lock;
3350     KMP_DEBUG_ASSERT(lck != NULL);
3351     if (__kmp_env_consistency_check) {
3352       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3353     }
3354     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3355   }
3356 
3357 #else // KMP_USE_DYNAMIC_LOCK
3358 
3359   // We know that the fast reduction code is only emitted by Intel compilers
3360   // with 32 byte critical sections. If there isn't enough space, then we
3361   // have to use a pointer.
3362   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3363     lck = (kmp_user_lock_p)crit;
3364   } else {
3365     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3366   }
3367   KMP_DEBUG_ASSERT(lck != NULL);
3368 
3369   if (__kmp_env_consistency_check)
3370     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3371 
3372   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3373 
3374 #endif // KMP_USE_DYNAMIC_LOCK
3375 }
3376 
3377 // used in a critical section reduce block
3378 static __forceinline void
3379 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3380                                         kmp_critical_name *crit) {
3381 
3382   kmp_user_lock_p lck;
3383 
3384 #if KMP_USE_DYNAMIC_LOCK
3385 
3386   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3387     lck = (kmp_user_lock_p)crit;
3388     if (__kmp_env_consistency_check)
3389       __kmp_pop_sync(global_tid, ct_critical, loc);
3390     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3391   } else {
3392     kmp_indirect_lock_t *ilk =
3393         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3394     if (__kmp_env_consistency_check)
3395       __kmp_pop_sync(global_tid, ct_critical, loc);
3396     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3397   }
3398 
3399 #else // KMP_USE_DYNAMIC_LOCK
3400 
3401   // We know that the fast reduction code is only emitted by Intel compilers
3402   // with 32 byte critical sections. If there isn't enough space, then we have
3403   // to use a pointer.
3404   if (__kmp_base_user_lock_size > 32) {
3405     lck = *((kmp_user_lock_p *)crit);
3406     KMP_ASSERT(lck != NULL);
3407   } else {
3408     lck = (kmp_user_lock_p)crit;
3409   }
3410 
3411   if (__kmp_env_consistency_check)
3412     __kmp_pop_sync(global_tid, ct_critical, loc);
3413 
3414   __kmp_release_user_lock_with_checks(lck, global_tid);
3415 
3416 #endif // KMP_USE_DYNAMIC_LOCK
3417 } // __kmp_end_critical_section_reduce_block
3418 
3419 static __forceinline int
3420 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3421                                      int *task_state) {
3422   kmp_team_t *team;
3423 
3424   // Check if we are inside the teams construct?
3425   if (th->th.th_teams_microtask) {
3426     *team_p = team = th->th.th_team;
3427     if (team->t.t_level == th->th.th_teams_level) {
3428       // This is reduction at teams construct.
3429       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3430       // Let's swap teams temporarily for the reduction.
3431       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3432       th->th.th_team = team->t.t_parent;
3433       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3434       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3435       *task_state = th->th.th_task_state;
3436       th->th.th_task_state = 0;
3437 
3438       return 1;
3439     }
3440   }
3441   return 0;
3442 }
3443 
3444 static __forceinline void
3445 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3446   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3447   th->th.th_info.ds.ds_tid = 0;
3448   th->th.th_team = team;
3449   th->th.th_team_nproc = team->t.t_nproc;
3450   th->th.th_task_team = team->t.t_task_team[task_state];
3451   __kmp_type_convert(task_state, &(th->th.th_task_state));
3452 }
3453 
3454 /* 2.a.i. Reduce Block without a terminating barrier */
3455 /*!
3456 @ingroup SYNCHRONIZATION
3457 @param loc source location information
3458 @param global_tid global thread number
3459 @param num_vars number of items (variables) to be reduced
3460 @param reduce_size size of data in bytes to be reduced
3461 @param reduce_data pointer to data to be reduced
3462 @param reduce_func callback function providing reduction operation on two
3463 operands and returning result of reduction in lhs_data
3464 @param lck pointer to the unique lock data structure
3465 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3466 threads if atomic reduction needed
3467 
3468 The nowait version is used for a reduce clause with the nowait argument.
3469 */
3470 kmp_int32
3471 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3472                      size_t reduce_size, void *reduce_data,
3473                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3474                      kmp_critical_name *lck) {
3475 
3476   KMP_COUNT_BLOCK(REDUCE_nowait);
3477   int retval = 0;
3478   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3479   kmp_info_t *th;
3480   kmp_team_t *team;
3481   int teams_swapped = 0, task_state;
3482   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3483   __kmp_assert_valid_gtid(global_tid);
3484 
3485   // why do we need this initialization here at all?
3486   // Reduction clause can not be used as a stand-alone directive.
3487 
3488   // do not call __kmp_serial_initialize(), it will be called by
3489   // __kmp_parallel_initialize() if needed
3490   // possible detection of false-positive race by the threadchecker ???
3491   if (!TCR_4(__kmp_init_parallel))
3492     __kmp_parallel_initialize();
3493 
3494   __kmp_resume_if_soft_paused();
3495 
3496 // check correctness of reduce block nesting
3497 #if KMP_USE_DYNAMIC_LOCK
3498   if (__kmp_env_consistency_check)
3499     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3500 #else
3501   if (__kmp_env_consistency_check)
3502     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3503 #endif
3504 
3505   th = __kmp_thread_from_gtid(global_tid);
3506   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3507 
3508   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3509   // the value should be kept in a variable
3510   // the variable should be either a construct-specific or thread-specific
3511   // property, not a team specific property
3512   //     (a thread can reach the next reduce block on the next construct, reduce
3513   //     method may differ on the next construct)
3514   // an ident_t "loc" parameter could be used as a construct-specific property
3515   // (what if loc == 0?)
3516   //     (if both construct-specific and team-specific variables were shared,
3517   //     then unness extra syncs should be needed)
3518   // a thread-specific variable is better regarding two issues above (next
3519   // construct and extra syncs)
3520   // a thread-specific "th_local.reduction_method" variable is used currently
3521   // each thread executes 'determine' and 'set' lines (no need to execute by one
3522   // thread, to avoid unness extra syncs)
3523 
3524   packed_reduction_method = __kmp_determine_reduction_method(
3525       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3526   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3527 
3528   OMPT_REDUCTION_DECL(th, global_tid);
3529   if (packed_reduction_method == critical_reduce_block) {
3530 
3531     OMPT_REDUCTION_BEGIN;
3532 
3533     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3534     retval = 1;
3535 
3536   } else if (packed_reduction_method == empty_reduce_block) {
3537 
3538     OMPT_REDUCTION_BEGIN;
3539 
3540     // usage: if team size == 1, no synchronization is required ( Intel
3541     // platforms only )
3542     retval = 1;
3543 
3544   } else if (packed_reduction_method == atomic_reduce_block) {
3545 
3546     retval = 2;
3547 
3548     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3549     // won't be called by the code gen)
3550     //     (it's not quite good, because the checking block has been closed by
3551     //     this 'pop',
3552     //      but atomic operation has not been executed yet, will be executed
3553     //      slightly later, literally on next instruction)
3554     if (__kmp_env_consistency_check)
3555       __kmp_pop_sync(global_tid, ct_reduce, loc);
3556 
3557   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3558                                    tree_reduce_block)) {
3559 
3560 // AT: performance issue: a real barrier here
3561 // AT: (if primary thread is slow, other threads are blocked here waiting for
3562 //      the primary thread to come and release them)
3563 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3564 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3565 //      be confusing to a customer)
3566 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3567 // might go faster and be more in line with sense of NOWAIT
3568 // AT: TO DO: do epcc test and compare times
3569 
3570 // this barrier should be invisible to a customer and to the threading profile
3571 // tool (it's neither a terminating barrier nor customer's code, it's
3572 // used for an internal purpose)
3573 #if OMPT_SUPPORT
3574     // JP: can this barrier potentially leed to task scheduling?
3575     // JP: as long as there is a barrier in the implementation, OMPT should and
3576     // will provide the barrier events
3577     //         so we set-up the necessary frame/return addresses.
3578     ompt_frame_t *ompt_frame;
3579     if (ompt_enabled.enabled) {
3580       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3581       if (ompt_frame->enter_frame.ptr == NULL)
3582         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3583     }
3584     OMPT_STORE_RETURN_ADDRESS(global_tid);
3585 #endif
3586 #if USE_ITT_NOTIFY
3587     __kmp_threads[global_tid]->th.th_ident = loc;
3588 #endif
3589     retval =
3590         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3591                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3592     retval = (retval != 0) ? (0) : (1);
3593 #if OMPT_SUPPORT && OMPT_OPTIONAL
3594     if (ompt_enabled.enabled) {
3595       ompt_frame->enter_frame = ompt_data_none;
3596     }
3597 #endif
3598 
3599     // all other workers except primary thread should do this pop here
3600     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3601     if (__kmp_env_consistency_check) {
3602       if (retval == 0) {
3603         __kmp_pop_sync(global_tid, ct_reduce, loc);
3604       }
3605     }
3606 
3607   } else {
3608 
3609     // should never reach this block
3610     KMP_ASSERT(0); // "unexpected method"
3611   }
3612   if (teams_swapped) {
3613     __kmp_restore_swapped_teams(th, team, task_state);
3614   }
3615   KA_TRACE(
3616       10,
3617       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3618        global_tid, packed_reduction_method, retval));
3619 
3620   return retval;
3621 }
3622 
3623 /*!
3624 @ingroup SYNCHRONIZATION
3625 @param loc source location information
3626 @param global_tid global thread id.
3627 @param lck pointer to the unique lock data structure
3628 
3629 Finish the execution of a reduce nowait.
3630 */
3631 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3632                               kmp_critical_name *lck) {
3633 
3634   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3635 
3636   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3637   __kmp_assert_valid_gtid(global_tid);
3638 
3639   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3640 
3641   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3642 
3643   if (packed_reduction_method == critical_reduce_block) {
3644 
3645     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3646     OMPT_REDUCTION_END;
3647 
3648   } else if (packed_reduction_method == empty_reduce_block) {
3649 
3650     // usage: if team size == 1, no synchronization is required ( on Intel
3651     // platforms only )
3652 
3653     OMPT_REDUCTION_END;
3654 
3655   } else if (packed_reduction_method == atomic_reduce_block) {
3656 
3657     // neither primary thread nor other workers should get here
3658     //     (code gen does not generate this call in case 2: atomic reduce block)
3659     // actually it's better to remove this elseif at all;
3660     // after removal this value will checked by the 'else' and will assert
3661 
3662   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3663                                    tree_reduce_block)) {
3664 
3665     // only primary thread gets here
3666     // OMPT: tree reduction is annotated in the barrier code
3667 
3668   } else {
3669 
3670     // should never reach this block
3671     KMP_ASSERT(0); // "unexpected method"
3672   }
3673 
3674   if (__kmp_env_consistency_check)
3675     __kmp_pop_sync(global_tid, ct_reduce, loc);
3676 
3677   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3678                 global_tid, packed_reduction_method));
3679 
3680   return;
3681 }
3682 
3683 /* 2.a.ii. Reduce Block with a terminating barrier */
3684 
3685 /*!
3686 @ingroup SYNCHRONIZATION
3687 @param loc source location information
3688 @param global_tid global thread number
3689 @param num_vars number of items (variables) to be reduced
3690 @param reduce_size size of data in bytes to be reduced
3691 @param reduce_data pointer to data to be reduced
3692 @param reduce_func callback function providing reduction operation on two
3693 operands and returning result of reduction in lhs_data
3694 @param lck pointer to the unique lock data structure
3695 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3696 threads if atomic reduction needed
3697 
3698 A blocking reduce that includes an implicit barrier.
3699 */
3700 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3701                         size_t reduce_size, void *reduce_data,
3702                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3703                         kmp_critical_name *lck) {
3704   KMP_COUNT_BLOCK(REDUCE_wait);
3705   int retval = 0;
3706   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3707   kmp_info_t *th;
3708   kmp_team_t *team;
3709   int teams_swapped = 0, task_state;
3710 
3711   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3712   __kmp_assert_valid_gtid(global_tid);
3713 
3714   // why do we need this initialization here at all?
3715   // Reduction clause can not be a stand-alone directive.
3716 
3717   // do not call __kmp_serial_initialize(), it will be called by
3718   // __kmp_parallel_initialize() if needed
3719   // possible detection of false-positive race by the threadchecker ???
3720   if (!TCR_4(__kmp_init_parallel))
3721     __kmp_parallel_initialize();
3722 
3723   __kmp_resume_if_soft_paused();
3724 
3725 // check correctness of reduce block nesting
3726 #if KMP_USE_DYNAMIC_LOCK
3727   if (__kmp_env_consistency_check)
3728     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3729 #else
3730   if (__kmp_env_consistency_check)
3731     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3732 #endif
3733 
3734   th = __kmp_thread_from_gtid(global_tid);
3735   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3736 
3737   packed_reduction_method = __kmp_determine_reduction_method(
3738       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3739   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3740 
3741   OMPT_REDUCTION_DECL(th, global_tid);
3742 
3743   if (packed_reduction_method == critical_reduce_block) {
3744 
3745     OMPT_REDUCTION_BEGIN;
3746     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3747     retval = 1;
3748 
3749   } else if (packed_reduction_method == empty_reduce_block) {
3750 
3751     OMPT_REDUCTION_BEGIN;
3752     // usage: if team size == 1, no synchronization is required ( Intel
3753     // platforms only )
3754     retval = 1;
3755 
3756   } else if (packed_reduction_method == atomic_reduce_block) {
3757 
3758     retval = 2;
3759 
3760   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3761                                    tree_reduce_block)) {
3762 
3763 // case tree_reduce_block:
3764 // this barrier should be visible to a customer and to the threading profile
3765 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3766 #if OMPT_SUPPORT
3767     ompt_frame_t *ompt_frame;
3768     if (ompt_enabled.enabled) {
3769       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3770       if (ompt_frame->enter_frame.ptr == NULL)
3771         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3772     }
3773     OMPT_STORE_RETURN_ADDRESS(global_tid);
3774 #endif
3775 #if USE_ITT_NOTIFY
3776     __kmp_threads[global_tid]->th.th_ident =
3777         loc; // needed for correct notification of frames
3778 #endif
3779     retval =
3780         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3781                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3782     retval = (retval != 0) ? (0) : (1);
3783 #if OMPT_SUPPORT && OMPT_OPTIONAL
3784     if (ompt_enabled.enabled) {
3785       ompt_frame->enter_frame = ompt_data_none;
3786     }
3787 #endif
3788 
3789     // all other workers except primary thread should do this pop here
3790     // (none of other workers except primary will enter __kmpc_end_reduce())
3791     if (__kmp_env_consistency_check) {
3792       if (retval == 0) { // 0: all other workers; 1: primary thread
3793         __kmp_pop_sync(global_tid, ct_reduce, loc);
3794       }
3795     }
3796 
3797   } else {
3798 
3799     // should never reach this block
3800     KMP_ASSERT(0); // "unexpected method"
3801   }
3802   if (teams_swapped) {
3803     __kmp_restore_swapped_teams(th, team, task_state);
3804   }
3805 
3806   KA_TRACE(10,
3807            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3808             global_tid, packed_reduction_method, retval));
3809   return retval;
3810 }
3811 
3812 /*!
3813 @ingroup SYNCHRONIZATION
3814 @param loc source location information
3815 @param global_tid global thread id.
3816 @param lck pointer to the unique lock data structure
3817 
3818 Finish the execution of a blocking reduce.
3819 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3820 start function.
3821 */
3822 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3823                        kmp_critical_name *lck) {
3824 
3825   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3826   kmp_info_t *th;
3827   kmp_team_t *team;
3828   int teams_swapped = 0, task_state;
3829 
3830   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3831   __kmp_assert_valid_gtid(global_tid);
3832 
3833   th = __kmp_thread_from_gtid(global_tid);
3834   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3835 
3836   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3837 
3838   // this barrier should be visible to a customer and to the threading profile
3839   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3840   OMPT_REDUCTION_DECL(th, global_tid);
3841 
3842   if (packed_reduction_method == critical_reduce_block) {
3843     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3844 
3845     OMPT_REDUCTION_END;
3846 
3847 // TODO: implicit barrier: should be exposed
3848 #if OMPT_SUPPORT
3849     ompt_frame_t *ompt_frame;
3850     if (ompt_enabled.enabled) {
3851       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3852       if (ompt_frame->enter_frame.ptr == NULL)
3853         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3854     }
3855     OMPT_STORE_RETURN_ADDRESS(global_tid);
3856 #endif
3857 #if USE_ITT_NOTIFY
3858     __kmp_threads[global_tid]->th.th_ident = loc;
3859 #endif
3860     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3861 #if OMPT_SUPPORT && OMPT_OPTIONAL
3862     if (ompt_enabled.enabled) {
3863       ompt_frame->enter_frame = ompt_data_none;
3864     }
3865 #endif
3866 
3867   } else if (packed_reduction_method == empty_reduce_block) {
3868 
3869     OMPT_REDUCTION_END;
3870 
3871 // usage: if team size==1, no synchronization is required (Intel platforms only)
3872 
3873 // TODO: implicit barrier: should be exposed
3874 #if OMPT_SUPPORT
3875     ompt_frame_t *ompt_frame;
3876     if (ompt_enabled.enabled) {
3877       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3878       if (ompt_frame->enter_frame.ptr == NULL)
3879         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3880     }
3881     OMPT_STORE_RETURN_ADDRESS(global_tid);
3882 #endif
3883 #if USE_ITT_NOTIFY
3884     __kmp_threads[global_tid]->th.th_ident = loc;
3885 #endif
3886     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3887 #if OMPT_SUPPORT && OMPT_OPTIONAL
3888     if (ompt_enabled.enabled) {
3889       ompt_frame->enter_frame = ompt_data_none;
3890     }
3891 #endif
3892 
3893   } else if (packed_reduction_method == atomic_reduce_block) {
3894 
3895 #if OMPT_SUPPORT
3896     ompt_frame_t *ompt_frame;
3897     if (ompt_enabled.enabled) {
3898       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3899       if (ompt_frame->enter_frame.ptr == NULL)
3900         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3901     }
3902     OMPT_STORE_RETURN_ADDRESS(global_tid);
3903 #endif
3904 // TODO: implicit barrier: should be exposed
3905 #if USE_ITT_NOTIFY
3906     __kmp_threads[global_tid]->th.th_ident = loc;
3907 #endif
3908     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3909 #if OMPT_SUPPORT && OMPT_OPTIONAL
3910     if (ompt_enabled.enabled) {
3911       ompt_frame->enter_frame = ompt_data_none;
3912     }
3913 #endif
3914 
3915   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3916                                    tree_reduce_block)) {
3917 
3918     // only primary thread executes here (primary releases all other workers)
3919     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3920                             global_tid);
3921 
3922   } else {
3923 
3924     // should never reach this block
3925     KMP_ASSERT(0); // "unexpected method"
3926   }
3927   if (teams_swapped) {
3928     __kmp_restore_swapped_teams(th, team, task_state);
3929   }
3930 
3931   if (__kmp_env_consistency_check)
3932     __kmp_pop_sync(global_tid, ct_reduce, loc);
3933 
3934   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3935                 global_tid, packed_reduction_method));
3936 
3937   return;
3938 }
3939 
3940 #undef __KMP_GET_REDUCTION_METHOD
3941 #undef __KMP_SET_REDUCTION_METHOD
3942 
3943 /* end of interface to fast scalable reduce routines */
3944 
3945 kmp_uint64 __kmpc_get_taskid() {
3946 
3947   kmp_int32 gtid;
3948   kmp_info_t *thread;
3949 
3950   gtid = __kmp_get_gtid();
3951   if (gtid < 0) {
3952     return 0;
3953   }
3954   thread = __kmp_thread_from_gtid(gtid);
3955   return thread->th.th_current_task->td_task_id;
3956 
3957 } // __kmpc_get_taskid
3958 
3959 kmp_uint64 __kmpc_get_parent_taskid() {
3960 
3961   kmp_int32 gtid;
3962   kmp_info_t *thread;
3963   kmp_taskdata_t *parent_task;
3964 
3965   gtid = __kmp_get_gtid();
3966   if (gtid < 0) {
3967     return 0;
3968   }
3969   thread = __kmp_thread_from_gtid(gtid);
3970   parent_task = thread->th.th_current_task->td_parent;
3971   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3972 
3973 } // __kmpc_get_parent_taskid
3974 
3975 /*!
3976 @ingroup WORK_SHARING
3977 @param loc  source location information.
3978 @param gtid  global thread number.
3979 @param num_dims  number of associated doacross loops.
3980 @param dims  info on loops bounds.
3981 
3982 Initialize doacross loop information.
3983 Expect compiler send us inclusive bounds,
3984 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3985 */
3986 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3987                           const struct kmp_dim *dims) {
3988   __kmp_assert_valid_gtid(gtid);
3989   int j, idx;
3990   kmp_int64 last, trace_count;
3991   kmp_info_t *th = __kmp_threads[gtid];
3992   kmp_team_t *team = th->th.th_team;
3993   kmp_uint32 *flags;
3994   kmp_disp_t *pr_buf = th->th.th_dispatch;
3995   dispatch_shared_info_t *sh_buf;
3996 
3997   KA_TRACE(
3998       20,
3999       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
4000        gtid, num_dims, !team->t.t_serialized));
4001   KMP_DEBUG_ASSERT(dims != NULL);
4002   KMP_DEBUG_ASSERT(num_dims > 0);
4003 
4004   if (team->t.t_serialized) {
4005     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
4006     return; // no dependencies if team is serialized
4007   }
4008   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
4009   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
4010   // the next loop
4011   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4012 
4013   // Save bounds info into allocated private buffer
4014   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
4015   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
4016       th, sizeof(kmp_int64) * (4 * num_dims + 1));
4017   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4018   pr_buf->th_doacross_info[0] =
4019       (kmp_int64)num_dims; // first element is number of dimensions
4020   // Save also address of num_done in order to access it later without knowing
4021   // the buffer index
4022   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
4023   pr_buf->th_doacross_info[2] = dims[0].lo;
4024   pr_buf->th_doacross_info[3] = dims[0].up;
4025   pr_buf->th_doacross_info[4] = dims[0].st;
4026   last = 5;
4027   for (j = 1; j < num_dims; ++j) {
4028     kmp_int64
4029         range_length; // To keep ranges of all dimensions but the first dims[0]
4030     if (dims[j].st == 1) { // most common case
4031       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
4032       range_length = dims[j].up - dims[j].lo + 1;
4033     } else {
4034       if (dims[j].st > 0) {
4035         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
4036         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
4037       } else { // negative increment
4038         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
4039         range_length =
4040             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
4041       }
4042     }
4043     pr_buf->th_doacross_info[last++] = range_length;
4044     pr_buf->th_doacross_info[last++] = dims[j].lo;
4045     pr_buf->th_doacross_info[last++] = dims[j].up;
4046     pr_buf->th_doacross_info[last++] = dims[j].st;
4047   }
4048 
4049   // Compute total trip count.
4050   // Start with range of dims[0] which we don't need to keep in the buffer.
4051   if (dims[0].st == 1) { // most common case
4052     trace_count = dims[0].up - dims[0].lo + 1;
4053   } else if (dims[0].st > 0) {
4054     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
4055     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
4056   } else { // negative increment
4057     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
4058     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
4059   }
4060   for (j = 1; j < num_dims; ++j) {
4061     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
4062   }
4063   KMP_DEBUG_ASSERT(trace_count > 0);
4064 
4065   // Check if shared buffer is not occupied by other loop (idx -
4066   // __kmp_dispatch_num_buffers)
4067   if (idx != sh_buf->doacross_buf_idx) {
4068     // Shared buffer is occupied, wait for it to be free
4069     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
4070                  __kmp_eq_4, NULL);
4071   }
4072 #if KMP_32_BIT_ARCH
4073   // Check if we are the first thread. After the CAS the first thread gets 0,
4074   // others get 1 if initialization is in progress, allocated pointer otherwise.
4075   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
4076   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
4077       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
4078 #else
4079   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
4080       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
4081 #endif
4082   if (flags == NULL) {
4083     // we are the first thread, allocate the array of flags
4084     size_t size =
4085         (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
4086     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4087     KMP_MB();
4088     sh_buf->doacross_flags = flags;
4089   } else if (flags == (kmp_uint32 *)1) {
4090 #if KMP_32_BIT_ARCH
4091     // initialization is still in progress, need to wait
4092     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4093 #else
4094     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4095 #endif
4096       KMP_YIELD(TRUE);
4097     KMP_MB();
4098   } else {
4099     KMP_MB();
4100   }
4101   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4102   pr_buf->th_doacross_flags =
4103       sh_buf->doacross_flags; // save private copy in order to not
4104   // touch shared buffer on each iteration
4105   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4106 }
4107 
4108 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4109   __kmp_assert_valid_gtid(gtid);
4110   kmp_int64 shft;
4111   size_t num_dims, i;
4112   kmp_uint32 flag;
4113   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4114   kmp_info_t *th = __kmp_threads[gtid];
4115   kmp_team_t *team = th->th.th_team;
4116   kmp_disp_t *pr_buf;
4117   kmp_int64 lo, up, st;
4118 
4119   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4120   if (team->t.t_serialized) {
4121     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4122     return; // no dependencies if team is serialized
4123   }
4124 
4125   // calculate sequential iteration number and check out-of-bounds condition
4126   pr_buf = th->th.th_dispatch;
4127   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4128   num_dims = (size_t)pr_buf->th_doacross_info[0];
4129   lo = pr_buf->th_doacross_info[2];
4130   up = pr_buf->th_doacross_info[3];
4131   st = pr_buf->th_doacross_info[4];
4132 #if OMPT_SUPPORT && OMPT_OPTIONAL
4133   ompt_dependence_t deps[num_dims];
4134 #endif
4135   if (st == 1) { // most common case
4136     if (vec[0] < lo || vec[0] > up) {
4137       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4138                     "bounds [%lld,%lld]\n",
4139                     gtid, vec[0], lo, up));
4140       return;
4141     }
4142     iter_number = vec[0] - lo;
4143   } else if (st > 0) {
4144     if (vec[0] < lo || vec[0] > up) {
4145       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4146                     "bounds [%lld,%lld]\n",
4147                     gtid, vec[0], lo, up));
4148       return;
4149     }
4150     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4151   } else { // negative increment
4152     if (vec[0] > lo || vec[0] < up) {
4153       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4154                     "bounds [%lld,%lld]\n",
4155                     gtid, vec[0], lo, up));
4156       return;
4157     }
4158     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4159   }
4160 #if OMPT_SUPPORT && OMPT_OPTIONAL
4161   deps[0].variable.value = iter_number;
4162   deps[0].dependence_type = ompt_dependence_type_sink;
4163 #endif
4164   for (i = 1; i < num_dims; ++i) {
4165     kmp_int64 iter, ln;
4166     size_t j = i * 4;
4167     ln = pr_buf->th_doacross_info[j + 1];
4168     lo = pr_buf->th_doacross_info[j + 2];
4169     up = pr_buf->th_doacross_info[j + 3];
4170     st = pr_buf->th_doacross_info[j + 4];
4171     if (st == 1) {
4172       if (vec[i] < lo || vec[i] > up) {
4173         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4174                       "bounds [%lld,%lld]\n",
4175                       gtid, vec[i], lo, up));
4176         return;
4177       }
4178       iter = vec[i] - lo;
4179     } else if (st > 0) {
4180       if (vec[i] < lo || vec[i] > up) {
4181         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4182                       "bounds [%lld,%lld]\n",
4183                       gtid, vec[i], lo, up));
4184         return;
4185       }
4186       iter = (kmp_uint64)(vec[i] - lo) / st;
4187     } else { // st < 0
4188       if (vec[i] > lo || vec[i] < up) {
4189         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4190                       "bounds [%lld,%lld]\n",
4191                       gtid, vec[i], lo, up));
4192         return;
4193       }
4194       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4195     }
4196     iter_number = iter + ln * iter_number;
4197 #if OMPT_SUPPORT && OMPT_OPTIONAL
4198     deps[i].variable.value = iter;
4199     deps[i].dependence_type = ompt_dependence_type_sink;
4200 #endif
4201   }
4202   shft = iter_number % 32; // use 32-bit granularity
4203   iter_number >>= 5; // divided by 32
4204   flag = 1 << shft;
4205   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4206     KMP_YIELD(TRUE);
4207   }
4208   KMP_MB();
4209 #if OMPT_SUPPORT && OMPT_OPTIONAL
4210   if (ompt_enabled.ompt_callback_dependences) {
4211     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4212         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4213   }
4214 #endif
4215   KA_TRACE(20,
4216            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4217             gtid, (iter_number << 5) + shft));
4218 }
4219 
4220 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4221   __kmp_assert_valid_gtid(gtid);
4222   kmp_int64 shft;
4223   size_t num_dims, i;
4224   kmp_uint32 flag;
4225   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4226   kmp_info_t *th = __kmp_threads[gtid];
4227   kmp_team_t *team = th->th.th_team;
4228   kmp_disp_t *pr_buf;
4229   kmp_int64 lo, st;
4230 
4231   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4232   if (team->t.t_serialized) {
4233     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4234     return; // no dependencies if team is serialized
4235   }
4236 
4237   // calculate sequential iteration number (same as in "wait" but no
4238   // out-of-bounds checks)
4239   pr_buf = th->th.th_dispatch;
4240   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4241   num_dims = (size_t)pr_buf->th_doacross_info[0];
4242   lo = pr_buf->th_doacross_info[2];
4243   st = pr_buf->th_doacross_info[4];
4244 #if OMPT_SUPPORT && OMPT_OPTIONAL
4245   ompt_dependence_t deps[num_dims];
4246 #endif
4247   if (st == 1) { // most common case
4248     iter_number = vec[0] - lo;
4249   } else if (st > 0) {
4250     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4251   } else { // negative increment
4252     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4253   }
4254 #if OMPT_SUPPORT && OMPT_OPTIONAL
4255   deps[0].variable.value = iter_number;
4256   deps[0].dependence_type = ompt_dependence_type_source;
4257 #endif
4258   for (i = 1; i < num_dims; ++i) {
4259     kmp_int64 iter, ln;
4260     size_t j = i * 4;
4261     ln = pr_buf->th_doacross_info[j + 1];
4262     lo = pr_buf->th_doacross_info[j + 2];
4263     st = pr_buf->th_doacross_info[j + 4];
4264     if (st == 1) {
4265       iter = vec[i] - lo;
4266     } else if (st > 0) {
4267       iter = (kmp_uint64)(vec[i] - lo) / st;
4268     } else { // st < 0
4269       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4270     }
4271     iter_number = iter + ln * iter_number;
4272 #if OMPT_SUPPORT && OMPT_OPTIONAL
4273     deps[i].variable.value = iter;
4274     deps[i].dependence_type = ompt_dependence_type_source;
4275 #endif
4276   }
4277 #if OMPT_SUPPORT && OMPT_OPTIONAL
4278   if (ompt_enabled.ompt_callback_dependences) {
4279     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4280         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4281   }
4282 #endif
4283   shft = iter_number % 32; // use 32-bit granularity
4284   iter_number >>= 5; // divided by 32
4285   flag = 1 << shft;
4286   KMP_MB();
4287   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4288     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4289   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4290                 (iter_number << 5) + shft));
4291 }
4292 
4293 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4294   __kmp_assert_valid_gtid(gtid);
4295   kmp_int32 num_done;
4296   kmp_info_t *th = __kmp_threads[gtid];
4297   kmp_team_t *team = th->th.th_team;
4298   kmp_disp_t *pr_buf = th->th.th_dispatch;
4299 
4300   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4301   if (team->t.t_serialized) {
4302     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4303     return; // nothing to do
4304   }
4305   num_done =
4306       KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4307   if (num_done == th->th.th_team_nproc) {
4308     // we are the last thread, need to free shared resources
4309     int idx = pr_buf->th_doacross_buf_idx - 1;
4310     dispatch_shared_info_t *sh_buf =
4311         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4312     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4313                      (kmp_int64)&sh_buf->doacross_num_done);
4314     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4315     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4316     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4317     sh_buf->doacross_flags = NULL;
4318     sh_buf->doacross_num_done = 0;
4319     sh_buf->doacross_buf_idx +=
4320         __kmp_dispatch_num_buffers; // free buffer for future re-use
4321   }
4322   // free private resources (need to keep buffer index forever)
4323   pr_buf->th_doacross_flags = NULL;
4324   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4325   pr_buf->th_doacross_info = NULL;
4326   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4327 }
4328 
4329 /* OpenMP 5.1 Memory Management routines */
4330 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4331   return __kmp_alloc(__kmp_entry_gtid(), 0, size, allocator);
4332 }
4333 
4334 void *omp_aligned_alloc(size_t align, size_t size,
4335                         omp_allocator_handle_t allocator) {
4336   return __kmp_alloc(__kmp_entry_gtid(), align, size, allocator);
4337 }
4338 
4339 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4340   return __kmp_calloc(__kmp_entry_gtid(), 0, nmemb, size, allocator);
4341 }
4342 
4343 void *omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
4344                          omp_allocator_handle_t allocator) {
4345   return __kmp_calloc(__kmp_entry_gtid(), align, nmemb, size, allocator);
4346 }
4347 
4348 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4349                   omp_allocator_handle_t free_allocator) {
4350   return __kmp_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4351                         free_allocator);
4352 }
4353 
4354 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4355   ___kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4356 }
4357 /* end of OpenMP 5.1 Memory Management routines */
4358 
4359 int __kmpc_get_target_offload(void) {
4360   if (!__kmp_init_serial) {
4361     __kmp_serial_initialize();
4362   }
4363   return __kmp_target_offload;
4364 }
4365 
4366 int __kmpc_pause_resource(kmp_pause_status_t level) {
4367   if (!__kmp_init_serial) {
4368     return 1; // Can't pause if runtime is not initialized
4369   }
4370   return __kmp_pause_resource(level);
4371 }
4372 
4373 void __kmpc_error(ident_t *loc, int severity, const char *message) {
4374   if (!__kmp_init_serial)
4375     __kmp_serial_initialize();
4376 
4377   KMP_ASSERT(severity == severity_warning || severity == severity_fatal);
4378 
4379 #if OMPT_SUPPORT
4380   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_error) {
4381     ompt_callbacks.ompt_callback(ompt_callback_error)(
4382         (ompt_severity_t)severity, message, KMP_STRLEN(message),
4383         OMPT_GET_RETURN_ADDRESS(0));
4384   }
4385 #endif // OMPT_SUPPORT
4386 
4387   char *src_loc;
4388   if (loc && loc->psource) {
4389     kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
4390     src_loc =
4391         __kmp_str_format("%s:%s:%s", str_loc.file, str_loc.line, str_loc.col);
4392     __kmp_str_loc_free(&str_loc);
4393   } else {
4394     src_loc = __kmp_str_format("unknown");
4395   }
4396 
4397   if (severity == severity_warning)
4398     KMP_WARNING(UserDirectedWarning, src_loc, message);
4399   else
4400     KMP_FATAL(UserDirectedError, src_loc, message);
4401 
4402   __kmp_str_free(&src_loc);
4403 }
4404 
4405 // Mark begin of scope directive.
4406 void __kmpc_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4407 // reserved is for extension of scope directive and not used.
4408 #if OMPT_SUPPORT && OMPT_OPTIONAL
4409   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4410     kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4411     int tid = __kmp_tid_from_gtid(gtid);
4412     ompt_callbacks.ompt_callback(ompt_callback_work)(
4413         ompt_work_scope, ompt_scope_begin,
4414         &(team->t.ompt_team_info.parallel_data),
4415         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4416         OMPT_GET_RETURN_ADDRESS(0));
4417   }
4418 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4419 }
4420 
4421 // Mark end of scope directive
4422 void __kmpc_end_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4423 // reserved is for extension of scope directive and not used.
4424 #if OMPT_SUPPORT && OMPT_OPTIONAL
4425   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4426     kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4427     int tid = __kmp_tid_from_gtid(gtid);
4428     ompt_callbacks.ompt_callback(ompt_callback_work)(
4429         ompt_work_scope, ompt_scope_end,
4430         &(team->t.ompt_team_info.parallel_data),
4431         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4432         OMPT_GET_RETURN_ADDRESS(0));
4433   }
4434 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4435 }
4436 
4437 #ifdef KMP_USE_VERSION_SYMBOLS
4438 // For GOMP compatibility there are two versions of each omp_* API.
4439 // One is the plain C symbol and one is the Fortran symbol with an appended
4440 // underscore. When we implement a specific ompc_* version of an omp_*
4441 // function, we want the plain GOMP versioned symbol to alias the ompc_* version
4442 // instead of the Fortran versions in kmp_ftn_entry.h
4443 extern "C" {
4444 // Have to undef these from omp.h so they aren't translated into
4445 // their ompc counterparts in the KMP_VERSION_OMPC_SYMBOL macros below
4446 #ifdef omp_set_affinity_format
4447 #undef omp_set_affinity_format
4448 #endif
4449 #ifdef omp_get_affinity_format
4450 #undef omp_get_affinity_format
4451 #endif
4452 #ifdef omp_display_affinity
4453 #undef omp_display_affinity
4454 #endif
4455 #ifdef omp_capture_affinity
4456 #undef omp_capture_affinity
4457 #endif
4458 KMP_VERSION_OMPC_SYMBOL(ompc_set_affinity_format, omp_set_affinity_format, 50,
4459                         "OMP_5.0");
4460 KMP_VERSION_OMPC_SYMBOL(ompc_get_affinity_format, omp_get_affinity_format, 50,
4461                         "OMP_5.0");
4462 KMP_VERSION_OMPC_SYMBOL(ompc_display_affinity, omp_display_affinity, 50,
4463                         "OMP_5.0");
4464 KMP_VERSION_OMPC_SYMBOL(ompc_capture_affinity, omp_capture_affinity, 50,
4465                         "OMP_5.0");
4466 } // extern "C"
4467 #endif
4468