xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_csupport.cpp (revision ee0fe82ee2892f5ece189db0eab38913aaab5f0f)
1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #define MAX_MESSAGE 512
27 
28 // flags will be used in future, e.g. to implement openmp_strict library
29 // restrictions
30 
31 /*!
32  * @ingroup STARTUP_SHUTDOWN
33  * @param loc   in   source location information
34  * @param flags in   for future use (currently ignored)
35  *
36  * Initialize the runtime library. This call is optional; if it is not made then
37  * it will be implicitly called by attempts to use other library functions.
38  */
39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
40   // By default __kmpc_begin() is no-op.
41   char *env;
42   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
43       __kmp_str_match_true(env)) {
44     __kmp_middle_initialize();
45     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
46   } else if (__kmp_ignore_mppbeg() == FALSE) {
47     // By default __kmp_ignore_mppbeg() returns TRUE.
48     __kmp_internal_begin();
49     KC_TRACE(10, ("__kmpc_begin: called\n"));
50   }
51 }
52 
53 /*!
54  * @ingroup STARTUP_SHUTDOWN
55  * @param loc source location information
56  *
57  * Shutdown the runtime library. This is also optional, and even if called will
58  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
59  * zero.
60  */
61 void __kmpc_end(ident_t *loc) {
62   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
63   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
64   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
65   // returns FALSE and __kmpc_end() will unregister this root (it can cause
66   // library shut down).
67   if (__kmp_ignore_mppend() == FALSE) {
68     KC_TRACE(10, ("__kmpc_end: called\n"));
69     KA_TRACE(30, ("__kmpc_end\n"));
70 
71     __kmp_internal_end_thread(-1);
72   }
73 #if KMP_OS_WINDOWS && OMPT_SUPPORT
74   // Normal exit process on Windows does not allow worker threads of the final
75   // parallel region to finish reporting their events, so shutting down the
76   // library here fixes the issue at least for the cases where __kmpc_end() is
77   // placed properly.
78   if (ompt_enabled.enabled)
79     __kmp_internal_end_library(__kmp_gtid_get_specific());
80 #endif
81 }
82 
83 /*!
84 @ingroup THREAD_STATES
85 @param loc Source location information.
86 @return The global thread index of the active thread.
87 
88 This function can be called in any context.
89 
90 If the runtime has ony been entered at the outermost level from a
91 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
92 that which would be returned by omp_get_thread_num() in the outermost
93 active parallel construct. (Or zero if there is no active parallel
94 construct, since the master thread is necessarily thread zero).
95 
96 If multiple non-OpenMP threads all enter an OpenMP construct then this
97 will be a unique thread identifier among all the threads created by
98 the OpenMP runtime (but the value cannote be defined in terms of
99 OpenMP thread ids returned by omp_get_thread_num()).
100 */
101 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
102   kmp_int32 gtid = __kmp_entry_gtid();
103 
104   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
105 
106   return gtid;
107 }
108 
109 /*!
110 @ingroup THREAD_STATES
111 @param loc Source location information.
112 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
113 
114 This function can be called in any context.
115 It returns the total number of threads under the control of the OpenMP runtime.
116 That is not a number that can be determined by any OpenMP standard calls, since
117 the library may be called from more than one non-OpenMP thread, and this
118 reflects the total over all such calls. Similarly the runtime maintains
119 underlying threads even when they are not active (since the cost of creating
120 and destroying OS threads is high), this call counts all such threads even if
121 they are not waiting for work.
122 */
123 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
124   KC_TRACE(10,
125            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
126 
127   return TCR_4(__kmp_all_nth);
128 }
129 
130 /*!
131 @ingroup THREAD_STATES
132 @param loc Source location information.
133 @return The thread number of the calling thread in the innermost active parallel
134 construct.
135 */
136 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
137   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
138   return __kmp_tid_from_gtid(__kmp_entry_gtid());
139 }
140 
141 /*!
142 @ingroup THREAD_STATES
143 @param loc Source location information.
144 @return The number of threads in the innermost active parallel construct.
145 */
146 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
147   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
148 
149   return __kmp_entry_thread()->th.th_team->t.t_nproc;
150 }
151 
152 /*!
153  * @ingroup DEPRECATED
154  * @param loc location description
155  *
156  * This function need not be called. It always returns TRUE.
157  */
158 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
159 #ifndef KMP_DEBUG
160 
161   return TRUE;
162 
163 #else
164 
165   const char *semi2;
166   const char *semi3;
167   int line_no;
168 
169   if (__kmp_par_range == 0) {
170     return TRUE;
171   }
172   semi2 = loc->psource;
173   if (semi2 == NULL) {
174     return TRUE;
175   }
176   semi2 = strchr(semi2, ';');
177   if (semi2 == NULL) {
178     return TRUE;
179   }
180   semi2 = strchr(semi2 + 1, ';');
181   if (semi2 == NULL) {
182     return TRUE;
183   }
184   if (__kmp_par_range_filename[0]) {
185     const char *name = semi2 - 1;
186     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
187       name--;
188     }
189     if ((*name == '/') || (*name == ';')) {
190       name++;
191     }
192     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
193       return __kmp_par_range < 0;
194     }
195   }
196   semi3 = strchr(semi2 + 1, ';');
197   if (__kmp_par_range_routine[0]) {
198     if ((semi3 != NULL) && (semi3 > semi2) &&
199         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
200       return __kmp_par_range < 0;
201     }
202   }
203   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
204     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
205       return __kmp_par_range > 0;
206     }
207     return __kmp_par_range < 0;
208   }
209   return TRUE;
210 
211 #endif /* KMP_DEBUG */
212 }
213 
214 /*!
215 @ingroup THREAD_STATES
216 @param loc Source location information.
217 @return 1 if this thread is executing inside an active parallel region, zero if
218 not.
219 */
220 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
221   return __kmp_entry_thread()->th.th_root->r.r_active;
222 }
223 
224 /*!
225 @ingroup PARALLEL
226 @param loc source location information
227 @param global_tid global thread number
228 @param num_threads number of threads requested for this parallel construct
229 
230 Set the number of threads to be used by the next fork spawned by this thread.
231 This call is only required if the parallel construct has a `num_threads` clause.
232 */
233 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
234                              kmp_int32 num_threads) {
235   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
236                 global_tid, num_threads));
237 
238   __kmp_push_num_threads(loc, global_tid, num_threads);
239 }
240 
241 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
242   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
243 
244   /* the num_threads are automatically popped */
245 }
246 
247 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
248                            kmp_int32 proc_bind) {
249   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
250                 proc_bind));
251 
252   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
253 }
254 
255 /*!
256 @ingroup PARALLEL
257 @param loc  source location information
258 @param argc  total number of arguments in the ellipsis
259 @param microtask  pointer to callback routine consisting of outlined parallel
260 construct
261 @param ...  pointers to shared variables that aren't global
262 
263 Do the actual fork and call the microtask in the relevant number of threads.
264 */
265 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
266   int gtid = __kmp_entry_gtid();
267 
268 #if (KMP_STATS_ENABLED)
269   // If we were in a serial region, then stop the serial timer, record
270   // the event, and start parallel region timer
271   stats_state_e previous_state = KMP_GET_THREAD_STATE();
272   if (previous_state == stats_state_e::SERIAL_REGION) {
273     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
274   } else {
275     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
276   }
277   int inParallel = __kmpc_in_parallel(loc);
278   if (inParallel) {
279     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
280   } else {
281     KMP_COUNT_BLOCK(OMP_PARALLEL);
282   }
283 #endif
284 
285   // maybe to save thr_state is enough here
286   {
287     va_list ap;
288     va_start(ap, microtask);
289 
290 #if OMPT_SUPPORT
291     ompt_frame_t *ompt_frame;
292     if (ompt_enabled.enabled) {
293       kmp_info_t *master_th = __kmp_threads[gtid];
294       kmp_team_t *parent_team = master_th->th.th_team;
295       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
296       if (lwt)
297         ompt_frame = &(lwt->ompt_task_info.frame);
298       else {
299         int tid = __kmp_tid_from_gtid(gtid);
300         ompt_frame = &(
301             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
302       }
303       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
304       OMPT_STORE_RETURN_ADDRESS(gtid);
305     }
306 #endif
307 
308 #if INCLUDE_SSC_MARKS
309     SSC_MARK_FORKING();
310 #endif
311     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
312                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
313                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
314 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
315 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
316                     &ap
317 #else
318                     ap
319 #endif
320                     );
321 #if INCLUDE_SSC_MARKS
322     SSC_MARK_JOINING();
323 #endif
324     __kmp_join_call(loc, gtid
325 #if OMPT_SUPPORT
326                     ,
327                     fork_context_intel
328 #endif
329                     );
330 
331     va_end(ap);
332   }
333 
334 #if KMP_STATS_ENABLED
335   if (previous_state == stats_state_e::SERIAL_REGION) {
336     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
337   } else {
338     KMP_POP_PARTITIONED_TIMER();
339   }
340 #endif // KMP_STATS_ENABLED
341 }
342 
343 /*!
344 @ingroup PARALLEL
345 @param loc source location information
346 @param global_tid global thread number
347 @param num_teams number of teams requested for the teams construct
348 @param num_threads number of threads per team requested for the teams construct
349 
350 Set the number of teams to be used by the teams construct.
351 This call is only required if the teams construct has a `num_teams` clause
352 or a `thread_limit` clause (or both).
353 */
354 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
355                            kmp_int32 num_teams, kmp_int32 num_threads) {
356   KA_TRACE(20,
357            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
358             global_tid, num_teams, num_threads));
359 
360   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
361 }
362 
363 /*!
364 @ingroup PARALLEL
365 @param loc  source location information
366 @param argc  total number of arguments in the ellipsis
367 @param microtask  pointer to callback routine consisting of outlined teams
368 construct
369 @param ...  pointers to shared variables that aren't global
370 
371 Do the actual fork and call the microtask in the relevant number of threads.
372 */
373 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
374                        ...) {
375   int gtid = __kmp_entry_gtid();
376   kmp_info_t *this_thr = __kmp_threads[gtid];
377   va_list ap;
378   va_start(ap, microtask);
379 
380 #if KMP_STATS_ENABLED
381   KMP_COUNT_BLOCK(OMP_TEAMS);
382   stats_state_e previous_state = KMP_GET_THREAD_STATE();
383   if (previous_state == stats_state_e::SERIAL_REGION) {
384     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
385   } else {
386     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
387   }
388 #endif
389 
390   // remember teams entry point and nesting level
391   this_thr->th.th_teams_microtask = microtask;
392   this_thr->th.th_teams_level =
393       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
394 
395 #if OMPT_SUPPORT
396   kmp_team_t *parent_team = this_thr->th.th_team;
397   int tid = __kmp_tid_from_gtid(gtid);
398   if (ompt_enabled.enabled) {
399     parent_team->t.t_implicit_task_taskdata[tid]
400         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
401   }
402   OMPT_STORE_RETURN_ADDRESS(gtid);
403 #endif
404 
405   // check if __kmpc_push_num_teams called, set default number of teams
406   // otherwise
407   if (this_thr->th.th_teams_size.nteams == 0) {
408     __kmp_push_num_teams(loc, gtid, 0, 0);
409   }
410   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
411   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
412   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
413 
414   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
415                   VOLATILE_CAST(microtask_t)
416                       __kmp_teams_master, // "wrapped" task
417                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
418 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
419                   &ap
420 #else
421                   ap
422 #endif
423                   );
424   __kmp_join_call(loc, gtid
425 #if OMPT_SUPPORT
426                   ,
427                   fork_context_intel
428 #endif
429                   );
430 
431   // Pop current CG root off list
432   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
433   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
434   this_thr->th.th_cg_roots = tmp->up;
435   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
436                  " to node %p. cg_nthreads was %d\n",
437                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
438   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
439   int i = tmp->cg_nthreads--;
440   if (i == 1) { // check is we are the last thread in CG (not always the case)
441     __kmp_free(tmp);
442   }
443   // Restore current task's thread_limit from CG root
444   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
445   this_thr->th.th_current_task->td_icvs.thread_limit =
446       this_thr->th.th_cg_roots->cg_thread_limit;
447 
448   this_thr->th.th_teams_microtask = NULL;
449   this_thr->th.th_teams_level = 0;
450   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
451   va_end(ap);
452 #if KMP_STATS_ENABLED
453   if (previous_state == stats_state_e::SERIAL_REGION) {
454     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
455   } else {
456     KMP_POP_PARTITIONED_TIMER();
457   }
458 #endif // KMP_STATS_ENABLED
459 }
460 
461 // I don't think this function should ever have been exported.
462 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
463 // openmp code ever called it, but it's been exported from the RTL for so
464 // long that I'm afraid to remove the definition.
465 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
466 
467 /*!
468 @ingroup PARALLEL
469 @param loc  source location information
470 @param global_tid  global thread number
471 
472 Enter a serialized parallel construct. This interface is used to handle a
473 conditional parallel region, like this,
474 @code
475 #pragma omp parallel if (condition)
476 @endcode
477 when the condition is false.
478 */
479 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
480 // The implementation is now in kmp_runtime.cpp so that it can share static
481 // functions with kmp_fork_call since the tasks to be done are similar in
482 // each case.
483 #if OMPT_SUPPORT
484   OMPT_STORE_RETURN_ADDRESS(global_tid);
485 #endif
486   __kmp_serialized_parallel(loc, global_tid);
487 }
488 
489 /*!
490 @ingroup PARALLEL
491 @param loc  source location information
492 @param global_tid  global thread number
493 
494 Leave a serialized parallel construct.
495 */
496 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
497   kmp_internal_control_t *top;
498   kmp_info_t *this_thr;
499   kmp_team_t *serial_team;
500 
501   KC_TRACE(10,
502            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
503 
504   /* skip all this code for autopar serialized loops since it results in
505      unacceptable overhead */
506   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
507     return;
508 
509   // Not autopar code
510   if (!TCR_4(__kmp_init_parallel))
511     __kmp_parallel_initialize();
512 
513   __kmp_resume_if_soft_paused();
514 
515   this_thr = __kmp_threads[global_tid];
516   serial_team = this_thr->th.th_serial_team;
517 
518   kmp_task_team_t *task_team = this_thr->th.th_task_team;
519   // we need to wait for the proxy tasks before finishing the thread
520   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
521     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
522 
523   KMP_MB();
524   KMP_DEBUG_ASSERT(serial_team);
525   KMP_ASSERT(serial_team->t.t_serialized);
526   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
527   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
528   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
529   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
530 
531 #if OMPT_SUPPORT
532   if (ompt_enabled.enabled &&
533       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
534     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
535     if (ompt_enabled.ompt_callback_implicit_task) {
536       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
537           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
538           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
539     }
540 
541     // reset clear the task id only after unlinking the task
542     ompt_data_t *parent_task_data;
543     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
544 
545     if (ompt_enabled.ompt_callback_parallel_end) {
546       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
547           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
548           ompt_parallel_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
549     }
550     __ompt_lw_taskteam_unlink(this_thr);
551     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
552   }
553 #endif
554 
555   /* If necessary, pop the internal control stack values and replace the team
556    * values */
557   top = serial_team->t.t_control_stack_top;
558   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
559     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
560     serial_team->t.t_control_stack_top = top->next;
561     __kmp_free(top);
562   }
563 
564   // if( serial_team -> t.t_serialized > 1 )
565   serial_team->t.t_level--;
566 
567   /* pop dispatch buffers stack */
568   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
569   {
570     dispatch_private_info_t *disp_buffer =
571         serial_team->t.t_dispatch->th_disp_buffer;
572     serial_team->t.t_dispatch->th_disp_buffer =
573         serial_team->t.t_dispatch->th_disp_buffer->next;
574     __kmp_free(disp_buffer);
575   }
576   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
577 
578   --serial_team->t.t_serialized;
579   if (serial_team->t.t_serialized == 0) {
580 
581 /* return to the parallel section */
582 
583 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
584     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
585       __kmp_clear_x87_fpu_status_word();
586       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
587       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
588     }
589 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
590 
591     this_thr->th.th_team = serial_team->t.t_parent;
592     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
593 
594     /* restore values cached in the thread */
595     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
596     this_thr->th.th_team_master =
597         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
598     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
599 
600     /* TODO the below shouldn't need to be adjusted for serialized teams */
601     this_thr->th.th_dispatch =
602         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
603 
604     __kmp_pop_current_task_from_thread(this_thr);
605 
606     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
607     this_thr->th.th_current_task->td_flags.executing = 1;
608 
609     if (__kmp_tasking_mode != tskm_immediate_exec) {
610       // Copy the task team from the new child / old parent team to the thread.
611       this_thr->th.th_task_team =
612           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
613       KA_TRACE(20,
614                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
615                 "team %p\n",
616                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
617     }
618   } else {
619     if (__kmp_tasking_mode != tskm_immediate_exec) {
620       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
621                     "depth of serial team %p to %d\n",
622                     global_tid, serial_team, serial_team->t.t_serialized));
623     }
624   }
625 
626   if (__kmp_env_consistency_check)
627     __kmp_pop_parallel(global_tid, NULL);
628 #if OMPT_SUPPORT
629   if (ompt_enabled.enabled)
630     this_thr->th.ompt_thread_info.state =
631         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
632                                            : ompt_state_work_parallel);
633 #endif
634 }
635 
636 /*!
637 @ingroup SYNCHRONIZATION
638 @param loc  source location information.
639 
640 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
641 depending on the memory ordering convention obeyed by the compiler
642 even that may not be necessary).
643 */
644 void __kmpc_flush(ident_t *loc) {
645   KC_TRACE(10, ("__kmpc_flush: called\n"));
646 
647   /* need explicit __mf() here since use volatile instead in library */
648   KMP_MB(); /* Flush all pending memory write invalidates.  */
649 
650 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
651 #if KMP_MIC
652 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
653 // We shouldn't need it, though, since the ABI rules require that
654 // * If the compiler generates NGO stores it also generates the fence
655 // * If users hand-code NGO stores they should insert the fence
656 // therefore no incomplete unordered stores should be visible.
657 #else
658   // C74404
659   // This is to address non-temporal store instructions (sfence needed).
660   // The clflush instruction is addressed either (mfence needed).
661   // Probably the non-temporal load monvtdqa instruction should also be
662   // addressed.
663   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
664   if (!__kmp_cpuinfo.initialized) {
665     __kmp_query_cpuid(&__kmp_cpuinfo);
666   }
667   if (!__kmp_cpuinfo.sse2) {
668     // CPU cannot execute SSE2 instructions.
669   } else {
670 #if KMP_COMPILER_ICC
671     _mm_mfence();
672 #elif KMP_COMPILER_MSVC
673     MemoryBarrier();
674 #else
675     __sync_synchronize();
676 #endif // KMP_COMPILER_ICC
677   }
678 #endif // KMP_MIC
679 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
680 // Nothing to see here move along
681 #elif KMP_ARCH_PPC64
682 // Nothing needed here (we have a real MB above).
683 #if KMP_OS_CNK
684   // The flushing thread needs to yield here; this prevents a
685   // busy-waiting thread from saturating the pipeline. flush is
686   // often used in loops like this:
687   // while (!flag) {
688   //   #pragma omp flush(flag)
689   // }
690   // and adding the yield here is good for at least a 10x speedup
691   // when running >2 threads per core (on the NAS LU benchmark).
692   __kmp_yield();
693 #endif
694 #else
695 #error Unknown or unsupported architecture
696 #endif
697 
698 #if OMPT_SUPPORT && OMPT_OPTIONAL
699   if (ompt_enabled.ompt_callback_flush) {
700     ompt_callbacks.ompt_callback(ompt_callback_flush)(
701         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
702   }
703 #endif
704 }
705 
706 /* -------------------------------------------------------------------------- */
707 /*!
708 @ingroup SYNCHRONIZATION
709 @param loc source location information
710 @param global_tid thread id.
711 
712 Execute a barrier.
713 */
714 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
715   KMP_COUNT_BLOCK(OMP_BARRIER);
716   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
717 
718   if (!TCR_4(__kmp_init_parallel))
719     __kmp_parallel_initialize();
720 
721   __kmp_resume_if_soft_paused();
722 
723   if (__kmp_env_consistency_check) {
724     if (loc == 0) {
725       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
726     }
727     __kmp_check_barrier(global_tid, ct_barrier, loc);
728   }
729 
730 #if OMPT_SUPPORT
731   ompt_frame_t *ompt_frame;
732   if (ompt_enabled.enabled) {
733     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
734     if (ompt_frame->enter_frame.ptr == NULL)
735       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
736     OMPT_STORE_RETURN_ADDRESS(global_tid);
737   }
738 #endif
739   __kmp_threads[global_tid]->th.th_ident = loc;
740   // TODO: explicit barrier_wait_id:
741   //   this function is called when 'barrier' directive is present or
742   //   implicit barrier at the end of a worksharing construct.
743   // 1) better to add a per-thread barrier counter to a thread data structure
744   // 2) set to 0 when a new team is created
745   // 4) no sync is required
746 
747   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
748 #if OMPT_SUPPORT && OMPT_OPTIONAL
749   if (ompt_enabled.enabled) {
750     ompt_frame->enter_frame = ompt_data_none;
751   }
752 #endif
753 }
754 
755 /* The BARRIER for a MASTER section is always explicit   */
756 /*!
757 @ingroup WORK_SHARING
758 @param loc  source location information.
759 @param global_tid  global thread number .
760 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
761 */
762 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
763   int status = 0;
764 
765   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
766 
767   if (!TCR_4(__kmp_init_parallel))
768     __kmp_parallel_initialize();
769 
770   __kmp_resume_if_soft_paused();
771 
772   if (KMP_MASTER_GTID(global_tid)) {
773     KMP_COUNT_BLOCK(OMP_MASTER);
774     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
775     status = 1;
776   }
777 
778 #if OMPT_SUPPORT && OMPT_OPTIONAL
779   if (status) {
780     if (ompt_enabled.ompt_callback_master) {
781       kmp_info_t *this_thr = __kmp_threads[global_tid];
782       kmp_team_t *team = this_thr->th.th_team;
783 
784       int tid = __kmp_tid_from_gtid(global_tid);
785       ompt_callbacks.ompt_callback(ompt_callback_master)(
786           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
787           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
788           OMPT_GET_RETURN_ADDRESS(0));
789     }
790   }
791 #endif
792 
793   if (__kmp_env_consistency_check) {
794 #if KMP_USE_DYNAMIC_LOCK
795     if (status)
796       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
797     else
798       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
799 #else
800     if (status)
801       __kmp_push_sync(global_tid, ct_master, loc, NULL);
802     else
803       __kmp_check_sync(global_tid, ct_master, loc, NULL);
804 #endif
805   }
806 
807   return status;
808 }
809 
810 /*!
811 @ingroup WORK_SHARING
812 @param loc  source location information.
813 @param global_tid  global thread number .
814 
815 Mark the end of a <tt>master</tt> region. This should only be called by the
816 thread that executes the <tt>master</tt> region.
817 */
818 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
819   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
820 
821   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
822   KMP_POP_PARTITIONED_TIMER();
823 
824 #if OMPT_SUPPORT && OMPT_OPTIONAL
825   kmp_info_t *this_thr = __kmp_threads[global_tid];
826   kmp_team_t *team = this_thr->th.th_team;
827   if (ompt_enabled.ompt_callback_master) {
828     int tid = __kmp_tid_from_gtid(global_tid);
829     ompt_callbacks.ompt_callback(ompt_callback_master)(
830         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
831         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
832         OMPT_GET_RETURN_ADDRESS(0));
833   }
834 #endif
835 
836   if (__kmp_env_consistency_check) {
837     if (global_tid < 0)
838       KMP_WARNING(ThreadIdentInvalid);
839 
840     if (KMP_MASTER_GTID(global_tid))
841       __kmp_pop_sync(global_tid, ct_master, loc);
842   }
843 }
844 
845 /*!
846 @ingroup WORK_SHARING
847 @param loc  source location information.
848 @param gtid  global thread number.
849 
850 Start execution of an <tt>ordered</tt> construct.
851 */
852 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
853   int cid = 0;
854   kmp_info_t *th;
855   KMP_DEBUG_ASSERT(__kmp_init_serial);
856 
857   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
858 
859   if (!TCR_4(__kmp_init_parallel))
860     __kmp_parallel_initialize();
861 
862   __kmp_resume_if_soft_paused();
863 
864 #if USE_ITT_BUILD
865   __kmp_itt_ordered_prep(gtid);
866 // TODO: ordered_wait_id
867 #endif /* USE_ITT_BUILD */
868 
869   th = __kmp_threads[gtid];
870 
871 #if OMPT_SUPPORT && OMPT_OPTIONAL
872   kmp_team_t *team;
873   ompt_wait_id_t lck;
874   void *codeptr_ra;
875   if (ompt_enabled.enabled) {
876     OMPT_STORE_RETURN_ADDRESS(gtid);
877     team = __kmp_team_from_gtid(gtid);
878     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
879     /* OMPT state update */
880     th->th.ompt_thread_info.wait_id = lck;
881     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
882 
883     /* OMPT event callback */
884     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
885     if (ompt_enabled.ompt_callback_mutex_acquire) {
886       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
887           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
888           codeptr_ra);
889     }
890   }
891 #endif
892 
893   if (th->th.th_dispatch->th_deo_fcn != 0)
894     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
895   else
896     __kmp_parallel_deo(&gtid, &cid, loc);
897 
898 #if OMPT_SUPPORT && OMPT_OPTIONAL
899   if (ompt_enabled.enabled) {
900     /* OMPT state update */
901     th->th.ompt_thread_info.state = ompt_state_work_parallel;
902     th->th.ompt_thread_info.wait_id = 0;
903 
904     /* OMPT event callback */
905     if (ompt_enabled.ompt_callback_mutex_acquired) {
906       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
907           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
908     }
909   }
910 #endif
911 
912 #if USE_ITT_BUILD
913   __kmp_itt_ordered_start(gtid);
914 #endif /* USE_ITT_BUILD */
915 }
916 
917 /*!
918 @ingroup WORK_SHARING
919 @param loc  source location information.
920 @param gtid  global thread number.
921 
922 End execution of an <tt>ordered</tt> construct.
923 */
924 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
925   int cid = 0;
926   kmp_info_t *th;
927 
928   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
929 
930 #if USE_ITT_BUILD
931   __kmp_itt_ordered_end(gtid);
932 // TODO: ordered_wait_id
933 #endif /* USE_ITT_BUILD */
934 
935   th = __kmp_threads[gtid];
936 
937   if (th->th.th_dispatch->th_dxo_fcn != 0)
938     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
939   else
940     __kmp_parallel_dxo(&gtid, &cid, loc);
941 
942 #if OMPT_SUPPORT && OMPT_OPTIONAL
943   OMPT_STORE_RETURN_ADDRESS(gtid);
944   if (ompt_enabled.ompt_callback_mutex_released) {
945     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
946         ompt_mutex_ordered,
947         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
948             ->t.t_ordered.dt.t_value,
949         OMPT_LOAD_RETURN_ADDRESS(gtid));
950   }
951 #endif
952 }
953 
954 #if KMP_USE_DYNAMIC_LOCK
955 
956 static __forceinline void
957 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
958                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
959   // Pointer to the allocated indirect lock is written to crit, while indexing
960   // is ignored.
961   void *idx;
962   kmp_indirect_lock_t **lck;
963   lck = (kmp_indirect_lock_t **)crit;
964   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
965   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
966   KMP_SET_I_LOCK_LOCATION(ilk, loc);
967   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
968   KA_TRACE(20,
969            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
970 #if USE_ITT_BUILD
971   __kmp_itt_critical_creating(ilk->lock, loc);
972 #endif
973   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
974   if (status == 0) {
975 #if USE_ITT_BUILD
976     __kmp_itt_critical_destroyed(ilk->lock);
977 #endif
978     // We don't really need to destroy the unclaimed lock here since it will be
979     // cleaned up at program exit.
980     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
981   }
982   KMP_DEBUG_ASSERT(*lck != NULL);
983 }
984 
985 // Fast-path acquire tas lock
986 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
987   {                                                                            \
988     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
989     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
990     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
991     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
992         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
993       kmp_uint32 spins;                                                        \
994       KMP_FSYNC_PREPARE(l);                                                    \
995       KMP_INIT_YIELD(spins);                                                   \
996       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
997       do {                                                                     \
998         if (TCR_4(__kmp_nth) >                                                 \
999             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1000           KMP_YIELD(TRUE);                                                     \
1001         } else {                                                               \
1002           KMP_YIELD_SPIN(spins);                                               \
1003         }                                                                      \
1004         __kmp_spin_backoff(&backoff);                                          \
1005       } while (                                                                \
1006           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1007           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1008     }                                                                          \
1009     KMP_FSYNC_ACQUIRED(l);                                                     \
1010   }
1011 
1012 // Fast-path test tas lock
1013 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1014   {                                                                            \
1015     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1016     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1017     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1018     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1019          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1020   }
1021 
1022 // Fast-path release tas lock
1023 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1024   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1025 
1026 #if KMP_USE_FUTEX
1027 
1028 #include <sys/syscall.h>
1029 #include <unistd.h>
1030 #ifndef FUTEX_WAIT
1031 #define FUTEX_WAIT 0
1032 #endif
1033 #ifndef FUTEX_WAKE
1034 #define FUTEX_WAKE 1
1035 #endif
1036 
1037 // Fast-path acquire futex lock
1038 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1039   {                                                                            \
1040     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1041     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1042     KMP_MB();                                                                  \
1043     KMP_FSYNC_PREPARE(ftx);                                                    \
1044     kmp_int32 poll_val;                                                        \
1045     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1046                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1047                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1048       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1049       if (!cond) {                                                             \
1050         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1051                                          poll_val |                            \
1052                                              KMP_LOCK_BUSY(1, futex))) {       \
1053           continue;                                                            \
1054         }                                                                      \
1055         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1056       }                                                                        \
1057       kmp_int32 rc;                                                            \
1058       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1059                         NULL, NULL, 0)) != 0) {                                \
1060         continue;                                                              \
1061       }                                                                        \
1062       gtid_code |= 1;                                                          \
1063     }                                                                          \
1064     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1065   }
1066 
1067 // Fast-path test futex lock
1068 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1069   {                                                                            \
1070     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1071     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1072                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1073       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1074       rc = TRUE;                                                               \
1075     } else {                                                                   \
1076       rc = FALSE;                                                              \
1077     }                                                                          \
1078   }
1079 
1080 // Fast-path release futex lock
1081 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1082   {                                                                            \
1083     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1084     KMP_MB();                                                                  \
1085     KMP_FSYNC_RELEASING(ftx);                                                  \
1086     kmp_int32 poll_val =                                                       \
1087         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1088     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1089       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1090               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1091     }                                                                          \
1092     KMP_MB();                                                                  \
1093     KMP_YIELD_OVERSUB();                                                       \
1094   }
1095 
1096 #endif // KMP_USE_FUTEX
1097 
1098 #else // KMP_USE_DYNAMIC_LOCK
1099 
1100 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1101                                                       ident_t const *loc,
1102                                                       kmp_int32 gtid) {
1103   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1104 
1105   // Because of the double-check, the following load doesn't need to be volatile
1106   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1107 
1108   if (lck == NULL) {
1109     void *idx;
1110 
1111     // Allocate & initialize the lock.
1112     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1113     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1114     __kmp_init_user_lock_with_checks(lck);
1115     __kmp_set_user_lock_location(lck, loc);
1116 #if USE_ITT_BUILD
1117     __kmp_itt_critical_creating(lck);
1118 // __kmp_itt_critical_creating() should be called *before* the first usage
1119 // of underlying lock. It is the only place where we can guarantee it. There
1120 // are chances the lock will destroyed with no usage, but it is not a
1121 // problem, because this is not real event seen by user but rather setting
1122 // name for object (lock). See more details in kmp_itt.h.
1123 #endif /* USE_ITT_BUILD */
1124 
1125     // Use a cmpxchg instruction to slam the start of the critical section with
1126     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1127     // and use the lock that the other thread allocated.
1128     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1129 
1130     if (status == 0) {
1131 // Deallocate the lock and reload the value.
1132 #if USE_ITT_BUILD
1133       __kmp_itt_critical_destroyed(lck);
1134 // Let ITT know the lock is destroyed and the same memory location may be reused
1135 // for another purpose.
1136 #endif /* USE_ITT_BUILD */
1137       __kmp_destroy_user_lock_with_checks(lck);
1138       __kmp_user_lock_free(&idx, gtid, lck);
1139       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1140       KMP_DEBUG_ASSERT(lck != NULL);
1141     }
1142   }
1143   return lck;
1144 }
1145 
1146 #endif // KMP_USE_DYNAMIC_LOCK
1147 
1148 /*!
1149 @ingroup WORK_SHARING
1150 @param loc  source location information.
1151 @param global_tid  global thread number .
1152 @param crit identity of the critical section. This could be a pointer to a lock
1153 associated with the critical section, or some other suitably unique value.
1154 
1155 Enter code protected by a `critical` construct.
1156 This function blocks until the executing thread can enter the critical section.
1157 */
1158 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1159                      kmp_critical_name *crit) {
1160 #if KMP_USE_DYNAMIC_LOCK
1161 #if OMPT_SUPPORT && OMPT_OPTIONAL
1162   OMPT_STORE_RETURN_ADDRESS(global_tid);
1163 #endif // OMPT_SUPPORT
1164   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1165 #else
1166   KMP_COUNT_BLOCK(OMP_CRITICAL);
1167 #if OMPT_SUPPORT && OMPT_OPTIONAL
1168   ompt_state_t prev_state = ompt_state_undefined;
1169   ompt_thread_info_t ti;
1170 #endif
1171   kmp_user_lock_p lck;
1172 
1173   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1174 
1175   // TODO: add THR_OVHD_STATE
1176 
1177   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1178   KMP_CHECK_USER_LOCK_INIT();
1179 
1180   if ((__kmp_user_lock_kind == lk_tas) &&
1181       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1182     lck = (kmp_user_lock_p)crit;
1183   }
1184 #if KMP_USE_FUTEX
1185   else if ((__kmp_user_lock_kind == lk_futex) &&
1186            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1187     lck = (kmp_user_lock_p)crit;
1188   }
1189 #endif
1190   else { // ticket, queuing or drdpa
1191     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1192   }
1193 
1194   if (__kmp_env_consistency_check)
1195     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1196 
1197 // since the critical directive binds to all threads, not just the current
1198 // team we have to check this even if we are in a serialized team.
1199 // also, even if we are the uber thread, we still have to conduct the lock,
1200 // as we have to contend with sibling threads.
1201 
1202 #if USE_ITT_BUILD
1203   __kmp_itt_critical_acquiring(lck);
1204 #endif /* USE_ITT_BUILD */
1205 #if OMPT_SUPPORT && OMPT_OPTIONAL
1206   OMPT_STORE_RETURN_ADDRESS(gtid);
1207   void *codeptr_ra = NULL;
1208   if (ompt_enabled.enabled) {
1209     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1210     /* OMPT state update */
1211     prev_state = ti.state;
1212     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1213     ti.state = ompt_state_wait_critical;
1214 
1215     /* OMPT event callback */
1216     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1217     if (ompt_enabled.ompt_callback_mutex_acquire) {
1218       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1219           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1220           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1221     }
1222   }
1223 #endif
1224   // Value of 'crit' should be good for using as a critical_id of the critical
1225   // section directive.
1226   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1227 
1228 #if USE_ITT_BUILD
1229   __kmp_itt_critical_acquired(lck);
1230 #endif /* USE_ITT_BUILD */
1231 #if OMPT_SUPPORT && OMPT_OPTIONAL
1232   if (ompt_enabled.enabled) {
1233     /* OMPT state update */
1234     ti.state = prev_state;
1235     ti.wait_id = 0;
1236 
1237     /* OMPT event callback */
1238     if (ompt_enabled.ompt_callback_mutex_acquired) {
1239       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1240           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1241     }
1242   }
1243 #endif
1244   KMP_POP_PARTITIONED_TIMER();
1245 
1246   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1247   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1248 #endif // KMP_USE_DYNAMIC_LOCK
1249 }
1250 
1251 #if KMP_USE_DYNAMIC_LOCK
1252 
1253 // Converts the given hint to an internal lock implementation
1254 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1255 #if KMP_USE_TSX
1256 #define KMP_TSX_LOCK(seq) lockseq_##seq
1257 #else
1258 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1259 #endif
1260 
1261 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1262 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1263 #else
1264 #define KMP_CPUINFO_RTM 0
1265 #endif
1266 
1267   // Hints that do not require further logic
1268   if (hint & kmp_lock_hint_hle)
1269     return KMP_TSX_LOCK(hle);
1270   if (hint & kmp_lock_hint_rtm)
1271     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1272   if (hint & kmp_lock_hint_adaptive)
1273     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1274 
1275   // Rule out conflicting hints first by returning the default lock
1276   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1277     return __kmp_user_lock_seq;
1278   if ((hint & omp_lock_hint_speculative) &&
1279       (hint & omp_lock_hint_nonspeculative))
1280     return __kmp_user_lock_seq;
1281 
1282   // Do not even consider speculation when it appears to be contended
1283   if (hint & omp_lock_hint_contended)
1284     return lockseq_queuing;
1285 
1286   // Uncontended lock without speculation
1287   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1288     return lockseq_tas;
1289 
1290   // HLE lock for speculation
1291   if (hint & omp_lock_hint_speculative)
1292     return KMP_TSX_LOCK(hle);
1293 
1294   return __kmp_user_lock_seq;
1295 }
1296 
1297 #if OMPT_SUPPORT && OMPT_OPTIONAL
1298 #if KMP_USE_DYNAMIC_LOCK
1299 static kmp_mutex_impl_t
1300 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1301   if (user_lock) {
1302     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1303     case 0:
1304       break;
1305 #if KMP_USE_FUTEX
1306     case locktag_futex:
1307       return kmp_mutex_impl_queuing;
1308 #endif
1309     case locktag_tas:
1310       return kmp_mutex_impl_spin;
1311 #if KMP_USE_TSX
1312     case locktag_hle:
1313       return kmp_mutex_impl_speculative;
1314 #endif
1315     default:
1316       return kmp_mutex_impl_none;
1317     }
1318     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1319   }
1320   KMP_ASSERT(ilock);
1321   switch (ilock->type) {
1322 #if KMP_USE_TSX
1323   case locktag_adaptive:
1324   case locktag_rtm:
1325     return kmp_mutex_impl_speculative;
1326 #endif
1327   case locktag_nested_tas:
1328     return kmp_mutex_impl_spin;
1329 #if KMP_USE_FUTEX
1330   case locktag_nested_futex:
1331 #endif
1332   case locktag_ticket:
1333   case locktag_queuing:
1334   case locktag_drdpa:
1335   case locktag_nested_ticket:
1336   case locktag_nested_queuing:
1337   case locktag_nested_drdpa:
1338     return kmp_mutex_impl_queuing;
1339   default:
1340     return kmp_mutex_impl_none;
1341   }
1342 }
1343 #else
1344 // For locks without dynamic binding
1345 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1346   switch (__kmp_user_lock_kind) {
1347   case lk_tas:
1348     return kmp_mutex_impl_spin;
1349 #if KMP_USE_FUTEX
1350   case lk_futex:
1351 #endif
1352   case lk_ticket:
1353   case lk_queuing:
1354   case lk_drdpa:
1355     return kmp_mutex_impl_queuing;
1356 #if KMP_USE_TSX
1357   case lk_hle:
1358   case lk_rtm:
1359   case lk_adaptive:
1360     return kmp_mutex_impl_speculative;
1361 #endif
1362   default:
1363     return kmp_mutex_impl_none;
1364   }
1365 }
1366 #endif // KMP_USE_DYNAMIC_LOCK
1367 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1368 
1369 /*!
1370 @ingroup WORK_SHARING
1371 @param loc  source location information.
1372 @param global_tid  global thread number.
1373 @param crit identity of the critical section. This could be a pointer to a lock
1374 associated with the critical section, or some other suitably unique value.
1375 @param hint the lock hint.
1376 
1377 Enter code protected by a `critical` construct with a hint. The hint value is
1378 used to suggest a lock implementation. This function blocks until the executing
1379 thread can enter the critical section unless the hint suggests use of
1380 speculative execution and the hardware supports it.
1381 */
1382 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1383                                kmp_critical_name *crit, uint32_t hint) {
1384   KMP_COUNT_BLOCK(OMP_CRITICAL);
1385   kmp_user_lock_p lck;
1386 #if OMPT_SUPPORT && OMPT_OPTIONAL
1387   ompt_state_t prev_state = ompt_state_undefined;
1388   ompt_thread_info_t ti;
1389   // This is the case, if called from __kmpc_critical:
1390   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1391   if (!codeptr)
1392     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1393 #endif
1394 
1395   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1396 
1397   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1398   // Check if it is initialized.
1399   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1400   if (*lk == 0) {
1401     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1402     if (KMP_IS_D_LOCK(lckseq)) {
1403       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1404                                   KMP_GET_D_TAG(lckseq));
1405     } else {
1406       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1407     }
1408   }
1409   // Branch for accessing the actual lock object and set operation. This
1410   // branching is inevitable since this lock initialization does not follow the
1411   // normal dispatch path (lock table is not used).
1412   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1413     lck = (kmp_user_lock_p)lk;
1414     if (__kmp_env_consistency_check) {
1415       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1416                       __kmp_map_hint_to_lock(hint));
1417     }
1418 #if USE_ITT_BUILD
1419     __kmp_itt_critical_acquiring(lck);
1420 #endif
1421 #if OMPT_SUPPORT && OMPT_OPTIONAL
1422     if (ompt_enabled.enabled) {
1423       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1424       /* OMPT state update */
1425       prev_state = ti.state;
1426       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1427       ti.state = ompt_state_wait_critical;
1428 
1429       /* OMPT event callback */
1430       if (ompt_enabled.ompt_callback_mutex_acquire) {
1431         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1432             ompt_mutex_critical, (unsigned int)hint,
1433             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1434             codeptr);
1435       }
1436     }
1437 #endif
1438 #if KMP_USE_INLINED_TAS
1439     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1440       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1441     } else
1442 #elif KMP_USE_INLINED_FUTEX
1443     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1444       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1445     } else
1446 #endif
1447     {
1448       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1449     }
1450   } else {
1451     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1452     lck = ilk->lock;
1453     if (__kmp_env_consistency_check) {
1454       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1455                       __kmp_map_hint_to_lock(hint));
1456     }
1457 #if USE_ITT_BUILD
1458     __kmp_itt_critical_acquiring(lck);
1459 #endif
1460 #if OMPT_SUPPORT && OMPT_OPTIONAL
1461     if (ompt_enabled.enabled) {
1462       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1463       /* OMPT state update */
1464       prev_state = ti.state;
1465       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1466       ti.state = ompt_state_wait_critical;
1467 
1468       /* OMPT event callback */
1469       if (ompt_enabled.ompt_callback_mutex_acquire) {
1470         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1471             ompt_mutex_critical, (unsigned int)hint,
1472             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1473             codeptr);
1474       }
1475     }
1476 #endif
1477     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1478   }
1479   KMP_POP_PARTITIONED_TIMER();
1480 
1481 #if USE_ITT_BUILD
1482   __kmp_itt_critical_acquired(lck);
1483 #endif /* USE_ITT_BUILD */
1484 #if OMPT_SUPPORT && OMPT_OPTIONAL
1485   if (ompt_enabled.enabled) {
1486     /* OMPT state update */
1487     ti.state = prev_state;
1488     ti.wait_id = 0;
1489 
1490     /* OMPT event callback */
1491     if (ompt_enabled.ompt_callback_mutex_acquired) {
1492       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1493           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1494     }
1495   }
1496 #endif
1497 
1498   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1499   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1500 } // __kmpc_critical_with_hint
1501 
1502 #endif // KMP_USE_DYNAMIC_LOCK
1503 
1504 /*!
1505 @ingroup WORK_SHARING
1506 @param loc  source location information.
1507 @param global_tid  global thread number .
1508 @param crit identity of the critical section. This could be a pointer to a lock
1509 associated with the critical section, or some other suitably unique value.
1510 
1511 Leave a critical section, releasing any lock that was held during its execution.
1512 */
1513 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1514                          kmp_critical_name *crit) {
1515   kmp_user_lock_p lck;
1516 
1517   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1518 
1519 #if KMP_USE_DYNAMIC_LOCK
1520   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1521     lck = (kmp_user_lock_p)crit;
1522     KMP_ASSERT(lck != NULL);
1523     if (__kmp_env_consistency_check) {
1524       __kmp_pop_sync(global_tid, ct_critical, loc);
1525     }
1526 #if USE_ITT_BUILD
1527     __kmp_itt_critical_releasing(lck);
1528 #endif
1529 #if KMP_USE_INLINED_TAS
1530     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1531       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1532     } else
1533 #elif KMP_USE_INLINED_FUTEX
1534     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1535       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1536     } else
1537 #endif
1538     {
1539       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1540     }
1541   } else {
1542     kmp_indirect_lock_t *ilk =
1543         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1544     KMP_ASSERT(ilk != NULL);
1545     lck = ilk->lock;
1546     if (__kmp_env_consistency_check) {
1547       __kmp_pop_sync(global_tid, ct_critical, loc);
1548     }
1549 #if USE_ITT_BUILD
1550     __kmp_itt_critical_releasing(lck);
1551 #endif
1552     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1553   }
1554 
1555 #else // KMP_USE_DYNAMIC_LOCK
1556 
1557   if ((__kmp_user_lock_kind == lk_tas) &&
1558       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1559     lck = (kmp_user_lock_p)crit;
1560   }
1561 #if KMP_USE_FUTEX
1562   else if ((__kmp_user_lock_kind == lk_futex) &&
1563            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1564     lck = (kmp_user_lock_p)crit;
1565   }
1566 #endif
1567   else { // ticket, queuing or drdpa
1568     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1569   }
1570 
1571   KMP_ASSERT(lck != NULL);
1572 
1573   if (__kmp_env_consistency_check)
1574     __kmp_pop_sync(global_tid, ct_critical, loc);
1575 
1576 #if USE_ITT_BUILD
1577   __kmp_itt_critical_releasing(lck);
1578 #endif /* USE_ITT_BUILD */
1579   // Value of 'crit' should be good for using as a critical_id of the critical
1580   // section directive.
1581   __kmp_release_user_lock_with_checks(lck, global_tid);
1582 
1583 #endif // KMP_USE_DYNAMIC_LOCK
1584 
1585 #if OMPT_SUPPORT && OMPT_OPTIONAL
1586   /* OMPT release event triggers after lock is released; place here to trigger
1587    * for all #if branches */
1588   OMPT_STORE_RETURN_ADDRESS(global_tid);
1589   if (ompt_enabled.ompt_callback_mutex_released) {
1590     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1591         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1592         OMPT_LOAD_RETURN_ADDRESS(0));
1593   }
1594 #endif
1595 
1596   KMP_POP_PARTITIONED_TIMER();
1597   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1598 }
1599 
1600 /*!
1601 @ingroup SYNCHRONIZATION
1602 @param loc source location information
1603 @param global_tid thread id.
1604 @return one if the thread should execute the master block, zero otherwise
1605 
1606 Start execution of a combined barrier and master. The barrier is executed inside
1607 this function.
1608 */
1609 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1610   int status;
1611 
1612   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1613 
1614   if (!TCR_4(__kmp_init_parallel))
1615     __kmp_parallel_initialize();
1616 
1617   __kmp_resume_if_soft_paused();
1618 
1619   if (__kmp_env_consistency_check)
1620     __kmp_check_barrier(global_tid, ct_barrier, loc);
1621 
1622 #if OMPT_SUPPORT
1623   ompt_frame_t *ompt_frame;
1624   if (ompt_enabled.enabled) {
1625     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1626     if (ompt_frame->enter_frame.ptr == NULL)
1627       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1628     OMPT_STORE_RETURN_ADDRESS(global_tid);
1629   }
1630 #endif
1631 #if USE_ITT_NOTIFY
1632   __kmp_threads[global_tid]->th.th_ident = loc;
1633 #endif
1634   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1635 #if OMPT_SUPPORT && OMPT_OPTIONAL
1636   if (ompt_enabled.enabled) {
1637     ompt_frame->enter_frame = ompt_data_none;
1638   }
1639 #endif
1640 
1641   return (status != 0) ? 0 : 1;
1642 }
1643 
1644 /*!
1645 @ingroup SYNCHRONIZATION
1646 @param loc source location information
1647 @param global_tid thread id.
1648 
1649 Complete the execution of a combined barrier and master. This function should
1650 only be called at the completion of the <tt>master</tt> code. Other threads will
1651 still be waiting at the barrier and this call releases them.
1652 */
1653 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1654   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1655 
1656   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1657 }
1658 
1659 /*!
1660 @ingroup SYNCHRONIZATION
1661 @param loc source location information
1662 @param global_tid thread id.
1663 @return one if the thread should execute the master block, zero otherwise
1664 
1665 Start execution of a combined barrier and master(nowait) construct.
1666 The barrier is executed inside this function.
1667 There is no equivalent "end" function, since the
1668 */
1669 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1670   kmp_int32 ret;
1671 
1672   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1673 
1674   if (!TCR_4(__kmp_init_parallel))
1675     __kmp_parallel_initialize();
1676 
1677   __kmp_resume_if_soft_paused();
1678 
1679   if (__kmp_env_consistency_check) {
1680     if (loc == 0) {
1681       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1682     }
1683     __kmp_check_barrier(global_tid, ct_barrier, loc);
1684   }
1685 
1686 #if OMPT_SUPPORT
1687   ompt_frame_t *ompt_frame;
1688   if (ompt_enabled.enabled) {
1689     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1690     if (ompt_frame->enter_frame.ptr == NULL)
1691       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1692     OMPT_STORE_RETURN_ADDRESS(global_tid);
1693   }
1694 #endif
1695 #if USE_ITT_NOTIFY
1696   __kmp_threads[global_tid]->th.th_ident = loc;
1697 #endif
1698   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1699 #if OMPT_SUPPORT && OMPT_OPTIONAL
1700   if (ompt_enabled.enabled) {
1701     ompt_frame->enter_frame = ompt_data_none;
1702   }
1703 #endif
1704 
1705   ret = __kmpc_master(loc, global_tid);
1706 
1707   if (__kmp_env_consistency_check) {
1708     /*  there's no __kmpc_end_master called; so the (stats) */
1709     /*  actions of __kmpc_end_master are done here          */
1710 
1711     if (global_tid < 0) {
1712       KMP_WARNING(ThreadIdentInvalid);
1713     }
1714     if (ret) {
1715       /* only one thread should do the pop since only */
1716       /* one did the push (see __kmpc_master())       */
1717 
1718       __kmp_pop_sync(global_tid, ct_master, loc);
1719     }
1720   }
1721 
1722   return (ret);
1723 }
1724 
1725 /* The BARRIER for a SINGLE process section is always explicit   */
1726 /*!
1727 @ingroup WORK_SHARING
1728 @param loc  source location information
1729 @param global_tid  global thread number
1730 @return One if this thread should execute the single construct, zero otherwise.
1731 
1732 Test whether to execute a <tt>single</tt> construct.
1733 There are no implicit barriers in the two "single" calls, rather the compiler
1734 should introduce an explicit barrier if it is required.
1735 */
1736 
1737 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1738   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1739 
1740   if (rc) {
1741     // We are going to execute the single statement, so we should count it.
1742     KMP_COUNT_BLOCK(OMP_SINGLE);
1743     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1744   }
1745 
1746 #if OMPT_SUPPORT && OMPT_OPTIONAL
1747   kmp_info_t *this_thr = __kmp_threads[global_tid];
1748   kmp_team_t *team = this_thr->th.th_team;
1749   int tid = __kmp_tid_from_gtid(global_tid);
1750 
1751   if (ompt_enabled.enabled) {
1752     if (rc) {
1753       if (ompt_enabled.ompt_callback_work) {
1754         ompt_callbacks.ompt_callback(ompt_callback_work)(
1755             ompt_work_single_executor, ompt_scope_begin,
1756             &(team->t.ompt_team_info.parallel_data),
1757             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1758             1, OMPT_GET_RETURN_ADDRESS(0));
1759       }
1760     } else {
1761       if (ompt_enabled.ompt_callback_work) {
1762         ompt_callbacks.ompt_callback(ompt_callback_work)(
1763             ompt_work_single_other, ompt_scope_begin,
1764             &(team->t.ompt_team_info.parallel_data),
1765             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1766             1, OMPT_GET_RETURN_ADDRESS(0));
1767         ompt_callbacks.ompt_callback(ompt_callback_work)(
1768             ompt_work_single_other, ompt_scope_end,
1769             &(team->t.ompt_team_info.parallel_data),
1770             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1771             1, OMPT_GET_RETURN_ADDRESS(0));
1772       }
1773     }
1774   }
1775 #endif
1776 
1777   return rc;
1778 }
1779 
1780 /*!
1781 @ingroup WORK_SHARING
1782 @param loc  source location information
1783 @param global_tid  global thread number
1784 
1785 Mark the end of a <tt>single</tt> construct.  This function should
1786 only be called by the thread that executed the block of code protected
1787 by the `single` construct.
1788 */
1789 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1790   __kmp_exit_single(global_tid);
1791   KMP_POP_PARTITIONED_TIMER();
1792 
1793 #if OMPT_SUPPORT && OMPT_OPTIONAL
1794   kmp_info_t *this_thr = __kmp_threads[global_tid];
1795   kmp_team_t *team = this_thr->th.th_team;
1796   int tid = __kmp_tid_from_gtid(global_tid);
1797 
1798   if (ompt_enabled.ompt_callback_work) {
1799     ompt_callbacks.ompt_callback(ompt_callback_work)(
1800         ompt_work_single_executor, ompt_scope_end,
1801         &(team->t.ompt_team_info.parallel_data),
1802         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1803         OMPT_GET_RETURN_ADDRESS(0));
1804   }
1805 #endif
1806 }
1807 
1808 /*!
1809 @ingroup WORK_SHARING
1810 @param loc Source location
1811 @param global_tid Global thread id
1812 
1813 Mark the end of a statically scheduled loop.
1814 */
1815 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1816   KMP_POP_PARTITIONED_TIMER();
1817   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1818 
1819 #if OMPT_SUPPORT && OMPT_OPTIONAL
1820   if (ompt_enabled.ompt_callback_work) {
1821     ompt_work_t ompt_work_type = ompt_work_loop;
1822     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1823     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1824     // Determine workshare type
1825     if (loc != NULL) {
1826       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1827         ompt_work_type = ompt_work_loop;
1828       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1829         ompt_work_type = ompt_work_sections;
1830       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1831         ompt_work_type = ompt_work_distribute;
1832       } else {
1833         // use default set above.
1834         // a warning about this case is provided in __kmpc_for_static_init
1835       }
1836       KMP_DEBUG_ASSERT(ompt_work_type);
1837     }
1838     ompt_callbacks.ompt_callback(ompt_callback_work)(
1839         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1840         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1841   }
1842 #endif
1843   if (__kmp_env_consistency_check)
1844     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1845 }
1846 
1847 // User routines which take C-style arguments (call by value)
1848 // different from the Fortran equivalent routines
1849 
1850 void ompc_set_num_threads(int arg) {
1851   // !!!!! TODO: check the per-task binding
1852   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1853 }
1854 
1855 void ompc_set_dynamic(int flag) {
1856   kmp_info_t *thread;
1857 
1858   /* For the thread-private implementation of the internal controls */
1859   thread = __kmp_entry_thread();
1860 
1861   __kmp_save_internal_controls(thread);
1862 
1863   set__dynamic(thread, flag ? TRUE : FALSE);
1864 }
1865 
1866 void ompc_set_nested(int flag) {
1867   kmp_info_t *thread;
1868 
1869   /* For the thread-private internal controls implementation */
1870   thread = __kmp_entry_thread();
1871 
1872   __kmp_save_internal_controls(thread);
1873 
1874   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1875 }
1876 
1877 void ompc_set_max_active_levels(int max_active_levels) {
1878   /* TO DO */
1879   /* we want per-task implementation of this internal control */
1880 
1881   /* For the per-thread internal controls implementation */
1882   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1883 }
1884 
1885 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1886   // !!!!! TODO: check the per-task binding
1887   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1888 }
1889 
1890 int ompc_get_ancestor_thread_num(int level) {
1891   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1892 }
1893 
1894 int ompc_get_team_size(int level) {
1895   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1896 }
1897 
1898 /* OpenMP 5.0 Affinity Format API */
1899 
1900 void ompc_set_affinity_format(char const *format) {
1901   if (!__kmp_init_serial) {
1902     __kmp_serial_initialize();
1903   }
1904   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1905                          format, KMP_STRLEN(format) + 1);
1906 }
1907 
1908 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1909   size_t format_size;
1910   if (!__kmp_init_serial) {
1911     __kmp_serial_initialize();
1912   }
1913   format_size = KMP_STRLEN(__kmp_affinity_format);
1914   if (buffer && size) {
1915     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1916                            format_size + 1);
1917   }
1918   return format_size;
1919 }
1920 
1921 void ompc_display_affinity(char const *format) {
1922   int gtid;
1923   if (!TCR_4(__kmp_init_middle)) {
1924     __kmp_middle_initialize();
1925   }
1926   gtid = __kmp_get_gtid();
1927   __kmp_aux_display_affinity(gtid, format);
1928 }
1929 
1930 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1931                              char const *format) {
1932   int gtid;
1933   size_t num_required;
1934   kmp_str_buf_t capture_buf;
1935   if (!TCR_4(__kmp_init_middle)) {
1936     __kmp_middle_initialize();
1937   }
1938   gtid = __kmp_get_gtid();
1939   __kmp_str_buf_init(&capture_buf);
1940   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1941   if (buffer && buf_size) {
1942     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1943                            capture_buf.used + 1);
1944   }
1945   __kmp_str_buf_free(&capture_buf);
1946   return num_required;
1947 }
1948 
1949 void kmpc_set_stacksize(int arg) {
1950   // __kmp_aux_set_stacksize initializes the library if needed
1951   __kmp_aux_set_stacksize(arg);
1952 }
1953 
1954 void kmpc_set_stacksize_s(size_t arg) {
1955   // __kmp_aux_set_stacksize initializes the library if needed
1956   __kmp_aux_set_stacksize(arg);
1957 }
1958 
1959 void kmpc_set_blocktime(int arg) {
1960   int gtid, tid;
1961   kmp_info_t *thread;
1962 
1963   gtid = __kmp_entry_gtid();
1964   tid = __kmp_tid_from_gtid(gtid);
1965   thread = __kmp_thread_from_gtid(gtid);
1966 
1967   __kmp_aux_set_blocktime(arg, thread, tid);
1968 }
1969 
1970 void kmpc_set_library(int arg) {
1971   // __kmp_user_set_library initializes the library if needed
1972   __kmp_user_set_library((enum library_type)arg);
1973 }
1974 
1975 void kmpc_set_defaults(char const *str) {
1976   // __kmp_aux_set_defaults initializes the library if needed
1977   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1978 }
1979 
1980 void kmpc_set_disp_num_buffers(int arg) {
1981   // ignore after initialization because some teams have already
1982   // allocated dispatch buffers
1983   if (__kmp_init_serial == 0 && arg > 0)
1984     __kmp_dispatch_num_buffers = arg;
1985 }
1986 
1987 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1988 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1989   return -1;
1990 #else
1991   if (!TCR_4(__kmp_init_middle)) {
1992     __kmp_middle_initialize();
1993   }
1994   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1995 #endif
1996 }
1997 
1998 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1999 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2000   return -1;
2001 #else
2002   if (!TCR_4(__kmp_init_middle)) {
2003     __kmp_middle_initialize();
2004   }
2005   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2006 #endif
2007 }
2008 
2009 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2010 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2011   return -1;
2012 #else
2013   if (!TCR_4(__kmp_init_middle)) {
2014     __kmp_middle_initialize();
2015   }
2016   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2017 #endif
2018 }
2019 
2020 /* -------------------------------------------------------------------------- */
2021 /*!
2022 @ingroup THREADPRIVATE
2023 @param loc       source location information
2024 @param gtid      global thread number
2025 @param cpy_size  size of the cpy_data buffer
2026 @param cpy_data  pointer to data to be copied
2027 @param cpy_func  helper function to call for copying data
2028 @param didit     flag variable: 1=single thread; 0=not single thread
2029 
2030 __kmpc_copyprivate implements the interface for the private data broadcast
2031 needed for the copyprivate clause associated with a single region in an
2032 OpenMP<sup>*</sup> program (both C and Fortran).
2033 All threads participating in the parallel region call this routine.
2034 One of the threads (called the single thread) should have the <tt>didit</tt>
2035 variable set to 1 and all other threads should have that variable set to 0.
2036 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2037 
2038 The OpenMP specification forbids the use of nowait on the single region when a
2039 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2040 barrier internally to avoid race conditions, so the code generation for the
2041 single region should avoid generating a barrier after the call to @ref
2042 __kmpc_copyprivate.
2043 
2044 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2045 The <tt>loc</tt> parameter is a pointer to source location information.
2046 
2047 Internal implementation: The single thread will first copy its descriptor
2048 address (cpy_data) to a team-private location, then the other threads will each
2049 call the function pointed to by the parameter cpy_func, which carries out the
2050 copy by copying the data using the cpy_data buffer.
2051 
2052 The cpy_func routine used for the copy and the contents of the data area defined
2053 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2054 to be done. For instance, the cpy_data buffer can hold the actual data to be
2055 copied or it may hold a list of pointers to the data. The cpy_func routine must
2056 interpret the cpy_data buffer appropriately.
2057 
2058 The interface to cpy_func is as follows:
2059 @code
2060 void cpy_func( void *destination, void *source )
2061 @endcode
2062 where void *destination is the cpy_data pointer for the thread being copied to
2063 and void *source is the cpy_data pointer for the thread being copied from.
2064 */
2065 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2066                         void *cpy_data, void (*cpy_func)(void *, void *),
2067                         kmp_int32 didit) {
2068   void **data_ptr;
2069 
2070   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2071 
2072   KMP_MB();
2073 
2074   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2075 
2076   if (__kmp_env_consistency_check) {
2077     if (loc == 0) {
2078       KMP_WARNING(ConstructIdentInvalid);
2079     }
2080   }
2081 
2082   // ToDo: Optimize the following two barriers into some kind of split barrier
2083 
2084   if (didit)
2085     *data_ptr = cpy_data;
2086 
2087 #if OMPT_SUPPORT
2088   ompt_frame_t *ompt_frame;
2089   if (ompt_enabled.enabled) {
2090     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2091     if (ompt_frame->enter_frame.ptr == NULL)
2092       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2093     OMPT_STORE_RETURN_ADDRESS(gtid);
2094   }
2095 #endif
2096 /* This barrier is not a barrier region boundary */
2097 #if USE_ITT_NOTIFY
2098   __kmp_threads[gtid]->th.th_ident = loc;
2099 #endif
2100   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2101 
2102   if (!didit)
2103     (*cpy_func)(cpy_data, *data_ptr);
2104 
2105 // Consider next barrier a user-visible barrier for barrier region boundaries
2106 // Nesting checks are already handled by the single construct checks
2107 
2108 #if OMPT_SUPPORT
2109   if (ompt_enabled.enabled) {
2110     OMPT_STORE_RETURN_ADDRESS(gtid);
2111   }
2112 #endif
2113 #if USE_ITT_NOTIFY
2114   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2115 // tasks can overwrite the location)
2116 #endif
2117   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2118 #if OMPT_SUPPORT && OMPT_OPTIONAL
2119   if (ompt_enabled.enabled) {
2120     ompt_frame->enter_frame = ompt_data_none;
2121   }
2122 #endif
2123 }
2124 
2125 /* -------------------------------------------------------------------------- */
2126 
2127 #define INIT_LOCK __kmp_init_user_lock_with_checks
2128 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2129 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2130 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2131 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2132 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2133   __kmp_acquire_nested_user_lock_with_checks_timed
2134 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2135 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2136 #define TEST_LOCK __kmp_test_user_lock_with_checks
2137 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2138 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2139 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2140 
2141 // TODO: Make check abort messages use location info & pass it into
2142 // with_checks routines
2143 
2144 #if KMP_USE_DYNAMIC_LOCK
2145 
2146 // internal lock initializer
2147 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2148                                                     kmp_dyna_lockseq_t seq) {
2149   if (KMP_IS_D_LOCK(seq)) {
2150     KMP_INIT_D_LOCK(lock, seq);
2151 #if USE_ITT_BUILD
2152     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2153 #endif
2154   } else {
2155     KMP_INIT_I_LOCK(lock, seq);
2156 #if USE_ITT_BUILD
2157     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2158     __kmp_itt_lock_creating(ilk->lock, loc);
2159 #endif
2160   }
2161 }
2162 
2163 // internal nest lock initializer
2164 static __forceinline void
2165 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2166                                kmp_dyna_lockseq_t seq) {
2167 #if KMP_USE_TSX
2168   // Don't have nested lock implementation for speculative locks
2169   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2170     seq = __kmp_user_lock_seq;
2171 #endif
2172   switch (seq) {
2173   case lockseq_tas:
2174     seq = lockseq_nested_tas;
2175     break;
2176 #if KMP_USE_FUTEX
2177   case lockseq_futex:
2178     seq = lockseq_nested_futex;
2179     break;
2180 #endif
2181   case lockseq_ticket:
2182     seq = lockseq_nested_ticket;
2183     break;
2184   case lockseq_queuing:
2185     seq = lockseq_nested_queuing;
2186     break;
2187   case lockseq_drdpa:
2188     seq = lockseq_nested_drdpa;
2189     break;
2190   default:
2191     seq = lockseq_nested_queuing;
2192   }
2193   KMP_INIT_I_LOCK(lock, seq);
2194 #if USE_ITT_BUILD
2195   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2196   __kmp_itt_lock_creating(ilk->lock, loc);
2197 #endif
2198 }
2199 
2200 /* initialize the lock with a hint */
2201 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2202                                 uintptr_t hint) {
2203   KMP_DEBUG_ASSERT(__kmp_init_serial);
2204   if (__kmp_env_consistency_check && user_lock == NULL) {
2205     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2206   }
2207 
2208   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2209 
2210 #if OMPT_SUPPORT && OMPT_OPTIONAL
2211   // This is the case, if called from omp_init_lock_with_hint:
2212   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2213   if (!codeptr)
2214     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2215   if (ompt_enabled.ompt_callback_lock_init) {
2216     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2217         ompt_mutex_lock, (omp_lock_hint_t)hint,
2218         __ompt_get_mutex_impl_type(user_lock),
2219         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2220   }
2221 #endif
2222 }
2223 
2224 /* initialize the lock with a hint */
2225 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2226                                      void **user_lock, uintptr_t hint) {
2227   KMP_DEBUG_ASSERT(__kmp_init_serial);
2228   if (__kmp_env_consistency_check && user_lock == NULL) {
2229     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2230   }
2231 
2232   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2233 
2234 #if OMPT_SUPPORT && OMPT_OPTIONAL
2235   // This is the case, if called from omp_init_lock_with_hint:
2236   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2237   if (!codeptr)
2238     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2239   if (ompt_enabled.ompt_callback_lock_init) {
2240     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2241         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2242         __ompt_get_mutex_impl_type(user_lock),
2243         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2244   }
2245 #endif
2246 }
2247 
2248 #endif // KMP_USE_DYNAMIC_LOCK
2249 
2250 /* initialize the lock */
2251 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2252 #if KMP_USE_DYNAMIC_LOCK
2253 
2254   KMP_DEBUG_ASSERT(__kmp_init_serial);
2255   if (__kmp_env_consistency_check && user_lock == NULL) {
2256     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2257   }
2258   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2259 
2260 #if OMPT_SUPPORT && OMPT_OPTIONAL
2261   // This is the case, if called from omp_init_lock_with_hint:
2262   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2263   if (!codeptr)
2264     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2265   if (ompt_enabled.ompt_callback_lock_init) {
2266     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2267         ompt_mutex_lock, omp_lock_hint_none,
2268         __ompt_get_mutex_impl_type(user_lock),
2269         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2270   }
2271 #endif
2272 
2273 #else // KMP_USE_DYNAMIC_LOCK
2274 
2275   static char const *const func = "omp_init_lock";
2276   kmp_user_lock_p lck;
2277   KMP_DEBUG_ASSERT(__kmp_init_serial);
2278 
2279   if (__kmp_env_consistency_check) {
2280     if (user_lock == NULL) {
2281       KMP_FATAL(LockIsUninitialized, func);
2282     }
2283   }
2284 
2285   KMP_CHECK_USER_LOCK_INIT();
2286 
2287   if ((__kmp_user_lock_kind == lk_tas) &&
2288       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2289     lck = (kmp_user_lock_p)user_lock;
2290   }
2291 #if KMP_USE_FUTEX
2292   else if ((__kmp_user_lock_kind == lk_futex) &&
2293            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2294     lck = (kmp_user_lock_p)user_lock;
2295   }
2296 #endif
2297   else {
2298     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2299   }
2300   INIT_LOCK(lck);
2301   __kmp_set_user_lock_location(lck, loc);
2302 
2303 #if OMPT_SUPPORT && OMPT_OPTIONAL
2304   // This is the case, if called from omp_init_lock_with_hint:
2305   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2306   if (!codeptr)
2307     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2308   if (ompt_enabled.ompt_callback_lock_init) {
2309     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2310         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2311         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2312   }
2313 #endif
2314 
2315 #if USE_ITT_BUILD
2316   __kmp_itt_lock_creating(lck);
2317 #endif /* USE_ITT_BUILD */
2318 
2319 #endif // KMP_USE_DYNAMIC_LOCK
2320 } // __kmpc_init_lock
2321 
2322 /* initialize the lock */
2323 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2324 #if KMP_USE_DYNAMIC_LOCK
2325 
2326   KMP_DEBUG_ASSERT(__kmp_init_serial);
2327   if (__kmp_env_consistency_check && user_lock == NULL) {
2328     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2329   }
2330   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2331 
2332 #if OMPT_SUPPORT && OMPT_OPTIONAL
2333   // This is the case, if called from omp_init_lock_with_hint:
2334   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2335   if (!codeptr)
2336     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2337   if (ompt_enabled.ompt_callback_lock_init) {
2338     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2339         ompt_mutex_nest_lock, omp_lock_hint_none,
2340         __ompt_get_mutex_impl_type(user_lock),
2341         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2342   }
2343 #endif
2344 
2345 #else // KMP_USE_DYNAMIC_LOCK
2346 
2347   static char const *const func = "omp_init_nest_lock";
2348   kmp_user_lock_p lck;
2349   KMP_DEBUG_ASSERT(__kmp_init_serial);
2350 
2351   if (__kmp_env_consistency_check) {
2352     if (user_lock == NULL) {
2353       KMP_FATAL(LockIsUninitialized, func);
2354     }
2355   }
2356 
2357   KMP_CHECK_USER_LOCK_INIT();
2358 
2359   if ((__kmp_user_lock_kind == lk_tas) &&
2360       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2361        OMP_NEST_LOCK_T_SIZE)) {
2362     lck = (kmp_user_lock_p)user_lock;
2363   }
2364 #if KMP_USE_FUTEX
2365   else if ((__kmp_user_lock_kind == lk_futex) &&
2366            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2367             OMP_NEST_LOCK_T_SIZE)) {
2368     lck = (kmp_user_lock_p)user_lock;
2369   }
2370 #endif
2371   else {
2372     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2373   }
2374 
2375   INIT_NESTED_LOCK(lck);
2376   __kmp_set_user_lock_location(lck, loc);
2377 
2378 #if OMPT_SUPPORT && OMPT_OPTIONAL
2379   // This is the case, if called from omp_init_lock_with_hint:
2380   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2381   if (!codeptr)
2382     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2383   if (ompt_enabled.ompt_callback_lock_init) {
2384     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2385         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2386         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2387   }
2388 #endif
2389 
2390 #if USE_ITT_BUILD
2391   __kmp_itt_lock_creating(lck);
2392 #endif /* USE_ITT_BUILD */
2393 
2394 #endif // KMP_USE_DYNAMIC_LOCK
2395 } // __kmpc_init_nest_lock
2396 
2397 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2398 #if KMP_USE_DYNAMIC_LOCK
2399 
2400 #if USE_ITT_BUILD
2401   kmp_user_lock_p lck;
2402   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2403     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2404   } else {
2405     lck = (kmp_user_lock_p)user_lock;
2406   }
2407   __kmp_itt_lock_destroyed(lck);
2408 #endif
2409 #if OMPT_SUPPORT && OMPT_OPTIONAL
2410   // This is the case, if called from omp_init_lock_with_hint:
2411   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2412   if (!codeptr)
2413     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2414   if (ompt_enabled.ompt_callback_lock_destroy) {
2415     kmp_user_lock_p lck;
2416     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2417       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2418     } else {
2419       lck = (kmp_user_lock_p)user_lock;
2420     }
2421     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2422         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2423   }
2424 #endif
2425   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2426 #else
2427   kmp_user_lock_p lck;
2428 
2429   if ((__kmp_user_lock_kind == lk_tas) &&
2430       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2431     lck = (kmp_user_lock_p)user_lock;
2432   }
2433 #if KMP_USE_FUTEX
2434   else if ((__kmp_user_lock_kind == lk_futex) &&
2435            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2436     lck = (kmp_user_lock_p)user_lock;
2437   }
2438 #endif
2439   else {
2440     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2441   }
2442 
2443 #if OMPT_SUPPORT && OMPT_OPTIONAL
2444   // This is the case, if called from omp_init_lock_with_hint:
2445   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2446   if (!codeptr)
2447     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2448   if (ompt_enabled.ompt_callback_lock_destroy) {
2449     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2450         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2451   }
2452 #endif
2453 
2454 #if USE_ITT_BUILD
2455   __kmp_itt_lock_destroyed(lck);
2456 #endif /* USE_ITT_BUILD */
2457   DESTROY_LOCK(lck);
2458 
2459   if ((__kmp_user_lock_kind == lk_tas) &&
2460       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2461     ;
2462   }
2463 #if KMP_USE_FUTEX
2464   else if ((__kmp_user_lock_kind == lk_futex) &&
2465            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2466     ;
2467   }
2468 #endif
2469   else {
2470     __kmp_user_lock_free(user_lock, gtid, lck);
2471   }
2472 #endif // KMP_USE_DYNAMIC_LOCK
2473 } // __kmpc_destroy_lock
2474 
2475 /* destroy the lock */
2476 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2477 #if KMP_USE_DYNAMIC_LOCK
2478 
2479 #if USE_ITT_BUILD
2480   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2481   __kmp_itt_lock_destroyed(ilk->lock);
2482 #endif
2483 #if OMPT_SUPPORT && OMPT_OPTIONAL
2484   // This is the case, if called from omp_init_lock_with_hint:
2485   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2486   if (!codeptr)
2487     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2488   if (ompt_enabled.ompt_callback_lock_destroy) {
2489     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2490         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2491   }
2492 #endif
2493   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2494 
2495 #else // KMP_USE_DYNAMIC_LOCK
2496 
2497   kmp_user_lock_p lck;
2498 
2499   if ((__kmp_user_lock_kind == lk_tas) &&
2500       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2501        OMP_NEST_LOCK_T_SIZE)) {
2502     lck = (kmp_user_lock_p)user_lock;
2503   }
2504 #if KMP_USE_FUTEX
2505   else if ((__kmp_user_lock_kind == lk_futex) &&
2506            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2507             OMP_NEST_LOCK_T_SIZE)) {
2508     lck = (kmp_user_lock_p)user_lock;
2509   }
2510 #endif
2511   else {
2512     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2513   }
2514 
2515 #if OMPT_SUPPORT && OMPT_OPTIONAL
2516   // This is the case, if called from omp_init_lock_with_hint:
2517   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2518   if (!codeptr)
2519     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2520   if (ompt_enabled.ompt_callback_lock_destroy) {
2521     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2522         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2523   }
2524 #endif
2525 
2526 #if USE_ITT_BUILD
2527   __kmp_itt_lock_destroyed(lck);
2528 #endif /* USE_ITT_BUILD */
2529 
2530   DESTROY_NESTED_LOCK(lck);
2531 
2532   if ((__kmp_user_lock_kind == lk_tas) &&
2533       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2534        OMP_NEST_LOCK_T_SIZE)) {
2535     ;
2536   }
2537 #if KMP_USE_FUTEX
2538   else if ((__kmp_user_lock_kind == lk_futex) &&
2539            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2540             OMP_NEST_LOCK_T_SIZE)) {
2541     ;
2542   }
2543 #endif
2544   else {
2545     __kmp_user_lock_free(user_lock, gtid, lck);
2546   }
2547 #endif // KMP_USE_DYNAMIC_LOCK
2548 } // __kmpc_destroy_nest_lock
2549 
2550 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2551   KMP_COUNT_BLOCK(OMP_set_lock);
2552 #if KMP_USE_DYNAMIC_LOCK
2553   int tag = KMP_EXTRACT_D_TAG(user_lock);
2554 #if USE_ITT_BUILD
2555   __kmp_itt_lock_acquiring(
2556       (kmp_user_lock_p)
2557           user_lock); // itt function will get to the right lock object.
2558 #endif
2559 #if OMPT_SUPPORT && OMPT_OPTIONAL
2560   // This is the case, if called from omp_init_lock_with_hint:
2561   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2562   if (!codeptr)
2563     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2564   if (ompt_enabled.ompt_callback_mutex_acquire) {
2565     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2566         ompt_mutex_lock, omp_lock_hint_none,
2567         __ompt_get_mutex_impl_type(user_lock),
2568         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2569   }
2570 #endif
2571 #if KMP_USE_INLINED_TAS
2572   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2573     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2574   } else
2575 #elif KMP_USE_INLINED_FUTEX
2576   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2577     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2578   } else
2579 #endif
2580   {
2581     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2582   }
2583 #if USE_ITT_BUILD
2584   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2585 #endif
2586 #if OMPT_SUPPORT && OMPT_OPTIONAL
2587   if (ompt_enabled.ompt_callback_mutex_acquired) {
2588     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2589         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2590   }
2591 #endif
2592 
2593 #else // KMP_USE_DYNAMIC_LOCK
2594 
2595   kmp_user_lock_p lck;
2596 
2597   if ((__kmp_user_lock_kind == lk_tas) &&
2598       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2599     lck = (kmp_user_lock_p)user_lock;
2600   }
2601 #if KMP_USE_FUTEX
2602   else if ((__kmp_user_lock_kind == lk_futex) &&
2603            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2604     lck = (kmp_user_lock_p)user_lock;
2605   }
2606 #endif
2607   else {
2608     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2609   }
2610 
2611 #if USE_ITT_BUILD
2612   __kmp_itt_lock_acquiring(lck);
2613 #endif /* USE_ITT_BUILD */
2614 #if OMPT_SUPPORT && OMPT_OPTIONAL
2615   // This is the case, if called from omp_init_lock_with_hint:
2616   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2617   if (!codeptr)
2618     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2619   if (ompt_enabled.ompt_callback_mutex_acquire) {
2620     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2621         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2622         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2623   }
2624 #endif
2625 
2626   ACQUIRE_LOCK(lck, gtid);
2627 
2628 #if USE_ITT_BUILD
2629   __kmp_itt_lock_acquired(lck);
2630 #endif /* USE_ITT_BUILD */
2631 
2632 #if OMPT_SUPPORT && OMPT_OPTIONAL
2633   if (ompt_enabled.ompt_callback_mutex_acquired) {
2634     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2635         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2636   }
2637 #endif
2638 
2639 #endif // KMP_USE_DYNAMIC_LOCK
2640 }
2641 
2642 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2643 #if KMP_USE_DYNAMIC_LOCK
2644 
2645 #if USE_ITT_BUILD
2646   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2647 #endif
2648 #if OMPT_SUPPORT && OMPT_OPTIONAL
2649   // This is the case, if called from omp_init_lock_with_hint:
2650   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2651   if (!codeptr)
2652     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2653   if (ompt_enabled.enabled) {
2654     if (ompt_enabled.ompt_callback_mutex_acquire) {
2655       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2656           ompt_mutex_nest_lock, omp_lock_hint_none,
2657           __ompt_get_mutex_impl_type(user_lock),
2658           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2659     }
2660   }
2661 #endif
2662   int acquire_status =
2663       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2664   (void) acquire_status;
2665 #if USE_ITT_BUILD
2666   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2667 #endif
2668 
2669 #if OMPT_SUPPORT && OMPT_OPTIONAL
2670   if (ompt_enabled.enabled) {
2671     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2672       if (ompt_enabled.ompt_callback_mutex_acquired) {
2673         // lock_first
2674         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2675             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2676             codeptr);
2677       }
2678     } else {
2679       if (ompt_enabled.ompt_callback_nest_lock) {
2680         // lock_next
2681         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2682             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2683       }
2684     }
2685   }
2686 #endif
2687 
2688 #else // KMP_USE_DYNAMIC_LOCK
2689   int acquire_status;
2690   kmp_user_lock_p lck;
2691 
2692   if ((__kmp_user_lock_kind == lk_tas) &&
2693       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2694        OMP_NEST_LOCK_T_SIZE)) {
2695     lck = (kmp_user_lock_p)user_lock;
2696   }
2697 #if KMP_USE_FUTEX
2698   else if ((__kmp_user_lock_kind == lk_futex) &&
2699            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2700             OMP_NEST_LOCK_T_SIZE)) {
2701     lck = (kmp_user_lock_p)user_lock;
2702   }
2703 #endif
2704   else {
2705     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2706   }
2707 
2708 #if USE_ITT_BUILD
2709   __kmp_itt_lock_acquiring(lck);
2710 #endif /* USE_ITT_BUILD */
2711 #if OMPT_SUPPORT && OMPT_OPTIONAL
2712   // This is the case, if called from omp_init_lock_with_hint:
2713   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2714   if (!codeptr)
2715     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2716   if (ompt_enabled.enabled) {
2717     if (ompt_enabled.ompt_callback_mutex_acquire) {
2718       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2719           ompt_mutex_nest_lock, omp_lock_hint_none,
2720           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2721           codeptr);
2722     }
2723   }
2724 #endif
2725 
2726   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2727 
2728 #if USE_ITT_BUILD
2729   __kmp_itt_lock_acquired(lck);
2730 #endif /* USE_ITT_BUILD */
2731 
2732 #if OMPT_SUPPORT && OMPT_OPTIONAL
2733   if (ompt_enabled.enabled) {
2734     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2735       if (ompt_enabled.ompt_callback_mutex_acquired) {
2736         // lock_first
2737         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2738             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2739       }
2740     } else {
2741       if (ompt_enabled.ompt_callback_nest_lock) {
2742         // lock_next
2743         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2744             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2745       }
2746     }
2747   }
2748 #endif
2749 
2750 #endif // KMP_USE_DYNAMIC_LOCK
2751 }
2752 
2753 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2754 #if KMP_USE_DYNAMIC_LOCK
2755 
2756   int tag = KMP_EXTRACT_D_TAG(user_lock);
2757 #if USE_ITT_BUILD
2758   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2759 #endif
2760 #if KMP_USE_INLINED_TAS
2761   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2762     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2763   } else
2764 #elif KMP_USE_INLINED_FUTEX
2765   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2766     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2767   } else
2768 #endif
2769   {
2770     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2771   }
2772 
2773 #if OMPT_SUPPORT && OMPT_OPTIONAL
2774   // This is the case, if called from omp_init_lock_with_hint:
2775   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2776   if (!codeptr)
2777     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2778   if (ompt_enabled.ompt_callback_mutex_released) {
2779     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2780         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2781   }
2782 #endif
2783 
2784 #else // KMP_USE_DYNAMIC_LOCK
2785 
2786   kmp_user_lock_p lck;
2787 
2788   /* Can't use serial interval since not block structured */
2789   /* release the lock */
2790 
2791   if ((__kmp_user_lock_kind == lk_tas) &&
2792       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2793 #if KMP_OS_LINUX &&                                                            \
2794     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2795 // "fast" path implemented to fix customer performance issue
2796 #if USE_ITT_BUILD
2797     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2798 #endif /* USE_ITT_BUILD */
2799     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2800     KMP_MB();
2801 
2802 #if OMPT_SUPPORT && OMPT_OPTIONAL
2803     // This is the case, if called from omp_init_lock_with_hint:
2804     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2805     if (!codeptr)
2806       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2807     if (ompt_enabled.ompt_callback_mutex_released) {
2808       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2809           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2810     }
2811 #endif
2812 
2813     return;
2814 #else
2815     lck = (kmp_user_lock_p)user_lock;
2816 #endif
2817   }
2818 #if KMP_USE_FUTEX
2819   else if ((__kmp_user_lock_kind == lk_futex) &&
2820            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2821     lck = (kmp_user_lock_p)user_lock;
2822   }
2823 #endif
2824   else {
2825     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2826   }
2827 
2828 #if USE_ITT_BUILD
2829   __kmp_itt_lock_releasing(lck);
2830 #endif /* USE_ITT_BUILD */
2831 
2832   RELEASE_LOCK(lck, gtid);
2833 
2834 #if OMPT_SUPPORT && OMPT_OPTIONAL
2835   // This is the case, if called from omp_init_lock_with_hint:
2836   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2837   if (!codeptr)
2838     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2839   if (ompt_enabled.ompt_callback_mutex_released) {
2840     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2841         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2842   }
2843 #endif
2844 
2845 #endif // KMP_USE_DYNAMIC_LOCK
2846 }
2847 
2848 /* release the lock */
2849 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2850 #if KMP_USE_DYNAMIC_LOCK
2851 
2852 #if USE_ITT_BUILD
2853   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2854 #endif
2855   int release_status =
2856       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2857   (void) release_status;
2858 
2859 #if OMPT_SUPPORT && OMPT_OPTIONAL
2860   // This is the case, if called from omp_init_lock_with_hint:
2861   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2862   if (!codeptr)
2863     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2864   if (ompt_enabled.enabled) {
2865     if (release_status == KMP_LOCK_RELEASED) {
2866       if (ompt_enabled.ompt_callback_mutex_released) {
2867         // release_lock_last
2868         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2869             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2870             codeptr);
2871       }
2872     } else if (ompt_enabled.ompt_callback_nest_lock) {
2873       // release_lock_prev
2874       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2875           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2876     }
2877   }
2878 #endif
2879 
2880 #else // KMP_USE_DYNAMIC_LOCK
2881 
2882   kmp_user_lock_p lck;
2883 
2884   /* Can't use serial interval since not block structured */
2885 
2886   if ((__kmp_user_lock_kind == lk_tas) &&
2887       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2888        OMP_NEST_LOCK_T_SIZE)) {
2889 #if KMP_OS_LINUX &&                                                            \
2890     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2891     // "fast" path implemented to fix customer performance issue
2892     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2893 #if USE_ITT_BUILD
2894     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2895 #endif /* USE_ITT_BUILD */
2896 
2897 #if OMPT_SUPPORT && OMPT_OPTIONAL
2898     int release_status = KMP_LOCK_STILL_HELD;
2899 #endif
2900 
2901     if (--(tl->lk.depth_locked) == 0) {
2902       TCW_4(tl->lk.poll, 0);
2903 #if OMPT_SUPPORT && OMPT_OPTIONAL
2904       release_status = KMP_LOCK_RELEASED;
2905 #endif
2906     }
2907     KMP_MB();
2908 
2909 #if OMPT_SUPPORT && OMPT_OPTIONAL
2910     // This is the case, if called from omp_init_lock_with_hint:
2911     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2912     if (!codeptr)
2913       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2914     if (ompt_enabled.enabled) {
2915       if (release_status == KMP_LOCK_RELEASED) {
2916         if (ompt_enabled.ompt_callback_mutex_released) {
2917           // release_lock_last
2918           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2919               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2920         }
2921       } else if (ompt_enabled.ompt_callback_nest_lock) {
2922         // release_lock_previous
2923         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2924             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2925       }
2926     }
2927 #endif
2928 
2929     return;
2930 #else
2931     lck = (kmp_user_lock_p)user_lock;
2932 #endif
2933   }
2934 #if KMP_USE_FUTEX
2935   else if ((__kmp_user_lock_kind == lk_futex) &&
2936            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2937             OMP_NEST_LOCK_T_SIZE)) {
2938     lck = (kmp_user_lock_p)user_lock;
2939   }
2940 #endif
2941   else {
2942     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2943   }
2944 
2945 #if USE_ITT_BUILD
2946   __kmp_itt_lock_releasing(lck);
2947 #endif /* USE_ITT_BUILD */
2948 
2949   int release_status;
2950   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2951 #if OMPT_SUPPORT && OMPT_OPTIONAL
2952   // This is the case, if called from omp_init_lock_with_hint:
2953   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2954   if (!codeptr)
2955     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2956   if (ompt_enabled.enabled) {
2957     if (release_status == KMP_LOCK_RELEASED) {
2958       if (ompt_enabled.ompt_callback_mutex_released) {
2959         // release_lock_last
2960         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2961             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2962       }
2963     } else if (ompt_enabled.ompt_callback_nest_lock) {
2964       // release_lock_previous
2965       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2966           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2967     }
2968   }
2969 #endif
2970 
2971 #endif // KMP_USE_DYNAMIC_LOCK
2972 }
2973 
2974 /* try to acquire the lock */
2975 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2976   KMP_COUNT_BLOCK(OMP_test_lock);
2977 
2978 #if KMP_USE_DYNAMIC_LOCK
2979   int rc;
2980   int tag = KMP_EXTRACT_D_TAG(user_lock);
2981 #if USE_ITT_BUILD
2982   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2983 #endif
2984 #if OMPT_SUPPORT && OMPT_OPTIONAL
2985   // This is the case, if called from omp_init_lock_with_hint:
2986   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2987   if (!codeptr)
2988     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2989   if (ompt_enabled.ompt_callback_mutex_acquire) {
2990     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2991         ompt_mutex_lock, omp_lock_hint_none,
2992         __ompt_get_mutex_impl_type(user_lock),
2993         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2994   }
2995 #endif
2996 #if KMP_USE_INLINED_TAS
2997   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2998     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2999   } else
3000 #elif KMP_USE_INLINED_FUTEX
3001   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3002     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3003   } else
3004 #endif
3005   {
3006     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3007   }
3008   if (rc) {
3009 #if USE_ITT_BUILD
3010     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3011 #endif
3012 #if OMPT_SUPPORT && OMPT_OPTIONAL
3013     if (ompt_enabled.ompt_callback_mutex_acquired) {
3014       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3015           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3016     }
3017 #endif
3018     return FTN_TRUE;
3019   } else {
3020 #if USE_ITT_BUILD
3021     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3022 #endif
3023     return FTN_FALSE;
3024   }
3025 
3026 #else // KMP_USE_DYNAMIC_LOCK
3027 
3028   kmp_user_lock_p lck;
3029   int rc;
3030 
3031   if ((__kmp_user_lock_kind == lk_tas) &&
3032       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3033     lck = (kmp_user_lock_p)user_lock;
3034   }
3035 #if KMP_USE_FUTEX
3036   else if ((__kmp_user_lock_kind == lk_futex) &&
3037            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3038     lck = (kmp_user_lock_p)user_lock;
3039   }
3040 #endif
3041   else {
3042     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3043   }
3044 
3045 #if USE_ITT_BUILD
3046   __kmp_itt_lock_acquiring(lck);
3047 #endif /* USE_ITT_BUILD */
3048 #if OMPT_SUPPORT && OMPT_OPTIONAL
3049   // This is the case, if called from omp_init_lock_with_hint:
3050   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3051   if (!codeptr)
3052     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3053   if (ompt_enabled.ompt_callback_mutex_acquire) {
3054     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3055         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3056         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3057   }
3058 #endif
3059 
3060   rc = TEST_LOCK(lck, gtid);
3061 #if USE_ITT_BUILD
3062   if (rc) {
3063     __kmp_itt_lock_acquired(lck);
3064   } else {
3065     __kmp_itt_lock_cancelled(lck);
3066   }
3067 #endif /* USE_ITT_BUILD */
3068 #if OMPT_SUPPORT && OMPT_OPTIONAL
3069   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3070     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3071         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3072   }
3073 #endif
3074 
3075   return (rc ? FTN_TRUE : FTN_FALSE);
3076 
3077 /* Can't use serial interval since not block structured */
3078 
3079 #endif // KMP_USE_DYNAMIC_LOCK
3080 }
3081 
3082 /* try to acquire the lock */
3083 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3084 #if KMP_USE_DYNAMIC_LOCK
3085   int rc;
3086 #if USE_ITT_BUILD
3087   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3088 #endif
3089 #if OMPT_SUPPORT && OMPT_OPTIONAL
3090   // This is the case, if called from omp_init_lock_with_hint:
3091   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3092   if (!codeptr)
3093     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3094   if (ompt_enabled.ompt_callback_mutex_acquire) {
3095     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3096         ompt_mutex_nest_lock, omp_lock_hint_none,
3097         __ompt_get_mutex_impl_type(user_lock),
3098         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3099   }
3100 #endif
3101   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3102 #if USE_ITT_BUILD
3103   if (rc) {
3104     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3105   } else {
3106     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3107   }
3108 #endif
3109 #if OMPT_SUPPORT && OMPT_OPTIONAL
3110   if (ompt_enabled.enabled && rc) {
3111     if (rc == 1) {
3112       if (ompt_enabled.ompt_callback_mutex_acquired) {
3113         // lock_first
3114         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3115             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3116             codeptr);
3117       }
3118     } else {
3119       if (ompt_enabled.ompt_callback_nest_lock) {
3120         // lock_next
3121         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3122             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3123       }
3124     }
3125   }
3126 #endif
3127   return rc;
3128 
3129 #else // KMP_USE_DYNAMIC_LOCK
3130 
3131   kmp_user_lock_p lck;
3132   int rc;
3133 
3134   if ((__kmp_user_lock_kind == lk_tas) &&
3135       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3136        OMP_NEST_LOCK_T_SIZE)) {
3137     lck = (kmp_user_lock_p)user_lock;
3138   }
3139 #if KMP_USE_FUTEX
3140   else if ((__kmp_user_lock_kind == lk_futex) &&
3141            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3142             OMP_NEST_LOCK_T_SIZE)) {
3143     lck = (kmp_user_lock_p)user_lock;
3144   }
3145 #endif
3146   else {
3147     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3148   }
3149 
3150 #if USE_ITT_BUILD
3151   __kmp_itt_lock_acquiring(lck);
3152 #endif /* USE_ITT_BUILD */
3153 
3154 #if OMPT_SUPPORT && OMPT_OPTIONAL
3155   // This is the case, if called from omp_init_lock_with_hint:
3156   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3157   if (!codeptr)
3158     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3159   if (ompt_enabled.enabled) &&
3160         ompt_enabled.ompt_callback_mutex_acquire) {
3161       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3162           ompt_mutex_nest_lock, omp_lock_hint_none,
3163           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3164           codeptr);
3165     }
3166 #endif
3167 
3168   rc = TEST_NESTED_LOCK(lck, gtid);
3169 #if USE_ITT_BUILD
3170   if (rc) {
3171     __kmp_itt_lock_acquired(lck);
3172   } else {
3173     __kmp_itt_lock_cancelled(lck);
3174   }
3175 #endif /* USE_ITT_BUILD */
3176 #if OMPT_SUPPORT && OMPT_OPTIONAL
3177   if (ompt_enabled.enabled && rc) {
3178     if (rc == 1) {
3179       if (ompt_enabled.ompt_callback_mutex_acquired) {
3180         // lock_first
3181         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3182             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3183       }
3184     } else {
3185       if (ompt_enabled.ompt_callback_nest_lock) {
3186         // lock_next
3187         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3188             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3189       }
3190     }
3191   }
3192 #endif
3193   return rc;
3194 
3195 /* Can't use serial interval since not block structured */
3196 
3197 #endif // KMP_USE_DYNAMIC_LOCK
3198 }
3199 
3200 // Interface to fast scalable reduce methods routines
3201 
3202 // keep the selected method in a thread local structure for cross-function
3203 // usage: will be used in __kmpc_end_reduce* functions;
3204 // another solution: to re-determine the method one more time in
3205 // __kmpc_end_reduce* functions (new prototype required then)
3206 // AT: which solution is better?
3207 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3208   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3209 
3210 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3211   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3212 
3213 // description of the packed_reduction_method variable: look at the macros in
3214 // kmp.h
3215 
3216 // used in a critical section reduce block
3217 static __forceinline void
3218 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3219                                           kmp_critical_name *crit) {
3220 
3221   // this lock was visible to a customer and to the threading profile tool as a
3222   // serial overhead span (although it's used for an internal purpose only)
3223   //            why was it visible in previous implementation?
3224   //            should we keep it visible in new reduce block?
3225   kmp_user_lock_p lck;
3226 
3227 #if KMP_USE_DYNAMIC_LOCK
3228 
3229   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3230   // Check if it is initialized.
3231   if (*lk == 0) {
3232     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3233       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3234                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3235     } else {
3236       __kmp_init_indirect_csptr(crit, loc, global_tid,
3237                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3238     }
3239   }
3240   // Branch for accessing the actual lock object and set operation. This
3241   // branching is inevitable since this lock initialization does not follow the
3242   // normal dispatch path (lock table is not used).
3243   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3244     lck = (kmp_user_lock_p)lk;
3245     KMP_DEBUG_ASSERT(lck != NULL);
3246     if (__kmp_env_consistency_check) {
3247       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3248     }
3249     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3250   } else {
3251     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3252     lck = ilk->lock;
3253     KMP_DEBUG_ASSERT(lck != NULL);
3254     if (__kmp_env_consistency_check) {
3255       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3256     }
3257     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3258   }
3259 
3260 #else // KMP_USE_DYNAMIC_LOCK
3261 
3262   // We know that the fast reduction code is only emitted by Intel compilers
3263   // with 32 byte critical sections. If there isn't enough space, then we
3264   // have to use a pointer.
3265   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3266     lck = (kmp_user_lock_p)crit;
3267   } else {
3268     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3269   }
3270   KMP_DEBUG_ASSERT(lck != NULL);
3271 
3272   if (__kmp_env_consistency_check)
3273     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3274 
3275   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3276 
3277 #endif // KMP_USE_DYNAMIC_LOCK
3278 }
3279 
3280 // used in a critical section reduce block
3281 static __forceinline void
3282 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3283                                         kmp_critical_name *crit) {
3284 
3285   kmp_user_lock_p lck;
3286 
3287 #if KMP_USE_DYNAMIC_LOCK
3288 
3289   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3290     lck = (kmp_user_lock_p)crit;
3291     if (__kmp_env_consistency_check)
3292       __kmp_pop_sync(global_tid, ct_critical, loc);
3293     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3294   } else {
3295     kmp_indirect_lock_t *ilk =
3296         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3297     if (__kmp_env_consistency_check)
3298       __kmp_pop_sync(global_tid, ct_critical, loc);
3299     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3300   }
3301 
3302 #else // KMP_USE_DYNAMIC_LOCK
3303 
3304   // We know that the fast reduction code is only emitted by Intel compilers
3305   // with 32 byte critical sections. If there isn't enough space, then we have
3306   // to use a pointer.
3307   if (__kmp_base_user_lock_size > 32) {
3308     lck = *((kmp_user_lock_p *)crit);
3309     KMP_ASSERT(lck != NULL);
3310   } else {
3311     lck = (kmp_user_lock_p)crit;
3312   }
3313 
3314   if (__kmp_env_consistency_check)
3315     __kmp_pop_sync(global_tid, ct_critical, loc);
3316 
3317   __kmp_release_user_lock_with_checks(lck, global_tid);
3318 
3319 #endif // KMP_USE_DYNAMIC_LOCK
3320 } // __kmp_end_critical_section_reduce_block
3321 
3322 static __forceinline int
3323 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3324                                      int *task_state) {
3325   kmp_team_t *team;
3326 
3327   // Check if we are inside the teams construct?
3328   if (th->th.th_teams_microtask) {
3329     *team_p = team = th->th.th_team;
3330     if (team->t.t_level == th->th.th_teams_level) {
3331       // This is reduction at teams construct.
3332       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3333       // Let's swap teams temporarily for the reduction.
3334       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3335       th->th.th_team = team->t.t_parent;
3336       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3337       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3338       *task_state = th->th.th_task_state;
3339       th->th.th_task_state = 0;
3340 
3341       return 1;
3342     }
3343   }
3344   return 0;
3345 }
3346 
3347 static __forceinline void
3348 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3349   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3350   th->th.th_info.ds.ds_tid = 0;
3351   th->th.th_team = team;
3352   th->th.th_team_nproc = team->t.t_nproc;
3353   th->th.th_task_team = team->t.t_task_team[task_state];
3354   th->th.th_task_state = task_state;
3355 }
3356 
3357 /* 2.a.i. Reduce Block without a terminating barrier */
3358 /*!
3359 @ingroup SYNCHRONIZATION
3360 @param loc source location information
3361 @param global_tid global thread number
3362 @param num_vars number of items (variables) to be reduced
3363 @param reduce_size size of data in bytes to be reduced
3364 @param reduce_data pointer to data to be reduced
3365 @param reduce_func callback function providing reduction operation on two
3366 operands and returning result of reduction in lhs_data
3367 @param lck pointer to the unique lock data structure
3368 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3369 threads if atomic reduction needed
3370 
3371 The nowait version is used for a reduce clause with the nowait argument.
3372 */
3373 kmp_int32
3374 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3375                      size_t reduce_size, void *reduce_data,
3376                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3377                      kmp_critical_name *lck) {
3378 
3379   KMP_COUNT_BLOCK(REDUCE_nowait);
3380   int retval = 0;
3381   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3382   kmp_info_t *th;
3383   kmp_team_t *team;
3384   int teams_swapped = 0, task_state;
3385   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3386 
3387   // why do we need this initialization here at all?
3388   // Reduction clause can not be used as a stand-alone directive.
3389 
3390   // do not call __kmp_serial_initialize(), it will be called by
3391   // __kmp_parallel_initialize() if needed
3392   // possible detection of false-positive race by the threadchecker ???
3393   if (!TCR_4(__kmp_init_parallel))
3394     __kmp_parallel_initialize();
3395 
3396   __kmp_resume_if_soft_paused();
3397 
3398 // check correctness of reduce block nesting
3399 #if KMP_USE_DYNAMIC_LOCK
3400   if (__kmp_env_consistency_check)
3401     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3402 #else
3403   if (__kmp_env_consistency_check)
3404     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3405 #endif
3406 
3407   th = __kmp_thread_from_gtid(global_tid);
3408   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3409 
3410   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3411   // the value should be kept in a variable
3412   // the variable should be either a construct-specific or thread-specific
3413   // property, not a team specific property
3414   //     (a thread can reach the next reduce block on the next construct, reduce
3415   //     method may differ on the next construct)
3416   // an ident_t "loc" parameter could be used as a construct-specific property
3417   // (what if loc == 0?)
3418   //     (if both construct-specific and team-specific variables were shared,
3419   //     then unness extra syncs should be needed)
3420   // a thread-specific variable is better regarding two issues above (next
3421   // construct and extra syncs)
3422   // a thread-specific "th_local.reduction_method" variable is used currently
3423   // each thread executes 'determine' and 'set' lines (no need to execute by one
3424   // thread, to avoid unness extra syncs)
3425 
3426   packed_reduction_method = __kmp_determine_reduction_method(
3427       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3428   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3429 
3430   if (packed_reduction_method == critical_reduce_block) {
3431 
3432     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3433     retval = 1;
3434 
3435   } else if (packed_reduction_method == empty_reduce_block) {
3436 
3437     // usage: if team size == 1, no synchronization is required ( Intel
3438     // platforms only )
3439     retval = 1;
3440 
3441   } else if (packed_reduction_method == atomic_reduce_block) {
3442 
3443     retval = 2;
3444 
3445     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3446     // won't be called by the code gen)
3447     //     (it's not quite good, because the checking block has been closed by
3448     //     this 'pop',
3449     //      but atomic operation has not been executed yet, will be executed
3450     //      slightly later, literally on next instruction)
3451     if (__kmp_env_consistency_check)
3452       __kmp_pop_sync(global_tid, ct_reduce, loc);
3453 
3454   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3455                                    tree_reduce_block)) {
3456 
3457 // AT: performance issue: a real barrier here
3458 // AT:     (if master goes slow, other threads are blocked here waiting for the
3459 // master to come and release them)
3460 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3461 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3462 // be confusing to a customer)
3463 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3464 // might go faster and be more in line with sense of NOWAIT
3465 // AT: TO DO: do epcc test and compare times
3466 
3467 // this barrier should be invisible to a customer and to the threading profile
3468 // tool (it's neither a terminating barrier nor customer's code, it's
3469 // used for an internal purpose)
3470 #if OMPT_SUPPORT
3471     // JP: can this barrier potentially leed to task scheduling?
3472     // JP: as long as there is a barrier in the implementation, OMPT should and
3473     // will provide the barrier events
3474     //         so we set-up the necessary frame/return addresses.
3475     ompt_frame_t *ompt_frame;
3476     if (ompt_enabled.enabled) {
3477       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3478       if (ompt_frame->enter_frame.ptr == NULL)
3479         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3480       OMPT_STORE_RETURN_ADDRESS(global_tid);
3481     }
3482 #endif
3483 #if USE_ITT_NOTIFY
3484     __kmp_threads[global_tid]->th.th_ident = loc;
3485 #endif
3486     retval =
3487         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3488                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3489     retval = (retval != 0) ? (0) : (1);
3490 #if OMPT_SUPPORT && OMPT_OPTIONAL
3491     if (ompt_enabled.enabled) {
3492       ompt_frame->enter_frame = ompt_data_none;
3493     }
3494 #endif
3495 
3496     // all other workers except master should do this pop here
3497     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3498     if (__kmp_env_consistency_check) {
3499       if (retval == 0) {
3500         __kmp_pop_sync(global_tid, ct_reduce, loc);
3501       }
3502     }
3503 
3504   } else {
3505 
3506     // should never reach this block
3507     KMP_ASSERT(0); // "unexpected method"
3508   }
3509   if (teams_swapped) {
3510     __kmp_restore_swapped_teams(th, team, task_state);
3511   }
3512   KA_TRACE(
3513       10,
3514       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3515        global_tid, packed_reduction_method, retval));
3516 
3517   return retval;
3518 }
3519 
3520 /*!
3521 @ingroup SYNCHRONIZATION
3522 @param loc source location information
3523 @param global_tid global thread id.
3524 @param lck pointer to the unique lock data structure
3525 
3526 Finish the execution of a reduce nowait.
3527 */
3528 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3529                               kmp_critical_name *lck) {
3530 
3531   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3532 
3533   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3534 
3535   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3536 
3537   if (packed_reduction_method == critical_reduce_block) {
3538 
3539     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3540 
3541   } else if (packed_reduction_method == empty_reduce_block) {
3542 
3543     // usage: if team size == 1, no synchronization is required ( on Intel
3544     // platforms only )
3545 
3546   } else if (packed_reduction_method == atomic_reduce_block) {
3547 
3548     // neither master nor other workers should get here
3549     //     (code gen does not generate this call in case 2: atomic reduce block)
3550     // actually it's better to remove this elseif at all;
3551     // after removal this value will checked by the 'else' and will assert
3552 
3553   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3554                                    tree_reduce_block)) {
3555 
3556     // only master gets here
3557 
3558   } else {
3559 
3560     // should never reach this block
3561     KMP_ASSERT(0); // "unexpected method"
3562   }
3563 
3564   if (__kmp_env_consistency_check)
3565     __kmp_pop_sync(global_tid, ct_reduce, loc);
3566 
3567   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3568                 global_tid, packed_reduction_method));
3569 
3570   return;
3571 }
3572 
3573 /* 2.a.ii. Reduce Block with a terminating barrier */
3574 
3575 /*!
3576 @ingroup SYNCHRONIZATION
3577 @param loc source location information
3578 @param global_tid global thread number
3579 @param num_vars number of items (variables) to be reduced
3580 @param reduce_size size of data in bytes to be reduced
3581 @param reduce_data pointer to data to be reduced
3582 @param reduce_func callback function providing reduction operation on two
3583 operands and returning result of reduction in lhs_data
3584 @param lck pointer to the unique lock data structure
3585 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3586 threads if atomic reduction needed
3587 
3588 A blocking reduce that includes an implicit barrier.
3589 */
3590 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3591                         size_t reduce_size, void *reduce_data,
3592                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3593                         kmp_critical_name *lck) {
3594   KMP_COUNT_BLOCK(REDUCE_wait);
3595   int retval = 0;
3596   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3597   kmp_info_t *th;
3598   kmp_team_t *team;
3599   int teams_swapped = 0, task_state;
3600 
3601   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3602 
3603   // why do we need this initialization here at all?
3604   // Reduction clause can not be a stand-alone directive.
3605 
3606   // do not call __kmp_serial_initialize(), it will be called by
3607   // __kmp_parallel_initialize() if needed
3608   // possible detection of false-positive race by the threadchecker ???
3609   if (!TCR_4(__kmp_init_parallel))
3610     __kmp_parallel_initialize();
3611 
3612   __kmp_resume_if_soft_paused();
3613 
3614 // check correctness of reduce block nesting
3615 #if KMP_USE_DYNAMIC_LOCK
3616   if (__kmp_env_consistency_check)
3617     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3618 #else
3619   if (__kmp_env_consistency_check)
3620     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3621 #endif
3622 
3623   th = __kmp_thread_from_gtid(global_tid);
3624   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3625 
3626   packed_reduction_method = __kmp_determine_reduction_method(
3627       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3628   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3629 
3630   if (packed_reduction_method == critical_reduce_block) {
3631 
3632     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3633     retval = 1;
3634 
3635   } else if (packed_reduction_method == empty_reduce_block) {
3636 
3637     // usage: if team size == 1, no synchronization is required ( Intel
3638     // platforms only )
3639     retval = 1;
3640 
3641   } else if (packed_reduction_method == atomic_reduce_block) {
3642 
3643     retval = 2;
3644 
3645   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3646                                    tree_reduce_block)) {
3647 
3648 // case tree_reduce_block:
3649 // this barrier should be visible to a customer and to the threading profile
3650 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3651 #if OMPT_SUPPORT
3652     ompt_frame_t *ompt_frame;
3653     if (ompt_enabled.enabled) {
3654       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3655       if (ompt_frame->enter_frame.ptr == NULL)
3656         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3657       OMPT_STORE_RETURN_ADDRESS(global_tid);
3658     }
3659 #endif
3660 #if USE_ITT_NOTIFY
3661     __kmp_threads[global_tid]->th.th_ident =
3662         loc; // needed for correct notification of frames
3663 #endif
3664     retval =
3665         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3666                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3667     retval = (retval != 0) ? (0) : (1);
3668 #if OMPT_SUPPORT && OMPT_OPTIONAL
3669     if (ompt_enabled.enabled) {
3670       ompt_frame->enter_frame = ompt_data_none;
3671     }
3672 #endif
3673 
3674     // all other workers except master should do this pop here
3675     // ( none of other workers except master will enter __kmpc_end_reduce() )
3676     if (__kmp_env_consistency_check) {
3677       if (retval == 0) { // 0: all other workers; 1: master
3678         __kmp_pop_sync(global_tid, ct_reduce, loc);
3679       }
3680     }
3681 
3682   } else {
3683 
3684     // should never reach this block
3685     KMP_ASSERT(0); // "unexpected method"
3686   }
3687   if (teams_swapped) {
3688     __kmp_restore_swapped_teams(th, team, task_state);
3689   }
3690 
3691   KA_TRACE(10,
3692            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3693             global_tid, packed_reduction_method, retval));
3694   return retval;
3695 }
3696 
3697 /*!
3698 @ingroup SYNCHRONIZATION
3699 @param loc source location information
3700 @param global_tid global thread id.
3701 @param lck pointer to the unique lock data structure
3702 
3703 Finish the execution of a blocking reduce.
3704 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3705 start function.
3706 */
3707 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3708                        kmp_critical_name *lck) {
3709 
3710   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3711   kmp_info_t *th;
3712   kmp_team_t *team;
3713   int teams_swapped = 0, task_state;
3714 
3715   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3716 
3717   th = __kmp_thread_from_gtid(global_tid);
3718   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3719 
3720   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3721 
3722   // this barrier should be visible to a customer and to the threading profile
3723   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3724 
3725   if (packed_reduction_method == critical_reduce_block) {
3726     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3727 
3728 // TODO: implicit barrier: should be exposed
3729 #if OMPT_SUPPORT
3730     ompt_frame_t *ompt_frame;
3731     if (ompt_enabled.enabled) {
3732       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3733       if (ompt_frame->enter_frame.ptr == NULL)
3734         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3735       OMPT_STORE_RETURN_ADDRESS(global_tid);
3736     }
3737 #endif
3738 #if USE_ITT_NOTIFY
3739     __kmp_threads[global_tid]->th.th_ident = loc;
3740 #endif
3741     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3742 #if OMPT_SUPPORT && OMPT_OPTIONAL
3743     if (ompt_enabled.enabled) {
3744       ompt_frame->enter_frame = ompt_data_none;
3745     }
3746 #endif
3747 
3748   } else if (packed_reduction_method == empty_reduce_block) {
3749 
3750 // usage: if team size==1, no synchronization is required (Intel platforms only)
3751 
3752 // TODO: implicit barrier: should be exposed
3753 #if OMPT_SUPPORT
3754     ompt_frame_t *ompt_frame;
3755     if (ompt_enabled.enabled) {
3756       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3757       if (ompt_frame->enter_frame.ptr == NULL)
3758         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3759       OMPT_STORE_RETURN_ADDRESS(global_tid);
3760     }
3761 #endif
3762 #if USE_ITT_NOTIFY
3763     __kmp_threads[global_tid]->th.th_ident = loc;
3764 #endif
3765     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3766 #if OMPT_SUPPORT && OMPT_OPTIONAL
3767     if (ompt_enabled.enabled) {
3768       ompt_frame->enter_frame = ompt_data_none;
3769     }
3770 #endif
3771 
3772   } else if (packed_reduction_method == atomic_reduce_block) {
3773 
3774 #if OMPT_SUPPORT
3775     ompt_frame_t *ompt_frame;
3776     if (ompt_enabled.enabled) {
3777       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3778       if (ompt_frame->enter_frame.ptr == NULL)
3779         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3780       OMPT_STORE_RETURN_ADDRESS(global_tid);
3781     }
3782 #endif
3783 // TODO: implicit barrier: should be exposed
3784 #if USE_ITT_NOTIFY
3785     __kmp_threads[global_tid]->th.th_ident = loc;
3786 #endif
3787     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3788 #if OMPT_SUPPORT && OMPT_OPTIONAL
3789     if (ompt_enabled.enabled) {
3790       ompt_frame->enter_frame = ompt_data_none;
3791     }
3792 #endif
3793 
3794   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3795                                    tree_reduce_block)) {
3796 
3797     // only master executes here (master releases all other workers)
3798     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3799                             global_tid);
3800 
3801   } else {
3802 
3803     // should never reach this block
3804     KMP_ASSERT(0); // "unexpected method"
3805   }
3806   if (teams_swapped) {
3807     __kmp_restore_swapped_teams(th, team, task_state);
3808   }
3809 
3810   if (__kmp_env_consistency_check)
3811     __kmp_pop_sync(global_tid, ct_reduce, loc);
3812 
3813   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3814                 global_tid, packed_reduction_method));
3815 
3816   return;
3817 }
3818 
3819 #undef __KMP_GET_REDUCTION_METHOD
3820 #undef __KMP_SET_REDUCTION_METHOD
3821 
3822 /* end of interface to fast scalable reduce routines */
3823 
3824 kmp_uint64 __kmpc_get_taskid() {
3825 
3826   kmp_int32 gtid;
3827   kmp_info_t *thread;
3828 
3829   gtid = __kmp_get_gtid();
3830   if (gtid < 0) {
3831     return 0;
3832   }
3833   thread = __kmp_thread_from_gtid(gtid);
3834   return thread->th.th_current_task->td_task_id;
3835 
3836 } // __kmpc_get_taskid
3837 
3838 kmp_uint64 __kmpc_get_parent_taskid() {
3839 
3840   kmp_int32 gtid;
3841   kmp_info_t *thread;
3842   kmp_taskdata_t *parent_task;
3843 
3844   gtid = __kmp_get_gtid();
3845   if (gtid < 0) {
3846     return 0;
3847   }
3848   thread = __kmp_thread_from_gtid(gtid);
3849   parent_task = thread->th.th_current_task->td_parent;
3850   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3851 
3852 } // __kmpc_get_parent_taskid
3853 
3854 /*!
3855 @ingroup WORK_SHARING
3856 @param loc  source location information.
3857 @param gtid  global thread number.
3858 @param num_dims  number of associated doacross loops.
3859 @param dims  info on loops bounds.
3860 
3861 Initialize doacross loop information.
3862 Expect compiler send us inclusive bounds,
3863 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3864 */
3865 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3866                           const struct kmp_dim *dims) {
3867   int j, idx;
3868   kmp_int64 last, trace_count;
3869   kmp_info_t *th = __kmp_threads[gtid];
3870   kmp_team_t *team = th->th.th_team;
3871   kmp_uint32 *flags;
3872   kmp_disp_t *pr_buf = th->th.th_dispatch;
3873   dispatch_shared_info_t *sh_buf;
3874 
3875   KA_TRACE(
3876       20,
3877       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3878        gtid, num_dims, !team->t.t_serialized));
3879   KMP_DEBUG_ASSERT(dims != NULL);
3880   KMP_DEBUG_ASSERT(num_dims > 0);
3881 
3882   if (team->t.t_serialized) {
3883     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3884     return; // no dependencies if team is serialized
3885   }
3886   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3887   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3888   // the next loop
3889   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3890 
3891   // Save bounds info into allocated private buffer
3892   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3893   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3894       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3895   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3896   pr_buf->th_doacross_info[0] =
3897       (kmp_int64)num_dims; // first element is number of dimensions
3898   // Save also address of num_done in order to access it later without knowing
3899   // the buffer index
3900   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3901   pr_buf->th_doacross_info[2] = dims[0].lo;
3902   pr_buf->th_doacross_info[3] = dims[0].up;
3903   pr_buf->th_doacross_info[4] = dims[0].st;
3904   last = 5;
3905   for (j = 1; j < num_dims; ++j) {
3906     kmp_int64
3907         range_length; // To keep ranges of all dimensions but the first dims[0]
3908     if (dims[j].st == 1) { // most common case
3909       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3910       range_length = dims[j].up - dims[j].lo + 1;
3911     } else {
3912       if (dims[j].st > 0) {
3913         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3914         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3915       } else { // negative increment
3916         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3917         range_length =
3918             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3919       }
3920     }
3921     pr_buf->th_doacross_info[last++] = range_length;
3922     pr_buf->th_doacross_info[last++] = dims[j].lo;
3923     pr_buf->th_doacross_info[last++] = dims[j].up;
3924     pr_buf->th_doacross_info[last++] = dims[j].st;
3925   }
3926 
3927   // Compute total trip count.
3928   // Start with range of dims[0] which we don't need to keep in the buffer.
3929   if (dims[0].st == 1) { // most common case
3930     trace_count = dims[0].up - dims[0].lo + 1;
3931   } else if (dims[0].st > 0) {
3932     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3933     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3934   } else { // negative increment
3935     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3936     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3937   }
3938   for (j = 1; j < num_dims; ++j) {
3939     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3940   }
3941   KMP_DEBUG_ASSERT(trace_count > 0);
3942 
3943   // Check if shared buffer is not occupied by other loop (idx -
3944   // __kmp_dispatch_num_buffers)
3945   if (idx != sh_buf->doacross_buf_idx) {
3946     // Shared buffer is occupied, wait for it to be free
3947     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3948                  __kmp_eq_4, NULL);
3949   }
3950 #if KMP_32_BIT_ARCH
3951   // Check if we are the first thread. After the CAS the first thread gets 0,
3952   // others get 1 if initialization is in progress, allocated pointer otherwise.
3953   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3954   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3955       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3956 #else
3957   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3958       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3959 #endif
3960   if (flags == NULL) {
3961     // we are the first thread, allocate the array of flags
3962     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3963     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3964     KMP_MB();
3965     sh_buf->doacross_flags = flags;
3966   } else if (flags == (kmp_uint32 *)1) {
3967 #if KMP_32_BIT_ARCH
3968     // initialization is still in progress, need to wait
3969     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3970 #else
3971     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3972 #endif
3973       KMP_YIELD(TRUE);
3974     KMP_MB();
3975   } else {
3976     KMP_MB();
3977   }
3978   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3979   pr_buf->th_doacross_flags =
3980       sh_buf->doacross_flags; // save private copy in order to not
3981   // touch shared buffer on each iteration
3982   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3983 }
3984 
3985 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
3986   kmp_int32 shft, num_dims, i;
3987   kmp_uint32 flag;
3988   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3989   kmp_info_t *th = __kmp_threads[gtid];
3990   kmp_team_t *team = th->th.th_team;
3991   kmp_disp_t *pr_buf;
3992   kmp_int64 lo, up, st;
3993 
3994   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3995   if (team->t.t_serialized) {
3996     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
3997     return; // no dependencies if team is serialized
3998   }
3999 
4000   // calculate sequential iteration number and check out-of-bounds condition
4001   pr_buf = th->th.th_dispatch;
4002   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4003   num_dims = pr_buf->th_doacross_info[0];
4004   lo = pr_buf->th_doacross_info[2];
4005   up = pr_buf->th_doacross_info[3];
4006   st = pr_buf->th_doacross_info[4];
4007   if (st == 1) { // most common case
4008     if (vec[0] < lo || vec[0] > up) {
4009       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4010                     "bounds [%lld,%lld]\n",
4011                     gtid, vec[0], lo, up));
4012       return;
4013     }
4014     iter_number = vec[0] - lo;
4015   } else if (st > 0) {
4016     if (vec[0] < lo || vec[0] > up) {
4017       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4018                     "bounds [%lld,%lld]\n",
4019                     gtid, vec[0], lo, up));
4020       return;
4021     }
4022     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4023   } else { // negative increment
4024     if (vec[0] > lo || vec[0] < up) {
4025       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4026                     "bounds [%lld,%lld]\n",
4027                     gtid, vec[0], lo, up));
4028       return;
4029     }
4030     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4031   }
4032   for (i = 1; i < num_dims; ++i) {
4033     kmp_int64 iter, ln;
4034     kmp_int32 j = i * 4;
4035     ln = pr_buf->th_doacross_info[j + 1];
4036     lo = pr_buf->th_doacross_info[j + 2];
4037     up = pr_buf->th_doacross_info[j + 3];
4038     st = pr_buf->th_doacross_info[j + 4];
4039     if (st == 1) {
4040       if (vec[i] < lo || vec[i] > up) {
4041         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4042                       "bounds [%lld,%lld]\n",
4043                       gtid, vec[i], lo, up));
4044         return;
4045       }
4046       iter = vec[i] - lo;
4047     } else if (st > 0) {
4048       if (vec[i] < lo || vec[i] > up) {
4049         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4050                       "bounds [%lld,%lld]\n",
4051                       gtid, vec[i], lo, up));
4052         return;
4053       }
4054       iter = (kmp_uint64)(vec[i] - lo) / st;
4055     } else { // st < 0
4056       if (vec[i] > lo || vec[i] < up) {
4057         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4058                       "bounds [%lld,%lld]\n",
4059                       gtid, vec[i], lo, up));
4060         return;
4061       }
4062       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4063     }
4064     iter_number = iter + ln * iter_number;
4065   }
4066   shft = iter_number % 32; // use 32-bit granularity
4067   iter_number >>= 5; // divided by 32
4068   flag = 1 << shft;
4069   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4070     KMP_YIELD(TRUE);
4071   }
4072   KMP_MB();
4073   KA_TRACE(20,
4074            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4075             gtid, (iter_number << 5) + shft));
4076 }
4077 
4078 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4079   kmp_int32 shft, num_dims, i;
4080   kmp_uint32 flag;
4081   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4082   kmp_info_t *th = __kmp_threads[gtid];
4083   kmp_team_t *team = th->th.th_team;
4084   kmp_disp_t *pr_buf;
4085   kmp_int64 lo, st;
4086 
4087   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4088   if (team->t.t_serialized) {
4089     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4090     return; // no dependencies if team is serialized
4091   }
4092 
4093   // calculate sequential iteration number (same as in "wait" but no
4094   // out-of-bounds checks)
4095   pr_buf = th->th.th_dispatch;
4096   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4097   num_dims = pr_buf->th_doacross_info[0];
4098   lo = pr_buf->th_doacross_info[2];
4099   st = pr_buf->th_doacross_info[4];
4100   if (st == 1) { // most common case
4101     iter_number = vec[0] - lo;
4102   } else if (st > 0) {
4103     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4104   } else { // negative increment
4105     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4106   }
4107   for (i = 1; i < num_dims; ++i) {
4108     kmp_int64 iter, ln;
4109     kmp_int32 j = i * 4;
4110     ln = pr_buf->th_doacross_info[j + 1];
4111     lo = pr_buf->th_doacross_info[j + 2];
4112     st = pr_buf->th_doacross_info[j + 4];
4113     if (st == 1) {
4114       iter = vec[i] - lo;
4115     } else if (st > 0) {
4116       iter = (kmp_uint64)(vec[i] - lo) / st;
4117     } else { // st < 0
4118       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4119     }
4120     iter_number = iter + ln * iter_number;
4121   }
4122   shft = iter_number % 32; // use 32-bit granularity
4123   iter_number >>= 5; // divided by 32
4124   flag = 1 << shft;
4125   KMP_MB();
4126   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4127     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4128   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4129                 (iter_number << 5) + shft));
4130 }
4131 
4132 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4133   kmp_int32 num_done;
4134   kmp_info_t *th = __kmp_threads[gtid];
4135   kmp_team_t *team = th->th.th_team;
4136   kmp_disp_t *pr_buf = th->th.th_dispatch;
4137 
4138   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4139   if (team->t.t_serialized) {
4140     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4141     return; // nothing to do
4142   }
4143   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4144   if (num_done == th->th.th_team_nproc) {
4145     // we are the last thread, need to free shared resources
4146     int idx = pr_buf->th_doacross_buf_idx - 1;
4147     dispatch_shared_info_t *sh_buf =
4148         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4149     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4150                      (kmp_int64)&sh_buf->doacross_num_done);
4151     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4152     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4153     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4154     sh_buf->doacross_flags = NULL;
4155     sh_buf->doacross_num_done = 0;
4156     sh_buf->doacross_buf_idx +=
4157         __kmp_dispatch_num_buffers; // free buffer for future re-use
4158   }
4159   // free private resources (need to keep buffer index forever)
4160   pr_buf->th_doacross_flags = NULL;
4161   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4162   pr_buf->th_doacross_info = NULL;
4163   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4164 }
4165 
4166 /* omp_alloc/omp_free only defined for C/C++, not for Fortran */
4167 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4168   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4169 }
4170 
4171 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4172   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4173 }
4174 
4175 int __kmpc_get_target_offload(void) {
4176   if (!__kmp_init_serial) {
4177     __kmp_serial_initialize();
4178   }
4179   return __kmp_target_offload;
4180 }
4181 
4182 int __kmpc_pause_resource(kmp_pause_status_t level) {
4183   if (!__kmp_init_serial) {
4184     return 1; // Can't pause if runtime is not initialized
4185   }
4186   return __kmp_pause_resource(level);
4187 }
4188