xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_csupport.cpp (revision e2eeea75eb8b6dd50c1298067a0655880d186734)
1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
43   } else if (__kmp_ignore_mppbeg() == FALSE) {
44     // By default __kmp_ignore_mppbeg() returns TRUE.
45     __kmp_internal_begin();
46     KC_TRACE(10, ("__kmpc_begin: called\n"));
47   }
48 }
49 
50 /*!
51  * @ingroup STARTUP_SHUTDOWN
52  * @param loc source location information
53  *
54  * Shutdown the runtime library. This is also optional, and even if called will
55  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
56  * zero.
57  */
58 void __kmpc_end(ident_t *loc) {
59   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
60   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
61   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
62   // returns FALSE and __kmpc_end() will unregister this root (it can cause
63   // library shut down).
64   if (__kmp_ignore_mppend() == FALSE) {
65     KC_TRACE(10, ("__kmpc_end: called\n"));
66     KA_TRACE(30, ("__kmpc_end\n"));
67 
68     __kmp_internal_end_thread(-1);
69   }
70 #if KMP_OS_WINDOWS && OMPT_SUPPORT
71   // Normal exit process on Windows does not allow worker threads of the final
72   // parallel region to finish reporting their events, so shutting down the
73   // library here fixes the issue at least for the cases where __kmpc_end() is
74   // placed properly.
75   if (ompt_enabled.enabled)
76     __kmp_internal_end_library(__kmp_gtid_get_specific());
77 #endif
78 }
79 
80 /*!
81 @ingroup THREAD_STATES
82 @param loc Source location information.
83 @return The global thread index of the active thread.
84 
85 This function can be called in any context.
86 
87 If the runtime has ony been entered at the outermost level from a
88 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
89 that which would be returned by omp_get_thread_num() in the outermost
90 active parallel construct. (Or zero if there is no active parallel
91 construct, since the master thread is necessarily thread zero).
92 
93 If multiple non-OpenMP threads all enter an OpenMP construct then this
94 will be a unique thread identifier among all the threads created by
95 the OpenMP runtime (but the value cannot be defined in terms of
96 OpenMP thread ids returned by omp_get_thread_num()).
97 */
98 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
99   kmp_int32 gtid = __kmp_entry_gtid();
100 
101   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
102 
103   return gtid;
104 }
105 
106 /*!
107 @ingroup THREAD_STATES
108 @param loc Source location information.
109 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
110 
111 This function can be called in any context.
112 It returns the total number of threads under the control of the OpenMP runtime.
113 That is not a number that can be determined by any OpenMP standard calls, since
114 the library may be called from more than one non-OpenMP thread, and this
115 reflects the total over all such calls. Similarly the runtime maintains
116 underlying threads even when they are not active (since the cost of creating
117 and destroying OS threads is high), this call counts all such threads even if
118 they are not waiting for work.
119 */
120 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
121   KC_TRACE(10,
122            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
123 
124   return TCR_4(__kmp_all_nth);
125 }
126 
127 /*!
128 @ingroup THREAD_STATES
129 @param loc Source location information.
130 @return The thread number of the calling thread in the innermost active parallel
131 construct.
132 */
133 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
134   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
135   return __kmp_tid_from_gtid(__kmp_entry_gtid());
136 }
137 
138 /*!
139 @ingroup THREAD_STATES
140 @param loc Source location information.
141 @return The number of threads in the innermost active parallel construct.
142 */
143 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
144   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
145 
146   return __kmp_entry_thread()->th.th_team->t.t_nproc;
147 }
148 
149 /*!
150  * @ingroup DEPRECATED
151  * @param loc location description
152  *
153  * This function need not be called. It always returns TRUE.
154  */
155 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
156 #ifndef KMP_DEBUG
157 
158   return TRUE;
159 
160 #else
161 
162   const char *semi2;
163   const char *semi3;
164   int line_no;
165 
166   if (__kmp_par_range == 0) {
167     return TRUE;
168   }
169   semi2 = loc->psource;
170   if (semi2 == NULL) {
171     return TRUE;
172   }
173   semi2 = strchr(semi2, ';');
174   if (semi2 == NULL) {
175     return TRUE;
176   }
177   semi2 = strchr(semi2 + 1, ';');
178   if (semi2 == NULL) {
179     return TRUE;
180   }
181   if (__kmp_par_range_filename[0]) {
182     const char *name = semi2 - 1;
183     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
184       name--;
185     }
186     if ((*name == '/') || (*name == ';')) {
187       name++;
188     }
189     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
190       return __kmp_par_range < 0;
191     }
192   }
193   semi3 = strchr(semi2 + 1, ';');
194   if (__kmp_par_range_routine[0]) {
195     if ((semi3 != NULL) && (semi3 > semi2) &&
196         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
197       return __kmp_par_range < 0;
198     }
199   }
200   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
201     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
202       return __kmp_par_range > 0;
203     }
204     return __kmp_par_range < 0;
205   }
206   return TRUE;
207 
208 #endif /* KMP_DEBUG */
209 }
210 
211 /*!
212 @ingroup THREAD_STATES
213 @param loc Source location information.
214 @return 1 if this thread is executing inside an active parallel region, zero if
215 not.
216 */
217 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
218   return __kmp_entry_thread()->th.th_root->r.r_active;
219 }
220 
221 /*!
222 @ingroup PARALLEL
223 @param loc source location information
224 @param global_tid global thread number
225 @param num_threads number of threads requested for this parallel construct
226 
227 Set the number of threads to be used by the next fork spawned by this thread.
228 This call is only required if the parallel construct has a `num_threads` clause.
229 */
230 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
231                              kmp_int32 num_threads) {
232   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
233                 global_tid, num_threads));
234 
235   __kmp_push_num_threads(loc, global_tid, num_threads);
236 }
237 
238 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
239   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
240 
241   /* the num_threads are automatically popped */
242 }
243 
244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245                            kmp_int32 proc_bind) {
246   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247                 proc_bind));
248 
249   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250 }
251 
252 /*!
253 @ingroup PARALLEL
254 @param loc  source location information
255 @param argc  total number of arguments in the ellipsis
256 @param microtask  pointer to callback routine consisting of outlined parallel
257 construct
258 @param ...  pointers to shared variables that aren't global
259 
260 Do the actual fork and call the microtask in the relevant number of threads.
261 */
262 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263   int gtid = __kmp_entry_gtid();
264 
265 #if (KMP_STATS_ENABLED)
266   // If we were in a serial region, then stop the serial timer, record
267   // the event, and start parallel region timer
268   stats_state_e previous_state = KMP_GET_THREAD_STATE();
269   if (previous_state == stats_state_e::SERIAL_REGION) {
270     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
271   } else {
272     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273   }
274   int inParallel = __kmpc_in_parallel(loc);
275   if (inParallel) {
276     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
277   } else {
278     KMP_COUNT_BLOCK(OMP_PARALLEL);
279   }
280 #endif
281 
282   // maybe to save thr_state is enough here
283   {
284     va_list ap;
285     va_start(ap, microtask);
286 
287 #if OMPT_SUPPORT
288     ompt_frame_t *ompt_frame;
289     if (ompt_enabled.enabled) {
290       kmp_info_t *master_th = __kmp_threads[gtid];
291       kmp_team_t *parent_team = master_th->th.th_team;
292       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
293       if (lwt)
294         ompt_frame = &(lwt->ompt_task_info.frame);
295       else {
296         int tid = __kmp_tid_from_gtid(gtid);
297         ompt_frame = &(
298             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
299       }
300       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
301       OMPT_STORE_RETURN_ADDRESS(gtid);
302     }
303 #endif
304 
305 #if INCLUDE_SSC_MARKS
306     SSC_MARK_FORKING();
307 #endif
308     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
309                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
310                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
311                     kmp_va_addr_of(ap));
312 #if INCLUDE_SSC_MARKS
313     SSC_MARK_JOINING();
314 #endif
315     __kmp_join_call(loc, gtid
316 #if OMPT_SUPPORT
317                     ,
318                     fork_context_intel
319 #endif
320                     );
321 
322     va_end(ap);
323   }
324 
325 #if KMP_STATS_ENABLED
326   if (previous_state == stats_state_e::SERIAL_REGION) {
327     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
328   } else {
329     KMP_POP_PARTITIONED_TIMER();
330   }
331 #endif // KMP_STATS_ENABLED
332 }
333 
334 /*!
335 @ingroup PARALLEL
336 @param loc source location information
337 @param global_tid global thread number
338 @param num_teams number of teams requested for the teams construct
339 @param num_threads number of threads per team requested for the teams construct
340 
341 Set the number of teams to be used by the teams construct.
342 This call is only required if the teams construct has a `num_teams` clause
343 or a `thread_limit` clause (or both).
344 */
345 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
346                            kmp_int32 num_teams, kmp_int32 num_threads) {
347   KA_TRACE(20,
348            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
349             global_tid, num_teams, num_threads));
350 
351   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
352 }
353 
354 /*!
355 @ingroup PARALLEL
356 @param loc  source location information
357 @param argc  total number of arguments in the ellipsis
358 @param microtask  pointer to callback routine consisting of outlined teams
359 construct
360 @param ...  pointers to shared variables that aren't global
361 
362 Do the actual fork and call the microtask in the relevant number of threads.
363 */
364 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
365                        ...) {
366   int gtid = __kmp_entry_gtid();
367   kmp_info_t *this_thr = __kmp_threads[gtid];
368   va_list ap;
369   va_start(ap, microtask);
370 
371 #if KMP_STATS_ENABLED
372   KMP_COUNT_BLOCK(OMP_TEAMS);
373   stats_state_e previous_state = KMP_GET_THREAD_STATE();
374   if (previous_state == stats_state_e::SERIAL_REGION) {
375     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
376   } else {
377     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
378   }
379 #endif
380 
381   // remember teams entry point and nesting level
382   this_thr->th.th_teams_microtask = microtask;
383   this_thr->th.th_teams_level =
384       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
385 
386 #if OMPT_SUPPORT
387   kmp_team_t *parent_team = this_thr->th.th_team;
388   int tid = __kmp_tid_from_gtid(gtid);
389   if (ompt_enabled.enabled) {
390     parent_team->t.t_implicit_task_taskdata[tid]
391         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
392   }
393   OMPT_STORE_RETURN_ADDRESS(gtid);
394 #endif
395 
396   // check if __kmpc_push_num_teams called, set default number of teams
397   // otherwise
398   if (this_thr->th.th_teams_size.nteams == 0) {
399     __kmp_push_num_teams(loc, gtid, 0, 0);
400   }
401   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
402   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
403   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
404 
405   __kmp_fork_call(
406       loc, gtid, fork_context_intel, argc,
407       VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
408       VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
409   __kmp_join_call(loc, gtid
410 #if OMPT_SUPPORT
411                   ,
412                   fork_context_intel
413 #endif
414                   );
415 
416   // Pop current CG root off list
417   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
418   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
419   this_thr->th.th_cg_roots = tmp->up;
420   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
421                  " to node %p. cg_nthreads was %d\n",
422                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
423   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
424   int i = tmp->cg_nthreads--;
425   if (i == 1) { // check is we are the last thread in CG (not always the case)
426     __kmp_free(tmp);
427   }
428   // Restore current task's thread_limit from CG root
429   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
430   this_thr->th.th_current_task->td_icvs.thread_limit =
431       this_thr->th.th_cg_roots->cg_thread_limit;
432 
433   this_thr->th.th_teams_microtask = NULL;
434   this_thr->th.th_teams_level = 0;
435   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
436   va_end(ap);
437 #if KMP_STATS_ENABLED
438   if (previous_state == stats_state_e::SERIAL_REGION) {
439     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
440   } else {
441     KMP_POP_PARTITIONED_TIMER();
442   }
443 #endif // KMP_STATS_ENABLED
444 }
445 
446 // I don't think this function should ever have been exported.
447 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
448 // openmp code ever called it, but it's been exported from the RTL for so
449 // long that I'm afraid to remove the definition.
450 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
451 
452 /*!
453 @ingroup PARALLEL
454 @param loc  source location information
455 @param global_tid  global thread number
456 
457 Enter a serialized parallel construct. This interface is used to handle a
458 conditional parallel region, like this,
459 @code
460 #pragma omp parallel if (condition)
461 @endcode
462 when the condition is false.
463 */
464 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
465 // The implementation is now in kmp_runtime.cpp so that it can share static
466 // functions with kmp_fork_call since the tasks to be done are similar in
467 // each case.
468 #if OMPT_SUPPORT
469   OMPT_STORE_RETURN_ADDRESS(global_tid);
470 #endif
471   __kmp_serialized_parallel(loc, global_tid);
472 }
473 
474 /*!
475 @ingroup PARALLEL
476 @param loc  source location information
477 @param global_tid  global thread number
478 
479 Leave a serialized parallel construct.
480 */
481 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
482   kmp_internal_control_t *top;
483   kmp_info_t *this_thr;
484   kmp_team_t *serial_team;
485 
486   KC_TRACE(10,
487            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
488 
489   /* skip all this code for autopar serialized loops since it results in
490      unacceptable overhead */
491   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
492     return;
493 
494   // Not autopar code
495   if (!TCR_4(__kmp_init_parallel))
496     __kmp_parallel_initialize();
497 
498   __kmp_resume_if_soft_paused();
499 
500   this_thr = __kmp_threads[global_tid];
501   serial_team = this_thr->th.th_serial_team;
502 
503   kmp_task_team_t *task_team = this_thr->th.th_task_team;
504   // we need to wait for the proxy tasks before finishing the thread
505   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
506     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
507 
508   KMP_MB();
509   KMP_DEBUG_ASSERT(serial_team);
510   KMP_ASSERT(serial_team->t.t_serialized);
511   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
512   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
513   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
514   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
515 
516 #if OMPT_SUPPORT
517   if (ompt_enabled.enabled &&
518       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
519     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
520     if (ompt_enabled.ompt_callback_implicit_task) {
521       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
522           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
523           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
524     }
525 
526     // reset clear the task id only after unlinking the task
527     ompt_data_t *parent_task_data;
528     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
529 
530     if (ompt_enabled.ompt_callback_parallel_end) {
531       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
532           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
533           ompt_parallel_invoker_program | ompt_parallel_team,
534           OMPT_LOAD_RETURN_ADDRESS(global_tid));
535     }
536     __ompt_lw_taskteam_unlink(this_thr);
537     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
538   }
539 #endif
540 
541   /* If necessary, pop the internal control stack values and replace the team
542    * values */
543   top = serial_team->t.t_control_stack_top;
544   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
545     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
546     serial_team->t.t_control_stack_top = top->next;
547     __kmp_free(top);
548   }
549 
550   // if( serial_team -> t.t_serialized > 1 )
551   serial_team->t.t_level--;
552 
553   /* pop dispatch buffers stack */
554   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
555   {
556     dispatch_private_info_t *disp_buffer =
557         serial_team->t.t_dispatch->th_disp_buffer;
558     serial_team->t.t_dispatch->th_disp_buffer =
559         serial_team->t.t_dispatch->th_disp_buffer->next;
560     __kmp_free(disp_buffer);
561   }
562   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
563 
564   --serial_team->t.t_serialized;
565   if (serial_team->t.t_serialized == 0) {
566 
567 /* return to the parallel section */
568 
569 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
570     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
571       __kmp_clear_x87_fpu_status_word();
572       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
573       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
574     }
575 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
576 
577     this_thr->th.th_team = serial_team->t.t_parent;
578     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
579 
580     /* restore values cached in the thread */
581     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
582     this_thr->th.th_team_master =
583         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
584     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
585 
586     /* TODO the below shouldn't need to be adjusted for serialized teams */
587     this_thr->th.th_dispatch =
588         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
589 
590     __kmp_pop_current_task_from_thread(this_thr);
591 
592     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
593     this_thr->th.th_current_task->td_flags.executing = 1;
594 
595     if (__kmp_tasking_mode != tskm_immediate_exec) {
596       // Copy the task team from the new child / old parent team to the thread.
597       this_thr->th.th_task_team =
598           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
599       KA_TRACE(20,
600                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
601                 "team %p\n",
602                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
603     }
604   } else {
605     if (__kmp_tasking_mode != tskm_immediate_exec) {
606       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
607                     "depth of serial team %p to %d\n",
608                     global_tid, serial_team, serial_team->t.t_serialized));
609     }
610   }
611 
612   if (__kmp_env_consistency_check)
613     __kmp_pop_parallel(global_tid, NULL);
614 #if OMPT_SUPPORT
615   if (ompt_enabled.enabled)
616     this_thr->th.ompt_thread_info.state =
617         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
618                                            : ompt_state_work_parallel);
619 #endif
620 }
621 
622 /*!
623 @ingroup SYNCHRONIZATION
624 @param loc  source location information.
625 
626 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
627 depending on the memory ordering convention obeyed by the compiler
628 even that may not be necessary).
629 */
630 void __kmpc_flush(ident_t *loc) {
631   KC_TRACE(10, ("__kmpc_flush: called\n"));
632 
633   /* need explicit __mf() here since use volatile instead in library */
634   KMP_MB(); /* Flush all pending memory write invalidates.  */
635 
636 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
637 #if KMP_MIC
638 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
639 // We shouldn't need it, though, since the ABI rules require that
640 // * If the compiler generates NGO stores it also generates the fence
641 // * If users hand-code NGO stores they should insert the fence
642 // therefore no incomplete unordered stores should be visible.
643 #else
644   // C74404
645   // This is to address non-temporal store instructions (sfence needed).
646   // The clflush instruction is addressed either (mfence needed).
647   // Probably the non-temporal load monvtdqa instruction should also be
648   // addressed.
649   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
650   if (!__kmp_cpuinfo.initialized) {
651     __kmp_query_cpuid(&__kmp_cpuinfo);
652   }
653   if (!__kmp_cpuinfo.sse2) {
654     // CPU cannot execute SSE2 instructions.
655   } else {
656 #if KMP_COMPILER_ICC
657     _mm_mfence();
658 #elif KMP_COMPILER_MSVC
659     MemoryBarrier();
660 #else
661     __sync_synchronize();
662 #endif // KMP_COMPILER_ICC
663   }
664 #endif // KMP_MIC
665 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
666        KMP_ARCH_RISCV64)
667 // Nothing to see here move along
668 #elif KMP_ARCH_PPC64
669 // Nothing needed here (we have a real MB above).
670 #if KMP_OS_CNK
671   // The flushing thread needs to yield here; this prevents a
672   // busy-waiting thread from saturating the pipeline. flush is
673   // often used in loops like this:
674   // while (!flag) {
675   //   #pragma omp flush(flag)
676   // }
677   // and adding the yield here is good for at least a 10x speedup
678   // when running >2 threads per core (on the NAS LU benchmark).
679   __kmp_yield();
680 #endif
681 #else
682 #error Unknown or unsupported architecture
683 #endif
684 
685 #if OMPT_SUPPORT && OMPT_OPTIONAL
686   if (ompt_enabled.ompt_callback_flush) {
687     ompt_callbacks.ompt_callback(ompt_callback_flush)(
688         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
689   }
690 #endif
691 }
692 
693 /* -------------------------------------------------------------------------- */
694 /*!
695 @ingroup SYNCHRONIZATION
696 @param loc source location information
697 @param global_tid thread id.
698 
699 Execute a barrier.
700 */
701 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
702   KMP_COUNT_BLOCK(OMP_BARRIER);
703   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
704 
705   if (!TCR_4(__kmp_init_parallel))
706     __kmp_parallel_initialize();
707 
708   __kmp_resume_if_soft_paused();
709 
710   if (__kmp_env_consistency_check) {
711     if (loc == 0) {
712       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
713     }
714     __kmp_check_barrier(global_tid, ct_barrier, loc);
715   }
716 
717 #if OMPT_SUPPORT
718   ompt_frame_t *ompt_frame;
719   if (ompt_enabled.enabled) {
720     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
721     if (ompt_frame->enter_frame.ptr == NULL)
722       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
723     OMPT_STORE_RETURN_ADDRESS(global_tid);
724   }
725 #endif
726   __kmp_threads[global_tid]->th.th_ident = loc;
727   // TODO: explicit barrier_wait_id:
728   //   this function is called when 'barrier' directive is present or
729   //   implicit barrier at the end of a worksharing construct.
730   // 1) better to add a per-thread barrier counter to a thread data structure
731   // 2) set to 0 when a new team is created
732   // 4) no sync is required
733 
734   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
735 #if OMPT_SUPPORT && OMPT_OPTIONAL
736   if (ompt_enabled.enabled) {
737     ompt_frame->enter_frame = ompt_data_none;
738   }
739 #endif
740 }
741 
742 /* The BARRIER for a MASTER section is always explicit   */
743 /*!
744 @ingroup WORK_SHARING
745 @param loc  source location information.
746 @param global_tid  global thread number .
747 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
748 */
749 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
750   int status = 0;
751 
752   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
753 
754   if (!TCR_4(__kmp_init_parallel))
755     __kmp_parallel_initialize();
756 
757   __kmp_resume_if_soft_paused();
758 
759   if (KMP_MASTER_GTID(global_tid)) {
760     KMP_COUNT_BLOCK(OMP_MASTER);
761     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
762     status = 1;
763   }
764 
765 #if OMPT_SUPPORT && OMPT_OPTIONAL
766   if (status) {
767     if (ompt_enabled.ompt_callback_master) {
768       kmp_info_t *this_thr = __kmp_threads[global_tid];
769       kmp_team_t *team = this_thr->th.th_team;
770 
771       int tid = __kmp_tid_from_gtid(global_tid);
772       ompt_callbacks.ompt_callback(ompt_callback_master)(
773           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
774           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
775           OMPT_GET_RETURN_ADDRESS(0));
776     }
777   }
778 #endif
779 
780   if (__kmp_env_consistency_check) {
781 #if KMP_USE_DYNAMIC_LOCK
782     if (status)
783       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
784     else
785       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
786 #else
787     if (status)
788       __kmp_push_sync(global_tid, ct_master, loc, NULL);
789     else
790       __kmp_check_sync(global_tid, ct_master, loc, NULL);
791 #endif
792   }
793 
794   return status;
795 }
796 
797 /*!
798 @ingroup WORK_SHARING
799 @param loc  source location information.
800 @param global_tid  global thread number .
801 
802 Mark the end of a <tt>master</tt> region. This should only be called by the
803 thread that executes the <tt>master</tt> region.
804 */
805 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
806   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
807 
808   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
809   KMP_POP_PARTITIONED_TIMER();
810 
811 #if OMPT_SUPPORT && OMPT_OPTIONAL
812   kmp_info_t *this_thr = __kmp_threads[global_tid];
813   kmp_team_t *team = this_thr->th.th_team;
814   if (ompt_enabled.ompt_callback_master) {
815     int tid = __kmp_tid_from_gtid(global_tid);
816     ompt_callbacks.ompt_callback(ompt_callback_master)(
817         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
818         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
819         OMPT_GET_RETURN_ADDRESS(0));
820   }
821 #endif
822 
823   if (__kmp_env_consistency_check) {
824     if (global_tid < 0)
825       KMP_WARNING(ThreadIdentInvalid);
826 
827     if (KMP_MASTER_GTID(global_tid))
828       __kmp_pop_sync(global_tid, ct_master, loc);
829   }
830 }
831 
832 /*!
833 @ingroup WORK_SHARING
834 @param loc  source location information.
835 @param gtid  global thread number.
836 
837 Start execution of an <tt>ordered</tt> construct.
838 */
839 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
840   int cid = 0;
841   kmp_info_t *th;
842   KMP_DEBUG_ASSERT(__kmp_init_serial);
843 
844   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
845 
846   if (!TCR_4(__kmp_init_parallel))
847     __kmp_parallel_initialize();
848 
849   __kmp_resume_if_soft_paused();
850 
851 #if USE_ITT_BUILD
852   __kmp_itt_ordered_prep(gtid);
853 // TODO: ordered_wait_id
854 #endif /* USE_ITT_BUILD */
855 
856   th = __kmp_threads[gtid];
857 
858 #if OMPT_SUPPORT && OMPT_OPTIONAL
859   kmp_team_t *team;
860   ompt_wait_id_t lck;
861   void *codeptr_ra;
862   if (ompt_enabled.enabled) {
863     OMPT_STORE_RETURN_ADDRESS(gtid);
864     team = __kmp_team_from_gtid(gtid);
865     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
866     /* OMPT state update */
867     th->th.ompt_thread_info.wait_id = lck;
868     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
869 
870     /* OMPT event callback */
871     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
872     if (ompt_enabled.ompt_callback_mutex_acquire) {
873       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
874           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
875           codeptr_ra);
876     }
877   }
878 #endif
879 
880   if (th->th.th_dispatch->th_deo_fcn != 0)
881     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
882   else
883     __kmp_parallel_deo(&gtid, &cid, loc);
884 
885 #if OMPT_SUPPORT && OMPT_OPTIONAL
886   if (ompt_enabled.enabled) {
887     /* OMPT state update */
888     th->th.ompt_thread_info.state = ompt_state_work_parallel;
889     th->th.ompt_thread_info.wait_id = 0;
890 
891     /* OMPT event callback */
892     if (ompt_enabled.ompt_callback_mutex_acquired) {
893       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
894           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
895     }
896   }
897 #endif
898 
899 #if USE_ITT_BUILD
900   __kmp_itt_ordered_start(gtid);
901 #endif /* USE_ITT_BUILD */
902 }
903 
904 /*!
905 @ingroup WORK_SHARING
906 @param loc  source location information.
907 @param gtid  global thread number.
908 
909 End execution of an <tt>ordered</tt> construct.
910 */
911 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
912   int cid = 0;
913   kmp_info_t *th;
914 
915   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
916 
917 #if USE_ITT_BUILD
918   __kmp_itt_ordered_end(gtid);
919 // TODO: ordered_wait_id
920 #endif /* USE_ITT_BUILD */
921 
922   th = __kmp_threads[gtid];
923 
924   if (th->th.th_dispatch->th_dxo_fcn != 0)
925     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
926   else
927     __kmp_parallel_dxo(&gtid, &cid, loc);
928 
929 #if OMPT_SUPPORT && OMPT_OPTIONAL
930   OMPT_STORE_RETURN_ADDRESS(gtid);
931   if (ompt_enabled.ompt_callback_mutex_released) {
932     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
933         ompt_mutex_ordered,
934         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
935             ->t.t_ordered.dt.t_value,
936         OMPT_LOAD_RETURN_ADDRESS(gtid));
937   }
938 #endif
939 }
940 
941 #if KMP_USE_DYNAMIC_LOCK
942 
943 static __forceinline void
944 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
945                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
946   // Pointer to the allocated indirect lock is written to crit, while indexing
947   // is ignored.
948   void *idx;
949   kmp_indirect_lock_t **lck;
950   lck = (kmp_indirect_lock_t **)crit;
951   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
952   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
953   KMP_SET_I_LOCK_LOCATION(ilk, loc);
954   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
955   KA_TRACE(20,
956            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
957 #if USE_ITT_BUILD
958   __kmp_itt_critical_creating(ilk->lock, loc);
959 #endif
960   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
961   if (status == 0) {
962 #if USE_ITT_BUILD
963     __kmp_itt_critical_destroyed(ilk->lock);
964 #endif
965     // We don't really need to destroy the unclaimed lock here since it will be
966     // cleaned up at program exit.
967     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
968   }
969   KMP_DEBUG_ASSERT(*lck != NULL);
970 }
971 
972 // Fast-path acquire tas lock
973 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
974   {                                                                            \
975     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
976     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
977     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
978     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
979         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
980       kmp_uint32 spins;                                                        \
981       KMP_FSYNC_PREPARE(l);                                                    \
982       KMP_INIT_YIELD(spins);                                                   \
983       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
984       do {                                                                     \
985         if (TCR_4(__kmp_nth) >                                                 \
986             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
987           KMP_YIELD(TRUE);                                                     \
988         } else {                                                               \
989           KMP_YIELD_SPIN(spins);                                               \
990         }                                                                      \
991         __kmp_spin_backoff(&backoff);                                          \
992       } while (                                                                \
993           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
994           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
995     }                                                                          \
996     KMP_FSYNC_ACQUIRED(l);                                                     \
997   }
998 
999 // Fast-path test tas lock
1000 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1001   {                                                                            \
1002     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1003     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1004     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1005     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1006          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1007   }
1008 
1009 // Fast-path release tas lock
1010 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1011   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1012 
1013 #if KMP_USE_FUTEX
1014 
1015 #include <sys/syscall.h>
1016 #include <unistd.h>
1017 #ifndef FUTEX_WAIT
1018 #define FUTEX_WAIT 0
1019 #endif
1020 #ifndef FUTEX_WAKE
1021 #define FUTEX_WAKE 1
1022 #endif
1023 
1024 // Fast-path acquire futex lock
1025 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1026   {                                                                            \
1027     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1028     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1029     KMP_MB();                                                                  \
1030     KMP_FSYNC_PREPARE(ftx);                                                    \
1031     kmp_int32 poll_val;                                                        \
1032     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1033                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1034                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1035       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1036       if (!cond) {                                                             \
1037         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1038                                          poll_val |                            \
1039                                              KMP_LOCK_BUSY(1, futex))) {       \
1040           continue;                                                            \
1041         }                                                                      \
1042         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1043       }                                                                        \
1044       kmp_int32 rc;                                                            \
1045       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1046                         NULL, NULL, 0)) != 0) {                                \
1047         continue;                                                              \
1048       }                                                                        \
1049       gtid_code |= 1;                                                          \
1050     }                                                                          \
1051     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1052   }
1053 
1054 // Fast-path test futex lock
1055 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1056   {                                                                            \
1057     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1058     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1059                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1060       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1061       rc = TRUE;                                                               \
1062     } else {                                                                   \
1063       rc = FALSE;                                                              \
1064     }                                                                          \
1065   }
1066 
1067 // Fast-path release futex lock
1068 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1069   {                                                                            \
1070     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1071     KMP_MB();                                                                  \
1072     KMP_FSYNC_RELEASING(ftx);                                                  \
1073     kmp_int32 poll_val =                                                       \
1074         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1075     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1076       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1077               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1078     }                                                                          \
1079     KMP_MB();                                                                  \
1080     KMP_YIELD_OVERSUB();                                                       \
1081   }
1082 
1083 #endif // KMP_USE_FUTEX
1084 
1085 #else // KMP_USE_DYNAMIC_LOCK
1086 
1087 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1088                                                       ident_t const *loc,
1089                                                       kmp_int32 gtid) {
1090   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1091 
1092   // Because of the double-check, the following load doesn't need to be volatile
1093   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1094 
1095   if (lck == NULL) {
1096     void *idx;
1097 
1098     // Allocate & initialize the lock.
1099     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1100     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1101     __kmp_init_user_lock_with_checks(lck);
1102     __kmp_set_user_lock_location(lck, loc);
1103 #if USE_ITT_BUILD
1104     __kmp_itt_critical_creating(lck);
1105 // __kmp_itt_critical_creating() should be called *before* the first usage
1106 // of underlying lock. It is the only place where we can guarantee it. There
1107 // are chances the lock will destroyed with no usage, but it is not a
1108 // problem, because this is not real event seen by user but rather setting
1109 // name for object (lock). See more details in kmp_itt.h.
1110 #endif /* USE_ITT_BUILD */
1111 
1112     // Use a cmpxchg instruction to slam the start of the critical section with
1113     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1114     // and use the lock that the other thread allocated.
1115     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1116 
1117     if (status == 0) {
1118 // Deallocate the lock and reload the value.
1119 #if USE_ITT_BUILD
1120       __kmp_itt_critical_destroyed(lck);
1121 // Let ITT know the lock is destroyed and the same memory location may be reused
1122 // for another purpose.
1123 #endif /* USE_ITT_BUILD */
1124       __kmp_destroy_user_lock_with_checks(lck);
1125       __kmp_user_lock_free(&idx, gtid, lck);
1126       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1127       KMP_DEBUG_ASSERT(lck != NULL);
1128     }
1129   }
1130   return lck;
1131 }
1132 
1133 #endif // KMP_USE_DYNAMIC_LOCK
1134 
1135 /*!
1136 @ingroup WORK_SHARING
1137 @param loc  source location information.
1138 @param global_tid  global thread number .
1139 @param crit identity of the critical section. This could be a pointer to a lock
1140 associated with the critical section, or some other suitably unique value.
1141 
1142 Enter code protected by a `critical` construct.
1143 This function blocks until the executing thread can enter the critical section.
1144 */
1145 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1146                      kmp_critical_name *crit) {
1147 #if KMP_USE_DYNAMIC_LOCK
1148 #if OMPT_SUPPORT && OMPT_OPTIONAL
1149   OMPT_STORE_RETURN_ADDRESS(global_tid);
1150 #endif // OMPT_SUPPORT
1151   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1152 #else
1153   KMP_COUNT_BLOCK(OMP_CRITICAL);
1154 #if OMPT_SUPPORT && OMPT_OPTIONAL
1155   ompt_state_t prev_state = ompt_state_undefined;
1156   ompt_thread_info_t ti;
1157 #endif
1158   kmp_user_lock_p lck;
1159 
1160   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1161 
1162   // TODO: add THR_OVHD_STATE
1163 
1164   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1165   KMP_CHECK_USER_LOCK_INIT();
1166 
1167   if ((__kmp_user_lock_kind == lk_tas) &&
1168       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1169     lck = (kmp_user_lock_p)crit;
1170   }
1171 #if KMP_USE_FUTEX
1172   else if ((__kmp_user_lock_kind == lk_futex) &&
1173            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1174     lck = (kmp_user_lock_p)crit;
1175   }
1176 #endif
1177   else { // ticket, queuing or drdpa
1178     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1179   }
1180 
1181   if (__kmp_env_consistency_check)
1182     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1183 
1184 // since the critical directive binds to all threads, not just the current
1185 // team we have to check this even if we are in a serialized team.
1186 // also, even if we are the uber thread, we still have to conduct the lock,
1187 // as we have to contend with sibling threads.
1188 
1189 #if USE_ITT_BUILD
1190   __kmp_itt_critical_acquiring(lck);
1191 #endif /* USE_ITT_BUILD */
1192 #if OMPT_SUPPORT && OMPT_OPTIONAL
1193   OMPT_STORE_RETURN_ADDRESS(gtid);
1194   void *codeptr_ra = NULL;
1195   if (ompt_enabled.enabled) {
1196     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1197     /* OMPT state update */
1198     prev_state = ti.state;
1199     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1200     ti.state = ompt_state_wait_critical;
1201 
1202     /* OMPT event callback */
1203     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1204     if (ompt_enabled.ompt_callback_mutex_acquire) {
1205       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1206           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1207           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1208     }
1209   }
1210 #endif
1211   // Value of 'crit' should be good for using as a critical_id of the critical
1212   // section directive.
1213   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1214 
1215 #if USE_ITT_BUILD
1216   __kmp_itt_critical_acquired(lck);
1217 #endif /* USE_ITT_BUILD */
1218 #if OMPT_SUPPORT && OMPT_OPTIONAL
1219   if (ompt_enabled.enabled) {
1220     /* OMPT state update */
1221     ti.state = prev_state;
1222     ti.wait_id = 0;
1223 
1224     /* OMPT event callback */
1225     if (ompt_enabled.ompt_callback_mutex_acquired) {
1226       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1227           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1228     }
1229   }
1230 #endif
1231   KMP_POP_PARTITIONED_TIMER();
1232 
1233   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1234   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1235 #endif // KMP_USE_DYNAMIC_LOCK
1236 }
1237 
1238 #if KMP_USE_DYNAMIC_LOCK
1239 
1240 // Converts the given hint to an internal lock implementation
1241 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1242 #if KMP_USE_TSX
1243 #define KMP_TSX_LOCK(seq) lockseq_##seq
1244 #else
1245 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1246 #endif
1247 
1248 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1249 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1250 #else
1251 #define KMP_CPUINFO_RTM 0
1252 #endif
1253 
1254   // Hints that do not require further logic
1255   if (hint & kmp_lock_hint_hle)
1256     return KMP_TSX_LOCK(hle);
1257   if (hint & kmp_lock_hint_rtm)
1258     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1259   if (hint & kmp_lock_hint_adaptive)
1260     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1261 
1262   // Rule out conflicting hints first by returning the default lock
1263   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1264     return __kmp_user_lock_seq;
1265   if ((hint & omp_lock_hint_speculative) &&
1266       (hint & omp_lock_hint_nonspeculative))
1267     return __kmp_user_lock_seq;
1268 
1269   // Do not even consider speculation when it appears to be contended
1270   if (hint & omp_lock_hint_contended)
1271     return lockseq_queuing;
1272 
1273   // Uncontended lock without speculation
1274   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1275     return lockseq_tas;
1276 
1277   // HLE lock for speculation
1278   if (hint & omp_lock_hint_speculative)
1279     return KMP_TSX_LOCK(hle);
1280 
1281   return __kmp_user_lock_seq;
1282 }
1283 
1284 #if OMPT_SUPPORT && OMPT_OPTIONAL
1285 #if KMP_USE_DYNAMIC_LOCK
1286 static kmp_mutex_impl_t
1287 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1288   if (user_lock) {
1289     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1290     case 0:
1291       break;
1292 #if KMP_USE_FUTEX
1293     case locktag_futex:
1294       return kmp_mutex_impl_queuing;
1295 #endif
1296     case locktag_tas:
1297       return kmp_mutex_impl_spin;
1298 #if KMP_USE_TSX
1299     case locktag_hle:
1300       return kmp_mutex_impl_speculative;
1301 #endif
1302     default:
1303       return kmp_mutex_impl_none;
1304     }
1305     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1306   }
1307   KMP_ASSERT(ilock);
1308   switch (ilock->type) {
1309 #if KMP_USE_TSX
1310   case locktag_adaptive:
1311   case locktag_rtm:
1312     return kmp_mutex_impl_speculative;
1313 #endif
1314   case locktag_nested_tas:
1315     return kmp_mutex_impl_spin;
1316 #if KMP_USE_FUTEX
1317   case locktag_nested_futex:
1318 #endif
1319   case locktag_ticket:
1320   case locktag_queuing:
1321   case locktag_drdpa:
1322   case locktag_nested_ticket:
1323   case locktag_nested_queuing:
1324   case locktag_nested_drdpa:
1325     return kmp_mutex_impl_queuing;
1326   default:
1327     return kmp_mutex_impl_none;
1328   }
1329 }
1330 #else
1331 // For locks without dynamic binding
1332 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1333   switch (__kmp_user_lock_kind) {
1334   case lk_tas:
1335     return kmp_mutex_impl_spin;
1336 #if KMP_USE_FUTEX
1337   case lk_futex:
1338 #endif
1339   case lk_ticket:
1340   case lk_queuing:
1341   case lk_drdpa:
1342     return kmp_mutex_impl_queuing;
1343 #if KMP_USE_TSX
1344   case lk_hle:
1345   case lk_rtm:
1346   case lk_adaptive:
1347     return kmp_mutex_impl_speculative;
1348 #endif
1349   default:
1350     return kmp_mutex_impl_none;
1351   }
1352 }
1353 #endif // KMP_USE_DYNAMIC_LOCK
1354 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1355 
1356 /*!
1357 @ingroup WORK_SHARING
1358 @param loc  source location information.
1359 @param global_tid  global thread number.
1360 @param crit identity of the critical section. This could be a pointer to a lock
1361 associated with the critical section, or some other suitably unique value.
1362 @param hint the lock hint.
1363 
1364 Enter code protected by a `critical` construct with a hint. The hint value is
1365 used to suggest a lock implementation. This function blocks until the executing
1366 thread can enter the critical section unless the hint suggests use of
1367 speculative execution and the hardware supports it.
1368 */
1369 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1370                                kmp_critical_name *crit, uint32_t hint) {
1371   KMP_COUNT_BLOCK(OMP_CRITICAL);
1372   kmp_user_lock_p lck;
1373 #if OMPT_SUPPORT && OMPT_OPTIONAL
1374   ompt_state_t prev_state = ompt_state_undefined;
1375   ompt_thread_info_t ti;
1376   // This is the case, if called from __kmpc_critical:
1377   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1378   if (!codeptr)
1379     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1380 #endif
1381 
1382   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1383 
1384   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1385   // Check if it is initialized.
1386   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1387   if (*lk == 0) {
1388     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1389     if (KMP_IS_D_LOCK(lckseq)) {
1390       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1391                                   KMP_GET_D_TAG(lckseq));
1392     } else {
1393       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1394     }
1395   }
1396   // Branch for accessing the actual lock object and set operation. This
1397   // branching is inevitable since this lock initialization does not follow the
1398   // normal dispatch path (lock table is not used).
1399   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1400     lck = (kmp_user_lock_p)lk;
1401     if (__kmp_env_consistency_check) {
1402       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1403                       __kmp_map_hint_to_lock(hint));
1404     }
1405 #if USE_ITT_BUILD
1406     __kmp_itt_critical_acquiring(lck);
1407 #endif
1408 #if OMPT_SUPPORT && OMPT_OPTIONAL
1409     if (ompt_enabled.enabled) {
1410       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1411       /* OMPT state update */
1412       prev_state = ti.state;
1413       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1414       ti.state = ompt_state_wait_critical;
1415 
1416       /* OMPT event callback */
1417       if (ompt_enabled.ompt_callback_mutex_acquire) {
1418         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1419             ompt_mutex_critical, (unsigned int)hint,
1420             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1421             codeptr);
1422       }
1423     }
1424 #endif
1425 #if KMP_USE_INLINED_TAS
1426     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1427       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1428     } else
1429 #elif KMP_USE_INLINED_FUTEX
1430     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1431       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1432     } else
1433 #endif
1434     {
1435       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1436     }
1437   } else {
1438     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1439     lck = ilk->lock;
1440     if (__kmp_env_consistency_check) {
1441       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1442                       __kmp_map_hint_to_lock(hint));
1443     }
1444 #if USE_ITT_BUILD
1445     __kmp_itt_critical_acquiring(lck);
1446 #endif
1447 #if OMPT_SUPPORT && OMPT_OPTIONAL
1448     if (ompt_enabled.enabled) {
1449       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1450       /* OMPT state update */
1451       prev_state = ti.state;
1452       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1453       ti.state = ompt_state_wait_critical;
1454 
1455       /* OMPT event callback */
1456       if (ompt_enabled.ompt_callback_mutex_acquire) {
1457         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1458             ompt_mutex_critical, (unsigned int)hint,
1459             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1460             codeptr);
1461       }
1462     }
1463 #endif
1464     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1465   }
1466   KMP_POP_PARTITIONED_TIMER();
1467 
1468 #if USE_ITT_BUILD
1469   __kmp_itt_critical_acquired(lck);
1470 #endif /* USE_ITT_BUILD */
1471 #if OMPT_SUPPORT && OMPT_OPTIONAL
1472   if (ompt_enabled.enabled) {
1473     /* OMPT state update */
1474     ti.state = prev_state;
1475     ti.wait_id = 0;
1476 
1477     /* OMPT event callback */
1478     if (ompt_enabled.ompt_callback_mutex_acquired) {
1479       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1480           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1481     }
1482   }
1483 #endif
1484 
1485   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1486   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1487 } // __kmpc_critical_with_hint
1488 
1489 #endif // KMP_USE_DYNAMIC_LOCK
1490 
1491 /*!
1492 @ingroup WORK_SHARING
1493 @param loc  source location information.
1494 @param global_tid  global thread number .
1495 @param crit identity of the critical section. This could be a pointer to a lock
1496 associated with the critical section, or some other suitably unique value.
1497 
1498 Leave a critical section, releasing any lock that was held during its execution.
1499 */
1500 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1501                          kmp_critical_name *crit) {
1502   kmp_user_lock_p lck;
1503 
1504   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1505 
1506 #if KMP_USE_DYNAMIC_LOCK
1507   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1508     lck = (kmp_user_lock_p)crit;
1509     KMP_ASSERT(lck != NULL);
1510     if (__kmp_env_consistency_check) {
1511       __kmp_pop_sync(global_tid, ct_critical, loc);
1512     }
1513 #if USE_ITT_BUILD
1514     __kmp_itt_critical_releasing(lck);
1515 #endif
1516 #if KMP_USE_INLINED_TAS
1517     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1518       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1519     } else
1520 #elif KMP_USE_INLINED_FUTEX
1521     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1522       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1523     } else
1524 #endif
1525     {
1526       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1527     }
1528   } else {
1529     kmp_indirect_lock_t *ilk =
1530         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1531     KMP_ASSERT(ilk != NULL);
1532     lck = ilk->lock;
1533     if (__kmp_env_consistency_check) {
1534       __kmp_pop_sync(global_tid, ct_critical, loc);
1535     }
1536 #if USE_ITT_BUILD
1537     __kmp_itt_critical_releasing(lck);
1538 #endif
1539     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1540   }
1541 
1542 #else // KMP_USE_DYNAMIC_LOCK
1543 
1544   if ((__kmp_user_lock_kind == lk_tas) &&
1545       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1546     lck = (kmp_user_lock_p)crit;
1547   }
1548 #if KMP_USE_FUTEX
1549   else if ((__kmp_user_lock_kind == lk_futex) &&
1550            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1551     lck = (kmp_user_lock_p)crit;
1552   }
1553 #endif
1554   else { // ticket, queuing or drdpa
1555     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1556   }
1557 
1558   KMP_ASSERT(lck != NULL);
1559 
1560   if (__kmp_env_consistency_check)
1561     __kmp_pop_sync(global_tid, ct_critical, loc);
1562 
1563 #if USE_ITT_BUILD
1564   __kmp_itt_critical_releasing(lck);
1565 #endif /* USE_ITT_BUILD */
1566   // Value of 'crit' should be good for using as a critical_id of the critical
1567   // section directive.
1568   __kmp_release_user_lock_with_checks(lck, global_tid);
1569 
1570 #endif // KMP_USE_DYNAMIC_LOCK
1571 
1572 #if OMPT_SUPPORT && OMPT_OPTIONAL
1573   /* OMPT release event triggers after lock is released; place here to trigger
1574    * for all #if branches */
1575   OMPT_STORE_RETURN_ADDRESS(global_tid);
1576   if (ompt_enabled.ompt_callback_mutex_released) {
1577     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1578         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1579         OMPT_LOAD_RETURN_ADDRESS(0));
1580   }
1581 #endif
1582 
1583   KMP_POP_PARTITIONED_TIMER();
1584   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1585 }
1586 
1587 /*!
1588 @ingroup SYNCHRONIZATION
1589 @param loc source location information
1590 @param global_tid thread id.
1591 @return one if the thread should execute the master block, zero otherwise
1592 
1593 Start execution of a combined barrier and master. The barrier is executed inside
1594 this function.
1595 */
1596 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1597   int status;
1598 
1599   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1600 
1601   if (!TCR_4(__kmp_init_parallel))
1602     __kmp_parallel_initialize();
1603 
1604   __kmp_resume_if_soft_paused();
1605 
1606   if (__kmp_env_consistency_check)
1607     __kmp_check_barrier(global_tid, ct_barrier, loc);
1608 
1609 #if OMPT_SUPPORT
1610   ompt_frame_t *ompt_frame;
1611   if (ompt_enabled.enabled) {
1612     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1613     if (ompt_frame->enter_frame.ptr == NULL)
1614       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1615     OMPT_STORE_RETURN_ADDRESS(global_tid);
1616   }
1617 #endif
1618 #if USE_ITT_NOTIFY
1619   __kmp_threads[global_tid]->th.th_ident = loc;
1620 #endif
1621   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1622 #if OMPT_SUPPORT && OMPT_OPTIONAL
1623   if (ompt_enabled.enabled) {
1624     ompt_frame->enter_frame = ompt_data_none;
1625   }
1626 #endif
1627 
1628   return (status != 0) ? 0 : 1;
1629 }
1630 
1631 /*!
1632 @ingroup SYNCHRONIZATION
1633 @param loc source location information
1634 @param global_tid thread id.
1635 
1636 Complete the execution of a combined barrier and master. This function should
1637 only be called at the completion of the <tt>master</tt> code. Other threads will
1638 still be waiting at the barrier and this call releases them.
1639 */
1640 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1641   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1642 
1643   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1644 }
1645 
1646 /*!
1647 @ingroup SYNCHRONIZATION
1648 @param loc source location information
1649 @param global_tid thread id.
1650 @return one if the thread should execute the master block, zero otherwise
1651 
1652 Start execution of a combined barrier and master(nowait) construct.
1653 The barrier is executed inside this function.
1654 There is no equivalent "end" function, since the
1655 */
1656 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1657   kmp_int32 ret;
1658 
1659   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1660 
1661   if (!TCR_4(__kmp_init_parallel))
1662     __kmp_parallel_initialize();
1663 
1664   __kmp_resume_if_soft_paused();
1665 
1666   if (__kmp_env_consistency_check) {
1667     if (loc == 0) {
1668       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1669     }
1670     __kmp_check_barrier(global_tid, ct_barrier, loc);
1671   }
1672 
1673 #if OMPT_SUPPORT
1674   ompt_frame_t *ompt_frame;
1675   if (ompt_enabled.enabled) {
1676     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1677     if (ompt_frame->enter_frame.ptr == NULL)
1678       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1679     OMPT_STORE_RETURN_ADDRESS(global_tid);
1680   }
1681 #endif
1682 #if USE_ITT_NOTIFY
1683   __kmp_threads[global_tid]->th.th_ident = loc;
1684 #endif
1685   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1686 #if OMPT_SUPPORT && OMPT_OPTIONAL
1687   if (ompt_enabled.enabled) {
1688     ompt_frame->enter_frame = ompt_data_none;
1689   }
1690 #endif
1691 
1692   ret = __kmpc_master(loc, global_tid);
1693 
1694   if (__kmp_env_consistency_check) {
1695     /*  there's no __kmpc_end_master called; so the (stats) */
1696     /*  actions of __kmpc_end_master are done here          */
1697 
1698     if (global_tid < 0) {
1699       KMP_WARNING(ThreadIdentInvalid);
1700     }
1701     if (ret) {
1702       /* only one thread should do the pop since only */
1703       /* one did the push (see __kmpc_master())       */
1704 
1705       __kmp_pop_sync(global_tid, ct_master, loc);
1706     }
1707   }
1708 
1709   return (ret);
1710 }
1711 
1712 /* The BARRIER for a SINGLE process section is always explicit   */
1713 /*!
1714 @ingroup WORK_SHARING
1715 @param loc  source location information
1716 @param global_tid  global thread number
1717 @return One if this thread should execute the single construct, zero otherwise.
1718 
1719 Test whether to execute a <tt>single</tt> construct.
1720 There are no implicit barriers in the two "single" calls, rather the compiler
1721 should introduce an explicit barrier if it is required.
1722 */
1723 
1724 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1725   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1726 
1727   if (rc) {
1728     // We are going to execute the single statement, so we should count it.
1729     KMP_COUNT_BLOCK(OMP_SINGLE);
1730     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1731   }
1732 
1733 #if OMPT_SUPPORT && OMPT_OPTIONAL
1734   kmp_info_t *this_thr = __kmp_threads[global_tid];
1735   kmp_team_t *team = this_thr->th.th_team;
1736   int tid = __kmp_tid_from_gtid(global_tid);
1737 
1738   if (ompt_enabled.enabled) {
1739     if (rc) {
1740       if (ompt_enabled.ompt_callback_work) {
1741         ompt_callbacks.ompt_callback(ompt_callback_work)(
1742             ompt_work_single_executor, ompt_scope_begin,
1743             &(team->t.ompt_team_info.parallel_data),
1744             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1745             1, OMPT_GET_RETURN_ADDRESS(0));
1746       }
1747     } else {
1748       if (ompt_enabled.ompt_callback_work) {
1749         ompt_callbacks.ompt_callback(ompt_callback_work)(
1750             ompt_work_single_other, ompt_scope_begin,
1751             &(team->t.ompt_team_info.parallel_data),
1752             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1753             1, OMPT_GET_RETURN_ADDRESS(0));
1754         ompt_callbacks.ompt_callback(ompt_callback_work)(
1755             ompt_work_single_other, ompt_scope_end,
1756             &(team->t.ompt_team_info.parallel_data),
1757             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1758             1, OMPT_GET_RETURN_ADDRESS(0));
1759       }
1760     }
1761   }
1762 #endif
1763 
1764   return rc;
1765 }
1766 
1767 /*!
1768 @ingroup WORK_SHARING
1769 @param loc  source location information
1770 @param global_tid  global thread number
1771 
1772 Mark the end of a <tt>single</tt> construct.  This function should
1773 only be called by the thread that executed the block of code protected
1774 by the `single` construct.
1775 */
1776 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1777   __kmp_exit_single(global_tid);
1778   KMP_POP_PARTITIONED_TIMER();
1779 
1780 #if OMPT_SUPPORT && OMPT_OPTIONAL
1781   kmp_info_t *this_thr = __kmp_threads[global_tid];
1782   kmp_team_t *team = this_thr->th.th_team;
1783   int tid = __kmp_tid_from_gtid(global_tid);
1784 
1785   if (ompt_enabled.ompt_callback_work) {
1786     ompt_callbacks.ompt_callback(ompt_callback_work)(
1787         ompt_work_single_executor, ompt_scope_end,
1788         &(team->t.ompt_team_info.parallel_data),
1789         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1790         OMPT_GET_RETURN_ADDRESS(0));
1791   }
1792 #endif
1793 }
1794 
1795 /*!
1796 @ingroup WORK_SHARING
1797 @param loc Source location
1798 @param global_tid Global thread id
1799 
1800 Mark the end of a statically scheduled loop.
1801 */
1802 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1803   KMP_POP_PARTITIONED_TIMER();
1804   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1805 
1806 #if OMPT_SUPPORT && OMPT_OPTIONAL
1807   if (ompt_enabled.ompt_callback_work) {
1808     ompt_work_t ompt_work_type = ompt_work_loop;
1809     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1810     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1811     // Determine workshare type
1812     if (loc != NULL) {
1813       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1814         ompt_work_type = ompt_work_loop;
1815       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1816         ompt_work_type = ompt_work_sections;
1817       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1818         ompt_work_type = ompt_work_distribute;
1819       } else {
1820         // use default set above.
1821         // a warning about this case is provided in __kmpc_for_static_init
1822       }
1823       KMP_DEBUG_ASSERT(ompt_work_type);
1824     }
1825     ompt_callbacks.ompt_callback(ompt_callback_work)(
1826         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1827         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1828   }
1829 #endif
1830   if (__kmp_env_consistency_check)
1831     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1832 }
1833 
1834 // User routines which take C-style arguments (call by value)
1835 // different from the Fortran equivalent routines
1836 
1837 void ompc_set_num_threads(int arg) {
1838   // !!!!! TODO: check the per-task binding
1839   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1840 }
1841 
1842 void ompc_set_dynamic(int flag) {
1843   kmp_info_t *thread;
1844 
1845   /* For the thread-private implementation of the internal controls */
1846   thread = __kmp_entry_thread();
1847 
1848   __kmp_save_internal_controls(thread);
1849 
1850   set__dynamic(thread, flag ? TRUE : FALSE);
1851 }
1852 
1853 void ompc_set_nested(int flag) {
1854   kmp_info_t *thread;
1855 
1856   /* For the thread-private internal controls implementation */
1857   thread = __kmp_entry_thread();
1858 
1859   __kmp_save_internal_controls(thread);
1860 
1861   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1862 }
1863 
1864 void ompc_set_max_active_levels(int max_active_levels) {
1865   /* TO DO */
1866   /* we want per-task implementation of this internal control */
1867 
1868   /* For the per-thread internal controls implementation */
1869   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1870 }
1871 
1872 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1873   // !!!!! TODO: check the per-task binding
1874   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1875 }
1876 
1877 int ompc_get_ancestor_thread_num(int level) {
1878   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1879 }
1880 
1881 int ompc_get_team_size(int level) {
1882   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1883 }
1884 
1885 /* OpenMP 5.0 Affinity Format API */
1886 
1887 void ompc_set_affinity_format(char const *format) {
1888   if (!__kmp_init_serial) {
1889     __kmp_serial_initialize();
1890   }
1891   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1892                          format, KMP_STRLEN(format) + 1);
1893 }
1894 
1895 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1896   size_t format_size;
1897   if (!__kmp_init_serial) {
1898     __kmp_serial_initialize();
1899   }
1900   format_size = KMP_STRLEN(__kmp_affinity_format);
1901   if (buffer && size) {
1902     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1903                            format_size + 1);
1904   }
1905   return format_size;
1906 }
1907 
1908 void ompc_display_affinity(char const *format) {
1909   int gtid;
1910   if (!TCR_4(__kmp_init_middle)) {
1911     __kmp_middle_initialize();
1912   }
1913   gtid = __kmp_get_gtid();
1914   __kmp_aux_display_affinity(gtid, format);
1915 }
1916 
1917 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1918                              char const *format) {
1919   int gtid;
1920   size_t num_required;
1921   kmp_str_buf_t capture_buf;
1922   if (!TCR_4(__kmp_init_middle)) {
1923     __kmp_middle_initialize();
1924   }
1925   gtid = __kmp_get_gtid();
1926   __kmp_str_buf_init(&capture_buf);
1927   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1928   if (buffer && buf_size) {
1929     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1930                            capture_buf.used + 1);
1931   }
1932   __kmp_str_buf_free(&capture_buf);
1933   return num_required;
1934 }
1935 
1936 void kmpc_set_stacksize(int arg) {
1937   // __kmp_aux_set_stacksize initializes the library if needed
1938   __kmp_aux_set_stacksize(arg);
1939 }
1940 
1941 void kmpc_set_stacksize_s(size_t arg) {
1942   // __kmp_aux_set_stacksize initializes the library if needed
1943   __kmp_aux_set_stacksize(arg);
1944 }
1945 
1946 void kmpc_set_blocktime(int arg) {
1947   int gtid, tid;
1948   kmp_info_t *thread;
1949 
1950   gtid = __kmp_entry_gtid();
1951   tid = __kmp_tid_from_gtid(gtid);
1952   thread = __kmp_thread_from_gtid(gtid);
1953 
1954   __kmp_aux_set_blocktime(arg, thread, tid);
1955 }
1956 
1957 void kmpc_set_library(int arg) {
1958   // __kmp_user_set_library initializes the library if needed
1959   __kmp_user_set_library((enum library_type)arg);
1960 }
1961 
1962 void kmpc_set_defaults(char const *str) {
1963   // __kmp_aux_set_defaults initializes the library if needed
1964   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1965 }
1966 
1967 void kmpc_set_disp_num_buffers(int arg) {
1968   // ignore after initialization because some teams have already
1969   // allocated dispatch buffers
1970   if (__kmp_init_serial == 0 && arg > 0)
1971     __kmp_dispatch_num_buffers = arg;
1972 }
1973 
1974 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1975 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1976   return -1;
1977 #else
1978   if (!TCR_4(__kmp_init_middle)) {
1979     __kmp_middle_initialize();
1980   }
1981   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1982 #endif
1983 }
1984 
1985 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1986 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1987   return -1;
1988 #else
1989   if (!TCR_4(__kmp_init_middle)) {
1990     __kmp_middle_initialize();
1991   }
1992   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
1993 #endif
1994 }
1995 
1996 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
1997 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1998   return -1;
1999 #else
2000   if (!TCR_4(__kmp_init_middle)) {
2001     __kmp_middle_initialize();
2002   }
2003   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2004 #endif
2005 }
2006 
2007 /* -------------------------------------------------------------------------- */
2008 /*!
2009 @ingroup THREADPRIVATE
2010 @param loc       source location information
2011 @param gtid      global thread number
2012 @param cpy_size  size of the cpy_data buffer
2013 @param cpy_data  pointer to data to be copied
2014 @param cpy_func  helper function to call for copying data
2015 @param didit     flag variable: 1=single thread; 0=not single thread
2016 
2017 __kmpc_copyprivate implements the interface for the private data broadcast
2018 needed for the copyprivate clause associated with a single region in an
2019 OpenMP<sup>*</sup> program (both C and Fortran).
2020 All threads participating in the parallel region call this routine.
2021 One of the threads (called the single thread) should have the <tt>didit</tt>
2022 variable set to 1 and all other threads should have that variable set to 0.
2023 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2024 
2025 The OpenMP specification forbids the use of nowait on the single region when a
2026 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2027 barrier internally to avoid race conditions, so the code generation for the
2028 single region should avoid generating a barrier after the call to @ref
2029 __kmpc_copyprivate.
2030 
2031 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2032 The <tt>loc</tt> parameter is a pointer to source location information.
2033 
2034 Internal implementation: The single thread will first copy its descriptor
2035 address (cpy_data) to a team-private location, then the other threads will each
2036 call the function pointed to by the parameter cpy_func, which carries out the
2037 copy by copying the data using the cpy_data buffer.
2038 
2039 The cpy_func routine used for the copy and the contents of the data area defined
2040 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2041 to be done. For instance, the cpy_data buffer can hold the actual data to be
2042 copied or it may hold a list of pointers to the data. The cpy_func routine must
2043 interpret the cpy_data buffer appropriately.
2044 
2045 The interface to cpy_func is as follows:
2046 @code
2047 void cpy_func( void *destination, void *source )
2048 @endcode
2049 where void *destination is the cpy_data pointer for the thread being copied to
2050 and void *source is the cpy_data pointer for the thread being copied from.
2051 */
2052 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2053                         void *cpy_data, void (*cpy_func)(void *, void *),
2054                         kmp_int32 didit) {
2055   void **data_ptr;
2056 
2057   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2058 
2059   KMP_MB();
2060 
2061   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2062 
2063   if (__kmp_env_consistency_check) {
2064     if (loc == 0) {
2065       KMP_WARNING(ConstructIdentInvalid);
2066     }
2067   }
2068 
2069   // ToDo: Optimize the following two barriers into some kind of split barrier
2070 
2071   if (didit)
2072     *data_ptr = cpy_data;
2073 
2074 #if OMPT_SUPPORT
2075   ompt_frame_t *ompt_frame;
2076   if (ompt_enabled.enabled) {
2077     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2078     if (ompt_frame->enter_frame.ptr == NULL)
2079       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2080     OMPT_STORE_RETURN_ADDRESS(gtid);
2081   }
2082 #endif
2083 /* This barrier is not a barrier region boundary */
2084 #if USE_ITT_NOTIFY
2085   __kmp_threads[gtid]->th.th_ident = loc;
2086 #endif
2087   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2088 
2089   if (!didit)
2090     (*cpy_func)(cpy_data, *data_ptr);
2091 
2092 // Consider next barrier a user-visible barrier for barrier region boundaries
2093 // Nesting checks are already handled by the single construct checks
2094 
2095 #if OMPT_SUPPORT
2096   if (ompt_enabled.enabled) {
2097     OMPT_STORE_RETURN_ADDRESS(gtid);
2098   }
2099 #endif
2100 #if USE_ITT_NOTIFY
2101   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2102 // tasks can overwrite the location)
2103 #endif
2104   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2105 #if OMPT_SUPPORT && OMPT_OPTIONAL
2106   if (ompt_enabled.enabled) {
2107     ompt_frame->enter_frame = ompt_data_none;
2108   }
2109 #endif
2110 }
2111 
2112 /* -------------------------------------------------------------------------- */
2113 
2114 #define INIT_LOCK __kmp_init_user_lock_with_checks
2115 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2116 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2117 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2118 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2119 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2120   __kmp_acquire_nested_user_lock_with_checks_timed
2121 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2122 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2123 #define TEST_LOCK __kmp_test_user_lock_with_checks
2124 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2125 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2126 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2127 
2128 // TODO: Make check abort messages use location info & pass it into
2129 // with_checks routines
2130 
2131 #if KMP_USE_DYNAMIC_LOCK
2132 
2133 // internal lock initializer
2134 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2135                                                     kmp_dyna_lockseq_t seq) {
2136   if (KMP_IS_D_LOCK(seq)) {
2137     KMP_INIT_D_LOCK(lock, seq);
2138 #if USE_ITT_BUILD
2139     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2140 #endif
2141   } else {
2142     KMP_INIT_I_LOCK(lock, seq);
2143 #if USE_ITT_BUILD
2144     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2145     __kmp_itt_lock_creating(ilk->lock, loc);
2146 #endif
2147   }
2148 }
2149 
2150 // internal nest lock initializer
2151 static __forceinline void
2152 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2153                                kmp_dyna_lockseq_t seq) {
2154 #if KMP_USE_TSX
2155   // Don't have nested lock implementation for speculative locks
2156   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2157     seq = __kmp_user_lock_seq;
2158 #endif
2159   switch (seq) {
2160   case lockseq_tas:
2161     seq = lockseq_nested_tas;
2162     break;
2163 #if KMP_USE_FUTEX
2164   case lockseq_futex:
2165     seq = lockseq_nested_futex;
2166     break;
2167 #endif
2168   case lockseq_ticket:
2169     seq = lockseq_nested_ticket;
2170     break;
2171   case lockseq_queuing:
2172     seq = lockseq_nested_queuing;
2173     break;
2174   case lockseq_drdpa:
2175     seq = lockseq_nested_drdpa;
2176     break;
2177   default:
2178     seq = lockseq_nested_queuing;
2179   }
2180   KMP_INIT_I_LOCK(lock, seq);
2181 #if USE_ITT_BUILD
2182   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2183   __kmp_itt_lock_creating(ilk->lock, loc);
2184 #endif
2185 }
2186 
2187 /* initialize the lock with a hint */
2188 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2189                                 uintptr_t hint) {
2190   KMP_DEBUG_ASSERT(__kmp_init_serial);
2191   if (__kmp_env_consistency_check && user_lock == NULL) {
2192     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2193   }
2194 
2195   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2196 
2197 #if OMPT_SUPPORT && OMPT_OPTIONAL
2198   // This is the case, if called from omp_init_lock_with_hint:
2199   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2200   if (!codeptr)
2201     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2202   if (ompt_enabled.ompt_callback_lock_init) {
2203     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2204         ompt_mutex_lock, (omp_lock_hint_t)hint,
2205         __ompt_get_mutex_impl_type(user_lock),
2206         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2207   }
2208 #endif
2209 }
2210 
2211 /* initialize the lock with a hint */
2212 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2213                                      void **user_lock, uintptr_t hint) {
2214   KMP_DEBUG_ASSERT(__kmp_init_serial);
2215   if (__kmp_env_consistency_check && user_lock == NULL) {
2216     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2217   }
2218 
2219   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2220 
2221 #if OMPT_SUPPORT && OMPT_OPTIONAL
2222   // This is the case, if called from omp_init_lock_with_hint:
2223   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2224   if (!codeptr)
2225     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2226   if (ompt_enabled.ompt_callback_lock_init) {
2227     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2228         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2229         __ompt_get_mutex_impl_type(user_lock),
2230         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2231   }
2232 #endif
2233 }
2234 
2235 #endif // KMP_USE_DYNAMIC_LOCK
2236 
2237 /* initialize the lock */
2238 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2239 #if KMP_USE_DYNAMIC_LOCK
2240 
2241   KMP_DEBUG_ASSERT(__kmp_init_serial);
2242   if (__kmp_env_consistency_check && user_lock == NULL) {
2243     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2244   }
2245   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2246 
2247 #if OMPT_SUPPORT && OMPT_OPTIONAL
2248   // This is the case, if called from omp_init_lock_with_hint:
2249   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2250   if (!codeptr)
2251     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2252   if (ompt_enabled.ompt_callback_lock_init) {
2253     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2254         ompt_mutex_lock, omp_lock_hint_none,
2255         __ompt_get_mutex_impl_type(user_lock),
2256         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2257   }
2258 #endif
2259 
2260 #else // KMP_USE_DYNAMIC_LOCK
2261 
2262   static char const *const func = "omp_init_lock";
2263   kmp_user_lock_p lck;
2264   KMP_DEBUG_ASSERT(__kmp_init_serial);
2265 
2266   if (__kmp_env_consistency_check) {
2267     if (user_lock == NULL) {
2268       KMP_FATAL(LockIsUninitialized, func);
2269     }
2270   }
2271 
2272   KMP_CHECK_USER_LOCK_INIT();
2273 
2274   if ((__kmp_user_lock_kind == lk_tas) &&
2275       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2276     lck = (kmp_user_lock_p)user_lock;
2277   }
2278 #if KMP_USE_FUTEX
2279   else if ((__kmp_user_lock_kind == lk_futex) &&
2280            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2281     lck = (kmp_user_lock_p)user_lock;
2282   }
2283 #endif
2284   else {
2285     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2286   }
2287   INIT_LOCK(lck);
2288   __kmp_set_user_lock_location(lck, loc);
2289 
2290 #if OMPT_SUPPORT && OMPT_OPTIONAL
2291   // This is the case, if called from omp_init_lock_with_hint:
2292   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2293   if (!codeptr)
2294     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2295   if (ompt_enabled.ompt_callback_lock_init) {
2296     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2297         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2298         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2299   }
2300 #endif
2301 
2302 #if USE_ITT_BUILD
2303   __kmp_itt_lock_creating(lck);
2304 #endif /* USE_ITT_BUILD */
2305 
2306 #endif // KMP_USE_DYNAMIC_LOCK
2307 } // __kmpc_init_lock
2308 
2309 /* initialize the lock */
2310 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2311 #if KMP_USE_DYNAMIC_LOCK
2312 
2313   KMP_DEBUG_ASSERT(__kmp_init_serial);
2314   if (__kmp_env_consistency_check && user_lock == NULL) {
2315     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2316   }
2317   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2318 
2319 #if OMPT_SUPPORT && OMPT_OPTIONAL
2320   // This is the case, if called from omp_init_lock_with_hint:
2321   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2322   if (!codeptr)
2323     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2324   if (ompt_enabled.ompt_callback_lock_init) {
2325     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2326         ompt_mutex_nest_lock, omp_lock_hint_none,
2327         __ompt_get_mutex_impl_type(user_lock),
2328         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2329   }
2330 #endif
2331 
2332 #else // KMP_USE_DYNAMIC_LOCK
2333 
2334   static char const *const func = "omp_init_nest_lock";
2335   kmp_user_lock_p lck;
2336   KMP_DEBUG_ASSERT(__kmp_init_serial);
2337 
2338   if (__kmp_env_consistency_check) {
2339     if (user_lock == NULL) {
2340       KMP_FATAL(LockIsUninitialized, func);
2341     }
2342   }
2343 
2344   KMP_CHECK_USER_LOCK_INIT();
2345 
2346   if ((__kmp_user_lock_kind == lk_tas) &&
2347       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2348        OMP_NEST_LOCK_T_SIZE)) {
2349     lck = (kmp_user_lock_p)user_lock;
2350   }
2351 #if KMP_USE_FUTEX
2352   else if ((__kmp_user_lock_kind == lk_futex) &&
2353            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2354             OMP_NEST_LOCK_T_SIZE)) {
2355     lck = (kmp_user_lock_p)user_lock;
2356   }
2357 #endif
2358   else {
2359     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2360   }
2361 
2362   INIT_NESTED_LOCK(lck);
2363   __kmp_set_user_lock_location(lck, loc);
2364 
2365 #if OMPT_SUPPORT && OMPT_OPTIONAL
2366   // This is the case, if called from omp_init_lock_with_hint:
2367   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2368   if (!codeptr)
2369     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2370   if (ompt_enabled.ompt_callback_lock_init) {
2371     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2372         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2373         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2374   }
2375 #endif
2376 
2377 #if USE_ITT_BUILD
2378   __kmp_itt_lock_creating(lck);
2379 #endif /* USE_ITT_BUILD */
2380 
2381 #endif // KMP_USE_DYNAMIC_LOCK
2382 } // __kmpc_init_nest_lock
2383 
2384 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2385 #if KMP_USE_DYNAMIC_LOCK
2386 
2387 #if USE_ITT_BUILD
2388   kmp_user_lock_p lck;
2389   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2390     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2391   } else {
2392     lck = (kmp_user_lock_p)user_lock;
2393   }
2394   __kmp_itt_lock_destroyed(lck);
2395 #endif
2396 #if OMPT_SUPPORT && OMPT_OPTIONAL
2397   // This is the case, if called from omp_init_lock_with_hint:
2398   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2399   if (!codeptr)
2400     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2401   if (ompt_enabled.ompt_callback_lock_destroy) {
2402     kmp_user_lock_p lck;
2403     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2404       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2405     } else {
2406       lck = (kmp_user_lock_p)user_lock;
2407     }
2408     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2409         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2410   }
2411 #endif
2412   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2413 #else
2414   kmp_user_lock_p lck;
2415 
2416   if ((__kmp_user_lock_kind == lk_tas) &&
2417       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2418     lck = (kmp_user_lock_p)user_lock;
2419   }
2420 #if KMP_USE_FUTEX
2421   else if ((__kmp_user_lock_kind == lk_futex) &&
2422            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2423     lck = (kmp_user_lock_p)user_lock;
2424   }
2425 #endif
2426   else {
2427     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2428   }
2429 
2430 #if OMPT_SUPPORT && OMPT_OPTIONAL
2431   // This is the case, if called from omp_init_lock_with_hint:
2432   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2433   if (!codeptr)
2434     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2435   if (ompt_enabled.ompt_callback_lock_destroy) {
2436     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2437         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2438   }
2439 #endif
2440 
2441 #if USE_ITT_BUILD
2442   __kmp_itt_lock_destroyed(lck);
2443 #endif /* USE_ITT_BUILD */
2444   DESTROY_LOCK(lck);
2445 
2446   if ((__kmp_user_lock_kind == lk_tas) &&
2447       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2448     ;
2449   }
2450 #if KMP_USE_FUTEX
2451   else if ((__kmp_user_lock_kind == lk_futex) &&
2452            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2453     ;
2454   }
2455 #endif
2456   else {
2457     __kmp_user_lock_free(user_lock, gtid, lck);
2458   }
2459 #endif // KMP_USE_DYNAMIC_LOCK
2460 } // __kmpc_destroy_lock
2461 
2462 /* destroy the lock */
2463 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2464 #if KMP_USE_DYNAMIC_LOCK
2465 
2466 #if USE_ITT_BUILD
2467   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2468   __kmp_itt_lock_destroyed(ilk->lock);
2469 #endif
2470 #if OMPT_SUPPORT && OMPT_OPTIONAL
2471   // This is the case, if called from omp_init_lock_with_hint:
2472   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2473   if (!codeptr)
2474     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2475   if (ompt_enabled.ompt_callback_lock_destroy) {
2476     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2477         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2478   }
2479 #endif
2480   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2481 
2482 #else // KMP_USE_DYNAMIC_LOCK
2483 
2484   kmp_user_lock_p lck;
2485 
2486   if ((__kmp_user_lock_kind == lk_tas) &&
2487       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2488        OMP_NEST_LOCK_T_SIZE)) {
2489     lck = (kmp_user_lock_p)user_lock;
2490   }
2491 #if KMP_USE_FUTEX
2492   else if ((__kmp_user_lock_kind == lk_futex) &&
2493            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2494             OMP_NEST_LOCK_T_SIZE)) {
2495     lck = (kmp_user_lock_p)user_lock;
2496   }
2497 #endif
2498   else {
2499     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2500   }
2501 
2502 #if OMPT_SUPPORT && OMPT_OPTIONAL
2503   // This is the case, if called from omp_init_lock_with_hint:
2504   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2505   if (!codeptr)
2506     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2507   if (ompt_enabled.ompt_callback_lock_destroy) {
2508     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2509         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2510   }
2511 #endif
2512 
2513 #if USE_ITT_BUILD
2514   __kmp_itt_lock_destroyed(lck);
2515 #endif /* USE_ITT_BUILD */
2516 
2517   DESTROY_NESTED_LOCK(lck);
2518 
2519   if ((__kmp_user_lock_kind == lk_tas) &&
2520       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2521        OMP_NEST_LOCK_T_SIZE)) {
2522     ;
2523   }
2524 #if KMP_USE_FUTEX
2525   else if ((__kmp_user_lock_kind == lk_futex) &&
2526            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2527             OMP_NEST_LOCK_T_SIZE)) {
2528     ;
2529   }
2530 #endif
2531   else {
2532     __kmp_user_lock_free(user_lock, gtid, lck);
2533   }
2534 #endif // KMP_USE_DYNAMIC_LOCK
2535 } // __kmpc_destroy_nest_lock
2536 
2537 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2538   KMP_COUNT_BLOCK(OMP_set_lock);
2539 #if KMP_USE_DYNAMIC_LOCK
2540   int tag = KMP_EXTRACT_D_TAG(user_lock);
2541 #if USE_ITT_BUILD
2542   __kmp_itt_lock_acquiring(
2543       (kmp_user_lock_p)
2544           user_lock); // itt function will get to the right lock object.
2545 #endif
2546 #if OMPT_SUPPORT && OMPT_OPTIONAL
2547   // This is the case, if called from omp_init_lock_with_hint:
2548   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2549   if (!codeptr)
2550     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2551   if (ompt_enabled.ompt_callback_mutex_acquire) {
2552     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2553         ompt_mutex_lock, omp_lock_hint_none,
2554         __ompt_get_mutex_impl_type(user_lock),
2555         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2556   }
2557 #endif
2558 #if KMP_USE_INLINED_TAS
2559   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2560     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2561   } else
2562 #elif KMP_USE_INLINED_FUTEX
2563   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2564     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2565   } else
2566 #endif
2567   {
2568     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2569   }
2570 #if USE_ITT_BUILD
2571   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2572 #endif
2573 #if OMPT_SUPPORT && OMPT_OPTIONAL
2574   if (ompt_enabled.ompt_callback_mutex_acquired) {
2575     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2576         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2577   }
2578 #endif
2579 
2580 #else // KMP_USE_DYNAMIC_LOCK
2581 
2582   kmp_user_lock_p lck;
2583 
2584   if ((__kmp_user_lock_kind == lk_tas) &&
2585       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2586     lck = (kmp_user_lock_p)user_lock;
2587   }
2588 #if KMP_USE_FUTEX
2589   else if ((__kmp_user_lock_kind == lk_futex) &&
2590            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2591     lck = (kmp_user_lock_p)user_lock;
2592   }
2593 #endif
2594   else {
2595     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2596   }
2597 
2598 #if USE_ITT_BUILD
2599   __kmp_itt_lock_acquiring(lck);
2600 #endif /* USE_ITT_BUILD */
2601 #if OMPT_SUPPORT && OMPT_OPTIONAL
2602   // This is the case, if called from omp_init_lock_with_hint:
2603   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2604   if (!codeptr)
2605     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2606   if (ompt_enabled.ompt_callback_mutex_acquire) {
2607     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2608         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2609         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2610   }
2611 #endif
2612 
2613   ACQUIRE_LOCK(lck, gtid);
2614 
2615 #if USE_ITT_BUILD
2616   __kmp_itt_lock_acquired(lck);
2617 #endif /* USE_ITT_BUILD */
2618 
2619 #if OMPT_SUPPORT && OMPT_OPTIONAL
2620   if (ompt_enabled.ompt_callback_mutex_acquired) {
2621     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2622         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2623   }
2624 #endif
2625 
2626 #endif // KMP_USE_DYNAMIC_LOCK
2627 }
2628 
2629 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2630 #if KMP_USE_DYNAMIC_LOCK
2631 
2632 #if USE_ITT_BUILD
2633   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2634 #endif
2635 #if OMPT_SUPPORT && OMPT_OPTIONAL
2636   // This is the case, if called from omp_init_lock_with_hint:
2637   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2638   if (!codeptr)
2639     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2640   if (ompt_enabled.enabled) {
2641     if (ompt_enabled.ompt_callback_mutex_acquire) {
2642       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2643           ompt_mutex_nest_lock, omp_lock_hint_none,
2644           __ompt_get_mutex_impl_type(user_lock),
2645           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2646     }
2647   }
2648 #endif
2649   int acquire_status =
2650       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2651   (void) acquire_status;
2652 #if USE_ITT_BUILD
2653   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2654 #endif
2655 
2656 #if OMPT_SUPPORT && OMPT_OPTIONAL
2657   if (ompt_enabled.enabled) {
2658     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2659       if (ompt_enabled.ompt_callback_mutex_acquired) {
2660         // lock_first
2661         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2662             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2663             codeptr);
2664       }
2665     } else {
2666       if (ompt_enabled.ompt_callback_nest_lock) {
2667         // lock_next
2668         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2669             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2670       }
2671     }
2672   }
2673 #endif
2674 
2675 #else // KMP_USE_DYNAMIC_LOCK
2676   int acquire_status;
2677   kmp_user_lock_p lck;
2678 
2679   if ((__kmp_user_lock_kind == lk_tas) &&
2680       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2681        OMP_NEST_LOCK_T_SIZE)) {
2682     lck = (kmp_user_lock_p)user_lock;
2683   }
2684 #if KMP_USE_FUTEX
2685   else if ((__kmp_user_lock_kind == lk_futex) &&
2686            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2687             OMP_NEST_LOCK_T_SIZE)) {
2688     lck = (kmp_user_lock_p)user_lock;
2689   }
2690 #endif
2691   else {
2692     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2693   }
2694 
2695 #if USE_ITT_BUILD
2696   __kmp_itt_lock_acquiring(lck);
2697 #endif /* USE_ITT_BUILD */
2698 #if OMPT_SUPPORT && OMPT_OPTIONAL
2699   // This is the case, if called from omp_init_lock_with_hint:
2700   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2701   if (!codeptr)
2702     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2703   if (ompt_enabled.enabled) {
2704     if (ompt_enabled.ompt_callback_mutex_acquire) {
2705       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2706           ompt_mutex_nest_lock, omp_lock_hint_none,
2707           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2708           codeptr);
2709     }
2710   }
2711 #endif
2712 
2713   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2714 
2715 #if USE_ITT_BUILD
2716   __kmp_itt_lock_acquired(lck);
2717 #endif /* USE_ITT_BUILD */
2718 
2719 #if OMPT_SUPPORT && OMPT_OPTIONAL
2720   if (ompt_enabled.enabled) {
2721     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2722       if (ompt_enabled.ompt_callback_mutex_acquired) {
2723         // lock_first
2724         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2725             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2726       }
2727     } else {
2728       if (ompt_enabled.ompt_callback_nest_lock) {
2729         // lock_next
2730         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2731             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2732       }
2733     }
2734   }
2735 #endif
2736 
2737 #endif // KMP_USE_DYNAMIC_LOCK
2738 }
2739 
2740 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2741 #if KMP_USE_DYNAMIC_LOCK
2742 
2743   int tag = KMP_EXTRACT_D_TAG(user_lock);
2744 #if USE_ITT_BUILD
2745   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2746 #endif
2747 #if KMP_USE_INLINED_TAS
2748   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2749     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2750   } else
2751 #elif KMP_USE_INLINED_FUTEX
2752   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2753     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2754   } else
2755 #endif
2756   {
2757     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2758   }
2759 
2760 #if OMPT_SUPPORT && OMPT_OPTIONAL
2761   // This is the case, if called from omp_init_lock_with_hint:
2762   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2763   if (!codeptr)
2764     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2765   if (ompt_enabled.ompt_callback_mutex_released) {
2766     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2767         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2768   }
2769 #endif
2770 
2771 #else // KMP_USE_DYNAMIC_LOCK
2772 
2773   kmp_user_lock_p lck;
2774 
2775   /* Can't use serial interval since not block structured */
2776   /* release the lock */
2777 
2778   if ((__kmp_user_lock_kind == lk_tas) &&
2779       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2780 #if KMP_OS_LINUX &&                                                            \
2781     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2782 // "fast" path implemented to fix customer performance issue
2783 #if USE_ITT_BUILD
2784     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2785 #endif /* USE_ITT_BUILD */
2786     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2787     KMP_MB();
2788 
2789 #if OMPT_SUPPORT && OMPT_OPTIONAL
2790     // This is the case, if called from omp_init_lock_with_hint:
2791     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2792     if (!codeptr)
2793       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2794     if (ompt_enabled.ompt_callback_mutex_released) {
2795       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2796           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2797     }
2798 #endif
2799 
2800     return;
2801 #else
2802     lck = (kmp_user_lock_p)user_lock;
2803 #endif
2804   }
2805 #if KMP_USE_FUTEX
2806   else if ((__kmp_user_lock_kind == lk_futex) &&
2807            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2808     lck = (kmp_user_lock_p)user_lock;
2809   }
2810 #endif
2811   else {
2812     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2813   }
2814 
2815 #if USE_ITT_BUILD
2816   __kmp_itt_lock_releasing(lck);
2817 #endif /* USE_ITT_BUILD */
2818 
2819   RELEASE_LOCK(lck, gtid);
2820 
2821 #if OMPT_SUPPORT && OMPT_OPTIONAL
2822   // This is the case, if called from omp_init_lock_with_hint:
2823   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2824   if (!codeptr)
2825     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2826   if (ompt_enabled.ompt_callback_mutex_released) {
2827     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2828         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2829   }
2830 #endif
2831 
2832 #endif // KMP_USE_DYNAMIC_LOCK
2833 }
2834 
2835 /* release the lock */
2836 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2837 #if KMP_USE_DYNAMIC_LOCK
2838 
2839 #if USE_ITT_BUILD
2840   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2841 #endif
2842   int release_status =
2843       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2844   (void) release_status;
2845 
2846 #if OMPT_SUPPORT && OMPT_OPTIONAL
2847   // This is the case, if called from omp_init_lock_with_hint:
2848   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2849   if (!codeptr)
2850     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2851   if (ompt_enabled.enabled) {
2852     if (release_status == KMP_LOCK_RELEASED) {
2853       if (ompt_enabled.ompt_callback_mutex_released) {
2854         // release_lock_last
2855         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2856             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2857             codeptr);
2858       }
2859     } else if (ompt_enabled.ompt_callback_nest_lock) {
2860       // release_lock_prev
2861       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2862           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2863     }
2864   }
2865 #endif
2866 
2867 #else // KMP_USE_DYNAMIC_LOCK
2868 
2869   kmp_user_lock_p lck;
2870 
2871   /* Can't use serial interval since not block structured */
2872 
2873   if ((__kmp_user_lock_kind == lk_tas) &&
2874       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2875        OMP_NEST_LOCK_T_SIZE)) {
2876 #if KMP_OS_LINUX &&                                                            \
2877     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2878     // "fast" path implemented to fix customer performance issue
2879     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2880 #if USE_ITT_BUILD
2881     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2882 #endif /* USE_ITT_BUILD */
2883 
2884 #if OMPT_SUPPORT && OMPT_OPTIONAL
2885     int release_status = KMP_LOCK_STILL_HELD;
2886 #endif
2887 
2888     if (--(tl->lk.depth_locked) == 0) {
2889       TCW_4(tl->lk.poll, 0);
2890 #if OMPT_SUPPORT && OMPT_OPTIONAL
2891       release_status = KMP_LOCK_RELEASED;
2892 #endif
2893     }
2894     KMP_MB();
2895 
2896 #if OMPT_SUPPORT && OMPT_OPTIONAL
2897     // This is the case, if called from omp_init_lock_with_hint:
2898     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2899     if (!codeptr)
2900       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2901     if (ompt_enabled.enabled) {
2902       if (release_status == KMP_LOCK_RELEASED) {
2903         if (ompt_enabled.ompt_callback_mutex_released) {
2904           // release_lock_last
2905           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2906               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2907         }
2908       } else if (ompt_enabled.ompt_callback_nest_lock) {
2909         // release_lock_previous
2910         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2911             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2912       }
2913     }
2914 #endif
2915 
2916     return;
2917 #else
2918     lck = (kmp_user_lock_p)user_lock;
2919 #endif
2920   }
2921 #if KMP_USE_FUTEX
2922   else if ((__kmp_user_lock_kind == lk_futex) &&
2923            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2924             OMP_NEST_LOCK_T_SIZE)) {
2925     lck = (kmp_user_lock_p)user_lock;
2926   }
2927 #endif
2928   else {
2929     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2930   }
2931 
2932 #if USE_ITT_BUILD
2933   __kmp_itt_lock_releasing(lck);
2934 #endif /* USE_ITT_BUILD */
2935 
2936   int release_status;
2937   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2938 #if OMPT_SUPPORT && OMPT_OPTIONAL
2939   // This is the case, if called from omp_init_lock_with_hint:
2940   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2941   if (!codeptr)
2942     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2943   if (ompt_enabled.enabled) {
2944     if (release_status == KMP_LOCK_RELEASED) {
2945       if (ompt_enabled.ompt_callback_mutex_released) {
2946         // release_lock_last
2947         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2948             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2949       }
2950     } else if (ompt_enabled.ompt_callback_nest_lock) {
2951       // release_lock_previous
2952       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2953           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2954     }
2955   }
2956 #endif
2957 
2958 #endif // KMP_USE_DYNAMIC_LOCK
2959 }
2960 
2961 /* try to acquire the lock */
2962 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2963   KMP_COUNT_BLOCK(OMP_test_lock);
2964 
2965 #if KMP_USE_DYNAMIC_LOCK
2966   int rc;
2967   int tag = KMP_EXTRACT_D_TAG(user_lock);
2968 #if USE_ITT_BUILD
2969   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2970 #endif
2971 #if OMPT_SUPPORT && OMPT_OPTIONAL
2972   // This is the case, if called from omp_init_lock_with_hint:
2973   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2974   if (!codeptr)
2975     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2976   if (ompt_enabled.ompt_callback_mutex_acquire) {
2977     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2978         ompt_mutex_lock, omp_lock_hint_none,
2979         __ompt_get_mutex_impl_type(user_lock),
2980         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2981   }
2982 #endif
2983 #if KMP_USE_INLINED_TAS
2984   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2985     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2986   } else
2987 #elif KMP_USE_INLINED_FUTEX
2988   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2989     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2990   } else
2991 #endif
2992   {
2993     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2994   }
2995   if (rc) {
2996 #if USE_ITT_BUILD
2997     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2998 #endif
2999 #if OMPT_SUPPORT && OMPT_OPTIONAL
3000     if (ompt_enabled.ompt_callback_mutex_acquired) {
3001       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3002           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3003     }
3004 #endif
3005     return FTN_TRUE;
3006   } else {
3007 #if USE_ITT_BUILD
3008     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3009 #endif
3010     return FTN_FALSE;
3011   }
3012 
3013 #else // KMP_USE_DYNAMIC_LOCK
3014 
3015   kmp_user_lock_p lck;
3016   int rc;
3017 
3018   if ((__kmp_user_lock_kind == lk_tas) &&
3019       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3020     lck = (kmp_user_lock_p)user_lock;
3021   }
3022 #if KMP_USE_FUTEX
3023   else if ((__kmp_user_lock_kind == lk_futex) &&
3024            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3025     lck = (kmp_user_lock_p)user_lock;
3026   }
3027 #endif
3028   else {
3029     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3030   }
3031 
3032 #if USE_ITT_BUILD
3033   __kmp_itt_lock_acquiring(lck);
3034 #endif /* USE_ITT_BUILD */
3035 #if OMPT_SUPPORT && OMPT_OPTIONAL
3036   // This is the case, if called from omp_init_lock_with_hint:
3037   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3038   if (!codeptr)
3039     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3040   if (ompt_enabled.ompt_callback_mutex_acquire) {
3041     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3042         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3043         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3044   }
3045 #endif
3046 
3047   rc = TEST_LOCK(lck, gtid);
3048 #if USE_ITT_BUILD
3049   if (rc) {
3050     __kmp_itt_lock_acquired(lck);
3051   } else {
3052     __kmp_itt_lock_cancelled(lck);
3053   }
3054 #endif /* USE_ITT_BUILD */
3055 #if OMPT_SUPPORT && OMPT_OPTIONAL
3056   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3057     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3058         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3059   }
3060 #endif
3061 
3062   return (rc ? FTN_TRUE : FTN_FALSE);
3063 
3064 /* Can't use serial interval since not block structured */
3065 
3066 #endif // KMP_USE_DYNAMIC_LOCK
3067 }
3068 
3069 /* try to acquire the lock */
3070 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3071 #if KMP_USE_DYNAMIC_LOCK
3072   int rc;
3073 #if USE_ITT_BUILD
3074   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3075 #endif
3076 #if OMPT_SUPPORT && OMPT_OPTIONAL
3077   // This is the case, if called from omp_init_lock_with_hint:
3078   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3079   if (!codeptr)
3080     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3081   if (ompt_enabled.ompt_callback_mutex_acquire) {
3082     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3083         ompt_mutex_nest_lock, omp_lock_hint_none,
3084         __ompt_get_mutex_impl_type(user_lock),
3085         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3086   }
3087 #endif
3088   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3089 #if USE_ITT_BUILD
3090   if (rc) {
3091     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3092   } else {
3093     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3094   }
3095 #endif
3096 #if OMPT_SUPPORT && OMPT_OPTIONAL
3097   if (ompt_enabled.enabled && rc) {
3098     if (rc == 1) {
3099       if (ompt_enabled.ompt_callback_mutex_acquired) {
3100         // lock_first
3101         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3102             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3103             codeptr);
3104       }
3105     } else {
3106       if (ompt_enabled.ompt_callback_nest_lock) {
3107         // lock_next
3108         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3109             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3110       }
3111     }
3112   }
3113 #endif
3114   return rc;
3115 
3116 #else // KMP_USE_DYNAMIC_LOCK
3117 
3118   kmp_user_lock_p lck;
3119   int rc;
3120 
3121   if ((__kmp_user_lock_kind == lk_tas) &&
3122       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3123        OMP_NEST_LOCK_T_SIZE)) {
3124     lck = (kmp_user_lock_p)user_lock;
3125   }
3126 #if KMP_USE_FUTEX
3127   else if ((__kmp_user_lock_kind == lk_futex) &&
3128            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3129             OMP_NEST_LOCK_T_SIZE)) {
3130     lck = (kmp_user_lock_p)user_lock;
3131   }
3132 #endif
3133   else {
3134     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3135   }
3136 
3137 #if USE_ITT_BUILD
3138   __kmp_itt_lock_acquiring(lck);
3139 #endif /* USE_ITT_BUILD */
3140 
3141 #if OMPT_SUPPORT && OMPT_OPTIONAL
3142   // This is the case, if called from omp_init_lock_with_hint:
3143   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3144   if (!codeptr)
3145     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3146   if (ompt_enabled.enabled) &&
3147         ompt_enabled.ompt_callback_mutex_acquire) {
3148       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3149           ompt_mutex_nest_lock, omp_lock_hint_none,
3150           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3151           codeptr);
3152     }
3153 #endif
3154 
3155   rc = TEST_NESTED_LOCK(lck, gtid);
3156 #if USE_ITT_BUILD
3157   if (rc) {
3158     __kmp_itt_lock_acquired(lck);
3159   } else {
3160     __kmp_itt_lock_cancelled(lck);
3161   }
3162 #endif /* USE_ITT_BUILD */
3163 #if OMPT_SUPPORT && OMPT_OPTIONAL
3164   if (ompt_enabled.enabled && rc) {
3165     if (rc == 1) {
3166       if (ompt_enabled.ompt_callback_mutex_acquired) {
3167         // lock_first
3168         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3169             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3170       }
3171     } else {
3172       if (ompt_enabled.ompt_callback_nest_lock) {
3173         // lock_next
3174         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3175             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3176       }
3177     }
3178   }
3179 #endif
3180   return rc;
3181 
3182 /* Can't use serial interval since not block structured */
3183 
3184 #endif // KMP_USE_DYNAMIC_LOCK
3185 }
3186 
3187 // Interface to fast scalable reduce methods routines
3188 
3189 // keep the selected method in a thread local structure for cross-function
3190 // usage: will be used in __kmpc_end_reduce* functions;
3191 // another solution: to re-determine the method one more time in
3192 // __kmpc_end_reduce* functions (new prototype required then)
3193 // AT: which solution is better?
3194 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3195   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3196 
3197 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3198   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3199 
3200 // description of the packed_reduction_method variable: look at the macros in
3201 // kmp.h
3202 
3203 // used in a critical section reduce block
3204 static __forceinline void
3205 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3206                                           kmp_critical_name *crit) {
3207 
3208   // this lock was visible to a customer and to the threading profile tool as a
3209   // serial overhead span (although it's used for an internal purpose only)
3210   //            why was it visible in previous implementation?
3211   //            should we keep it visible in new reduce block?
3212   kmp_user_lock_p lck;
3213 
3214 #if KMP_USE_DYNAMIC_LOCK
3215 
3216   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3217   // Check if it is initialized.
3218   if (*lk == 0) {
3219     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3220       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3221                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3222     } else {
3223       __kmp_init_indirect_csptr(crit, loc, global_tid,
3224                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3225     }
3226   }
3227   // Branch for accessing the actual lock object and set operation. This
3228   // branching is inevitable since this lock initialization does not follow the
3229   // normal dispatch path (lock table is not used).
3230   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3231     lck = (kmp_user_lock_p)lk;
3232     KMP_DEBUG_ASSERT(lck != NULL);
3233     if (__kmp_env_consistency_check) {
3234       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3235     }
3236     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3237   } else {
3238     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3239     lck = ilk->lock;
3240     KMP_DEBUG_ASSERT(lck != NULL);
3241     if (__kmp_env_consistency_check) {
3242       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3243     }
3244     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3245   }
3246 
3247 #else // KMP_USE_DYNAMIC_LOCK
3248 
3249   // We know that the fast reduction code is only emitted by Intel compilers
3250   // with 32 byte critical sections. If there isn't enough space, then we
3251   // have to use a pointer.
3252   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3253     lck = (kmp_user_lock_p)crit;
3254   } else {
3255     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3256   }
3257   KMP_DEBUG_ASSERT(lck != NULL);
3258 
3259   if (__kmp_env_consistency_check)
3260     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3261 
3262   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3263 
3264 #endif // KMP_USE_DYNAMIC_LOCK
3265 }
3266 
3267 // used in a critical section reduce block
3268 static __forceinline void
3269 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3270                                         kmp_critical_name *crit) {
3271 
3272   kmp_user_lock_p lck;
3273 
3274 #if KMP_USE_DYNAMIC_LOCK
3275 
3276   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3277     lck = (kmp_user_lock_p)crit;
3278     if (__kmp_env_consistency_check)
3279       __kmp_pop_sync(global_tid, ct_critical, loc);
3280     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3281   } else {
3282     kmp_indirect_lock_t *ilk =
3283         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3284     if (__kmp_env_consistency_check)
3285       __kmp_pop_sync(global_tid, ct_critical, loc);
3286     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3287   }
3288 
3289 #else // KMP_USE_DYNAMIC_LOCK
3290 
3291   // We know that the fast reduction code is only emitted by Intel compilers
3292   // with 32 byte critical sections. If there isn't enough space, then we have
3293   // to use a pointer.
3294   if (__kmp_base_user_lock_size > 32) {
3295     lck = *((kmp_user_lock_p *)crit);
3296     KMP_ASSERT(lck != NULL);
3297   } else {
3298     lck = (kmp_user_lock_p)crit;
3299   }
3300 
3301   if (__kmp_env_consistency_check)
3302     __kmp_pop_sync(global_tid, ct_critical, loc);
3303 
3304   __kmp_release_user_lock_with_checks(lck, global_tid);
3305 
3306 #endif // KMP_USE_DYNAMIC_LOCK
3307 } // __kmp_end_critical_section_reduce_block
3308 
3309 static __forceinline int
3310 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3311                                      int *task_state) {
3312   kmp_team_t *team;
3313 
3314   // Check if we are inside the teams construct?
3315   if (th->th.th_teams_microtask) {
3316     *team_p = team = th->th.th_team;
3317     if (team->t.t_level == th->th.th_teams_level) {
3318       // This is reduction at teams construct.
3319       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3320       // Let's swap teams temporarily for the reduction.
3321       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3322       th->th.th_team = team->t.t_parent;
3323       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3324       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3325       *task_state = th->th.th_task_state;
3326       th->th.th_task_state = 0;
3327 
3328       return 1;
3329     }
3330   }
3331   return 0;
3332 }
3333 
3334 static __forceinline void
3335 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3336   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3337   th->th.th_info.ds.ds_tid = 0;
3338   th->th.th_team = team;
3339   th->th.th_team_nproc = team->t.t_nproc;
3340   th->th.th_task_team = team->t.t_task_team[task_state];
3341   th->th.th_task_state = task_state;
3342 }
3343 
3344 /* 2.a.i. Reduce Block without a terminating barrier */
3345 /*!
3346 @ingroup SYNCHRONIZATION
3347 @param loc source location information
3348 @param global_tid global thread number
3349 @param num_vars number of items (variables) to be reduced
3350 @param reduce_size size of data in bytes to be reduced
3351 @param reduce_data pointer to data to be reduced
3352 @param reduce_func callback function providing reduction operation on two
3353 operands and returning result of reduction in lhs_data
3354 @param lck pointer to the unique lock data structure
3355 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3356 threads if atomic reduction needed
3357 
3358 The nowait version is used for a reduce clause with the nowait argument.
3359 */
3360 kmp_int32
3361 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3362                      size_t reduce_size, void *reduce_data,
3363                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3364                      kmp_critical_name *lck) {
3365 
3366   KMP_COUNT_BLOCK(REDUCE_nowait);
3367   int retval = 0;
3368   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3369   kmp_info_t *th;
3370   kmp_team_t *team;
3371   int teams_swapped = 0, task_state;
3372   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3373 
3374   // why do we need this initialization here at all?
3375   // Reduction clause can not be used as a stand-alone directive.
3376 
3377   // do not call __kmp_serial_initialize(), it will be called by
3378   // __kmp_parallel_initialize() if needed
3379   // possible detection of false-positive race by the threadchecker ???
3380   if (!TCR_4(__kmp_init_parallel))
3381     __kmp_parallel_initialize();
3382 
3383   __kmp_resume_if_soft_paused();
3384 
3385 // check correctness of reduce block nesting
3386 #if KMP_USE_DYNAMIC_LOCK
3387   if (__kmp_env_consistency_check)
3388     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3389 #else
3390   if (__kmp_env_consistency_check)
3391     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3392 #endif
3393 
3394   th = __kmp_thread_from_gtid(global_tid);
3395   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3396 
3397   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3398   // the value should be kept in a variable
3399   // the variable should be either a construct-specific or thread-specific
3400   // property, not a team specific property
3401   //     (a thread can reach the next reduce block on the next construct, reduce
3402   //     method may differ on the next construct)
3403   // an ident_t "loc" parameter could be used as a construct-specific property
3404   // (what if loc == 0?)
3405   //     (if both construct-specific and team-specific variables were shared,
3406   //     then unness extra syncs should be needed)
3407   // a thread-specific variable is better regarding two issues above (next
3408   // construct and extra syncs)
3409   // a thread-specific "th_local.reduction_method" variable is used currently
3410   // each thread executes 'determine' and 'set' lines (no need to execute by one
3411   // thread, to avoid unness extra syncs)
3412 
3413   packed_reduction_method = __kmp_determine_reduction_method(
3414       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3415   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3416 
3417   OMPT_REDUCTION_DECL(th, global_tid);
3418   if (packed_reduction_method == critical_reduce_block) {
3419 
3420     OMPT_REDUCTION_BEGIN;
3421 
3422     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3423     retval = 1;
3424 
3425   } else if (packed_reduction_method == empty_reduce_block) {
3426 
3427     OMPT_REDUCTION_BEGIN;
3428 
3429     // usage: if team size == 1, no synchronization is required ( Intel
3430     // platforms only )
3431     retval = 1;
3432 
3433   } else if (packed_reduction_method == atomic_reduce_block) {
3434 
3435     retval = 2;
3436 
3437     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3438     // won't be called by the code gen)
3439     //     (it's not quite good, because the checking block has been closed by
3440     //     this 'pop',
3441     //      but atomic operation has not been executed yet, will be executed
3442     //      slightly later, literally on next instruction)
3443     if (__kmp_env_consistency_check)
3444       __kmp_pop_sync(global_tid, ct_reduce, loc);
3445 
3446   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3447                                    tree_reduce_block)) {
3448 
3449 // AT: performance issue: a real barrier here
3450 // AT:     (if master goes slow, other threads are blocked here waiting for the
3451 // master to come and release them)
3452 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3453 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3454 // be confusing to a customer)
3455 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3456 // might go faster and be more in line with sense of NOWAIT
3457 // AT: TO DO: do epcc test and compare times
3458 
3459 // this barrier should be invisible to a customer and to the threading profile
3460 // tool (it's neither a terminating barrier nor customer's code, it's
3461 // used for an internal purpose)
3462 #if OMPT_SUPPORT
3463     // JP: can this barrier potentially leed to task scheduling?
3464     // JP: as long as there is a barrier in the implementation, OMPT should and
3465     // will provide the barrier events
3466     //         so we set-up the necessary frame/return addresses.
3467     ompt_frame_t *ompt_frame;
3468     if (ompt_enabled.enabled) {
3469       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3470       if (ompt_frame->enter_frame.ptr == NULL)
3471         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3472       OMPT_STORE_RETURN_ADDRESS(global_tid);
3473     }
3474 #endif
3475 #if USE_ITT_NOTIFY
3476     __kmp_threads[global_tid]->th.th_ident = loc;
3477 #endif
3478     retval =
3479         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3480                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3481     retval = (retval != 0) ? (0) : (1);
3482 #if OMPT_SUPPORT && OMPT_OPTIONAL
3483     if (ompt_enabled.enabled) {
3484       ompt_frame->enter_frame = ompt_data_none;
3485     }
3486 #endif
3487 
3488     // all other workers except master should do this pop here
3489     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3490     if (__kmp_env_consistency_check) {
3491       if (retval == 0) {
3492         __kmp_pop_sync(global_tid, ct_reduce, loc);
3493       }
3494     }
3495 
3496   } else {
3497 
3498     // should never reach this block
3499     KMP_ASSERT(0); // "unexpected method"
3500   }
3501   if (teams_swapped) {
3502     __kmp_restore_swapped_teams(th, team, task_state);
3503   }
3504   KA_TRACE(
3505       10,
3506       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3507        global_tid, packed_reduction_method, retval));
3508 
3509   return retval;
3510 }
3511 
3512 /*!
3513 @ingroup SYNCHRONIZATION
3514 @param loc source location information
3515 @param global_tid global thread id.
3516 @param lck pointer to the unique lock data structure
3517 
3518 Finish the execution of a reduce nowait.
3519 */
3520 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3521                               kmp_critical_name *lck) {
3522 
3523   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3524 
3525   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3526 
3527   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3528 
3529   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3530 
3531   if (packed_reduction_method == critical_reduce_block) {
3532 
3533     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3534     OMPT_REDUCTION_END;
3535 
3536   } else if (packed_reduction_method == empty_reduce_block) {
3537 
3538     // usage: if team size == 1, no synchronization is required ( on Intel
3539     // platforms only )
3540 
3541     OMPT_REDUCTION_END;
3542 
3543   } else if (packed_reduction_method == atomic_reduce_block) {
3544 
3545     // neither master nor other workers should get here
3546     //     (code gen does not generate this call in case 2: atomic reduce block)
3547     // actually it's better to remove this elseif at all;
3548     // after removal this value will checked by the 'else' and will assert
3549 
3550   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3551                                    tree_reduce_block)) {
3552 
3553     // only master gets here
3554     // OMPT: tree reduction is annotated in the barrier code
3555 
3556   } else {
3557 
3558     // should never reach this block
3559     KMP_ASSERT(0); // "unexpected method"
3560   }
3561 
3562   if (__kmp_env_consistency_check)
3563     __kmp_pop_sync(global_tid, ct_reduce, loc);
3564 
3565   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3566                 global_tid, packed_reduction_method));
3567 
3568   return;
3569 }
3570 
3571 /* 2.a.ii. Reduce Block with a terminating barrier */
3572 
3573 /*!
3574 @ingroup SYNCHRONIZATION
3575 @param loc source location information
3576 @param global_tid global thread number
3577 @param num_vars number of items (variables) to be reduced
3578 @param reduce_size size of data in bytes to be reduced
3579 @param reduce_data pointer to data to be reduced
3580 @param reduce_func callback function providing reduction operation on two
3581 operands and returning result of reduction in lhs_data
3582 @param lck pointer to the unique lock data structure
3583 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3584 threads if atomic reduction needed
3585 
3586 A blocking reduce that includes an implicit barrier.
3587 */
3588 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3589                         size_t reduce_size, void *reduce_data,
3590                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3591                         kmp_critical_name *lck) {
3592   KMP_COUNT_BLOCK(REDUCE_wait);
3593   int retval = 0;
3594   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3595   kmp_info_t *th;
3596   kmp_team_t *team;
3597   int teams_swapped = 0, task_state;
3598 
3599   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3600 
3601   // why do we need this initialization here at all?
3602   // Reduction clause can not be a stand-alone directive.
3603 
3604   // do not call __kmp_serial_initialize(), it will be called by
3605   // __kmp_parallel_initialize() if needed
3606   // possible detection of false-positive race by the threadchecker ???
3607   if (!TCR_4(__kmp_init_parallel))
3608     __kmp_parallel_initialize();
3609 
3610   __kmp_resume_if_soft_paused();
3611 
3612 // check correctness of reduce block nesting
3613 #if KMP_USE_DYNAMIC_LOCK
3614   if (__kmp_env_consistency_check)
3615     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3616 #else
3617   if (__kmp_env_consistency_check)
3618     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3619 #endif
3620 
3621   th = __kmp_thread_from_gtid(global_tid);
3622   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3623 
3624   packed_reduction_method = __kmp_determine_reduction_method(
3625       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3626   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3627 
3628   OMPT_REDUCTION_DECL(th, global_tid);
3629 
3630   if (packed_reduction_method == critical_reduce_block) {
3631 
3632     OMPT_REDUCTION_BEGIN;
3633     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3634     retval = 1;
3635 
3636   } else if (packed_reduction_method == empty_reduce_block) {
3637 
3638     OMPT_REDUCTION_BEGIN;
3639     // usage: if team size == 1, no synchronization is required ( Intel
3640     // platforms only )
3641     retval = 1;
3642 
3643   } else if (packed_reduction_method == atomic_reduce_block) {
3644 
3645     retval = 2;
3646 
3647   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3648                                    tree_reduce_block)) {
3649 
3650 // case tree_reduce_block:
3651 // this barrier should be visible to a customer and to the threading profile
3652 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3653 #if OMPT_SUPPORT
3654     ompt_frame_t *ompt_frame;
3655     if (ompt_enabled.enabled) {
3656       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3657       if (ompt_frame->enter_frame.ptr == NULL)
3658         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3659       OMPT_STORE_RETURN_ADDRESS(global_tid);
3660     }
3661 #endif
3662 #if USE_ITT_NOTIFY
3663     __kmp_threads[global_tid]->th.th_ident =
3664         loc; // needed for correct notification of frames
3665 #endif
3666     retval =
3667         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3668                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3669     retval = (retval != 0) ? (0) : (1);
3670 #if OMPT_SUPPORT && OMPT_OPTIONAL
3671     if (ompt_enabled.enabled) {
3672       ompt_frame->enter_frame = ompt_data_none;
3673     }
3674 #endif
3675 
3676     // all other workers except master should do this pop here
3677     // ( none of other workers except master will enter __kmpc_end_reduce() )
3678     if (__kmp_env_consistency_check) {
3679       if (retval == 0) { // 0: all other workers; 1: master
3680         __kmp_pop_sync(global_tid, ct_reduce, loc);
3681       }
3682     }
3683 
3684   } else {
3685 
3686     // should never reach this block
3687     KMP_ASSERT(0); // "unexpected method"
3688   }
3689   if (teams_swapped) {
3690     __kmp_restore_swapped_teams(th, team, task_state);
3691   }
3692 
3693   KA_TRACE(10,
3694            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3695             global_tid, packed_reduction_method, retval));
3696   return retval;
3697 }
3698 
3699 /*!
3700 @ingroup SYNCHRONIZATION
3701 @param loc source location information
3702 @param global_tid global thread id.
3703 @param lck pointer to the unique lock data structure
3704 
3705 Finish the execution of a blocking reduce.
3706 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3707 start function.
3708 */
3709 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3710                        kmp_critical_name *lck) {
3711 
3712   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3713   kmp_info_t *th;
3714   kmp_team_t *team;
3715   int teams_swapped = 0, task_state;
3716 
3717   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3718 
3719   th = __kmp_thread_from_gtid(global_tid);
3720   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3721 
3722   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3723 
3724   // this barrier should be visible to a customer and to the threading profile
3725   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3726   OMPT_REDUCTION_DECL(th, global_tid);
3727 
3728   if (packed_reduction_method == critical_reduce_block) {
3729     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3730 
3731     OMPT_REDUCTION_END;
3732 
3733 // TODO: implicit barrier: should be exposed
3734 #if OMPT_SUPPORT
3735     ompt_frame_t *ompt_frame;
3736     if (ompt_enabled.enabled) {
3737       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3738       if (ompt_frame->enter_frame.ptr == NULL)
3739         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3740       OMPT_STORE_RETURN_ADDRESS(global_tid);
3741     }
3742 #endif
3743 #if USE_ITT_NOTIFY
3744     __kmp_threads[global_tid]->th.th_ident = loc;
3745 #endif
3746     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3747 #if OMPT_SUPPORT && OMPT_OPTIONAL
3748     if (ompt_enabled.enabled) {
3749       ompt_frame->enter_frame = ompt_data_none;
3750     }
3751 #endif
3752 
3753   } else if (packed_reduction_method == empty_reduce_block) {
3754 
3755     OMPT_REDUCTION_END;
3756 
3757 // usage: if team size==1, no synchronization is required (Intel platforms only)
3758 
3759 // TODO: implicit barrier: should be exposed
3760 #if OMPT_SUPPORT
3761     ompt_frame_t *ompt_frame;
3762     if (ompt_enabled.enabled) {
3763       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3764       if (ompt_frame->enter_frame.ptr == NULL)
3765         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3766       OMPT_STORE_RETURN_ADDRESS(global_tid);
3767     }
3768 #endif
3769 #if USE_ITT_NOTIFY
3770     __kmp_threads[global_tid]->th.th_ident = loc;
3771 #endif
3772     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3773 #if OMPT_SUPPORT && OMPT_OPTIONAL
3774     if (ompt_enabled.enabled) {
3775       ompt_frame->enter_frame = ompt_data_none;
3776     }
3777 #endif
3778 
3779   } else if (packed_reduction_method == atomic_reduce_block) {
3780 
3781 #if OMPT_SUPPORT
3782     ompt_frame_t *ompt_frame;
3783     if (ompt_enabled.enabled) {
3784       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3785       if (ompt_frame->enter_frame.ptr == NULL)
3786         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3787       OMPT_STORE_RETURN_ADDRESS(global_tid);
3788     }
3789 #endif
3790 // TODO: implicit barrier: should be exposed
3791 #if USE_ITT_NOTIFY
3792     __kmp_threads[global_tid]->th.th_ident = loc;
3793 #endif
3794     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3795 #if OMPT_SUPPORT && OMPT_OPTIONAL
3796     if (ompt_enabled.enabled) {
3797       ompt_frame->enter_frame = ompt_data_none;
3798     }
3799 #endif
3800 
3801   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3802                                    tree_reduce_block)) {
3803 
3804     // only master executes here (master releases all other workers)
3805     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3806                             global_tid);
3807 
3808   } else {
3809 
3810     // should never reach this block
3811     KMP_ASSERT(0); // "unexpected method"
3812   }
3813   if (teams_swapped) {
3814     __kmp_restore_swapped_teams(th, team, task_state);
3815   }
3816 
3817   if (__kmp_env_consistency_check)
3818     __kmp_pop_sync(global_tid, ct_reduce, loc);
3819 
3820   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3821                 global_tid, packed_reduction_method));
3822 
3823   return;
3824 }
3825 
3826 #undef __KMP_GET_REDUCTION_METHOD
3827 #undef __KMP_SET_REDUCTION_METHOD
3828 
3829 /* end of interface to fast scalable reduce routines */
3830 
3831 kmp_uint64 __kmpc_get_taskid() {
3832 
3833   kmp_int32 gtid;
3834   kmp_info_t *thread;
3835 
3836   gtid = __kmp_get_gtid();
3837   if (gtid < 0) {
3838     return 0;
3839   }
3840   thread = __kmp_thread_from_gtid(gtid);
3841   return thread->th.th_current_task->td_task_id;
3842 
3843 } // __kmpc_get_taskid
3844 
3845 kmp_uint64 __kmpc_get_parent_taskid() {
3846 
3847   kmp_int32 gtid;
3848   kmp_info_t *thread;
3849   kmp_taskdata_t *parent_task;
3850 
3851   gtid = __kmp_get_gtid();
3852   if (gtid < 0) {
3853     return 0;
3854   }
3855   thread = __kmp_thread_from_gtid(gtid);
3856   parent_task = thread->th.th_current_task->td_parent;
3857   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3858 
3859 } // __kmpc_get_parent_taskid
3860 
3861 /*!
3862 @ingroup WORK_SHARING
3863 @param loc  source location information.
3864 @param gtid  global thread number.
3865 @param num_dims  number of associated doacross loops.
3866 @param dims  info on loops bounds.
3867 
3868 Initialize doacross loop information.
3869 Expect compiler send us inclusive bounds,
3870 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3871 */
3872 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3873                           const struct kmp_dim *dims) {
3874   int j, idx;
3875   kmp_int64 last, trace_count;
3876   kmp_info_t *th = __kmp_threads[gtid];
3877   kmp_team_t *team = th->th.th_team;
3878   kmp_uint32 *flags;
3879   kmp_disp_t *pr_buf = th->th.th_dispatch;
3880   dispatch_shared_info_t *sh_buf;
3881 
3882   KA_TRACE(
3883       20,
3884       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3885        gtid, num_dims, !team->t.t_serialized));
3886   KMP_DEBUG_ASSERT(dims != NULL);
3887   KMP_DEBUG_ASSERT(num_dims > 0);
3888 
3889   if (team->t.t_serialized) {
3890     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3891     return; // no dependencies if team is serialized
3892   }
3893   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3894   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3895   // the next loop
3896   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3897 
3898   // Save bounds info into allocated private buffer
3899   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3900   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3901       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3902   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3903   pr_buf->th_doacross_info[0] =
3904       (kmp_int64)num_dims; // first element is number of dimensions
3905   // Save also address of num_done in order to access it later without knowing
3906   // the buffer index
3907   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3908   pr_buf->th_doacross_info[2] = dims[0].lo;
3909   pr_buf->th_doacross_info[3] = dims[0].up;
3910   pr_buf->th_doacross_info[4] = dims[0].st;
3911   last = 5;
3912   for (j = 1; j < num_dims; ++j) {
3913     kmp_int64
3914         range_length; // To keep ranges of all dimensions but the first dims[0]
3915     if (dims[j].st == 1) { // most common case
3916       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3917       range_length = dims[j].up - dims[j].lo + 1;
3918     } else {
3919       if (dims[j].st > 0) {
3920         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3921         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3922       } else { // negative increment
3923         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3924         range_length =
3925             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3926       }
3927     }
3928     pr_buf->th_doacross_info[last++] = range_length;
3929     pr_buf->th_doacross_info[last++] = dims[j].lo;
3930     pr_buf->th_doacross_info[last++] = dims[j].up;
3931     pr_buf->th_doacross_info[last++] = dims[j].st;
3932   }
3933 
3934   // Compute total trip count.
3935   // Start with range of dims[0] which we don't need to keep in the buffer.
3936   if (dims[0].st == 1) { // most common case
3937     trace_count = dims[0].up - dims[0].lo + 1;
3938   } else if (dims[0].st > 0) {
3939     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3940     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3941   } else { // negative increment
3942     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3943     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3944   }
3945   for (j = 1; j < num_dims; ++j) {
3946     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3947   }
3948   KMP_DEBUG_ASSERT(trace_count > 0);
3949 
3950   // Check if shared buffer is not occupied by other loop (idx -
3951   // __kmp_dispatch_num_buffers)
3952   if (idx != sh_buf->doacross_buf_idx) {
3953     // Shared buffer is occupied, wait for it to be free
3954     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3955                  __kmp_eq_4, NULL);
3956   }
3957 #if KMP_32_BIT_ARCH
3958   // Check if we are the first thread. After the CAS the first thread gets 0,
3959   // others get 1 if initialization is in progress, allocated pointer otherwise.
3960   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3961   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3962       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3963 #else
3964   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3965       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3966 #endif
3967   if (flags == NULL) {
3968     // we are the first thread, allocate the array of flags
3969     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3970     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3971     KMP_MB();
3972     sh_buf->doacross_flags = flags;
3973   } else if (flags == (kmp_uint32 *)1) {
3974 #if KMP_32_BIT_ARCH
3975     // initialization is still in progress, need to wait
3976     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3977 #else
3978     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3979 #endif
3980       KMP_YIELD(TRUE);
3981     KMP_MB();
3982   } else {
3983     KMP_MB();
3984   }
3985   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3986   pr_buf->th_doacross_flags =
3987       sh_buf->doacross_flags; // save private copy in order to not
3988   // touch shared buffer on each iteration
3989   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3990 }
3991 
3992 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
3993   kmp_int32 shft, num_dims, i;
3994   kmp_uint32 flag;
3995   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3996   kmp_info_t *th = __kmp_threads[gtid];
3997   kmp_team_t *team = th->th.th_team;
3998   kmp_disp_t *pr_buf;
3999   kmp_int64 lo, up, st;
4000 
4001   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4002   if (team->t.t_serialized) {
4003     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4004     return; // no dependencies if team is serialized
4005   }
4006 
4007   // calculate sequential iteration number and check out-of-bounds condition
4008   pr_buf = th->th.th_dispatch;
4009   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4010   num_dims = pr_buf->th_doacross_info[0];
4011   lo = pr_buf->th_doacross_info[2];
4012   up = pr_buf->th_doacross_info[3];
4013   st = pr_buf->th_doacross_info[4];
4014 #if OMPT_SUPPORT && OMPT_OPTIONAL
4015   ompt_dependence_t deps[num_dims];
4016 #endif
4017   if (st == 1) { // most common case
4018     if (vec[0] < lo || vec[0] > up) {
4019       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4020                     "bounds [%lld,%lld]\n",
4021                     gtid, vec[0], lo, up));
4022       return;
4023     }
4024     iter_number = vec[0] - lo;
4025   } else if (st > 0) {
4026     if (vec[0] < lo || vec[0] > up) {
4027       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4028                     "bounds [%lld,%lld]\n",
4029                     gtid, vec[0], lo, up));
4030       return;
4031     }
4032     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4033   } else { // negative increment
4034     if (vec[0] > lo || vec[0] < up) {
4035       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4036                     "bounds [%lld,%lld]\n",
4037                     gtid, vec[0], lo, up));
4038       return;
4039     }
4040     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4041   }
4042 #if OMPT_SUPPORT && OMPT_OPTIONAL
4043   deps[0].variable.value = iter_number;
4044   deps[0].dependence_type = ompt_dependence_type_sink;
4045 #endif
4046   for (i = 1; i < num_dims; ++i) {
4047     kmp_int64 iter, ln;
4048     kmp_int32 j = i * 4;
4049     ln = pr_buf->th_doacross_info[j + 1];
4050     lo = pr_buf->th_doacross_info[j + 2];
4051     up = pr_buf->th_doacross_info[j + 3];
4052     st = pr_buf->th_doacross_info[j + 4];
4053     if (st == 1) {
4054       if (vec[i] < lo || vec[i] > up) {
4055         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4056                       "bounds [%lld,%lld]\n",
4057                       gtid, vec[i], lo, up));
4058         return;
4059       }
4060       iter = vec[i] - lo;
4061     } else if (st > 0) {
4062       if (vec[i] < lo || vec[i] > up) {
4063         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4064                       "bounds [%lld,%lld]\n",
4065                       gtid, vec[i], lo, up));
4066         return;
4067       }
4068       iter = (kmp_uint64)(vec[i] - lo) / st;
4069     } else { // st < 0
4070       if (vec[i] > lo || vec[i] < up) {
4071         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4072                       "bounds [%lld,%lld]\n",
4073                       gtid, vec[i], lo, up));
4074         return;
4075       }
4076       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4077     }
4078     iter_number = iter + ln * iter_number;
4079 #if OMPT_SUPPORT && OMPT_OPTIONAL
4080     deps[i].variable.value = iter;
4081     deps[i].dependence_type = ompt_dependence_type_sink;
4082 #endif
4083   }
4084   shft = iter_number % 32; // use 32-bit granularity
4085   iter_number >>= 5; // divided by 32
4086   flag = 1 << shft;
4087   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4088     KMP_YIELD(TRUE);
4089   }
4090   KMP_MB();
4091 #if OMPT_SUPPORT && OMPT_OPTIONAL
4092   if (ompt_enabled.ompt_callback_dependences) {
4093     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4094         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, num_dims);
4095   }
4096 #endif
4097   KA_TRACE(20,
4098            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4099             gtid, (iter_number << 5) + shft));
4100 }
4101 
4102 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4103   kmp_int32 shft, num_dims, i;
4104   kmp_uint32 flag;
4105   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4106   kmp_info_t *th = __kmp_threads[gtid];
4107   kmp_team_t *team = th->th.th_team;
4108   kmp_disp_t *pr_buf;
4109   kmp_int64 lo, st;
4110 
4111   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4112   if (team->t.t_serialized) {
4113     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4114     return; // no dependencies if team is serialized
4115   }
4116 
4117   // calculate sequential iteration number (same as in "wait" but no
4118   // out-of-bounds checks)
4119   pr_buf = th->th.th_dispatch;
4120   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4121   num_dims = pr_buf->th_doacross_info[0];
4122   lo = pr_buf->th_doacross_info[2];
4123   st = pr_buf->th_doacross_info[4];
4124 #if OMPT_SUPPORT && OMPT_OPTIONAL
4125   ompt_dependence_t deps[num_dims];
4126 #endif
4127   if (st == 1) { // most common case
4128     iter_number = vec[0] - lo;
4129   } else if (st > 0) {
4130     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4131   } else { // negative increment
4132     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4133   }
4134 #if OMPT_SUPPORT && OMPT_OPTIONAL
4135   deps[0].variable.value = iter_number;
4136   deps[0].dependence_type = ompt_dependence_type_source;
4137 #endif
4138   for (i = 1; i < num_dims; ++i) {
4139     kmp_int64 iter, ln;
4140     kmp_int32 j = i * 4;
4141     ln = pr_buf->th_doacross_info[j + 1];
4142     lo = pr_buf->th_doacross_info[j + 2];
4143     st = pr_buf->th_doacross_info[j + 4];
4144     if (st == 1) {
4145       iter = vec[i] - lo;
4146     } else if (st > 0) {
4147       iter = (kmp_uint64)(vec[i] - lo) / st;
4148     } else { // st < 0
4149       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4150     }
4151     iter_number = iter + ln * iter_number;
4152 #if OMPT_SUPPORT && OMPT_OPTIONAL
4153     deps[i].variable.value = iter;
4154     deps[i].dependence_type = ompt_dependence_type_source;
4155 #endif
4156   }
4157 #if OMPT_SUPPORT && OMPT_OPTIONAL
4158   if (ompt_enabled.ompt_callback_dependences) {
4159     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4160         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, num_dims);
4161   }
4162 #endif
4163   shft = iter_number % 32; // use 32-bit granularity
4164   iter_number >>= 5; // divided by 32
4165   flag = 1 << shft;
4166   KMP_MB();
4167   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4168     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4169   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4170                 (iter_number << 5) + shft));
4171 }
4172 
4173 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4174   kmp_int32 num_done;
4175   kmp_info_t *th = __kmp_threads[gtid];
4176   kmp_team_t *team = th->th.th_team;
4177   kmp_disp_t *pr_buf = th->th.th_dispatch;
4178 
4179   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4180   if (team->t.t_serialized) {
4181     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4182     return; // nothing to do
4183   }
4184   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4185   if (num_done == th->th.th_team_nproc) {
4186     // we are the last thread, need to free shared resources
4187     int idx = pr_buf->th_doacross_buf_idx - 1;
4188     dispatch_shared_info_t *sh_buf =
4189         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4190     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4191                      (kmp_int64)&sh_buf->doacross_num_done);
4192     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4193     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4194     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4195     sh_buf->doacross_flags = NULL;
4196     sh_buf->doacross_num_done = 0;
4197     sh_buf->doacross_buf_idx +=
4198         __kmp_dispatch_num_buffers; // free buffer for future re-use
4199   }
4200   // free private resources (need to keep buffer index forever)
4201   pr_buf->th_doacross_flags = NULL;
4202   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4203   pr_buf->th_doacross_info = NULL;
4204   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4205 }
4206 
4207 /* omp_alloc/omp_free only defined for C/C++, not for Fortran */
4208 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4209   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4210 }
4211 
4212 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4213   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4214 }
4215 
4216 int __kmpc_get_target_offload(void) {
4217   if (!__kmp_init_serial) {
4218     __kmp_serial_initialize();
4219   }
4220   return __kmp_target_offload;
4221 }
4222 
4223 int __kmpc_pause_resource(kmp_pause_status_t level) {
4224   if (!__kmp_init_serial) {
4225     return 1; // Can't pause if runtime is not initialized
4226   }
4227   return __kmp_pause_resource(level);
4228 }
4229