xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_runtime.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #include "kmp_utils.h"
28 #if KMP_USE_HIER_SCHED
29 #include "kmp_dispatch_hier.h"
30 #endif
31 
32 #if OMPT_SUPPORT
33 #include "ompt-specific.h"
34 #endif
35 #if OMPD_SUPPORT
36 #include "ompd-specific.h"
37 #endif
38 
39 #if OMP_PROFILING_SUPPORT
40 #include "llvm/Support/TimeProfiler.h"
41 static char *ProfileTraceFile = nullptr;
42 #endif
43 
44 /* these are temporary issues to be dealt with */
45 #define KMP_USE_PRCTL 0
46 
47 #if KMP_OS_WINDOWS
48 #include <process.h>
49 #endif
50 
51 #ifndef KMP_USE_SHM
52 // Windows and WASI do not need these include files as they don't use shared
53 // memory.
54 #else
55 #include <sys/mman.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #define SHM_SIZE 1024
59 #endif
60 
61 #if defined(KMP_GOMP_COMPAT)
62 char const __kmp_version_alt_comp[] =
63     KMP_VERSION_PREFIX "alternative compiler support: yes";
64 #endif /* defined(KMP_GOMP_COMPAT) */
65 
66 char const __kmp_version_omp_api[] =
67     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68 
69 #ifdef KMP_DEBUG
70 char const __kmp_version_lock[] =
71     KMP_VERSION_PREFIX "lock type: run time selectable";
72 #endif /* KMP_DEBUG */
73 
74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75 
76 /* ------------------------------------------------------------------------ */
77 
78 #if KMP_USE_MONITOR
79 kmp_info_t __kmp_monitor;
80 #endif
81 
82 /* Forward declarations */
83 
84 void __kmp_cleanup(void);
85 
86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87                                   int gtid);
88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89                                   kmp_internal_control_t *new_icvs,
90                                   ident_t *loc);
91 #if KMP_AFFINITY_SUPPORTED
92 static void __kmp_partition_places(kmp_team_t *team,
93                                    int update_master_only = 0);
94 #endif
95 static void __kmp_do_serial_initialize(void);
96 void __kmp_fork_barrier(int gtid, int tid);
97 void __kmp_join_barrier(int gtid);
98 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99                           kmp_internal_control_t *new_icvs, ident_t *loc);
100 
101 #ifdef USE_LOAD_BALANCE
102 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103 #endif
104 
105 static int __kmp_expand_threads(int nNeed);
106 #if KMP_OS_WINDOWS
107 static int __kmp_unregister_root_other_thread(int gtid);
108 #endif
109 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111 
112 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
113                                int new_nthreads);
114 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
115 
116 static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
117                                                         int level) {
118   kmp_nested_nthreads_t *new_nested_nth =
119       (kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
120           sizeof(kmp_nested_nthreads_t));
121   int new_size = level + thr->th.th_set_nested_nth_sz;
122   new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
123   for (int i = 0; i < level + 1; ++i)
124     new_nested_nth->nth[i] = 0;
125   for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
126     new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
127   new_nested_nth->size = new_nested_nth->used = new_size;
128   return new_nested_nth;
129 }
130 
131 /* Calculate the identifier of the current thread */
132 /* fast (and somewhat portable) way to get unique identifier of executing
133    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
134 int __kmp_get_global_thread_id() {
135   int i;
136   kmp_info_t **other_threads;
137   size_t stack_data;
138   char *stack_addr;
139   size_t stack_size;
140   char *stack_base;
141 
142   KA_TRACE(
143       1000,
144       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
145        __kmp_nth, __kmp_all_nth));
146 
147   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
148      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
149      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
150      __kmp_init_gtid for this to work. */
151 
152   if (!TCR_4(__kmp_init_gtid))
153     return KMP_GTID_DNE;
154 
155 #ifdef KMP_TDATA_GTID
156   if (TCR_4(__kmp_gtid_mode) >= 3) {
157     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
158     return __kmp_gtid;
159   }
160 #endif
161   if (TCR_4(__kmp_gtid_mode) >= 2) {
162     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
163     return __kmp_gtid_get_specific();
164   }
165   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
166 
167   stack_addr = (char *)&stack_data;
168   other_threads = __kmp_threads;
169 
170   /* ATT: The code below is a source of potential bugs due to unsynchronized
171      access to __kmp_threads array. For example:
172      1. Current thread loads other_threads[i] to thr and checks it, it is
173         non-NULL.
174      2. Current thread is suspended by OS.
175      3. Another thread unregisters and finishes (debug versions of free()
176         may fill memory with something like 0xEF).
177      4. Current thread is resumed.
178      5. Current thread reads junk from *thr.
179      TODO: Fix it.  --ln  */
180 
181   for (i = 0; i < __kmp_threads_capacity; i++) {
182 
183     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
184     if (!thr)
185       continue;
186 
187     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
188     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
189 
190     /* stack grows down -- search through all of the active threads */
191 
192     if (stack_addr <= stack_base) {
193       size_t stack_diff = stack_base - stack_addr;
194 
195       if (stack_diff <= stack_size) {
196         /* The only way we can be closer than the allocated */
197         /* stack size is if we are running on this thread. */
198         // __kmp_gtid_get_specific can return negative value because this
199         // function can be called by thread destructor. However, before the
200         // thread destructor is called, the value of the corresponding
201         // thread-specific data will be reset to NULL.
202         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
203                          __kmp_gtid_get_specific() == i);
204         return i;
205       }
206     }
207   }
208 
209   /* get specific to try and determine our gtid */
210   KA_TRACE(1000,
211            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
212             "thread, using TLS\n"));
213   i = __kmp_gtid_get_specific();
214 
215   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
216 
217   /* if we havn't been assigned a gtid, then return code */
218   if (i < 0)
219     return i;
220 
221   // other_threads[i] can be nullptr at this point because the corresponding
222   // thread could have already been destructed. It can happen when this function
223   // is called in end library routine.
224   if (!TCR_SYNC_PTR(other_threads[i]))
225     return i;
226 
227   /* dynamically updated stack window for uber threads to avoid get_specific
228      call */
229   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
230     KMP_FATAL(StackOverflow, i);
231   }
232 
233   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
234   if (stack_addr > stack_base) {
235     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
236     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
237             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
238                 stack_base);
239   } else {
240     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
241             stack_base - stack_addr);
242   }
243 
244   /* Reprint stack bounds for ubermaster since they have been refined */
245   if (__kmp_storage_map) {
246     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
247     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
248     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
249                                  other_threads[i]->th.th_info.ds.ds_stacksize,
250                                  "th_%d stack (refinement)", i);
251   }
252   return i;
253 }
254 
255 int __kmp_get_global_thread_id_reg() {
256   int gtid;
257 
258   if (!__kmp_init_serial) {
259     gtid = KMP_GTID_DNE;
260   } else
261 #ifdef KMP_TDATA_GTID
262       if (TCR_4(__kmp_gtid_mode) >= 3) {
263     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
264     gtid = __kmp_gtid;
265   } else
266 #endif
267       if (TCR_4(__kmp_gtid_mode) >= 2) {
268     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
269     gtid = __kmp_gtid_get_specific();
270   } else {
271     KA_TRACE(1000,
272              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
273     gtid = __kmp_get_global_thread_id();
274   }
275 
276   /* we must be a new uber master sibling thread */
277   if (gtid == KMP_GTID_DNE) {
278     KA_TRACE(10,
279              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
280               "Registering a new gtid.\n"));
281     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
282     if (!__kmp_init_serial) {
283       __kmp_do_serial_initialize();
284       gtid = __kmp_gtid_get_specific();
285     } else {
286       gtid = __kmp_register_root(FALSE);
287     }
288     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
289     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
290   }
291 
292   KMP_DEBUG_ASSERT(gtid >= 0);
293 
294   return gtid;
295 }
296 
297 /* caller must hold forkjoin_lock */
298 void __kmp_check_stack_overlap(kmp_info_t *th) {
299   int f;
300   char *stack_beg = NULL;
301   char *stack_end = NULL;
302   int gtid;
303 
304   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
305   if (__kmp_storage_map) {
306     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
307     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
308 
309     gtid = __kmp_gtid_from_thread(th);
310 
311     if (gtid == KMP_GTID_MONITOR) {
312       __kmp_print_storage_map_gtid(
313           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
314           "th_%s stack (%s)", "mon",
315           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
316     } else {
317       __kmp_print_storage_map_gtid(
318           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
319           "th_%d stack (%s)", gtid,
320           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
321     }
322   }
323 
324   /* No point in checking ubermaster threads since they use refinement and
325    * cannot overlap */
326   gtid = __kmp_gtid_from_thread(th);
327   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
328     KA_TRACE(10,
329              ("__kmp_check_stack_overlap: performing extensive checking\n"));
330     if (stack_beg == NULL) {
331       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
332       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
333     }
334 
335     for (f = 0; f < __kmp_threads_capacity; f++) {
336       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
337 
338       if (f_th && f_th != th) {
339         char *other_stack_end =
340             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
341         char *other_stack_beg =
342             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
343         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
344             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
345 
346           /* Print the other stack values before the abort */
347           if (__kmp_storage_map)
348             __kmp_print_storage_map_gtid(
349                 -1, other_stack_beg, other_stack_end,
350                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
351                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
352 
353           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
354                       __kmp_msg_null);
355         }
356       }
357     }
358   }
359   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
360 }
361 
362 /* ------------------------------------------------------------------------ */
363 
364 void __kmp_infinite_loop(void) {
365   static int done = FALSE;
366 
367   while (!done) {
368     KMP_YIELD(TRUE);
369   }
370 }
371 
372 #define MAX_MESSAGE 512
373 
374 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
375                                   char const *format, ...) {
376   char buffer[MAX_MESSAGE];
377   va_list ap;
378 
379   va_start(ap, format);
380   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
381                p2, (unsigned long)size, format);
382   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
383   __kmp_vprintf(kmp_err, buffer, ap);
384 #if KMP_PRINT_DATA_PLACEMENT
385   int node;
386   if (gtid >= 0) {
387     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
388       if (__kmp_storage_map_verbose) {
389         node = __kmp_get_host_node(p1);
390         if (node < 0) /* doesn't work, so don't try this next time */
391           __kmp_storage_map_verbose = FALSE;
392         else {
393           char *last;
394           int lastNode;
395           int localProc = __kmp_get_cpu_from_gtid(gtid);
396 
397           const int page_size = KMP_GET_PAGE_SIZE();
398 
399           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
400           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
401           if (localProc >= 0)
402             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
403                                  localProc >> 1);
404           else
405             __kmp_printf_no_lock("  GTID %d\n", gtid);
406 #if KMP_USE_PRCTL
407           /* The more elaborate format is disabled for now because of the prctl
408            * hanging bug. */
409           do {
410             last = p1;
411             lastNode = node;
412             /* This loop collates adjacent pages with the same host node. */
413             do {
414               (char *)p1 += page_size;
415             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
416             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
417                                  lastNode);
418           } while (p1 <= p2);
419 #else
420           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
421                                (char *)p1 + (page_size - 1),
422                                __kmp_get_host_node(p1));
423           if (p1 < p2) {
424             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
425                                  (char *)p2 + (page_size - 1),
426                                  __kmp_get_host_node(p2));
427           }
428 #endif
429         }
430       }
431     } else
432       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
433   }
434 #endif /* KMP_PRINT_DATA_PLACEMENT */
435   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
436 
437   va_end(ap);
438 }
439 
440 void __kmp_warn(char const *format, ...) {
441   char buffer[MAX_MESSAGE];
442   va_list ap;
443 
444   if (__kmp_generate_warnings == kmp_warnings_off) {
445     return;
446   }
447 
448   va_start(ap, format);
449 
450   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
451   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
452   __kmp_vprintf(kmp_err, buffer, ap);
453   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
454 
455   va_end(ap);
456 }
457 
458 void __kmp_abort_process() {
459   // Later threads may stall here, but that's ok because abort() will kill them.
460   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
461 
462   if (__kmp_debug_buf) {
463     __kmp_dump_debug_buffer();
464   }
465 
466 #if KMP_OS_WINDOWS
467   // Let other threads know of abnormal termination and prevent deadlock
468   // if abort happened during library initialization or shutdown
469   __kmp_global.g.g_abort = SIGABRT;
470 
471   /* On Windows* OS by default abort() causes pop-up error box, which stalls
472      nightly testing. Unfortunately, we cannot reliably suppress pop-up error
473      boxes. _set_abort_behavior() works well, but this function is not
474      available in VS7 (this is not problem for DLL, but it is a problem for
475      static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
476      help, at least in some versions of MS C RTL.
477 
478      It seems following sequence is the only way to simulate abort() and
479      avoid pop-up error box. */
480   raise(SIGABRT);
481   _exit(3); // Just in case, if signal ignored, exit anyway.
482 #else
483   __kmp_unregister_library();
484   abort();
485 #endif
486 
487   __kmp_infinite_loop();
488   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
489 
490 } // __kmp_abort_process
491 
492 void __kmp_abort_thread(void) {
493   // TODO: Eliminate g_abort global variable and this function.
494   // In case of abort just call abort(), it will kill all the threads.
495   __kmp_infinite_loop();
496 } // __kmp_abort_thread
497 
498 /* Print out the storage map for the major kmp_info_t thread data structures
499    that are allocated together. */
500 
501 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
502   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
503                                gtid);
504 
505   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
506                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
507 
508   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
509                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
510 
511   __kmp_print_storage_map_gtid(
512       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
513       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
514 
515   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
516                                &thr->th.th_bar[bs_plain_barrier + 1],
517                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
518                                gtid);
519 
520   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
521                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
522                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
523                                gtid);
524 
525 #if KMP_FAST_REDUCTION_BARRIER
526   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
527                                &thr->th.th_bar[bs_reduction_barrier + 1],
528                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
529                                gtid);
530 #endif // KMP_FAST_REDUCTION_BARRIER
531 }
532 
533 /* Print out the storage map for the major kmp_team_t team data structures
534    that are allocated together. */
535 
536 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
537                                          int team_id, int num_thr) {
538   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
539   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
540                                header, team_id);
541 
542   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
543                                &team->t.t_bar[bs_last_barrier],
544                                sizeof(kmp_balign_team_t) * bs_last_barrier,
545                                "%s_%d.t_bar", header, team_id);
546 
547   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
548                                &team->t.t_bar[bs_plain_barrier + 1],
549                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
550                                header, team_id);
551 
552   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
553                                &team->t.t_bar[bs_forkjoin_barrier + 1],
554                                sizeof(kmp_balign_team_t),
555                                "%s_%d.t_bar[forkjoin]", header, team_id);
556 
557 #if KMP_FAST_REDUCTION_BARRIER
558   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
559                                &team->t.t_bar[bs_reduction_barrier + 1],
560                                sizeof(kmp_balign_team_t),
561                                "%s_%d.t_bar[reduction]", header, team_id);
562 #endif // KMP_FAST_REDUCTION_BARRIER
563 
564   __kmp_print_storage_map_gtid(
565       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
566       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
567 
568   __kmp_print_storage_map_gtid(
569       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
570       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
571 
572   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
573                                &team->t.t_disp_buffer[num_disp_buff],
574                                sizeof(dispatch_shared_info_t) * num_disp_buff,
575                                "%s_%d.t_disp_buffer", header, team_id);
576 }
577 
578 static void __kmp_init_allocator() {
579   __kmp_init_memkind();
580   __kmp_init_target_mem();
581 }
582 static void __kmp_fini_allocator() {
583   __kmp_fini_target_mem();
584   __kmp_fini_memkind();
585 }
586 
587 /* ------------------------------------------------------------------------ */
588 
589 #if ENABLE_LIBOMPTARGET
590 static void __kmp_init_omptarget() {
591   __kmp_init_target_task();
592 }
593 #endif
594 
595 /* ------------------------------------------------------------------------ */
596 
597 #if KMP_DYNAMIC_LIB
598 #if KMP_OS_WINDOWS
599 
600 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
601   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
602 
603   switch (fdwReason) {
604 
605   case DLL_PROCESS_ATTACH:
606     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
607 
608     return TRUE;
609 
610   case DLL_PROCESS_DETACH:
611     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
612 
613     // According to Windows* documentation for DllMain entry point:
614     // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
615     //   lpReserved == NULL when FreeLibrary() is called,
616     //   lpReserved != NULL when the process is terminated.
617     // When FreeLibrary() is called, worker threads remain alive. So the
618     // runtime's state is consistent and executing proper shutdown is OK.
619     // When the process is terminated, worker threads have exited or been
620     // forcefully terminated by the OS and only the shutdown thread remains.
621     // This can leave the runtime in an inconsistent state.
622     // Hence, only attempt proper cleanup when FreeLibrary() is called.
623     // Otherwise, rely on OS to reclaim resources.
624     if (lpReserved == NULL)
625       __kmp_internal_end_library(__kmp_gtid_get_specific());
626 
627     return TRUE;
628 
629   case DLL_THREAD_ATTACH:
630     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
631 
632     /* if we want to register new siblings all the time here call
633      * __kmp_get_gtid(); */
634     return TRUE;
635 
636   case DLL_THREAD_DETACH:
637     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
638 
639     __kmp_internal_end_thread(__kmp_gtid_get_specific());
640     return TRUE;
641   }
642 
643   return TRUE;
644 }
645 
646 #endif /* KMP_OS_WINDOWS */
647 #endif /* KMP_DYNAMIC_LIB */
648 
649 /* __kmp_parallel_deo -- Wait until it's our turn. */
650 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
651   int gtid = *gtid_ref;
652 #ifdef BUILD_PARALLEL_ORDERED
653   kmp_team_t *team = __kmp_team_from_gtid(gtid);
654 #endif /* BUILD_PARALLEL_ORDERED */
655 
656   if (__kmp_env_consistency_check) {
657     if (__kmp_threads[gtid]->th.th_root->r.r_active)
658 #if KMP_USE_DYNAMIC_LOCK
659       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
660 #else
661       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
662 #endif
663   }
664 #ifdef BUILD_PARALLEL_ORDERED
665   if (!team->t.t_serialized) {
666     KMP_MB();
667     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
668              NULL);
669     KMP_MB();
670   }
671 #endif /* BUILD_PARALLEL_ORDERED */
672 }
673 
674 /* __kmp_parallel_dxo -- Signal the next task. */
675 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
676   int gtid = *gtid_ref;
677 #ifdef BUILD_PARALLEL_ORDERED
678   int tid = __kmp_tid_from_gtid(gtid);
679   kmp_team_t *team = __kmp_team_from_gtid(gtid);
680 #endif /* BUILD_PARALLEL_ORDERED */
681 
682   if (__kmp_env_consistency_check) {
683     if (__kmp_threads[gtid]->th.th_root->r.r_active)
684       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
685   }
686 #ifdef BUILD_PARALLEL_ORDERED
687   if (!team->t.t_serialized) {
688     KMP_MB(); /* Flush all pending memory write invalidates.  */
689 
690     /* use the tid of the next thread in this team */
691     /* TODO replace with general release procedure */
692     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
693 
694     KMP_MB(); /* Flush all pending memory write invalidates.  */
695   }
696 #endif /* BUILD_PARALLEL_ORDERED */
697 }
698 
699 /* ------------------------------------------------------------------------ */
700 /* The BARRIER for a SINGLE process section is always explicit   */
701 
702 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
703   int status;
704   kmp_info_t *th;
705   kmp_team_t *team;
706 
707   if (!TCR_4(__kmp_init_parallel))
708     __kmp_parallel_initialize();
709   __kmp_resume_if_soft_paused();
710 
711   th = __kmp_threads[gtid];
712   team = th->th.th_team;
713   status = 0;
714 
715   th->th.th_ident = id_ref;
716 
717   if (team->t.t_serialized) {
718     status = 1;
719   } else {
720     kmp_int32 old_this = th->th.th_local.this_construct;
721 
722     ++th->th.th_local.this_construct;
723     /* try to set team count to thread count--success means thread got the
724        single block */
725     /* TODO: Should this be acquire or release? */
726     if (team->t.t_construct == old_this) {
727       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
728                                               th->th.th_local.this_construct);
729     }
730 #if USE_ITT_BUILD
731     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
732         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
733         team->t.t_active_level == 1) {
734       // Only report metadata by primary thread of active team at level 1
735       __kmp_itt_metadata_single(id_ref);
736     }
737 #endif /* USE_ITT_BUILD */
738   }
739 
740   if (__kmp_env_consistency_check) {
741     if (status && push_ws) {
742       __kmp_push_workshare(gtid, ct_psingle, id_ref);
743     } else {
744       __kmp_check_workshare(gtid, ct_psingle, id_ref);
745     }
746   }
747 #if USE_ITT_BUILD
748   if (status) {
749     __kmp_itt_single_start(gtid);
750   }
751 #endif /* USE_ITT_BUILD */
752   return status;
753 }
754 
755 void __kmp_exit_single(int gtid) {
756 #if USE_ITT_BUILD
757   __kmp_itt_single_end(gtid);
758 #endif /* USE_ITT_BUILD */
759   if (__kmp_env_consistency_check)
760     __kmp_pop_workshare(gtid, ct_psingle, NULL);
761 }
762 
763 /* determine if we can go parallel or must use a serialized parallel region and
764  * how many threads we can use
765  * set_nproc is the number of threads requested for the team
766  * returns 0 if we should serialize or only use one thread,
767  * otherwise the number of threads to use
768  * The forkjoin lock is held by the caller. */
769 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
770                                  int master_tid, int set_nthreads,
771                                  int enter_teams) {
772   int capacity;
773   int new_nthreads;
774   KMP_DEBUG_ASSERT(__kmp_init_serial);
775   KMP_DEBUG_ASSERT(root && parent_team);
776   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
777 
778   // If dyn-var is set, dynamically adjust the number of desired threads,
779   // according to the method specified by dynamic_mode.
780   new_nthreads = set_nthreads;
781   if (!get__dynamic_2(parent_team, master_tid)) {
782     ;
783   }
784 #ifdef USE_LOAD_BALANCE
785   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
786     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
787     if (new_nthreads == 1) {
788       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
789                     "reservation to 1 thread\n",
790                     master_tid));
791       return 1;
792     }
793     if (new_nthreads < set_nthreads) {
794       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
795                     "reservation to %d threads\n",
796                     master_tid, new_nthreads));
797     }
798   }
799 #endif /* USE_LOAD_BALANCE */
800   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
801     new_nthreads = __kmp_avail_proc - __kmp_nth +
802                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
803     if (new_nthreads <= 1) {
804       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
805                     "reservation to 1 thread\n",
806                     master_tid));
807       return 1;
808     }
809     if (new_nthreads < set_nthreads) {
810       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
811                     "reservation to %d threads\n",
812                     master_tid, new_nthreads));
813     } else {
814       new_nthreads = set_nthreads;
815     }
816   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
817     if (set_nthreads > 2) {
818       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
819       new_nthreads = (new_nthreads % set_nthreads) + 1;
820       if (new_nthreads == 1) {
821         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
822                       "reservation to 1 thread\n",
823                       master_tid));
824         return 1;
825       }
826       if (new_nthreads < set_nthreads) {
827         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
828                       "reservation to %d threads\n",
829                       master_tid, new_nthreads));
830       }
831     }
832   } else {
833     KMP_ASSERT(0);
834   }
835 
836   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
837   if (__kmp_nth + new_nthreads -
838           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
839       __kmp_max_nth) {
840     int tl_nthreads = __kmp_max_nth - __kmp_nth +
841                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
842     if (tl_nthreads <= 0) {
843       tl_nthreads = 1;
844     }
845 
846     // If dyn-var is false, emit a 1-time warning.
847     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
848       __kmp_reserve_warn = 1;
849       __kmp_msg(kmp_ms_warning,
850                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
851                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
852     }
853     if (tl_nthreads == 1) {
854       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
855                     "reduced reservation to 1 thread\n",
856                     master_tid));
857       return 1;
858     }
859     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
860                   "reservation to %d threads\n",
861                   master_tid, tl_nthreads));
862     new_nthreads = tl_nthreads;
863   }
864 
865   // Respect OMP_THREAD_LIMIT
866   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
867   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
868   if (cg_nthreads + new_nthreads -
869           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870       max_cg_threads) {
871     int tl_nthreads = max_cg_threads - cg_nthreads +
872                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
873     if (tl_nthreads <= 0) {
874       tl_nthreads = 1;
875     }
876 
877     // If dyn-var is false, emit a 1-time warning.
878     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
879       __kmp_reserve_warn = 1;
880       __kmp_msg(kmp_ms_warning,
881                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
882                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
883     }
884     if (tl_nthreads == 1) {
885       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
886                     "reduced reservation to 1 thread\n",
887                     master_tid));
888       return 1;
889     }
890     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
891                   "reservation to %d threads\n",
892                   master_tid, tl_nthreads));
893     new_nthreads = tl_nthreads;
894   }
895 
896   // Check if the threads array is large enough, or needs expanding.
897   // See comment in __kmp_register_root() about the adjustment if
898   // __kmp_threads[0] == NULL.
899   capacity = __kmp_threads_capacity;
900   if (TCR_PTR(__kmp_threads[0]) == NULL) {
901     --capacity;
902   }
903   // If it is not for initializing the hidden helper team, we need to take
904   // __kmp_hidden_helper_threads_num out of the capacity because it is included
905   // in __kmp_threads_capacity.
906   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
907     capacity -= __kmp_hidden_helper_threads_num;
908   }
909   if (__kmp_nth + new_nthreads -
910           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
911       capacity) {
912     // Expand the threads array.
913     int slotsRequired = __kmp_nth + new_nthreads -
914                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
915                         capacity;
916     int slotsAdded = __kmp_expand_threads(slotsRequired);
917     if (slotsAdded < slotsRequired) {
918       // The threads array was not expanded enough.
919       new_nthreads -= (slotsRequired - slotsAdded);
920       KMP_ASSERT(new_nthreads >= 1);
921 
922       // If dyn-var is false, emit a 1-time warning.
923       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
924         __kmp_reserve_warn = 1;
925         if (__kmp_tp_cached) {
926           __kmp_msg(kmp_ms_warning,
927                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
928                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
929                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
930         } else {
931           __kmp_msg(kmp_ms_warning,
932                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
933                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
934         }
935       }
936     }
937   }
938 
939 #ifdef KMP_DEBUG
940   if (new_nthreads == 1) {
941     KC_TRACE(10,
942              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
943               "dead roots and rechecking; requested %d threads\n",
944               __kmp_get_gtid(), set_nthreads));
945   } else {
946     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
947                   " %d threads\n",
948                   __kmp_get_gtid(), new_nthreads, set_nthreads));
949   }
950 #endif // KMP_DEBUG
951 
952   if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) {
953     __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
954                  this_thr->th.th_nt_msg);
955   }
956   return new_nthreads;
957 }
958 
959 /* Allocate threads from the thread pool and assign them to the new team. We are
960    assured that there are enough threads available, because we checked on that
961    earlier within critical section forkjoin */
962 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
963                                     kmp_info_t *master_th, int master_gtid,
964                                     int fork_teams_workers) {
965   int i;
966   int use_hot_team;
967 
968   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
969   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
970   KMP_MB();
971 
972   /* first, let's setup the primary thread */
973   master_th->th.th_info.ds.ds_tid = 0;
974   master_th->th.th_team = team;
975   master_th->th.th_team_nproc = team->t.t_nproc;
976   master_th->th.th_team_master = master_th;
977   master_th->th.th_team_serialized = FALSE;
978   master_th->th.th_dispatch = &team->t.t_dispatch[0];
979 
980 /* make sure we are not the optimized hot team */
981 #if KMP_NESTED_HOT_TEAMS
982   use_hot_team = 0;
983   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
984   if (hot_teams) { // hot teams array is not allocated if
985     // KMP_HOT_TEAMS_MAX_LEVEL=0
986     int level = team->t.t_active_level - 1; // index in array of hot teams
987     if (master_th->th.th_teams_microtask) { // are we inside the teams?
988       if (master_th->th.th_teams_size.nteams > 1) {
989         ++level; // level was not increased in teams construct for
990         // team_of_masters
991       }
992       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
993           master_th->th.th_teams_level == team->t.t_level) {
994         ++level; // level was not increased in teams construct for
995         // team_of_workers before the parallel
996       } // team->t.t_level will be increased inside parallel
997     }
998     if (level < __kmp_hot_teams_max_level) {
999       if (hot_teams[level].hot_team) {
1000         // hot team has already been allocated for given level
1001         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1002         use_hot_team = 1; // the team is ready to use
1003       } else {
1004         use_hot_team = 0; // AC: threads are not allocated yet
1005         hot_teams[level].hot_team = team; // remember new hot team
1006         hot_teams[level].hot_team_nth = team->t.t_nproc;
1007       }
1008     } else {
1009       use_hot_team = 0;
1010     }
1011   }
1012 #else
1013   use_hot_team = team == root->r.r_hot_team;
1014 #endif
1015   if (!use_hot_team) {
1016 
1017     /* install the primary thread */
1018     team->t.t_threads[0] = master_th;
1019     __kmp_initialize_info(master_th, team, 0, master_gtid);
1020 
1021     /* now, install the worker threads */
1022     for (i = 1; i < team->t.t_nproc; i++) {
1023 
1024       /* fork or reallocate a new thread and install it in team */
1025       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1026       team->t.t_threads[i] = thr;
1027       KMP_DEBUG_ASSERT(thr);
1028       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1029       /* align team and thread arrived states */
1030       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1031                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1032                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1033                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1034                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1035                     team->t.t_bar[bs_plain_barrier].b_arrived));
1036       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1037       thr->th.th_teams_level = master_th->th.th_teams_level;
1038       thr->th.th_teams_size = master_th->th.th_teams_size;
1039       { // Initialize threads' barrier data.
1040         int b;
1041         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1042         for (b = 0; b < bs_last_barrier; ++b) {
1043           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1044           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1045 #if USE_DEBUGGER
1046           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1047 #endif
1048         }
1049       }
1050     }
1051 
1052 #if KMP_AFFINITY_SUPPORTED
1053     // Do not partition the places list for teams construct workers who
1054     // haven't actually been forked to do real work yet. This partitioning
1055     // will take place in the parallel region nested within the teams construct.
1056     if (!fork_teams_workers) {
1057       __kmp_partition_places(team);
1058     }
1059 #endif
1060 
1061     if (team->t.t_nproc > 1 &&
1062         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1063       team->t.b->update_num_threads(team->t.t_nproc);
1064       __kmp_add_threads_to_team(team, team->t.t_nproc);
1065     }
1066   }
1067 
1068   // Take care of primary thread's task state
1069   if (__kmp_tasking_mode != tskm_immediate_exec) {
1070     if (use_hot_team) {
1071       KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
1072       KA_TRACE(
1073           20,
1074           ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
1075            "%p, new task_team %p / team %p\n",
1076            __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
1077            team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
1078            team));
1079 
1080       // Store primary thread's current task state on new team
1081       KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1082                        master_th->th.th_task_state);
1083 
1084       // Restore primary thread's task state to hot team's state
1085       // by using thread 1's task state
1086       if (team->t.t_nproc > 1) {
1087         KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
1088                          team->t.t_threads[1]->th.th_task_state == 1);
1089         KMP_CHECK_UPDATE(master_th->th.th_task_state,
1090                          team->t.t_threads[1]->th.th_task_state);
1091       } else {
1092         master_th->th.th_task_state = 0;
1093       }
1094     } else {
1095       // Store primary thread's current task_state on new team
1096       KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1097                        master_th->th.th_task_state);
1098       // Are not using hot team, so set task state to 0.
1099       master_th->th.th_task_state = 0;
1100     }
1101   }
1102 
1103   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1104     for (i = 0; i < team->t.t_nproc; i++) {
1105       kmp_info_t *thr = team->t.t_threads[i];
1106       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1107           thr->th.th_prev_level != team->t.t_level) {
1108         team->t.t_display_affinity = 1;
1109         break;
1110       }
1111     }
1112   }
1113 
1114   KMP_MB();
1115 }
1116 
1117 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1118 // Propagate any changes to the floating point control registers out to the team
1119 // We try to avoid unnecessary writes to the relevant cache line in the team
1120 // structure, so we don't make changes unless they are needed.
1121 inline static void propagateFPControl(kmp_team_t *team) {
1122   if (__kmp_inherit_fp_control) {
1123     kmp_int16 x87_fpu_control_word;
1124     kmp_uint32 mxcsr;
1125 
1126     // Get primary thread's values of FPU control flags (both X87 and vector)
1127     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1128     __kmp_store_mxcsr(&mxcsr);
1129     mxcsr &= KMP_X86_MXCSR_MASK;
1130 
1131     // There is no point looking at t_fp_control_saved here.
1132     // If it is TRUE, we still have to update the values if they are different
1133     // from those we now have. If it is FALSE we didn't save anything yet, but
1134     // our objective is the same. We have to ensure that the values in the team
1135     // are the same as those we have.
1136     // So, this code achieves what we need whether or not t_fp_control_saved is
1137     // true. By checking whether the value needs updating we avoid unnecessary
1138     // writes that would put the cache-line into a written state, causing all
1139     // threads in the team to have to read it again.
1140     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1141     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1142     // Although we don't use this value, other code in the runtime wants to know
1143     // whether it should restore them. So we must ensure it is correct.
1144     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1145   } else {
1146     // Similarly here. Don't write to this cache-line in the team structure
1147     // unless we have to.
1148     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1149   }
1150 }
1151 
1152 // Do the opposite, setting the hardware registers to the updated values from
1153 // the team.
1154 inline static void updateHWFPControl(kmp_team_t *team) {
1155   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1156     // Only reset the fp control regs if they have been changed in the team.
1157     // the parallel region that we are exiting.
1158     kmp_int16 x87_fpu_control_word;
1159     kmp_uint32 mxcsr;
1160     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1161     __kmp_store_mxcsr(&mxcsr);
1162     mxcsr &= KMP_X86_MXCSR_MASK;
1163 
1164     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1165       __kmp_clear_x87_fpu_status_word();
1166       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1167     }
1168 
1169     if (team->t.t_mxcsr != mxcsr) {
1170       __kmp_load_mxcsr(&team->t.t_mxcsr);
1171     }
1172   }
1173 }
1174 #else
1175 #define propagateFPControl(x) ((void)0)
1176 #define updateHWFPControl(x) ((void)0)
1177 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1178 
1179 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1180                                      int realloc); // forward declaration
1181 
1182 /* Run a parallel region that has been serialized, so runs only in a team of the
1183    single primary thread. */
1184 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1185   kmp_info_t *this_thr;
1186   kmp_team_t *serial_team;
1187 
1188   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1189 
1190   /* Skip all this code for autopar serialized loops since it results in
1191      unacceptable overhead */
1192   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1193     return;
1194 
1195   if (!TCR_4(__kmp_init_parallel))
1196     __kmp_parallel_initialize();
1197   __kmp_resume_if_soft_paused();
1198 
1199   this_thr = __kmp_threads[global_tid];
1200   serial_team = this_thr->th.th_serial_team;
1201 
1202   /* utilize the serialized team held by this thread */
1203   KMP_DEBUG_ASSERT(serial_team);
1204   KMP_MB();
1205 
1206   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1207   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1208     proc_bind = proc_bind_false;
1209   } else if (proc_bind == proc_bind_default) {
1210     // No proc_bind clause was specified, so use the current value
1211     // of proc-bind-var for this parallel region.
1212     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1213   }
1214   // Reset for next parallel region
1215   this_thr->th.th_set_proc_bind = proc_bind_default;
1216 
1217   // OpenMP 6.0 12.1.2 requires the num_threads 'strict' modifier to also have
1218   // effect when parallel execution is disabled by a corresponding if clause
1219   // attached to the parallel directive.
1220   if (this_thr->th.th_nt_strict && this_thr->th.th_set_nproc > 1)
1221     __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
1222                  this_thr->th.th_nt_msg);
1223   // Reset num_threads for next parallel region
1224   this_thr->th.th_set_nproc = 0;
1225 
1226 #if OMPT_SUPPORT
1227   ompt_data_t ompt_parallel_data = ompt_data_none;
1228   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1229   if (ompt_enabled.enabled &&
1230       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1231 
1232     ompt_task_info_t *parent_task_info;
1233     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1234 
1235     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1236     if (ompt_enabled.ompt_callback_parallel_begin) {
1237       int team_size = 1;
1238 
1239       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1240           &(parent_task_info->task_data), &(parent_task_info->frame),
1241           &ompt_parallel_data, team_size,
1242           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1243     }
1244   }
1245 #endif // OMPT_SUPPORT
1246 
1247   if (this_thr->th.th_team != serial_team) {
1248     // Nested level will be an index in the nested nthreads array
1249     int level = this_thr->th.th_team->t.t_level;
1250 
1251     if (serial_team->t.t_serialized) {
1252       /* this serial team was already used
1253          TODO increase performance by making this locks more specific */
1254       kmp_team_t *new_team;
1255 
1256       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1257 
1258       new_team =
1259           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1260 #if OMPT_SUPPORT
1261                               ompt_parallel_data,
1262 #endif
1263                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1264                               0 USE_NESTED_HOT_ARG(NULL));
1265       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1266       KMP_ASSERT(new_team);
1267 
1268       /* setup new serialized team and install it */
1269       new_team->t.t_threads[0] = this_thr;
1270       new_team->t.t_parent = this_thr->th.th_team;
1271       serial_team = new_team;
1272       this_thr->th.th_serial_team = serial_team;
1273 
1274       KF_TRACE(
1275           10,
1276           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1277            global_tid, serial_team));
1278 
1279       /* TODO the above breaks the requirement that if we run out of resources,
1280          then we can still guarantee that serialized teams are ok, since we may
1281          need to allocate a new one */
1282     } else {
1283       KF_TRACE(
1284           10,
1285           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1286            global_tid, serial_team));
1287     }
1288 
1289     /* we have to initialize this serial team */
1290     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1291     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1292     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1293     serial_team->t.t_ident = loc;
1294     serial_team->t.t_serialized = 1;
1295     serial_team->t.t_nproc = 1;
1296     serial_team->t.t_parent = this_thr->th.th_team;
1297     if (this_thr->th.th_team->t.t_nested_nth)
1298       serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
1299     else
1300       serial_team->t.t_nested_nth = &__kmp_nested_nth;
1301     // Save previous team's task state on serial team structure
1302     serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
1303     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1304     this_thr->th.th_team = serial_team;
1305     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1306 
1307     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1308                   this_thr->th.th_current_task));
1309     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1310     this_thr->th.th_current_task->td_flags.executing = 0;
1311 
1312     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1313 
1314     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1315        implicit task for each serialized task represented by
1316        team->t.t_serialized? */
1317     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1318               &this_thr->th.th_current_task->td_parent->td_icvs);
1319 
1320     // Thread value exists in the nested nthreads array for the next nested
1321     // level
1322     kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1323     if (this_thr->th.th_team->t.t_nested_nth)
1324       nested_nth = this_thr->th.th_team->t.t_nested_nth;
1325     if (nested_nth->used && (level + 1 < nested_nth->used)) {
1326       this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1327     }
1328 
1329     if (__kmp_nested_proc_bind.used &&
1330         (level + 1 < __kmp_nested_proc_bind.used)) {
1331       this_thr->th.th_current_task->td_icvs.proc_bind =
1332           __kmp_nested_proc_bind.bind_types[level + 1];
1333     }
1334 
1335 #if USE_DEBUGGER
1336     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1337 #endif
1338     this_thr->th.th_info.ds.ds_tid = 0;
1339 
1340     /* set thread cache values */
1341     this_thr->th.th_team_nproc = 1;
1342     this_thr->th.th_team_master = this_thr;
1343     this_thr->th.th_team_serialized = 1;
1344     this_thr->th.th_task_team = NULL;
1345     this_thr->th.th_task_state = 0;
1346 
1347     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1348     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1349     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1350 
1351     propagateFPControl(serial_team);
1352 
1353     /* check if we need to allocate dispatch buffers stack */
1354     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1355     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1356       serial_team->t.t_dispatch->th_disp_buffer =
1357           (dispatch_private_info_t *)__kmp_allocate(
1358               sizeof(dispatch_private_info_t));
1359     }
1360     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1361 
1362     KMP_MB();
1363 
1364   } else {
1365     /* this serialized team is already being used,
1366      * that's fine, just add another nested level */
1367     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1368     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1369     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1370     ++serial_team->t.t_serialized;
1371     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1372 
1373     // Nested level will be an index in the nested nthreads array
1374     int level = this_thr->th.th_team->t.t_level;
1375     // Thread value exists in the nested nthreads array for the next nested
1376     // level
1377 
1378     kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1379     if (serial_team->t.t_nested_nth)
1380       nested_nth = serial_team->t.t_nested_nth;
1381     if (nested_nth->used && (level + 1 < nested_nth->used)) {
1382       this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1383     }
1384 
1385     serial_team->t.t_level++;
1386     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1387                   "of serial team %p to %d\n",
1388                   global_tid, serial_team, serial_team->t.t_level));
1389 
1390     /* allocate/push dispatch buffers stack */
1391     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1392     {
1393       dispatch_private_info_t *disp_buffer =
1394           (dispatch_private_info_t *)__kmp_allocate(
1395               sizeof(dispatch_private_info_t));
1396       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1397       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1398     }
1399     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1400 
1401     /* allocate/push task team stack */
1402     __kmp_push_task_team_node(this_thr, serial_team);
1403 
1404     KMP_MB();
1405   }
1406   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1407 
1408   // Perform the display affinity functionality for
1409   // serialized parallel regions
1410   if (__kmp_display_affinity) {
1411     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1412         this_thr->th.th_prev_num_threads != 1) {
1413       // NULL means use the affinity-format-var ICV
1414       __kmp_aux_display_affinity(global_tid, NULL);
1415       this_thr->th.th_prev_level = serial_team->t.t_level;
1416       this_thr->th.th_prev_num_threads = 1;
1417     }
1418   }
1419 
1420   if (__kmp_env_consistency_check)
1421     __kmp_push_parallel(global_tid, NULL);
1422 #if OMPT_SUPPORT
1423   serial_team->t.ompt_team_info.master_return_address = codeptr;
1424   if (ompt_enabled.enabled &&
1425       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1426     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1427         OMPT_GET_FRAME_ADDRESS(0);
1428 
1429     ompt_lw_taskteam_t lw_taskteam;
1430     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1431                             &ompt_parallel_data, codeptr);
1432 
1433     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1434     // don't use lw_taskteam after linking. content was swaped
1435 
1436     /* OMPT implicit task begin */
1437     if (ompt_enabled.ompt_callback_implicit_task) {
1438       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1439           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1440           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1441           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1442       OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1443           __kmp_tid_from_gtid(global_tid);
1444     }
1445 
1446     /* OMPT state */
1447     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1448     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1449         OMPT_GET_FRAME_ADDRESS(0);
1450   }
1451 #endif
1452 }
1453 
1454 // Test if this fork is for a team closely nested in a teams construct
1455 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1456                                           microtask_t microtask, int level,
1457                                           int teams_level, kmp_va_list ap) {
1458   return (master_th->th.th_teams_microtask && ap &&
1459           microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1460 }
1461 
1462 // Test if this fork is for the teams construct, i.e. to form the outer league
1463 // of teams
1464 static inline bool __kmp_is_entering_teams(int active_level, int level,
1465                                            int teams_level, kmp_va_list ap) {
1466   return ((ap == NULL && active_level == 0) ||
1467           (ap && teams_level > 0 && teams_level == level));
1468 }
1469 
1470 // AC: This is start of parallel that is nested inside teams construct.
1471 // The team is actual (hot), all workers are ready at the fork barrier.
1472 // No lock needed to initialize the team a bit, then free workers.
1473 static inline int
1474 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1475                     kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1476                     enum fork_context_e call_context, microtask_t microtask,
1477                     launch_t invoker, int master_set_numthreads, int level,
1478 #if OMPT_SUPPORT
1479                     ompt_data_t ompt_parallel_data, void *return_address,
1480 #endif
1481                     kmp_va_list ap) {
1482   void **argv;
1483   int i;
1484 
1485   parent_team->t.t_ident = loc;
1486   __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1487   parent_team->t.t_argc = argc;
1488   argv = (void **)parent_team->t.t_argv;
1489   for (i = argc - 1; i >= 0; --i) {
1490     *argv++ = va_arg(kmp_va_deref(ap), void *);
1491   }
1492   // Increment our nested depth levels, but not increase the serialization
1493   if (parent_team == master_th->th.th_serial_team) {
1494     // AC: we are in serialized parallel
1495     __kmpc_serialized_parallel(loc, gtid);
1496     KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1497 
1498     if (call_context == fork_context_gnu) {
1499       // AC: need to decrement t_serialized for enquiry functions to work
1500       // correctly, will restore at join time
1501       parent_team->t.t_serialized--;
1502       return TRUE;
1503     }
1504 
1505 #if OMPD_SUPPORT
1506     parent_team->t.t_pkfn = microtask;
1507 #endif
1508 
1509 #if OMPT_SUPPORT
1510     void *dummy;
1511     void **exit_frame_p;
1512     ompt_data_t *implicit_task_data;
1513     ompt_lw_taskteam_t lw_taskteam;
1514 
1515     if (ompt_enabled.enabled) {
1516       __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1517                               &ompt_parallel_data, return_address);
1518       exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1519 
1520       __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1521       // Don't use lw_taskteam after linking. Content was swapped.
1522 
1523       /* OMPT implicit task begin */
1524       implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1525       if (ompt_enabled.ompt_callback_implicit_task) {
1526         OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1527         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1528             ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1529             1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1530       }
1531 
1532       /* OMPT state */
1533       master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1534     } else {
1535       exit_frame_p = &dummy;
1536     }
1537 #endif
1538 
1539     // AC: need to decrement t_serialized for enquiry functions to work
1540     // correctly, will restore at join time
1541     parent_team->t.t_serialized--;
1542 
1543     {
1544       KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1545       KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1546       __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1547 #if OMPT_SUPPORT
1548                              ,
1549                              exit_frame_p
1550 #endif
1551                              );
1552     }
1553 
1554 #if OMPT_SUPPORT
1555     if (ompt_enabled.enabled) {
1556       *exit_frame_p = NULL;
1557       OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1558       if (ompt_enabled.ompt_callback_implicit_task) {
1559         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1560             ompt_scope_end, NULL, implicit_task_data, 1,
1561             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1562       }
1563       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1564       __ompt_lw_taskteam_unlink(master_th);
1565       if (ompt_enabled.ompt_callback_parallel_end) {
1566         ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1567             &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1568             OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1569       }
1570       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1571     }
1572 #endif
1573     return TRUE;
1574   }
1575 
1576   parent_team->t.t_pkfn = microtask;
1577   parent_team->t.t_invoke = invoker;
1578   KMP_ATOMIC_INC(&root->r.r_in_parallel);
1579   parent_team->t.t_active_level++;
1580   parent_team->t.t_level++;
1581   parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1582 
1583   // If the threads allocated to the team are less than the thread limit, update
1584   // the thread limit here. th_teams_size.nth is specific to this team nested
1585   // in a teams construct, the team is fully created, and we're about to do
1586   // the actual fork. Best to do this here so that the subsequent uses below
1587   // and in the join have the correct value.
1588   master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1589 
1590 #if OMPT_SUPPORT
1591   if (ompt_enabled.enabled) {
1592     ompt_lw_taskteam_t lw_taskteam;
1593     __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1594                             return_address);
1595     __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1596   }
1597 #endif
1598 
1599   /* Change number of threads in the team if requested */
1600   if (master_set_numthreads) { // The parallel has num_threads clause
1601     if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1602       // AC: only can reduce number of threads dynamically, can't increase
1603       kmp_info_t **other_threads = parent_team->t.t_threads;
1604       // NOTE: if using distributed barrier, we need to run this code block
1605       // even when the team size appears not to have changed from the max.
1606       int old_proc = master_th->th.th_teams_size.nth;
1607       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1608         __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1609         __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1610       }
1611       parent_team->t.t_nproc = master_set_numthreads;
1612       for (i = 0; i < master_set_numthreads; ++i) {
1613         other_threads[i]->th.th_team_nproc = master_set_numthreads;
1614       }
1615     }
1616     // Keep extra threads hot in the team for possible next parallels
1617     master_th->th.th_set_nproc = 0;
1618   }
1619 
1620 #if USE_DEBUGGER
1621   if (__kmp_debugging) { // Let debugger override number of threads.
1622     int nth = __kmp_omp_num_threads(loc);
1623     if (nth > 0) { // 0 means debugger doesn't want to change num threads
1624       master_set_numthreads = nth;
1625     }
1626   }
1627 #endif
1628 
1629   // Figure out the proc_bind policy for the nested parallel within teams
1630   kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1631   // proc_bind_default means don't update
1632   kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1633   if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1634     proc_bind = proc_bind_false;
1635   } else {
1636     // No proc_bind clause specified; use current proc-bind-var
1637     if (proc_bind == proc_bind_default) {
1638       proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1639     }
1640     /* else: The proc_bind policy was specified explicitly on parallel clause.
1641        This overrides proc-bind-var for this parallel region, but does not
1642        change proc-bind-var. */
1643     // Figure the value of proc-bind-var for the child threads.
1644     if ((level + 1 < __kmp_nested_proc_bind.used) &&
1645         (__kmp_nested_proc_bind.bind_types[level + 1] !=
1646          master_th->th.th_current_task->td_icvs.proc_bind)) {
1647       proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1648     }
1649   }
1650   KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1651   // Need to change the bind-var ICV to correct value for each implicit task
1652   if (proc_bind_icv != proc_bind_default &&
1653       master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1654     kmp_info_t **other_threads = parent_team->t.t_threads;
1655     for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1656       other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1657     }
1658   }
1659   // Reset for next parallel region
1660   master_th->th.th_set_proc_bind = proc_bind_default;
1661 
1662 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1663   if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1664        KMP_ITT_DEBUG) &&
1665       __kmp_forkjoin_frames_mode == 3 &&
1666       parent_team->t.t_active_level == 1 // only report frames at level 1
1667       && master_th->th.th_teams_size.nteams == 1) {
1668     kmp_uint64 tmp_time = __itt_get_timestamp();
1669     master_th->th.th_frame_time = tmp_time;
1670     parent_team->t.t_region_time = tmp_time;
1671   }
1672   if (__itt_stack_caller_create_ptr) {
1673     KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1674     // create new stack stitching id before entering fork barrier
1675     parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1676   }
1677 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1678 #if KMP_AFFINITY_SUPPORTED
1679   __kmp_partition_places(parent_team);
1680 #endif
1681 
1682   KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1683                 "master_th=%p, gtid=%d\n",
1684                 root, parent_team, master_th, gtid));
1685   __kmp_internal_fork(loc, gtid, parent_team);
1686   KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1687                 "master_th=%p, gtid=%d\n",
1688                 root, parent_team, master_th, gtid));
1689 
1690   if (call_context == fork_context_gnu)
1691     return TRUE;
1692 
1693   /* Invoke microtask for PRIMARY thread */
1694   KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1695                 parent_team->t.t_id, parent_team->t.t_pkfn));
1696 
1697   if (!parent_team->t.t_invoke(gtid)) {
1698     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1699   }
1700   KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1701                 parent_team->t.t_id, parent_team->t.t_pkfn));
1702   KMP_MB(); /* Flush all pending memory write invalidates.  */
1703 
1704   KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1705 
1706   return TRUE;
1707 }
1708 
1709 // Create a serialized parallel region
1710 static inline int
1711 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1712                        kmp_int32 argc, microtask_t microtask, launch_t invoker,
1713                        kmp_info_t *master_th, kmp_team_t *parent_team,
1714 #if OMPT_SUPPORT
1715                        ompt_data_t *ompt_parallel_data, void **return_address,
1716                        ompt_data_t **parent_task_data,
1717 #endif
1718                        kmp_va_list ap) {
1719   kmp_team_t *team;
1720   int i;
1721   void **argv;
1722 
1723 /* josh todo: hypothetical question: what do we do for OS X*? */
1724 #if KMP_OS_LINUX &&                                                            \
1725     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1726   SimpleVLA<void *> args(argc);
1727 #else
1728   void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1729 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1730           KMP_ARCH_AARCH64) */
1731 
1732   KA_TRACE(
1733       20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1734 
1735   __kmpc_serialized_parallel(loc, gtid);
1736 
1737 #if OMPD_SUPPORT
1738   master_th->th.th_serial_team->t.t_pkfn = microtask;
1739 #endif
1740 
1741   if (call_context == fork_context_intel) {
1742     /* TODO this sucks, use the compiler itself to pass args! :) */
1743     master_th->th.th_serial_team->t.t_ident = loc;
1744     if (!ap) {
1745       // revert change made in __kmpc_serialized_parallel()
1746       master_th->th.th_serial_team->t.t_level--;
1747 // Get args from parent team for teams construct
1748 
1749 #if OMPT_SUPPORT
1750       void *dummy;
1751       void **exit_frame_p;
1752       ompt_task_info_t *task_info;
1753       ompt_lw_taskteam_t lw_taskteam;
1754 
1755       if (ompt_enabled.enabled) {
1756         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1757                                 ompt_parallel_data, *return_address);
1758 
1759         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1760         // don't use lw_taskteam after linking. content was swaped
1761         task_info = OMPT_CUR_TASK_INFO(master_th);
1762         exit_frame_p = &(task_info->frame.exit_frame.ptr);
1763         if (ompt_enabled.ompt_callback_implicit_task) {
1764           OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1765           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1766               ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1767               &(task_info->task_data), 1,
1768               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1769         }
1770 
1771         /* OMPT state */
1772         master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1773       } else {
1774         exit_frame_p = &dummy;
1775       }
1776 #endif
1777 
1778       {
1779         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1780         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1781         __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1782 #if OMPT_SUPPORT
1783                                ,
1784                                exit_frame_p
1785 #endif
1786                                );
1787       }
1788 
1789 #if OMPT_SUPPORT
1790       if (ompt_enabled.enabled) {
1791         *exit_frame_p = NULL;
1792         if (ompt_enabled.ompt_callback_implicit_task) {
1793           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1794               ompt_scope_end, NULL, &(task_info->task_data), 1,
1795               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1796         }
1797         *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1798         __ompt_lw_taskteam_unlink(master_th);
1799         if (ompt_enabled.ompt_callback_parallel_end) {
1800           ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1801               ompt_parallel_data, *parent_task_data,
1802               OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1803         }
1804         master_th->th.ompt_thread_info.state = ompt_state_overhead;
1805       }
1806 #endif
1807     } else if (microtask == (microtask_t)__kmp_teams_master) {
1808       KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1809       team = master_th->th.th_team;
1810       // team->t.t_pkfn = microtask;
1811       team->t.t_invoke = invoker;
1812       __kmp_alloc_argv_entries(argc, team, TRUE);
1813       team->t.t_argc = argc;
1814       argv = (void **)team->t.t_argv;
1815       for (i = argc - 1; i >= 0; --i)
1816         *argv++ = va_arg(kmp_va_deref(ap), void *);
1817       // AC: revert change made in __kmpc_serialized_parallel()
1818       //     because initial code in teams should have level=0
1819       team->t.t_level--;
1820       // AC: call special invoker for outer "parallel" of teams construct
1821       invoker(gtid);
1822 #if OMPT_SUPPORT
1823       if (ompt_enabled.enabled) {
1824         ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1825         if (ompt_enabled.ompt_callback_implicit_task) {
1826           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1827               ompt_scope_end, NULL, &(task_info->task_data), 0,
1828               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1829         }
1830         if (ompt_enabled.ompt_callback_parallel_end) {
1831           ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1832               ompt_parallel_data, *parent_task_data,
1833               OMPT_INVOKER(call_context) | ompt_parallel_league,
1834               *return_address);
1835         }
1836         master_th->th.ompt_thread_info.state = ompt_state_overhead;
1837       }
1838 #endif
1839     } else {
1840       argv = args;
1841       for (i = argc - 1; i >= 0; --i)
1842         *argv++ = va_arg(kmp_va_deref(ap), void *);
1843       KMP_MB();
1844 
1845 #if OMPT_SUPPORT
1846       void *dummy;
1847       void **exit_frame_p;
1848       ompt_task_info_t *task_info;
1849       ompt_lw_taskteam_t lw_taskteam;
1850       ompt_data_t *implicit_task_data;
1851 
1852       if (ompt_enabled.enabled) {
1853         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1854                                 ompt_parallel_data, *return_address);
1855         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1856         // don't use lw_taskteam after linking. content was swaped
1857         task_info = OMPT_CUR_TASK_INFO(master_th);
1858         exit_frame_p = &(task_info->frame.exit_frame.ptr);
1859 
1860         /* OMPT implicit task begin */
1861         implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1862         if (ompt_enabled.ompt_callback_implicit_task) {
1863           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1864               ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1865               implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1866               ompt_task_implicit);
1867           OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1868         }
1869 
1870         /* OMPT state */
1871         master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1872       } else {
1873         exit_frame_p = &dummy;
1874       }
1875 #endif
1876 
1877       {
1878         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1879         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1880         __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1881 #if OMPT_SUPPORT
1882                                ,
1883                                exit_frame_p
1884 #endif
1885                                );
1886       }
1887 
1888 #if OMPT_SUPPORT
1889       if (ompt_enabled.enabled) {
1890         *exit_frame_p = NULL;
1891         if (ompt_enabled.ompt_callback_implicit_task) {
1892           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1893               ompt_scope_end, NULL, &(task_info->task_data), 1,
1894               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1895         }
1896 
1897         *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1898         __ompt_lw_taskteam_unlink(master_th);
1899         if (ompt_enabled.ompt_callback_parallel_end) {
1900           ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1901               ompt_parallel_data, *parent_task_data,
1902               OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1903         }
1904         master_th->th.ompt_thread_info.state = ompt_state_overhead;
1905       }
1906 #endif
1907     }
1908   } else if (call_context == fork_context_gnu) {
1909 #if OMPT_SUPPORT
1910     if (ompt_enabled.enabled) {
1911       ompt_lw_taskteam_t lwt;
1912       __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1913                               *return_address);
1914 
1915       lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1916       __ompt_lw_taskteam_link(&lwt, master_th, 1);
1917     }
1918 // don't use lw_taskteam after linking. content was swaped
1919 #endif
1920 
1921     // we were called from GNU native code
1922     KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1923     return FALSE;
1924   } else {
1925     KMP_ASSERT2(call_context < fork_context_last,
1926                 "__kmp_serial_fork_call: unknown fork_context parameter");
1927   }
1928 
1929   KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1930   KMP_MB();
1931   return FALSE;
1932 }
1933 
1934 /* most of the work for a fork */
1935 /* return true if we really went parallel, false if serialized */
1936 int __kmp_fork_call(ident_t *loc, int gtid,
1937                     enum fork_context_e call_context, // Intel, GNU, ...
1938                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1939                     kmp_va_list ap) {
1940   void **argv;
1941   int i;
1942   int master_tid;
1943   int master_this_cons;
1944   kmp_team_t *team;
1945   kmp_team_t *parent_team;
1946   kmp_info_t *master_th;
1947   kmp_root_t *root;
1948   int nthreads;
1949   int master_active;
1950   int master_set_numthreads;
1951   int task_thread_limit = 0;
1952   int level;
1953   int active_level;
1954   int teams_level;
1955 #if KMP_NESTED_HOT_TEAMS
1956   kmp_hot_team_ptr_t **p_hot_teams;
1957 #endif
1958   { // KMP_TIME_BLOCK
1959     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1960     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1961 
1962     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1963     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1964       /* Some systems prefer the stack for the root thread(s) to start with */
1965       /* some gap from the parent stack to prevent false sharing. */
1966       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1967       /* These 2 lines below are so this does not get optimized out */
1968       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1969         __kmp_stkpadding += (short)((kmp_int64)dummy);
1970     }
1971 
1972     /* initialize if needed */
1973     KMP_DEBUG_ASSERT(
1974         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1975     if (!TCR_4(__kmp_init_parallel))
1976       __kmp_parallel_initialize();
1977     __kmp_resume_if_soft_paused();
1978 
1979     /* setup current data */
1980     // AC: potentially unsafe, not in sync with library shutdown,
1981     // __kmp_threads can be freed
1982     master_th = __kmp_threads[gtid];
1983 
1984     parent_team = master_th->th.th_team;
1985     master_tid = master_th->th.th_info.ds.ds_tid;
1986     master_this_cons = master_th->th.th_local.this_construct;
1987     root = master_th->th.th_root;
1988     master_active = root->r.r_active;
1989     master_set_numthreads = master_th->th.th_set_nproc;
1990     task_thread_limit =
1991         master_th->th.th_current_task->td_icvs.task_thread_limit;
1992 
1993 #if OMPT_SUPPORT
1994     ompt_data_t ompt_parallel_data = ompt_data_none;
1995     ompt_data_t *parent_task_data = NULL;
1996     ompt_frame_t *ompt_frame = NULL;
1997     void *return_address = NULL;
1998 
1999     if (ompt_enabled.enabled) {
2000       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
2001                                     NULL, NULL);
2002       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
2003     }
2004 #endif
2005 
2006     // Assign affinity to root thread if it hasn't happened yet
2007     __kmp_assign_root_init_mask();
2008 
2009     // Nested level will be an index in the nested nthreads array
2010     level = parent_team->t.t_level;
2011     // used to launch non-serial teams even if nested is not allowed
2012     active_level = parent_team->t.t_active_level;
2013     // needed to check nesting inside the teams
2014     teams_level = master_th->th.th_teams_level;
2015 #if KMP_NESTED_HOT_TEAMS
2016     p_hot_teams = &master_th->th.th_hot_teams;
2017     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
2018       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
2019           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
2020       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
2021       // it is either actual or not needed (when active_level > 0)
2022       (*p_hot_teams)[0].hot_team_nth = 1;
2023     }
2024 #endif
2025 
2026 #if OMPT_SUPPORT
2027     if (ompt_enabled.enabled) {
2028       if (ompt_enabled.ompt_callback_parallel_begin) {
2029         int team_size = master_set_numthreads
2030                             ? master_set_numthreads
2031                             : get__nproc_2(parent_team, master_tid);
2032         int flags = OMPT_INVOKER(call_context) |
2033                     ((microtask == (microtask_t)__kmp_teams_master)
2034                          ? ompt_parallel_league
2035                          : ompt_parallel_team);
2036         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
2037             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
2038             return_address);
2039       }
2040       master_th->th.ompt_thread_info.state = ompt_state_overhead;
2041     }
2042 #endif
2043 
2044     master_th->th.th_ident = loc;
2045 
2046     // Parallel closely nested in teams construct:
2047     if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
2048       return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
2049                                  call_context, microtask, invoker,
2050                                  master_set_numthreads, level,
2051 #if OMPT_SUPPORT
2052                                  ompt_parallel_data, return_address,
2053 #endif
2054                                  ap);
2055     } // End parallel closely nested in teams construct
2056 
2057     // Need this to happen before we determine the number of threads, not while
2058     // we are allocating the team
2059     //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2060 
2061     KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
2062 
2063     // Determine the number of threads
2064     int enter_teams =
2065         __kmp_is_entering_teams(active_level, level, teams_level, ap);
2066     if ((!enter_teams &&
2067          (parent_team->t.t_active_level >=
2068           master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2069         (__kmp_library == library_serial)) {
2070       KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2071       nthreads = 1;
2072     } else {
2073       nthreads = master_set_numthreads
2074                      ? master_set_numthreads
2075                      // TODO: get nproc directly from current task
2076                      : get__nproc_2(parent_team, master_tid);
2077       // Use the thread_limit set for the current target task if exists, else go
2078       // with the deduced nthreads
2079       nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2080                      ? task_thread_limit
2081                      : nthreads;
2082       // Check if we need to take forkjoin lock? (no need for serialized
2083       // parallel out of teams construct).
2084       if (nthreads > 1) {
2085         /* determine how many new threads we can use */
2086         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2087         /* AC: If we execute teams from parallel region (on host), then teams
2088            should be created but each can only have 1 thread if nesting is
2089            disabled. If teams called from serial region, then teams and their
2090            threads should be created regardless of the nesting setting. */
2091         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2092                                          nthreads, enter_teams);
2093         if (nthreads == 1) {
2094           // Free lock for single thread execution here; for multi-thread
2095           // execution it will be freed later after team of threads created
2096           // and initialized
2097           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2098         }
2099       }
2100     }
2101     KMP_DEBUG_ASSERT(nthreads > 0);
2102 
2103     // If we temporarily changed the set number of threads then restore it now
2104     master_th->th.th_set_nproc = 0;
2105 
2106     if (nthreads == 1) {
2107       return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2108                                     invoker, master_th, parent_team,
2109 #if OMPT_SUPPORT
2110                                     &ompt_parallel_data, &return_address,
2111                                     &parent_task_data,
2112 #endif
2113                                     ap);
2114     } // if (nthreads == 1)
2115 
2116     // GEH: only modify the executing flag in the case when not serialized
2117     //      serialized case is handled in kmpc_serialized_parallel
2118     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2119                   "curtask=%p, curtask_max_aclevel=%d\n",
2120                   parent_team->t.t_active_level, master_th,
2121                   master_th->th.th_current_task,
2122                   master_th->th.th_current_task->td_icvs.max_active_levels));
2123     // TODO: GEH - cannot do this assertion because root thread not set up as
2124     // executing
2125     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2126     master_th->th.th_current_task->td_flags.executing = 0;
2127 
2128     if (!master_th->th.th_teams_microtask || level > teams_level) {
2129       /* Increment our nested depth level */
2130       KMP_ATOMIC_INC(&root->r.r_in_parallel);
2131     }
2132 
2133     // See if we need to make a copy of the ICVs.
2134     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2135     kmp_nested_nthreads_t *nested_nth = NULL;
2136     if (!master_th->th.th_set_nested_nth &&
2137         (level + 1 < parent_team->t.t_nested_nth->used) &&
2138         (parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
2139       nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
2140     } else if (master_th->th.th_set_nested_nth) {
2141       nested_nth = __kmp_override_nested_nth(master_th, level);
2142       if ((level + 1 < nested_nth->used) &&
2143           (nested_nth->nth[level + 1] != nthreads_icv))
2144         nthreads_icv = nested_nth->nth[level + 1];
2145       else
2146         nthreads_icv = 0; // don't update
2147     } else {
2148       nthreads_icv = 0; // don't update
2149     }
2150 
2151     // Figure out the proc_bind_policy for the new team.
2152     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2153     // proc_bind_default means don't update
2154     kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2155     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2156       proc_bind = proc_bind_false;
2157     } else {
2158       // No proc_bind clause specified; use current proc-bind-var for this
2159       // parallel region
2160       if (proc_bind == proc_bind_default) {
2161         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2162       }
2163       // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2164       if (master_th->th.th_teams_microtask &&
2165           microtask == (microtask_t)__kmp_teams_master) {
2166         proc_bind = __kmp_teams_proc_bind;
2167       }
2168       /* else: The proc_bind policy was specified explicitly on parallel clause.
2169          This overrides proc-bind-var for this parallel region, but does not
2170          change proc-bind-var. */
2171       // Figure the value of proc-bind-var for the child threads.
2172       if ((level + 1 < __kmp_nested_proc_bind.used) &&
2173           (__kmp_nested_proc_bind.bind_types[level + 1] !=
2174            master_th->th.th_current_task->td_icvs.proc_bind)) {
2175         // Do not modify the proc bind icv for the two teams construct forks
2176         // They just let the proc bind icv pass through
2177         if (!master_th->th.th_teams_microtask ||
2178             !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2179           proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2180       }
2181     }
2182 
2183     // Reset for next parallel region
2184     master_th->th.th_set_proc_bind = proc_bind_default;
2185 
2186     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2187       kmp_internal_control_t new_icvs;
2188       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2189       new_icvs.next = NULL;
2190       if (nthreads_icv > 0) {
2191         new_icvs.nproc = nthreads_icv;
2192       }
2193       if (proc_bind_icv != proc_bind_default) {
2194         new_icvs.proc_bind = proc_bind_icv;
2195       }
2196 
2197       /* allocate a new parallel team */
2198       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2199       team = __kmp_allocate_team(root, nthreads, nthreads,
2200 #if OMPT_SUPPORT
2201                                  ompt_parallel_data,
2202 #endif
2203                                  proc_bind, &new_icvs,
2204                                  argc USE_NESTED_HOT_ARG(master_th));
2205       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2206         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2207     } else {
2208       /* allocate a new parallel team */
2209       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2210       team = __kmp_allocate_team(root, nthreads, nthreads,
2211 #if OMPT_SUPPORT
2212                                  ompt_parallel_data,
2213 #endif
2214                                  proc_bind,
2215                                  &master_th->th.th_current_task->td_icvs,
2216                                  argc USE_NESTED_HOT_ARG(master_th));
2217       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2218         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2219                   &master_th->th.th_current_task->td_icvs);
2220     }
2221     KF_TRACE(
2222         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2223 
2224     /* setup the new team */
2225     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2226     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2227     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2228     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2229     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2230 #if OMPT_SUPPORT
2231     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2232                           return_address);
2233 #endif
2234     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2235     // TODO: parent_team->t.t_level == INT_MAX ???
2236     if (!master_th->th.th_teams_microtask || level > teams_level) {
2237       int new_level = parent_team->t.t_level + 1;
2238       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2239       new_level = parent_team->t.t_active_level + 1;
2240       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2241     } else {
2242       // AC: Do not increase parallel level at start of the teams construct
2243       int new_level = parent_team->t.t_level;
2244       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2245       new_level = parent_team->t.t_active_level;
2246       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2247     }
2248     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2249     // set primary thread's schedule as new run-time schedule
2250     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2251 
2252     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2253     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2254 
2255     // Check if hot team has potentially outdated list, and if so, free it
2256     if (team->t.t_nested_nth &&
2257         team->t.t_nested_nth != parent_team->t.t_nested_nth) {
2258       KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
2259       KMP_INTERNAL_FREE(team->t.t_nested_nth);
2260       team->t.t_nested_nth = NULL;
2261     }
2262     team->t.t_nested_nth = parent_team->t.t_nested_nth;
2263     if (master_th->th.th_set_nested_nth) {
2264       if (!nested_nth)
2265         nested_nth = __kmp_override_nested_nth(master_th, level);
2266       team->t.t_nested_nth = nested_nth;
2267       KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
2268       master_th->th.th_set_nested_nth = NULL;
2269       master_th->th.th_set_nested_nth_sz = 0;
2270       master_th->th.th_nt_strict = false;
2271     }
2272 
2273     // Update the floating point rounding in the team if required.
2274     propagateFPControl(team);
2275 #if OMPD_SUPPORT
2276     if (ompd_state & OMPD_ENABLE_BP)
2277       ompd_bp_parallel_begin();
2278 #endif
2279 
2280     KA_TRACE(
2281         20,
2282         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2283          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2284          team->t.t_nproc));
2285     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2286                      (team->t.t_master_tid == 0 &&
2287                       (team->t.t_parent == root->r.r_root_team ||
2288                        team->t.t_parent->t.t_serialized)));
2289     KMP_MB();
2290 
2291     /* now, setup the arguments */
2292     argv = (void **)team->t.t_argv;
2293     if (ap) {
2294       for (i = argc - 1; i >= 0; --i) {
2295         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2296         KMP_CHECK_UPDATE(*argv, new_argv);
2297         argv++;
2298       }
2299     } else {
2300       for (i = 0; i < argc; ++i) {
2301         // Get args from parent team for teams construct
2302         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2303       }
2304     }
2305 
2306     /* now actually fork the threads */
2307     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2308     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2309       root->r.r_active = TRUE;
2310 
2311     __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2312     __kmp_setup_icv_copy(team, nthreads,
2313                          &master_th->th.th_current_task->td_icvs, loc);
2314 
2315 #if OMPT_SUPPORT
2316     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2317 #endif
2318 
2319     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2320 
2321 #if USE_ITT_BUILD
2322     if (team->t.t_active_level == 1 // only report frames at level 1
2323         && !master_th->th.th_teams_microtask) { // not in teams construct
2324 #if USE_ITT_NOTIFY
2325       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2326           (__kmp_forkjoin_frames_mode == 3 ||
2327            __kmp_forkjoin_frames_mode == 1)) {
2328         kmp_uint64 tmp_time = 0;
2329         if (__itt_get_timestamp_ptr)
2330           tmp_time = __itt_get_timestamp();
2331         // Internal fork - report frame begin
2332         master_th->th.th_frame_time = tmp_time;
2333         if (__kmp_forkjoin_frames_mode == 3)
2334           team->t.t_region_time = tmp_time;
2335       } else
2336 // only one notification scheme (either "submit" or "forking/joined", not both)
2337 #endif /* USE_ITT_NOTIFY */
2338         if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2339             __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2340           // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2341           __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2342         }
2343     }
2344 #endif /* USE_ITT_BUILD */
2345 
2346     /* now go on and do the work */
2347     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2348     KMP_MB();
2349     KF_TRACE(10,
2350              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2351               root, team, master_th, gtid));
2352 
2353 #if USE_ITT_BUILD
2354     if (__itt_stack_caller_create_ptr) {
2355       // create new stack stitching id before entering fork barrier
2356       if (!enter_teams) {
2357         KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2358         team->t.t_stack_id = __kmp_itt_stack_caller_create();
2359       } else if (parent_team->t.t_serialized) {
2360         // keep stack stitching id in the serialized parent_team;
2361         // current team will be used for parallel inside the teams;
2362         // if parent_team is active, then it already keeps stack stitching id
2363         // for the league of teams
2364         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2365         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2366       }
2367     }
2368 #endif /* USE_ITT_BUILD */
2369 
2370     // AC: skip __kmp_internal_fork at teams construct, let only primary
2371     // threads execute
2372     if (ap) {
2373       __kmp_internal_fork(loc, gtid, team);
2374       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2375                     "master_th=%p, gtid=%d\n",
2376                     root, team, master_th, gtid));
2377     }
2378 
2379     if (call_context == fork_context_gnu) {
2380       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2381       return TRUE;
2382     }
2383 
2384     /* Invoke microtask for PRIMARY thread */
2385     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2386                   team->t.t_id, team->t.t_pkfn));
2387   } // END of timer KMP_fork_call block
2388 
2389 #if KMP_STATS_ENABLED
2390   // If beginning a teams construct, then change thread state
2391   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2392   if (!ap) {
2393     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2394   }
2395 #endif
2396 
2397   if (!team->t.t_invoke(gtid)) {
2398     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2399   }
2400 
2401 #if KMP_STATS_ENABLED
2402   // If was beginning of a teams construct, then reset thread state
2403   if (!ap) {
2404     KMP_SET_THREAD_STATE(previous_state);
2405   }
2406 #endif
2407 
2408   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2409                 team->t.t_id, team->t.t_pkfn));
2410   KMP_MB(); /* Flush all pending memory write invalidates.  */
2411 
2412   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2413 #if OMPT_SUPPORT
2414   if (ompt_enabled.enabled) {
2415     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2416   }
2417 #endif
2418 
2419   return TRUE;
2420 }
2421 
2422 #if OMPT_SUPPORT
2423 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2424                                             kmp_team_t *team) {
2425   // restore state outside the region
2426   thread->th.ompt_thread_info.state =
2427       ((team->t.t_serialized) ? ompt_state_work_serial
2428                               : ompt_state_work_parallel);
2429 }
2430 
2431 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2432                                    kmp_team_t *team, ompt_data_t *parallel_data,
2433                                    int flags, void *codeptr) {
2434   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2435   if (ompt_enabled.ompt_callback_parallel_end) {
2436     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2437         parallel_data, &(task_info->task_data), flags, codeptr);
2438   }
2439 
2440   task_info->frame.enter_frame = ompt_data_none;
2441   __kmp_join_restore_state(thread, team);
2442 }
2443 #endif
2444 
2445 void __kmp_join_call(ident_t *loc, int gtid
2446 #if OMPT_SUPPORT
2447                      ,
2448                      enum fork_context_e fork_context
2449 #endif
2450                      ,
2451                      int exit_teams) {
2452   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2453   kmp_team_t *team;
2454   kmp_team_t *parent_team;
2455   kmp_info_t *master_th;
2456   kmp_root_t *root;
2457   int master_active;
2458 
2459   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2460 
2461   /* setup current data */
2462   master_th = __kmp_threads[gtid];
2463   root = master_th->th.th_root;
2464   team = master_th->th.th_team;
2465   parent_team = team->t.t_parent;
2466 
2467   master_th->th.th_ident = loc;
2468 
2469 #if OMPT_SUPPORT
2470   void *team_microtask = (void *)team->t.t_pkfn;
2471   // For GOMP interface with serialized parallel, need the
2472   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2473   // and end-parallel events.
2474   if (ompt_enabled.enabled &&
2475       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2476     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2477   }
2478 #endif
2479 
2480 #if KMP_DEBUG
2481   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2482     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2483                   "th_task_team = %p\n",
2484                   __kmp_gtid_from_thread(master_th), team,
2485                   team->t.t_task_team[master_th->th.th_task_state],
2486                   master_th->th.th_task_team));
2487     KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
2488   }
2489 #endif
2490 
2491   if (team->t.t_serialized) {
2492     if (master_th->th.th_teams_microtask) {
2493       // We are in teams construct
2494       int level = team->t.t_level;
2495       int tlevel = master_th->th.th_teams_level;
2496       if (level == tlevel) {
2497         // AC: we haven't incremented it earlier at start of teams construct,
2498         //     so do it here - at the end of teams construct
2499         team->t.t_level++;
2500       } else if (level == tlevel + 1) {
2501         // AC: we are exiting parallel inside teams, need to increment
2502         // serialization in order to restore it in the next call to
2503         // __kmpc_end_serialized_parallel
2504         team->t.t_serialized++;
2505       }
2506     }
2507     __kmpc_end_serialized_parallel(loc, gtid);
2508 
2509 #if OMPT_SUPPORT
2510     if (ompt_enabled.enabled) {
2511       if (fork_context == fork_context_gnu) {
2512         __ompt_lw_taskteam_unlink(master_th);
2513       }
2514       __kmp_join_restore_state(master_th, parent_team);
2515     }
2516 #endif
2517 
2518     return;
2519   }
2520 
2521   master_active = team->t.t_master_active;
2522 
2523   if (!exit_teams) {
2524     // AC: No barrier for internal teams at exit from teams construct.
2525     //     But there is barrier for external team (league).
2526     __kmp_internal_join(loc, gtid, team);
2527 #if USE_ITT_BUILD
2528     if (__itt_stack_caller_create_ptr) {
2529       KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2530       // destroy the stack stitching id after join barrier
2531       __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2532       team->t.t_stack_id = NULL;
2533     }
2534 #endif
2535   } else {
2536     master_th->th.th_task_state =
2537         0; // AC: no tasking in teams (out of any parallel)
2538 #if USE_ITT_BUILD
2539     if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2540       KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2541       // destroy the stack stitching id on exit from the teams construct
2542       // if parent_team is active, then the id will be destroyed later on
2543       // by master of the league of teams
2544       __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2545       parent_team->t.t_stack_id = NULL;
2546     }
2547 #endif
2548   }
2549 
2550   KMP_MB();
2551 
2552 #if OMPT_SUPPORT
2553   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2554   void *codeptr = team->t.ompt_team_info.master_return_address;
2555 #endif
2556 
2557 #if USE_ITT_BUILD
2558   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2559   if (team->t.t_active_level == 1 &&
2560       (!master_th->th.th_teams_microtask || /* not in teams construct */
2561        master_th->th.th_teams_size.nteams == 1)) {
2562     master_th->th.th_ident = loc;
2563     // only one notification scheme (either "submit" or "forking/joined", not
2564     // both)
2565     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2566         __kmp_forkjoin_frames_mode == 3)
2567       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2568                              master_th->th.th_frame_time, 0, loc,
2569                              master_th->th.th_team_nproc, 1);
2570     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2571              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2572       __kmp_itt_region_joined(gtid);
2573   } // active_level == 1
2574 #endif /* USE_ITT_BUILD */
2575 
2576 #if KMP_AFFINITY_SUPPORTED
2577   if (!exit_teams) {
2578     // Restore master thread's partition.
2579     master_th->th.th_first_place = team->t.t_first_place;
2580     master_th->th.th_last_place = team->t.t_last_place;
2581   }
2582 #endif // KMP_AFFINITY_SUPPORTED
2583 
2584   if (master_th->th.th_teams_microtask && !exit_teams &&
2585       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2586       team->t.t_level == master_th->th.th_teams_level + 1) {
2587 // AC: We need to leave the team structure intact at the end of parallel
2588 // inside the teams construct, so that at the next parallel same (hot) team
2589 // works, only adjust nesting levels
2590 #if OMPT_SUPPORT
2591     ompt_data_t ompt_parallel_data = ompt_data_none;
2592     if (ompt_enabled.enabled) {
2593       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2594       if (ompt_enabled.ompt_callback_implicit_task) {
2595         int ompt_team_size = team->t.t_nproc;
2596         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2597             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2598             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2599       }
2600       task_info->frame.exit_frame = ompt_data_none;
2601       task_info->task_data = ompt_data_none;
2602       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2603       __ompt_lw_taskteam_unlink(master_th);
2604     }
2605 #endif
2606     /* Decrement our nested depth level */
2607     team->t.t_level--;
2608     team->t.t_active_level--;
2609     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2610 
2611     // Restore number of threads in the team if needed. This code relies on
2612     // the proper adjustment of th_teams_size.nth after the fork in
2613     // __kmp_teams_master on each teams primary thread in the case that
2614     // __kmp_reserve_threads reduced it.
2615     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2616       int old_num = master_th->th.th_team_nproc;
2617       int new_num = master_th->th.th_teams_size.nth;
2618       kmp_info_t **other_threads = team->t.t_threads;
2619       team->t.t_nproc = new_num;
2620       for (int i = 0; i < old_num; ++i) {
2621         other_threads[i]->th.th_team_nproc = new_num;
2622       }
2623       // Adjust states of non-used threads of the team
2624       for (int i = old_num; i < new_num; ++i) {
2625         // Re-initialize thread's barrier data.
2626         KMP_DEBUG_ASSERT(other_threads[i]);
2627         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2628         for (int b = 0; b < bs_last_barrier; ++b) {
2629           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2630           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2631 #if USE_DEBUGGER
2632           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2633 #endif
2634         }
2635         if (__kmp_tasking_mode != tskm_immediate_exec) {
2636           // Synchronize thread's task state
2637           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2638         }
2639       }
2640     }
2641 
2642 #if OMPT_SUPPORT
2643     if (ompt_enabled.enabled) {
2644       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2645                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2646     }
2647 #endif
2648 
2649     return;
2650   }
2651 
2652   /* do cleanup and restore the parent team */
2653   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2654   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2655 
2656   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2657 
2658   /* jc: The following lock has instructions with REL and ACQ semantics,
2659      separating the parallel user code called in this parallel region
2660      from the serial user code called after this function returns. */
2661   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2662 
2663   if (!master_th->th.th_teams_microtask ||
2664       team->t.t_level > master_th->th.th_teams_level) {
2665     /* Decrement our nested depth level */
2666     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2667   }
2668   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2669 
2670 #if OMPT_SUPPORT
2671   if (ompt_enabled.enabled) {
2672     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2673     if (ompt_enabled.ompt_callback_implicit_task) {
2674       int flags = (team_microtask == (void *)__kmp_teams_master)
2675                       ? ompt_task_initial
2676                       : ompt_task_implicit;
2677       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2678       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2679           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2680           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2681     }
2682     task_info->frame.exit_frame = ompt_data_none;
2683     task_info->task_data = ompt_data_none;
2684   }
2685 #endif
2686 
2687   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2688                 master_th, team));
2689   __kmp_pop_current_task_from_thread(master_th);
2690 
2691   master_th->th.th_def_allocator = team->t.t_def_allocator;
2692 
2693 #if OMPD_SUPPORT
2694   if (ompd_state & OMPD_ENABLE_BP)
2695     ompd_bp_parallel_end();
2696 #endif
2697   updateHWFPControl(team);
2698 
2699   if (root->r.r_active != master_active)
2700     root->r.r_active = master_active;
2701 
2702   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2703                             master_th)); // this will free worker threads
2704 
2705   /* this race was fun to find. make sure the following is in the critical
2706      region otherwise assertions may fail occasionally since the old team may be
2707      reallocated and the hierarchy appears inconsistent. it is actually safe to
2708      run and won't cause any bugs, but will cause those assertion failures. it's
2709      only one deref&assign so might as well put this in the critical region */
2710   master_th->th.th_team = parent_team;
2711   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2712   master_th->th.th_team_master = parent_team->t.t_threads[0];
2713   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2714 
2715   /* restore serialized team, if need be */
2716   if (parent_team->t.t_serialized &&
2717       parent_team != master_th->th.th_serial_team &&
2718       parent_team != root->r.r_root_team) {
2719     __kmp_free_team(root,
2720                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2721     master_th->th.th_serial_team = parent_team;
2722   }
2723 
2724   if (__kmp_tasking_mode != tskm_immediate_exec) {
2725     // Restore primary thread's task state from team structure
2726     KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
2727                      team->t.t_primary_task_state == 1);
2728     master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
2729 
2730     // Copy the task team from the parent team to the primary thread
2731     master_th->th.th_task_team =
2732         parent_team->t.t_task_team[master_th->th.th_task_state];
2733     KA_TRACE(20,
2734              ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2735               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2736               parent_team));
2737   }
2738 
2739   // TODO: GEH - cannot do this assertion because root thread not set up as
2740   // executing
2741   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2742   master_th->th.th_current_task->td_flags.executing = 1;
2743 
2744   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2745 
2746 #if KMP_AFFINITY_SUPPORTED
2747   if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2748     __kmp_reset_root_init_mask(gtid);
2749   }
2750 #endif
2751 #if OMPT_SUPPORT
2752   int flags =
2753       OMPT_INVOKER(fork_context) |
2754       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2755                                                       : ompt_parallel_team);
2756   if (ompt_enabled.enabled) {
2757     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2758                     codeptr);
2759   }
2760 #endif
2761 
2762   KMP_MB();
2763   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2764 }
2765 
2766 /* Check whether we should push an internal control record onto the
2767    serial team stack.  If so, do it.  */
2768 void __kmp_save_internal_controls(kmp_info_t *thread) {
2769 
2770   if (thread->th.th_team != thread->th.th_serial_team) {
2771     return;
2772   }
2773   if (thread->th.th_team->t.t_serialized > 1) {
2774     int push = 0;
2775 
2776     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2777       push = 1;
2778     } else {
2779       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2780           thread->th.th_team->t.t_serialized) {
2781         push = 1;
2782       }
2783     }
2784     if (push) { /* push a record on the serial team's stack */
2785       kmp_internal_control_t *control =
2786           (kmp_internal_control_t *)__kmp_allocate(
2787               sizeof(kmp_internal_control_t));
2788 
2789       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2790 
2791       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2792 
2793       control->next = thread->th.th_team->t.t_control_stack_top;
2794       thread->th.th_team->t.t_control_stack_top = control;
2795     }
2796   }
2797 }
2798 
2799 /* Changes set_nproc */
2800 void __kmp_set_num_threads(int new_nth, int gtid) {
2801   kmp_info_t *thread;
2802   kmp_root_t *root;
2803 
2804   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2805   KMP_DEBUG_ASSERT(__kmp_init_serial);
2806 
2807   if (new_nth < 1)
2808     new_nth = 1;
2809   else if (new_nth > __kmp_max_nth)
2810     new_nth = __kmp_max_nth;
2811 
2812   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2813   thread = __kmp_threads[gtid];
2814   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2815     return; // nothing to do
2816 
2817   __kmp_save_internal_controls(thread);
2818 
2819   set__nproc(thread, new_nth);
2820 
2821   // If this omp_set_num_threads() call will cause the hot team size to be
2822   // reduced (in the absence of a num_threads clause), then reduce it now,
2823   // rather than waiting for the next parallel region.
2824   root = thread->th.th_root;
2825   if (__kmp_init_parallel && (!root->r.r_active) &&
2826       (root->r.r_hot_team->t.t_nproc > new_nth)
2827 #if KMP_NESTED_HOT_TEAMS
2828       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2829 #endif
2830   ) {
2831     kmp_team_t *hot_team = root->r.r_hot_team;
2832     int f;
2833 
2834     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2835 
2836     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2837       __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2838     }
2839     // Release the extra threads we don't need any more.
2840     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2841       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2842       if (__kmp_tasking_mode != tskm_immediate_exec) {
2843         // When decreasing team size, threads no longer in the team should unref
2844         // task team.
2845         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2846       }
2847       __kmp_free_thread(hot_team->t.t_threads[f]);
2848       hot_team->t.t_threads[f] = NULL;
2849     }
2850     hot_team->t.t_nproc = new_nth;
2851 #if KMP_NESTED_HOT_TEAMS
2852     if (thread->th.th_hot_teams) {
2853       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2854       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2855     }
2856 #endif
2857 
2858     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2859       hot_team->t.b->update_num_threads(new_nth);
2860       __kmp_add_threads_to_team(hot_team, new_nth);
2861     }
2862 
2863     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2864 
2865     // Update the t_nproc field in the threads that are still active.
2866     for (f = 0; f < new_nth; f++) {
2867       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2868       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2869     }
2870     // Special flag in case omp_set_num_threads() call
2871     hot_team->t.t_size_changed = -1;
2872   }
2873 }
2874 
2875 /* Changes max_active_levels */
2876 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2877   kmp_info_t *thread;
2878 
2879   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2880                 "%d = (%d)\n",
2881                 gtid, max_active_levels));
2882   KMP_DEBUG_ASSERT(__kmp_init_serial);
2883 
2884   // validate max_active_levels
2885   if (max_active_levels < 0) {
2886     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2887     // We ignore this call if the user has specified a negative value.
2888     // The current setting won't be changed. The last valid setting will be
2889     // used. A warning will be issued (if warnings are allowed as controlled by
2890     // the KMP_WARNINGS env var).
2891     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2892                   "max_active_levels for thread %d = (%d)\n",
2893                   gtid, max_active_levels));
2894     return;
2895   }
2896   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2897     // it's OK, the max_active_levels is within the valid range: [ 0;
2898     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2899     // We allow a zero value. (implementation defined behavior)
2900   } else {
2901     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2902                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2903     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2904     // Current upper limit is MAX_INT. (implementation defined behavior)
2905     // If the input exceeds the upper limit, we correct the input to be the
2906     // upper limit. (implementation defined behavior)
2907     // Actually, the flow should never get here until we use MAX_INT limit.
2908   }
2909   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2910                 "max_active_levels for thread %d = (%d)\n",
2911                 gtid, max_active_levels));
2912 
2913   thread = __kmp_threads[gtid];
2914 
2915   __kmp_save_internal_controls(thread);
2916 
2917   set__max_active_levels(thread, max_active_levels);
2918 }
2919 
2920 /* Gets max_active_levels */
2921 int __kmp_get_max_active_levels(int gtid) {
2922   kmp_info_t *thread;
2923 
2924   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2925   KMP_DEBUG_ASSERT(__kmp_init_serial);
2926 
2927   thread = __kmp_threads[gtid];
2928   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2929   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2930                 "curtask_maxaclevel=%d\n",
2931                 gtid, thread->th.th_current_task,
2932                 thread->th.th_current_task->td_icvs.max_active_levels));
2933   return thread->th.th_current_task->td_icvs.max_active_levels;
2934 }
2935 
2936 // nteams-var per-device ICV
2937 void __kmp_set_num_teams(int num_teams) {
2938   if (num_teams > 0)
2939     __kmp_nteams = num_teams;
2940 }
2941 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2942 // teams-thread-limit-var per-device ICV
2943 void __kmp_set_teams_thread_limit(int limit) {
2944   if (limit > 0)
2945     __kmp_teams_thread_limit = limit;
2946 }
2947 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2948 
2949 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2950 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2951 
2952 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2953 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2954   kmp_info_t *thread;
2955   kmp_sched_t orig_kind;
2956   //    kmp_team_t *team;
2957 
2958   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2959                 gtid, (int)kind, chunk));
2960   KMP_DEBUG_ASSERT(__kmp_init_serial);
2961 
2962   // Check if the kind parameter is valid, correct if needed.
2963   // Valid parameters should fit in one of two intervals - standard or extended:
2964   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2965   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2966   orig_kind = kind;
2967   kind = __kmp_sched_without_mods(kind);
2968 
2969   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2970       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2971     // TODO: Hint needs attention in case we change the default schedule.
2972     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2973               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2974               __kmp_msg_null);
2975     kind = kmp_sched_default;
2976     chunk = 0; // ignore chunk value in case of bad kind
2977   }
2978 
2979   thread = __kmp_threads[gtid];
2980 
2981   __kmp_save_internal_controls(thread);
2982 
2983   if (kind < kmp_sched_upper_std) {
2984     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2985       // differ static chunked vs. unchunked:  chunk should be invalid to
2986       // indicate unchunked schedule (which is the default)
2987       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2988     } else {
2989       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2990           __kmp_sch_map[kind - kmp_sched_lower - 1];
2991     }
2992   } else {
2993     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2994     //    kmp_sched_lower - 2 ];
2995     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2996         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2997                       kmp_sched_lower - 2];
2998   }
2999   __kmp_sched_apply_mods_intkind(
3000       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
3001   if (kind == kmp_sched_auto || chunk < 1) {
3002     // ignore parameter chunk for schedule auto
3003     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
3004   } else {
3005     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
3006   }
3007 }
3008 
3009 /* Gets def_sched_var ICV values */
3010 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
3011   kmp_info_t *thread;
3012   enum sched_type th_type;
3013 
3014   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
3015   KMP_DEBUG_ASSERT(__kmp_init_serial);
3016 
3017   thread = __kmp_threads[gtid];
3018 
3019   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3020   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3021   case kmp_sch_static:
3022   case kmp_sch_static_greedy:
3023   case kmp_sch_static_balanced:
3024     *kind = kmp_sched_static;
3025     __kmp_sched_apply_mods_stdkind(kind, th_type);
3026     *chunk = 0; // chunk was not set, try to show this fact via zero value
3027     return;
3028   case kmp_sch_static_chunked:
3029     *kind = kmp_sched_static;
3030     break;
3031   case kmp_sch_dynamic_chunked:
3032     *kind = kmp_sched_dynamic;
3033     break;
3034   case kmp_sch_guided_chunked:
3035   case kmp_sch_guided_iterative_chunked:
3036   case kmp_sch_guided_analytical_chunked:
3037     *kind = kmp_sched_guided;
3038     break;
3039   case kmp_sch_auto:
3040     *kind = kmp_sched_auto;
3041     break;
3042   case kmp_sch_trapezoidal:
3043     *kind = kmp_sched_trapezoidal;
3044     break;
3045 #if KMP_STATIC_STEAL_ENABLED
3046   case kmp_sch_static_steal:
3047     *kind = kmp_sched_static_steal;
3048     break;
3049 #endif
3050   default:
3051     KMP_FATAL(UnknownSchedulingType, th_type);
3052   }
3053 
3054   __kmp_sched_apply_mods_stdkind(kind, th_type);
3055   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3056 }
3057 
3058 int __kmp_get_ancestor_thread_num(int gtid, int level) {
3059 
3060   int ii, dd;
3061   kmp_team_t *team;
3062   kmp_info_t *thr;
3063 
3064   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3065   KMP_DEBUG_ASSERT(__kmp_init_serial);
3066 
3067   // validate level
3068   if (level == 0)
3069     return 0;
3070   if (level < 0)
3071     return -1;
3072   thr = __kmp_threads[gtid];
3073   team = thr->th.th_team;
3074   ii = team->t.t_level;
3075   if (level > ii)
3076     return -1;
3077 
3078   if (thr->th.th_teams_microtask) {
3079     // AC: we are in teams region where multiple nested teams have same level
3080     int tlevel = thr->th.th_teams_level; // the level of the teams construct
3081     if (level <=
3082         tlevel) { // otherwise usual algorithm works (will not touch the teams)
3083       KMP_DEBUG_ASSERT(ii >= tlevel);
3084       // AC: As we need to pass by the teams league, we need to artificially
3085       // increase ii
3086       if (ii == tlevel) {
3087         ii += 2; // three teams have same level
3088       } else {
3089         ii++; // two teams have same level
3090       }
3091     }
3092   }
3093 
3094   if (ii == level)
3095     return __kmp_tid_from_gtid(gtid);
3096 
3097   dd = team->t.t_serialized;
3098   level++;
3099   while (ii > level) {
3100     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3101     }
3102     if ((team->t.t_serialized) && (!dd)) {
3103       team = team->t.t_parent;
3104       continue;
3105     }
3106     if (ii > level) {
3107       team = team->t.t_parent;
3108       dd = team->t.t_serialized;
3109       ii--;
3110     }
3111   }
3112 
3113   return (dd > 1) ? (0) : (team->t.t_master_tid);
3114 }
3115 
3116 int __kmp_get_team_size(int gtid, int level) {
3117 
3118   int ii, dd;
3119   kmp_team_t *team;
3120   kmp_info_t *thr;
3121 
3122   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3123   KMP_DEBUG_ASSERT(__kmp_init_serial);
3124 
3125   // validate level
3126   if (level == 0)
3127     return 1;
3128   if (level < 0)
3129     return -1;
3130   thr = __kmp_threads[gtid];
3131   team = thr->th.th_team;
3132   ii = team->t.t_level;
3133   if (level > ii)
3134     return -1;
3135 
3136   if (thr->th.th_teams_microtask) {
3137     // AC: we are in teams region where multiple nested teams have same level
3138     int tlevel = thr->th.th_teams_level; // the level of the teams construct
3139     if (level <=
3140         tlevel) { // otherwise usual algorithm works (will not touch the teams)
3141       KMP_DEBUG_ASSERT(ii >= tlevel);
3142       // AC: As we need to pass by the teams league, we need to artificially
3143       // increase ii
3144       if (ii == tlevel) {
3145         ii += 2; // three teams have same level
3146       } else {
3147         ii++; // two teams have same level
3148       }
3149     }
3150   }
3151 
3152   while (ii > level) {
3153     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3154     }
3155     if (team->t.t_serialized && (!dd)) {
3156       team = team->t.t_parent;
3157       continue;
3158     }
3159     if (ii > level) {
3160       team = team->t.t_parent;
3161       ii--;
3162     }
3163   }
3164 
3165   return team->t.t_nproc;
3166 }
3167 
3168 kmp_r_sched_t __kmp_get_schedule_global() {
3169   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3170   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3171   // independently. So one can get the updated schedule here.
3172 
3173   kmp_r_sched_t r_sched;
3174 
3175   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3176   // __kmp_guided. __kmp_sched should keep original value, so that user can set
3177   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3178   // different roots (even in OMP 2.5)
3179   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3180   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3181   if (s == kmp_sch_static) {
3182     // replace STATIC with more detailed schedule (balanced or greedy)
3183     r_sched.r_sched_type = __kmp_static;
3184   } else if (s == kmp_sch_guided_chunked) {
3185     // replace GUIDED with more detailed schedule (iterative or analytical)
3186     r_sched.r_sched_type = __kmp_guided;
3187   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3188     r_sched.r_sched_type = __kmp_sched;
3189   }
3190   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3191 
3192   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3193     // __kmp_chunk may be wrong here (if it was not ever set)
3194     r_sched.chunk = KMP_DEFAULT_CHUNK;
3195   } else {
3196     r_sched.chunk = __kmp_chunk;
3197   }
3198 
3199   return r_sched;
3200 }
3201 
3202 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3203    at least argc number of *t_argv entries for the requested team. */
3204 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3205 
3206   KMP_DEBUG_ASSERT(team);
3207   if (!realloc || argc > team->t.t_max_argc) {
3208 
3209     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3210                    "current entries=%d\n",
3211                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3212     /* if previously allocated heap space for args, free them */
3213     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3214       __kmp_free((void *)team->t.t_argv);
3215 
3216     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3217       /* use unused space in the cache line for arguments */
3218       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3219       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3220                      "argv entries\n",
3221                      team->t.t_id, team->t.t_max_argc));
3222       team->t.t_argv = &team->t.t_inline_argv[0];
3223       if (__kmp_storage_map) {
3224         __kmp_print_storage_map_gtid(
3225             -1, &team->t.t_inline_argv[0],
3226             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3227             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3228             team->t.t_id);
3229       }
3230     } else {
3231       /* allocate space for arguments in the heap */
3232       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3233                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3234                                : 2 * argc;
3235       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3236                      "argv entries\n",
3237                      team->t.t_id, team->t.t_max_argc));
3238       team->t.t_argv =
3239           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3240       if (__kmp_storage_map) {
3241         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3242                                      &team->t.t_argv[team->t.t_max_argc],
3243                                      sizeof(void *) * team->t.t_max_argc,
3244                                      "team_%d.t_argv", team->t.t_id);
3245       }
3246     }
3247   }
3248 }
3249 
3250 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3251   int i;
3252   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3253   team->t.t_threads =
3254       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3255   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3256       sizeof(dispatch_shared_info_t) * num_disp_buff);
3257   team->t.t_dispatch =
3258       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3259   team->t.t_implicit_task_taskdata =
3260       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3261   team->t.t_max_nproc = max_nth;
3262 
3263   /* setup dispatch buffers */
3264   for (i = 0; i < num_disp_buff; ++i) {
3265     team->t.t_disp_buffer[i].buffer_index = i;
3266     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3267   }
3268 }
3269 
3270 static void __kmp_free_team_arrays(kmp_team_t *team) {
3271   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3272   int i;
3273   for (i = 0; i < team->t.t_max_nproc; ++i) {
3274     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3275       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3276       team->t.t_dispatch[i].th_disp_buffer = NULL;
3277     }
3278   }
3279 #if KMP_USE_HIER_SCHED
3280   __kmp_dispatch_free_hierarchies(team);
3281 #endif
3282   __kmp_free(team->t.t_threads);
3283   __kmp_free(team->t.t_disp_buffer);
3284   __kmp_free(team->t.t_dispatch);
3285   __kmp_free(team->t.t_implicit_task_taskdata);
3286   team->t.t_threads = NULL;
3287   team->t.t_disp_buffer = NULL;
3288   team->t.t_dispatch = NULL;
3289   team->t.t_implicit_task_taskdata = 0;
3290 }
3291 
3292 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3293   kmp_info_t **oldThreads = team->t.t_threads;
3294 
3295   __kmp_free(team->t.t_disp_buffer);
3296   __kmp_free(team->t.t_dispatch);
3297   __kmp_free(team->t.t_implicit_task_taskdata);
3298   __kmp_allocate_team_arrays(team, max_nth);
3299 
3300   KMP_MEMCPY(team->t.t_threads, oldThreads,
3301              team->t.t_nproc * sizeof(kmp_info_t *));
3302 
3303   __kmp_free(oldThreads);
3304 }
3305 
3306 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3307 
3308   kmp_r_sched_t r_sched =
3309       __kmp_get_schedule_global(); // get current state of scheduling globals
3310 
3311   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3312 
3313   kmp_internal_control_t g_icvs = {
3314     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3315     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3316     // adjustment of threads (per thread)
3317     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3318     // whether blocktime is explicitly set
3319     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3320 #if KMP_USE_MONITOR
3321     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3322 // intervals
3323 #endif
3324     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3325     // next parallel region (per thread)
3326     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3327     __kmp_cg_max_nth, // int thread_limit;
3328     __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3329     // on task. This is used in the case of target thread_limit
3330     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3331     // for max_active_levels
3332     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3333     // {sched,chunk} pair
3334     __kmp_nested_proc_bind.bind_types[0],
3335     __kmp_default_device,
3336     NULL // struct kmp_internal_control *next;
3337   };
3338 
3339   return g_icvs;
3340 }
3341 
3342 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3343 
3344   kmp_internal_control_t gx_icvs;
3345   gx_icvs.serial_nesting_level =
3346       0; // probably =team->t.t_serial like in save_inter_controls
3347   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3348   gx_icvs.next = NULL;
3349 
3350   return gx_icvs;
3351 }
3352 
3353 static void __kmp_initialize_root(kmp_root_t *root) {
3354   int f;
3355   kmp_team_t *root_team;
3356   kmp_team_t *hot_team;
3357   int hot_team_max_nth;
3358   kmp_r_sched_t r_sched =
3359       __kmp_get_schedule_global(); // get current state of scheduling globals
3360   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3361   KMP_DEBUG_ASSERT(root);
3362   KMP_ASSERT(!root->r.r_begin);
3363 
3364   /* setup the root state structure */
3365   __kmp_init_lock(&root->r.r_begin_lock);
3366   root->r.r_begin = FALSE;
3367   root->r.r_active = FALSE;
3368   root->r.r_in_parallel = 0;
3369   root->r.r_blocktime = __kmp_dflt_blocktime;
3370 #if KMP_AFFINITY_SUPPORTED
3371   root->r.r_affinity_assigned = FALSE;
3372 #endif
3373 
3374   /* setup the root team for this task */
3375   /* allocate the root team structure */
3376   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3377 
3378   root_team =
3379       __kmp_allocate_team(root,
3380                           1, // new_nproc
3381                           1, // max_nproc
3382 #if OMPT_SUPPORT
3383                           ompt_data_none, // root parallel id
3384 #endif
3385                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3386                           0 // argc
3387                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3388       );
3389 #if USE_DEBUGGER
3390   // Non-NULL value should be assigned to make the debugger display the root
3391   // team.
3392   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3393 #endif
3394 
3395   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3396 
3397   root->r.r_root_team = root_team;
3398   root_team->t.t_control_stack_top = NULL;
3399 
3400   /* initialize root team */
3401   root_team->t.t_threads[0] = NULL;
3402   root_team->t.t_nproc = 1;
3403   root_team->t.t_serialized = 1;
3404   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3405   root_team->t.t_sched.sched = r_sched.sched;
3406   root_team->t.t_nested_nth = &__kmp_nested_nth;
3407   KA_TRACE(
3408       20,
3409       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3410        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3411 
3412   /* setup the  hot team for this task */
3413   /* allocate the hot team structure */
3414   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3415 
3416   hot_team =
3417       __kmp_allocate_team(root,
3418                           1, // new_nproc
3419                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3420 #if OMPT_SUPPORT
3421                           ompt_data_none, // root parallel id
3422 #endif
3423                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3424                           0 // argc
3425                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3426       );
3427   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3428 
3429   root->r.r_hot_team = hot_team;
3430   root_team->t.t_control_stack_top = NULL;
3431 
3432   /* first-time initialization */
3433   hot_team->t.t_parent = root_team;
3434 
3435   /* initialize hot team */
3436   hot_team_max_nth = hot_team->t.t_max_nproc;
3437   for (f = 0; f < hot_team_max_nth; ++f) {
3438     hot_team->t.t_threads[f] = NULL;
3439   }
3440   hot_team->t.t_nproc = 1;
3441   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3442   hot_team->t.t_sched.sched = r_sched.sched;
3443   hot_team->t.t_size_changed = 0;
3444   hot_team->t.t_nested_nth = &__kmp_nested_nth;
3445 }
3446 
3447 #ifdef KMP_DEBUG
3448 
3449 typedef struct kmp_team_list_item {
3450   kmp_team_p const *entry;
3451   struct kmp_team_list_item *next;
3452 } kmp_team_list_item_t;
3453 typedef kmp_team_list_item_t *kmp_team_list_t;
3454 
3455 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3456     kmp_team_list_t list, // List of teams.
3457     kmp_team_p const *team // Team to add.
3458 ) {
3459 
3460   // List must terminate with item where both entry and next are NULL.
3461   // Team is added to the list only once.
3462   // List is sorted in ascending order by team id.
3463   // Team id is *not* a key.
3464 
3465   kmp_team_list_t l;
3466 
3467   KMP_DEBUG_ASSERT(list != NULL);
3468   if (team == NULL) {
3469     return;
3470   }
3471 
3472   __kmp_print_structure_team_accum(list, team->t.t_parent);
3473   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3474 
3475   // Search list for the team.
3476   l = list;
3477   while (l->next != NULL && l->entry != team) {
3478     l = l->next;
3479   }
3480   if (l->next != NULL) {
3481     return; // Team has been added before, exit.
3482   }
3483 
3484   // Team is not found. Search list again for insertion point.
3485   l = list;
3486   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3487     l = l->next;
3488   }
3489 
3490   // Insert team.
3491   {
3492     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3493         sizeof(kmp_team_list_item_t));
3494     *item = *l;
3495     l->entry = team;
3496     l->next = item;
3497   }
3498 }
3499 
3500 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3501 
3502 ) {
3503   __kmp_printf("%s", title);
3504   if (team != NULL) {
3505     __kmp_printf("%2x %p\n", team->t.t_id, team);
3506   } else {
3507     __kmp_printf(" - (nil)\n");
3508   }
3509 }
3510 
3511 static void __kmp_print_structure_thread(char const *title,
3512                                          kmp_info_p const *thread) {
3513   __kmp_printf("%s", title);
3514   if (thread != NULL) {
3515     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3516   } else {
3517     __kmp_printf(" - (nil)\n");
3518   }
3519 }
3520 
3521 void __kmp_print_structure(void) {
3522 
3523   kmp_team_list_t list;
3524 
3525   // Initialize list of teams.
3526   list =
3527       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3528   list->entry = NULL;
3529   list->next = NULL;
3530 
3531   __kmp_printf("\n------------------------------\nGlobal Thread "
3532                "Table\n------------------------------\n");
3533   {
3534     int gtid;
3535     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3536       __kmp_printf("%2d", gtid);
3537       if (__kmp_threads != NULL) {
3538         __kmp_printf(" %p", __kmp_threads[gtid]);
3539       }
3540       if (__kmp_root != NULL) {
3541         __kmp_printf(" %p", __kmp_root[gtid]);
3542       }
3543       __kmp_printf("\n");
3544     }
3545   }
3546 
3547   // Print out __kmp_threads array.
3548   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3549                "----------\n");
3550   if (__kmp_threads != NULL) {
3551     int gtid;
3552     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3553       kmp_info_t const *thread = __kmp_threads[gtid];
3554       if (thread != NULL) {
3555         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3556         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3557         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3558         __kmp_print_structure_team("    Serial Team:  ",
3559                                    thread->th.th_serial_team);
3560         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3561         __kmp_print_structure_thread("    Primary:      ",
3562                                      thread->th.th_team_master);
3563         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3564         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3565         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3566         __kmp_print_structure_thread("    Next in pool: ",
3567                                      thread->th.th_next_pool);
3568         __kmp_printf("\n");
3569         __kmp_print_structure_team_accum(list, thread->th.th_team);
3570         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3571       }
3572     }
3573   } else {
3574     __kmp_printf("Threads array is not allocated.\n");
3575   }
3576 
3577   // Print out __kmp_root array.
3578   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3579                "--------\n");
3580   if (__kmp_root != NULL) {
3581     int gtid;
3582     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3583       kmp_root_t const *root = __kmp_root[gtid];
3584       if (root != NULL) {
3585         __kmp_printf("GTID %2d %p:\n", gtid, root);
3586         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3587         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3588         __kmp_print_structure_thread("    Uber Thread:  ",
3589                                      root->r.r_uber_thread);
3590         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3591         __kmp_printf("    In Parallel:  %2d\n",
3592                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3593         __kmp_printf("\n");
3594         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3595         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3596       }
3597     }
3598   } else {
3599     __kmp_printf("Ubers array is not allocated.\n");
3600   }
3601 
3602   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3603                "--------\n");
3604   while (list->next != NULL) {
3605     kmp_team_p const *team = list->entry;
3606     int i;
3607     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3608     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3609     __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
3610     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3611     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3612     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3613     for (i = 0; i < team->t.t_nproc; ++i) {
3614       __kmp_printf("    Thread %2d:      ", i);
3615       __kmp_print_structure_thread("", team->t.t_threads[i]);
3616     }
3617     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3618     __kmp_printf("\n");
3619     list = list->next;
3620   }
3621 
3622   // Print out __kmp_thread_pool and __kmp_team_pool.
3623   __kmp_printf("\n------------------------------\nPools\n----------------------"
3624                "--------\n");
3625   __kmp_print_structure_thread("Thread pool:          ",
3626                                CCAST(kmp_info_t *, __kmp_thread_pool));
3627   __kmp_print_structure_team("Team pool:            ",
3628                              CCAST(kmp_team_t *, __kmp_team_pool));
3629   __kmp_printf("\n");
3630 
3631   // Free team list.
3632   while (list != NULL) {
3633     kmp_team_list_item_t *item = list;
3634     list = list->next;
3635     KMP_INTERNAL_FREE(item);
3636   }
3637 }
3638 
3639 #endif
3640 
3641 //---------------------------------------------------------------------------
3642 //  Stuff for per-thread fast random number generator
3643 //  Table of primes
3644 static const unsigned __kmp_primes[] = {
3645     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3646     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3647     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3648     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3649     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3650     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3651     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3652     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3653     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3654     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3655     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3656 
3657 //---------------------------------------------------------------------------
3658 //  __kmp_get_random: Get a random number using a linear congruential method.
3659 unsigned short __kmp_get_random(kmp_info_t *thread) {
3660   unsigned x = thread->th.th_x;
3661   unsigned short r = (unsigned short)(x >> 16);
3662 
3663   thread->th.th_x = x * thread->th.th_a + 1;
3664 
3665   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3666                 thread->th.th_info.ds.ds_tid, r));
3667 
3668   return r;
3669 }
3670 //--------------------------------------------------------
3671 // __kmp_init_random: Initialize a random number generator
3672 void __kmp_init_random(kmp_info_t *thread) {
3673   unsigned seed = thread->th.th_info.ds.ds_tid;
3674 
3675   thread->th.th_a =
3676       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3677   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3678   KA_TRACE(30,
3679            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3680 }
3681 
3682 #if KMP_OS_WINDOWS
3683 /* reclaim array entries for root threads that are already dead, returns number
3684  * reclaimed */
3685 static int __kmp_reclaim_dead_roots(void) {
3686   int i, r = 0;
3687 
3688   for (i = 0; i < __kmp_threads_capacity; ++i) {
3689     if (KMP_UBER_GTID(i) &&
3690         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3691         !__kmp_root[i]
3692              ->r.r_active) { // AC: reclaim only roots died in non-active state
3693       r += __kmp_unregister_root_other_thread(i);
3694     }
3695   }
3696   return r;
3697 }
3698 #endif
3699 
3700 /* This function attempts to create free entries in __kmp_threads and
3701    __kmp_root, and returns the number of free entries generated.
3702 
3703    For Windows* OS static library, the first mechanism used is to reclaim array
3704    entries for root threads that are already dead.
3705 
3706    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3707    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3708    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3709    threadprivate cache array has been created. Synchronization with
3710    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3711 
3712    After any dead root reclamation, if the clipping value allows array expansion
3713    to result in the generation of a total of nNeed free slots, the function does
3714    that expansion. If not, nothing is done beyond the possible initial root
3715    thread reclamation.
3716 
3717    If any argument is negative, the behavior is undefined. */
3718 static int __kmp_expand_threads(int nNeed) {
3719   int added = 0;
3720   int minimumRequiredCapacity;
3721   int newCapacity;
3722   kmp_info_t **newThreads;
3723   kmp_root_t **newRoot;
3724 
3725   // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3726   // resizing __kmp_threads does not need additional protection if foreign
3727   // threads are present
3728 
3729 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3730   /* only for Windows static library */
3731   /* reclaim array entries for root threads that are already dead */
3732   added = __kmp_reclaim_dead_roots();
3733 
3734   if (nNeed) {
3735     nNeed -= added;
3736     if (nNeed < 0)
3737       nNeed = 0;
3738   }
3739 #endif
3740   if (nNeed <= 0)
3741     return added;
3742 
3743   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3744   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3745   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3746   // > __kmp_max_nth in one of two ways:
3747   //
3748   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3749   //    may not be reused by another thread, so we may need to increase
3750   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3751   //
3752   // 2) New foreign root(s) are encountered.  We always register new foreign
3753   //    roots. This may cause a smaller # of threads to be allocated at
3754   //    subsequent parallel regions, but the worker threads hang around (and
3755   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3756   //
3757   // Anyway, that is the reason for moving the check to see if
3758   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3759   // instead of having it performed here. -BB
3760 
3761   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3762 
3763   /* compute expansion headroom to check if we can expand */
3764   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3765     /* possible expansion too small -- give up */
3766     return added;
3767   }
3768   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3769 
3770   newCapacity = __kmp_threads_capacity;
3771   do {
3772     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3773                                                           : __kmp_sys_max_nth;
3774   } while (newCapacity < minimumRequiredCapacity);
3775   newThreads = (kmp_info_t **)__kmp_allocate(
3776       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3777   newRoot =
3778       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3779   KMP_MEMCPY(newThreads, __kmp_threads,
3780              __kmp_threads_capacity * sizeof(kmp_info_t *));
3781   KMP_MEMCPY(newRoot, __kmp_root,
3782              __kmp_threads_capacity * sizeof(kmp_root_t *));
3783   // Put old __kmp_threads array on a list. Any ongoing references to the old
3784   // list will be valid. This list is cleaned up at library shutdown.
3785   kmp_old_threads_list_t *node =
3786       (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3787   node->threads = __kmp_threads;
3788   node->next = __kmp_old_threads_list;
3789   __kmp_old_threads_list = node;
3790 
3791   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3792   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3793   added += newCapacity - __kmp_threads_capacity;
3794   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3795 
3796   if (newCapacity > __kmp_tp_capacity) {
3797     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3798     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3799       __kmp_threadprivate_resize_cache(newCapacity);
3800     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3801       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3802     }
3803     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3804   }
3805 
3806   return added;
3807 }
3808 
3809 /* Register the current thread as a root thread and obtain our gtid. We must
3810    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3811    thread that calls from __kmp_do_serial_initialize() */
3812 int __kmp_register_root(int initial_thread) {
3813   kmp_info_t *root_thread;
3814   kmp_root_t *root;
3815   int gtid;
3816   int capacity;
3817   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3818   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3819   KMP_MB();
3820 
3821   /* 2007-03-02:
3822      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3823      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3824      work as expected -- it may return false (that means there is at least one
3825      empty slot in __kmp_threads array), but it is possible the only free slot
3826      is #0, which is reserved for initial thread and so cannot be used for this
3827      one. Following code workarounds this bug.
3828 
3829      However, right solution seems to be not reserving slot #0 for initial
3830      thread because:
3831      (1) there is no magic in slot #0,
3832      (2) we cannot detect initial thread reliably (the first thread which does
3833         serial initialization may be not a real initial thread).
3834   */
3835   capacity = __kmp_threads_capacity;
3836   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3837     --capacity;
3838   }
3839 
3840   // If it is not for initializing the hidden helper team, we need to take
3841   // __kmp_hidden_helper_threads_num out of the capacity because it is included
3842   // in __kmp_threads_capacity.
3843   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3844     capacity -= __kmp_hidden_helper_threads_num;
3845   }
3846 
3847   /* see if there are too many threads */
3848   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3849     if (__kmp_tp_cached) {
3850       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3851                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3852                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3853     } else {
3854       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3855                   __kmp_msg_null);
3856     }
3857   }
3858 
3859   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3860   // 0: initial thread, also a regular OpenMP thread.
3861   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3862   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3863   // regular OpenMP threads.
3864   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3865     // Find an available thread slot for hidden helper thread. Slots for hidden
3866     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3867     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3868                    gtid <= __kmp_hidden_helper_threads_num;
3869          gtid++)
3870       ;
3871     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3872     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3873                  "hidden helper thread: T#%d\n",
3874                  gtid));
3875   } else {
3876     /* find an available thread slot */
3877     // Don't reassign the zero slot since we need that to only be used by
3878     // initial thread. Slots for hidden helper threads should also be skipped.
3879     if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3880       gtid = 0;
3881     } else {
3882       for (gtid = __kmp_hidden_helper_threads_num + 1;
3883            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3884         ;
3885     }
3886     KA_TRACE(
3887         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3888     KMP_ASSERT(gtid < __kmp_threads_capacity);
3889   }
3890 
3891   /* update global accounting */
3892   __kmp_all_nth++;
3893   TCW_4(__kmp_nth, __kmp_nth + 1);
3894 
3895   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3896   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3897   if (__kmp_adjust_gtid_mode) {
3898     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3899       if (TCR_4(__kmp_gtid_mode) != 2) {
3900         TCW_4(__kmp_gtid_mode, 2);
3901       }
3902     } else {
3903       if (TCR_4(__kmp_gtid_mode) != 1) {
3904         TCW_4(__kmp_gtid_mode, 1);
3905       }
3906     }
3907   }
3908 
3909 #ifdef KMP_ADJUST_BLOCKTIME
3910   /* Adjust blocktime to zero if necessary            */
3911   /* Middle initialization might not have occurred yet */
3912   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3913     if (__kmp_nth > __kmp_avail_proc) {
3914       __kmp_zero_bt = TRUE;
3915     }
3916   }
3917 #endif /* KMP_ADJUST_BLOCKTIME */
3918 
3919   /* setup this new hierarchy */
3920   if (!(root = __kmp_root[gtid])) {
3921     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3922     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3923   }
3924 
3925 #if KMP_STATS_ENABLED
3926   // Initialize stats as soon as possible (right after gtid assignment).
3927   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3928   __kmp_stats_thread_ptr->startLife();
3929   KMP_SET_THREAD_STATE(SERIAL_REGION);
3930   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3931 #endif
3932   __kmp_initialize_root(root);
3933 
3934   /* setup new root thread structure */
3935   if (root->r.r_uber_thread) {
3936     root_thread = root->r.r_uber_thread;
3937   } else {
3938     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3939     if (__kmp_storage_map) {
3940       __kmp_print_thread_storage_map(root_thread, gtid);
3941     }
3942     root_thread->th.th_info.ds.ds_gtid = gtid;
3943 #if OMPT_SUPPORT
3944     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3945 #endif
3946     root_thread->th.th_root = root;
3947     if (__kmp_env_consistency_check) {
3948       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3949     }
3950 #if USE_FAST_MEMORY
3951     __kmp_initialize_fast_memory(root_thread);
3952 #endif /* USE_FAST_MEMORY */
3953 
3954 #if KMP_USE_BGET
3955     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3956     __kmp_initialize_bget(root_thread);
3957 #endif
3958     __kmp_init_random(root_thread); // Initialize random number generator
3959   }
3960 
3961   /* setup the serial team held in reserve by the root thread */
3962   if (!root_thread->th.th_serial_team) {
3963     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3964     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3965     root_thread->th.th_serial_team = __kmp_allocate_team(
3966         root, 1, 1,
3967 #if OMPT_SUPPORT
3968         ompt_data_none, // root parallel id
3969 #endif
3970         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3971   }
3972   KMP_ASSERT(root_thread->th.th_serial_team);
3973   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3974                 root_thread->th.th_serial_team));
3975 
3976   /* drop root_thread into place */
3977   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3978 
3979   root->r.r_root_team->t.t_threads[0] = root_thread;
3980   root->r.r_hot_team->t.t_threads[0] = root_thread;
3981   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3982   // AC: the team created in reserve, not for execution (it is unused for now).
3983   root_thread->th.th_serial_team->t.t_serialized = 0;
3984   root->r.r_uber_thread = root_thread;
3985 
3986   /* initialize the thread, get it ready to go */
3987   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3988   TCW_4(__kmp_init_gtid, TRUE);
3989 
3990   /* prepare the primary thread for get_gtid() */
3991   __kmp_gtid_set_specific(gtid);
3992 
3993 #if USE_ITT_BUILD
3994   __kmp_itt_thread_name(gtid);
3995 #endif /* USE_ITT_BUILD */
3996 
3997 #ifdef KMP_TDATA_GTID
3998   __kmp_gtid = gtid;
3999 #endif
4000   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
4001   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
4002 
4003   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
4004                 "plain=%u\n",
4005                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
4006                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
4007                 KMP_INIT_BARRIER_STATE));
4008   { // Initialize barrier data.
4009     int b;
4010     for (b = 0; b < bs_last_barrier; ++b) {
4011       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
4012 #if USE_DEBUGGER
4013       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
4014 #endif
4015     }
4016   }
4017   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
4018                    KMP_INIT_BARRIER_STATE);
4019 
4020 #if KMP_AFFINITY_SUPPORTED
4021   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4022   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4023   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4024   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4025 #endif /* KMP_AFFINITY_SUPPORTED */
4026   root_thread->th.th_def_allocator = __kmp_def_allocator;
4027   root_thread->th.th_prev_level = 0;
4028   root_thread->th.th_prev_num_threads = 1;
4029 
4030   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4031   tmp->cg_root = root_thread;
4032   tmp->cg_thread_limit = __kmp_cg_max_nth;
4033   tmp->cg_nthreads = 1;
4034   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4035                  " cg_nthreads init to 1\n",
4036                  root_thread, tmp));
4037   tmp->up = NULL;
4038   root_thread->th.th_cg_roots = tmp;
4039 
4040   __kmp_root_counter++;
4041 
4042 #if OMPT_SUPPORT
4043   if (ompt_enabled.enabled) {
4044 
4045     kmp_info_t *root_thread = ompt_get_thread();
4046 
4047     ompt_set_thread_state(root_thread, ompt_state_overhead);
4048 
4049     if (ompt_enabled.ompt_callback_thread_begin) {
4050       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4051           ompt_thread_initial, __ompt_get_thread_data_internal());
4052     }
4053     ompt_data_t *task_data;
4054     ompt_data_t *parallel_data;
4055     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4056                                   NULL);
4057     if (ompt_enabled.ompt_callback_implicit_task) {
4058       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4059           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4060     }
4061 
4062     ompt_set_thread_state(root_thread, ompt_state_work_serial);
4063   }
4064 #endif
4065 #if OMPD_SUPPORT
4066   if (ompd_state & OMPD_ENABLE_BP)
4067     ompd_bp_thread_begin();
4068 #endif
4069 
4070   KMP_MB();
4071   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4072 
4073   return gtid;
4074 }
4075 
4076 #if KMP_NESTED_HOT_TEAMS
4077 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4078                                 const int max_level) {
4079   int i, n, nth;
4080   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4081   if (!hot_teams || !hot_teams[level].hot_team) {
4082     return 0;
4083   }
4084   KMP_DEBUG_ASSERT(level < max_level);
4085   kmp_team_t *team = hot_teams[level].hot_team;
4086   nth = hot_teams[level].hot_team_nth;
4087   n = nth - 1; // primary thread is not freed
4088   if (level < max_level - 1) {
4089     for (i = 0; i < nth; ++i) {
4090       kmp_info_t *th = team->t.t_threads[i];
4091       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4092       if (i > 0 && th->th.th_hot_teams) {
4093         __kmp_free(th->th.th_hot_teams);
4094         th->th.th_hot_teams = NULL;
4095       }
4096     }
4097   }
4098   __kmp_free_team(root, team, NULL);
4099   return n;
4100 }
4101 #endif
4102 
4103 // Resets a root thread and clear its root and hot teams.
4104 // Returns the number of __kmp_threads entries directly and indirectly freed.
4105 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4106   kmp_team_t *root_team = root->r.r_root_team;
4107   kmp_team_t *hot_team = root->r.r_hot_team;
4108   int n = hot_team->t.t_nproc;
4109   int i;
4110 
4111   KMP_DEBUG_ASSERT(!root->r.r_active);
4112 
4113   root->r.r_root_team = NULL;
4114   root->r.r_hot_team = NULL;
4115   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4116   // before call to __kmp_free_team().
4117   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4118 #if KMP_NESTED_HOT_TEAMS
4119   if (__kmp_hot_teams_max_level >
4120       0) { // need to free nested hot teams and their threads if any
4121     for (i = 0; i < hot_team->t.t_nproc; ++i) {
4122       kmp_info_t *th = hot_team->t.t_threads[i];
4123       if (__kmp_hot_teams_max_level > 1) {
4124         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4125       }
4126       if (th->th.th_hot_teams) {
4127         __kmp_free(th->th.th_hot_teams);
4128         th->th.th_hot_teams = NULL;
4129       }
4130     }
4131   }
4132 #endif
4133   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4134 
4135   // Before we can reap the thread, we need to make certain that all other
4136   // threads in the teams that had this root as ancestor have stopped trying to
4137   // steal tasks.
4138   if (__kmp_tasking_mode != tskm_immediate_exec) {
4139     __kmp_wait_to_unref_task_teams();
4140   }
4141 
4142 #if KMP_OS_WINDOWS
4143   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4144   KA_TRACE(
4145       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4146            "\n",
4147            (LPVOID) & (root->r.r_uber_thread->th),
4148            root->r.r_uber_thread->th.th_info.ds.ds_thread));
4149   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4150 #endif /* KMP_OS_WINDOWS */
4151 
4152 #if OMPD_SUPPORT
4153   if (ompd_state & OMPD_ENABLE_BP)
4154     ompd_bp_thread_end();
4155 #endif
4156 
4157 #if OMPT_SUPPORT
4158   ompt_data_t *task_data;
4159   ompt_data_t *parallel_data;
4160   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4161                                 NULL);
4162   if (ompt_enabled.ompt_callback_implicit_task) {
4163     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4164         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4165   }
4166   if (ompt_enabled.ompt_callback_thread_end) {
4167     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4168         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4169   }
4170 #endif
4171 
4172   TCW_4(__kmp_nth,
4173         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4174   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4175   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4176                  " to %d\n",
4177                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4178                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4179   if (i == 1) {
4180     // need to free contention group structure
4181     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4182                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
4183     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4184     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4185     root->r.r_uber_thread->th.th_cg_roots = NULL;
4186   }
4187   __kmp_reap_thread(root->r.r_uber_thread, 1);
4188 
4189   // We canot put root thread to __kmp_thread_pool, so we have to reap it
4190   // instead of freeing.
4191   root->r.r_uber_thread = NULL;
4192   /* mark root as no longer in use */
4193   root->r.r_begin = FALSE;
4194 
4195   return n;
4196 }
4197 
4198 void __kmp_unregister_root_current_thread(int gtid) {
4199   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4200   /* this lock should be ok, since unregister_root_current_thread is never
4201      called during an abort, only during a normal close. furthermore, if you
4202      have the forkjoin lock, you should never try to get the initz lock */
4203   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4204   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4205     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4206                   "exiting T#%d\n",
4207                   gtid));
4208     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4209     return;
4210   }
4211   kmp_root_t *root = __kmp_root[gtid];
4212 
4213   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4214   KMP_ASSERT(KMP_UBER_GTID(gtid));
4215   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4216   KMP_ASSERT(root->r.r_active == FALSE);
4217 
4218   KMP_MB();
4219 
4220   kmp_info_t *thread = __kmp_threads[gtid];
4221   kmp_team_t *team = thread->th.th_team;
4222   kmp_task_team_t *task_team = thread->th.th_task_team;
4223 
4224   // we need to wait for the proxy tasks before finishing the thread
4225   if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4226                             task_team->tt.tt_hidden_helper_task_encountered)) {
4227 #if OMPT_SUPPORT
4228     // the runtime is shutting down so we won't report any events
4229     thread->th.ompt_thread_info.state = ompt_state_undefined;
4230 #endif
4231     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4232   }
4233 
4234   __kmp_reset_root(gtid, root);
4235 
4236   KMP_MB();
4237   KC_TRACE(10,
4238            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4239 
4240   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4241 }
4242 
4243 #if KMP_OS_WINDOWS
4244 /* __kmp_forkjoin_lock must be already held
4245    Unregisters a root thread that is not the current thread.  Returns the number
4246    of __kmp_threads entries freed as a result. */
4247 static int __kmp_unregister_root_other_thread(int gtid) {
4248   kmp_root_t *root = __kmp_root[gtid];
4249   int r;
4250 
4251   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4252   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4253   KMP_ASSERT(KMP_UBER_GTID(gtid));
4254   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4255   KMP_ASSERT(root->r.r_active == FALSE);
4256 
4257   r = __kmp_reset_root(gtid, root);
4258   KC_TRACE(10,
4259            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4260   return r;
4261 }
4262 #endif
4263 
4264 #if KMP_DEBUG
4265 void __kmp_task_info() {
4266 
4267   kmp_int32 gtid = __kmp_entry_gtid();
4268   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4269   kmp_info_t *this_thr = __kmp_threads[gtid];
4270   kmp_team_t *steam = this_thr->th.th_serial_team;
4271   kmp_team_t *team = this_thr->th.th_team;
4272 
4273   __kmp_printf(
4274       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4275       "ptask=%p\n",
4276       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4277       team->t.t_implicit_task_taskdata[tid].td_parent);
4278 }
4279 #endif // KMP_DEBUG
4280 
4281 /* TODO optimize with one big memclr, take out what isn't needed, split
4282    responsibility to workers as much as possible, and delay initialization of
4283    features as much as possible  */
4284 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4285                                   int tid, int gtid) {
4286   /* this_thr->th.th_info.ds.ds_gtid is setup in
4287      kmp_allocate_thread/create_worker.
4288      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4289   KMP_DEBUG_ASSERT(this_thr != NULL);
4290   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4291   KMP_DEBUG_ASSERT(team);
4292   KMP_DEBUG_ASSERT(team->t.t_threads);
4293   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4294   kmp_info_t *master = team->t.t_threads[0];
4295   KMP_DEBUG_ASSERT(master);
4296   KMP_DEBUG_ASSERT(master->th.th_root);
4297 
4298   KMP_MB();
4299 
4300   TCW_SYNC_PTR(this_thr->th.th_team, team);
4301 
4302   this_thr->th.th_info.ds.ds_tid = tid;
4303   this_thr->th.th_set_nproc = 0;
4304   if (__kmp_tasking_mode != tskm_immediate_exec)
4305     // When tasking is possible, threads are not safe to reap until they are
4306     // done tasking; this will be set when tasking code is exited in wait
4307     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4308   else // no tasking --> always safe to reap
4309     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4310   this_thr->th.th_set_proc_bind = proc_bind_default;
4311 
4312 #if KMP_AFFINITY_SUPPORTED
4313   this_thr->th.th_new_place = this_thr->th.th_current_place;
4314 #endif
4315   this_thr->th.th_root = master->th.th_root;
4316 
4317   /* setup the thread's cache of the team structure */
4318   this_thr->th.th_team_nproc = team->t.t_nproc;
4319   this_thr->th.th_team_master = master;
4320   this_thr->th.th_team_serialized = team->t.t_serialized;
4321 
4322   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4323 
4324   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4325                 tid, gtid, this_thr, this_thr->th.th_current_task));
4326 
4327   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4328                            team, tid, TRUE);
4329 
4330   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4331                 tid, gtid, this_thr, this_thr->th.th_current_task));
4332   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4333   // __kmp_initialize_team()?
4334 
4335   /* TODO no worksharing in speculative threads */
4336   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4337 
4338   this_thr->th.th_local.this_construct = 0;
4339 
4340   if (!this_thr->th.th_pri_common) {
4341     this_thr->th.th_pri_common =
4342         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4343     if (__kmp_storage_map) {
4344       __kmp_print_storage_map_gtid(
4345           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4346           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4347     }
4348     this_thr->th.th_pri_head = NULL;
4349   }
4350 
4351   if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4352       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4353     // Make new thread's CG root same as primary thread's
4354     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4355     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4356     if (tmp) {
4357       // worker changes CG, need to check if old CG should be freed
4358       int i = tmp->cg_nthreads--;
4359       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4360                      " on node %p of thread %p to %d\n",
4361                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4362       if (i == 1) {
4363         __kmp_free(tmp); // last thread left CG --> free it
4364       }
4365     }
4366     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4367     // Increment new thread's CG root's counter to add the new thread
4368     this_thr->th.th_cg_roots->cg_nthreads++;
4369     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4370                    " node %p of thread %p to %d\n",
4371                    this_thr, this_thr->th.th_cg_roots,
4372                    this_thr->th.th_cg_roots->cg_root,
4373                    this_thr->th.th_cg_roots->cg_nthreads));
4374     this_thr->th.th_current_task->td_icvs.thread_limit =
4375         this_thr->th.th_cg_roots->cg_thread_limit;
4376   }
4377 
4378   /* Initialize dynamic dispatch */
4379   {
4380     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4381     // Use team max_nproc since this will never change for the team.
4382     size_t disp_size =
4383         sizeof(dispatch_private_info_t) *
4384         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4385     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4386                   team->t.t_max_nproc));
4387     KMP_ASSERT(dispatch);
4388     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4389     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4390 
4391     dispatch->th_disp_index = 0;
4392     dispatch->th_doacross_buf_idx = 0;
4393     if (!dispatch->th_disp_buffer) {
4394       dispatch->th_disp_buffer =
4395           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4396 
4397       if (__kmp_storage_map) {
4398         __kmp_print_storage_map_gtid(
4399             gtid, &dispatch->th_disp_buffer[0],
4400             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4401                                           ? 1
4402                                           : __kmp_dispatch_num_buffers],
4403             disp_size,
4404             "th_%d.th_dispatch.th_disp_buffer "
4405             "(team_%d.t_dispatch[%d].th_disp_buffer)",
4406             gtid, team->t.t_id, gtid);
4407       }
4408     } else {
4409       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4410     }
4411 
4412     dispatch->th_dispatch_pr_current = 0;
4413     dispatch->th_dispatch_sh_current = 0;
4414 
4415     dispatch->th_deo_fcn = 0; /* ORDERED     */
4416     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4417   }
4418 
4419   this_thr->th.th_next_pool = NULL;
4420 
4421   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4422   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4423 
4424   KMP_MB();
4425 }
4426 
4427 /* allocate a new thread for the requesting team. this is only called from
4428    within a forkjoin critical section. we will first try to get an available
4429    thread from the thread pool. if none is available, we will fork a new one
4430    assuming we are able to create a new one. this should be assured, as the
4431    caller should check on this first. */
4432 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4433                                   int new_tid) {
4434   kmp_team_t *serial_team;
4435   kmp_info_t *new_thr;
4436   int new_gtid;
4437 
4438   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4439   KMP_DEBUG_ASSERT(root && team);
4440 #if !KMP_NESTED_HOT_TEAMS
4441   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4442 #endif
4443   KMP_MB();
4444 
4445   /* first, try to get one from the thread pool unless allocating thread is
4446    * the main hidden helper thread. The hidden helper team should always
4447    * allocate new OS threads. */
4448   if (__kmp_thread_pool && !KMP_HIDDEN_HELPER_TEAM(team)) {
4449     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4450     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4451     if (new_thr == __kmp_thread_pool_insert_pt) {
4452       __kmp_thread_pool_insert_pt = NULL;
4453     }
4454     TCW_4(new_thr->th.th_in_pool, FALSE);
4455     __kmp_suspend_initialize_thread(new_thr);
4456     __kmp_lock_suspend_mx(new_thr);
4457     if (new_thr->th.th_active_in_pool == TRUE) {
4458       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4459       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4460       new_thr->th.th_active_in_pool = FALSE;
4461     }
4462     __kmp_unlock_suspend_mx(new_thr);
4463 
4464     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4465                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4466     KMP_ASSERT(!new_thr->th.th_team);
4467     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4468 
4469     /* setup the thread structure */
4470     __kmp_initialize_info(new_thr, team, new_tid,
4471                           new_thr->th.th_info.ds.ds_gtid);
4472     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4473 
4474     TCW_4(__kmp_nth, __kmp_nth + 1);
4475 
4476     new_thr->th.th_task_state = 0;
4477 
4478     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4479       // Make sure pool thread has transitioned to waiting on own thread struct
4480       KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4481       // Thread activated in __kmp_allocate_team when increasing team size
4482     }
4483 
4484 #ifdef KMP_ADJUST_BLOCKTIME
4485     /* Adjust blocktime back to zero if necessary */
4486     /* Middle initialization might not have occurred yet */
4487     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4488       if (__kmp_nth > __kmp_avail_proc) {
4489         __kmp_zero_bt = TRUE;
4490       }
4491     }
4492 #endif /* KMP_ADJUST_BLOCKTIME */
4493 
4494 #if KMP_DEBUG
4495     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4496     // KMP_BARRIER_PARENT_FLAG.
4497     int b;
4498     kmp_balign_t *balign = new_thr->th.th_bar;
4499     for (b = 0; b < bs_last_barrier; ++b)
4500       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4501 #endif
4502 
4503     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4504                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4505 
4506     KMP_MB();
4507     return new_thr;
4508   }
4509 
4510   /* no, well fork a new one */
4511   KMP_ASSERT(KMP_HIDDEN_HELPER_TEAM(team) || __kmp_nth == __kmp_all_nth);
4512   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4513 
4514 #if KMP_USE_MONITOR
4515   // If this is the first worker thread the RTL is creating, then also
4516   // launch the monitor thread.  We try to do this as early as possible.
4517   if (!TCR_4(__kmp_init_monitor)) {
4518     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4519     if (!TCR_4(__kmp_init_monitor)) {
4520       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4521       TCW_4(__kmp_init_monitor, 1);
4522       __kmp_create_monitor(&__kmp_monitor);
4523       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4524 #if KMP_OS_WINDOWS
4525       // AC: wait until monitor has started. This is a fix for CQ232808.
4526       // The reason is that if the library is loaded/unloaded in a loop with
4527       // small (parallel) work in between, then there is high probability that
4528       // monitor thread started after the library shutdown. At shutdown it is
4529       // too late to cope with the problem, because when the primary thread is
4530       // in DllMain (process detach) the monitor has no chances to start (it is
4531       // blocked), and primary thread has no means to inform the monitor that
4532       // the library has gone, because all the memory which the monitor can
4533       // access is going to be released/reset.
4534       while (TCR_4(__kmp_init_monitor) < 2) {
4535         KMP_YIELD(TRUE);
4536       }
4537       KF_TRACE(10, ("after monitor thread has started\n"));
4538 #endif
4539     }
4540     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4541   }
4542 #endif
4543 
4544   KMP_MB();
4545 
4546   {
4547     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4548                              ? 1
4549                              : __kmp_hidden_helper_threads_num + 1;
4550 
4551     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4552          ++new_gtid) {
4553       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4554     }
4555 
4556     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4557       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4558     }
4559   }
4560 
4561   /* allocate space for it. */
4562   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4563 
4564   new_thr->th.th_nt_strict = false;
4565   new_thr->th.th_nt_loc = NULL;
4566   new_thr->th.th_nt_sev = severity_fatal;
4567   new_thr->th.th_nt_msg = NULL;
4568 
4569   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4570 
4571 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4572   // suppress race conditions detection on synchronization flags in debug mode
4573   // this helps to analyze library internals eliminating false positives
4574   __itt_suppress_mark_range(
4575       __itt_suppress_range, __itt_suppress_threading_errors,
4576       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4577   __itt_suppress_mark_range(
4578       __itt_suppress_range, __itt_suppress_threading_errors,
4579       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4580 #if KMP_OS_WINDOWS
4581   __itt_suppress_mark_range(
4582       __itt_suppress_range, __itt_suppress_threading_errors,
4583       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4584 #else
4585   __itt_suppress_mark_range(__itt_suppress_range,
4586                             __itt_suppress_threading_errors,
4587                             &new_thr->th.th_suspend_init_count,
4588                             sizeof(new_thr->th.th_suspend_init_count));
4589 #endif
4590   // TODO: check if we need to also suppress b_arrived flags
4591   __itt_suppress_mark_range(__itt_suppress_range,
4592                             __itt_suppress_threading_errors,
4593                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4594                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4595   __itt_suppress_mark_range(__itt_suppress_range,
4596                             __itt_suppress_threading_errors,
4597                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4598                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4599   __itt_suppress_mark_range(__itt_suppress_range,
4600                             __itt_suppress_threading_errors,
4601                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4602                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4603 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4604   if (__kmp_storage_map) {
4605     __kmp_print_thread_storage_map(new_thr, new_gtid);
4606   }
4607 
4608   // add the reserve serialized team, initialized from the team's primary thread
4609   {
4610     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4611     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4612     new_thr->th.th_serial_team = serial_team =
4613         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4614 #if OMPT_SUPPORT
4615                                           ompt_data_none, // root parallel id
4616 #endif
4617                                           proc_bind_default, &r_icvs,
4618                                           0 USE_NESTED_HOT_ARG(NULL));
4619   }
4620   KMP_ASSERT(serial_team);
4621   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4622   // execution (it is unused for now).
4623   serial_team->t.t_threads[0] = new_thr;
4624   KF_TRACE(10,
4625            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4626             new_thr));
4627 
4628   /* setup the thread structures */
4629   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4630 
4631 #if USE_FAST_MEMORY
4632   __kmp_initialize_fast_memory(new_thr);
4633 #endif /* USE_FAST_MEMORY */
4634 
4635 #if KMP_USE_BGET
4636   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4637   __kmp_initialize_bget(new_thr);
4638 #endif
4639 
4640   __kmp_init_random(new_thr); // Initialize random number generator
4641 
4642   /* Initialize these only once when thread is grabbed for a team allocation */
4643   KA_TRACE(20,
4644            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4645             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4646 
4647   int b;
4648   kmp_balign_t *balign = new_thr->th.th_bar;
4649   for (b = 0; b < bs_last_barrier; ++b) {
4650     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4651     balign[b].bb.team = NULL;
4652     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4653     balign[b].bb.use_oncore_barrier = 0;
4654   }
4655 
4656   TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4657   new_thr->th.th_sleep_loc_type = flag_unset;
4658 
4659   new_thr->th.th_spin_here = FALSE;
4660   new_thr->th.th_next_waiting = 0;
4661 #if KMP_OS_UNIX
4662   new_thr->th.th_blocking = false;
4663 #endif
4664 
4665 #if KMP_AFFINITY_SUPPORTED
4666   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4667   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4668   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4669   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4670 #endif
4671   new_thr->th.th_def_allocator = __kmp_def_allocator;
4672   new_thr->th.th_prev_level = 0;
4673   new_thr->th.th_prev_num_threads = 1;
4674 
4675   TCW_4(new_thr->th.th_in_pool, FALSE);
4676   new_thr->th.th_active_in_pool = FALSE;
4677   TCW_4(new_thr->th.th_active, TRUE);
4678 
4679   new_thr->th.th_set_nested_nth = NULL;
4680   new_thr->th.th_set_nested_nth_sz = 0;
4681 
4682   /* adjust the global counters */
4683   __kmp_all_nth++;
4684   __kmp_nth++;
4685 
4686   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4687   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4688   if (__kmp_adjust_gtid_mode) {
4689     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4690       if (TCR_4(__kmp_gtid_mode) != 2) {
4691         TCW_4(__kmp_gtid_mode, 2);
4692       }
4693     } else {
4694       if (TCR_4(__kmp_gtid_mode) != 1) {
4695         TCW_4(__kmp_gtid_mode, 1);
4696       }
4697     }
4698   }
4699 
4700 #ifdef KMP_ADJUST_BLOCKTIME
4701   /* Adjust blocktime back to zero if necessary       */
4702   /* Middle initialization might not have occurred yet */
4703   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4704     if (__kmp_nth > __kmp_avail_proc) {
4705       __kmp_zero_bt = TRUE;
4706     }
4707   }
4708 #endif /* KMP_ADJUST_BLOCKTIME */
4709 
4710 #if KMP_AFFINITY_SUPPORTED
4711   // Set the affinity and topology information for new thread
4712   __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
4713 #endif
4714 
4715   /* actually fork it and create the new worker thread */
4716   KF_TRACE(
4717       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4718   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4719   KF_TRACE(10,
4720            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4721 
4722   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4723                 new_gtid));
4724   KMP_MB();
4725   return new_thr;
4726 }
4727 
4728 /* Reinitialize team for reuse.
4729    The hot team code calls this case at every fork barrier, so EPCC barrier
4730    test are extremely sensitive to changes in it, esp. writes to the team
4731    struct, which cause a cache invalidation in all threads.
4732    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4733 static void __kmp_reinitialize_team(kmp_team_t *team,
4734                                     kmp_internal_control_t *new_icvs,
4735                                     ident_t *loc) {
4736   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4737                 team->t.t_threads[0], team));
4738   KMP_DEBUG_ASSERT(team && new_icvs);
4739   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4740   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4741 
4742   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4743   // Copy ICVs to the primary thread's implicit taskdata
4744   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4745   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4746 
4747   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4748                 team->t.t_threads[0], team));
4749 }
4750 
4751 /* Initialize the team data structure.
4752    This assumes the t_threads and t_max_nproc are already set.
4753    Also, we don't touch the arguments */
4754 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4755                                   kmp_internal_control_t *new_icvs,
4756                                   ident_t *loc) {
4757   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4758 
4759   /* verify */
4760   KMP_DEBUG_ASSERT(team);
4761   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4762   KMP_DEBUG_ASSERT(team->t.t_threads);
4763   KMP_MB();
4764 
4765   team->t.t_master_tid = 0; /* not needed */
4766   /* team->t.t_master_bar;        not needed */
4767   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4768   team->t.t_nproc = new_nproc;
4769 
4770   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4771   team->t.t_next_pool = NULL;
4772   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4773    * up hot team */
4774 
4775   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4776   team->t.t_invoke = NULL; /* not needed */
4777 
4778   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4779   team->t.t_sched.sched = new_icvs->sched.sched;
4780 
4781 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4782   team->t.t_fp_control_saved = FALSE; /* not needed */
4783   team->t.t_x87_fpu_control_word = 0; /* not needed */
4784   team->t.t_mxcsr = 0; /* not needed */
4785 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4786 
4787   team->t.t_construct = 0;
4788 
4789   team->t.t_ordered.dt.t_value = 0;
4790   team->t.t_master_active = FALSE;
4791 
4792 #ifdef KMP_DEBUG
4793   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4794 #endif
4795 #if KMP_OS_WINDOWS
4796   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4797 #endif
4798 
4799   team->t.t_control_stack_top = NULL;
4800 
4801   __kmp_reinitialize_team(team, new_icvs, loc);
4802 
4803   KMP_MB();
4804   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4805 }
4806 
4807 #if KMP_AFFINITY_SUPPORTED
4808 static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4809                                           int first, int last, int newp) {
4810   th->th.th_first_place = first;
4811   th->th.th_last_place = last;
4812   th->th.th_new_place = newp;
4813   if (newp != th->th.th_current_place) {
4814     if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4815       team->t.t_display_affinity = 1;
4816     // Copy topology information associated with the new place
4817     th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4818     th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4819   }
4820 }
4821 
4822 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4823 // It calculates the worker + primary thread's partition based upon the parent
4824 // thread's partition, and binds each worker to a thread in their partition.
4825 // The primary thread's partition should already include its current binding.
4826 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4827   // Do not partition places for the hidden helper team
4828   if (KMP_HIDDEN_HELPER_TEAM(team))
4829     return;
4830   // Copy the primary thread's place partition to the team struct
4831   kmp_info_t *master_th = team->t.t_threads[0];
4832   KMP_DEBUG_ASSERT(master_th != NULL);
4833   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4834   int first_place = master_th->th.th_first_place;
4835   int last_place = master_th->th.th_last_place;
4836   int masters_place = master_th->th.th_current_place;
4837   int num_masks = __kmp_affinity.num_masks;
4838   team->t.t_first_place = first_place;
4839   team->t.t_last_place = last_place;
4840 
4841   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4842                 "bound to place %d partition = [%d,%d]\n",
4843                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4844                 team->t.t_id, masters_place, first_place, last_place));
4845 
4846   switch (proc_bind) {
4847 
4848   case proc_bind_default:
4849     // Serial teams might have the proc_bind policy set to proc_bind_default.
4850     // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4851     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4852     break;
4853 
4854   case proc_bind_primary: {
4855     int f;
4856     int n_th = team->t.t_nproc;
4857     for (f = 1; f < n_th; f++) {
4858       kmp_info_t *th = team->t.t_threads[f];
4859       KMP_DEBUG_ASSERT(th != NULL);
4860       __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
4861 
4862       KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4863                      "partition = [%d,%d]\n",
4864                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4865                      f, masters_place, first_place, last_place));
4866     }
4867   } break;
4868 
4869   case proc_bind_close: {
4870     int f;
4871     int n_th = team->t.t_nproc;
4872     int n_places;
4873     if (first_place <= last_place) {
4874       n_places = last_place - first_place + 1;
4875     } else {
4876       n_places = num_masks - first_place + last_place + 1;
4877     }
4878     if (n_th <= n_places) {
4879       int place = masters_place;
4880       for (f = 1; f < n_th; f++) {
4881         kmp_info_t *th = team->t.t_threads[f];
4882         KMP_DEBUG_ASSERT(th != NULL);
4883 
4884         if (place == last_place) {
4885           place = first_place;
4886         } else if (place == (num_masks - 1)) {
4887           place = 0;
4888         } else {
4889           place++;
4890         }
4891         __kmp_set_thread_place(team, th, first_place, last_place, place);
4892 
4893         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4894                        "partition = [%d,%d]\n",
4895                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4896                        team->t.t_id, f, place, first_place, last_place));
4897       }
4898     } else {
4899       int S, rem, gap, s_count;
4900       S = n_th / n_places;
4901       s_count = 0;
4902       rem = n_th - (S * n_places);
4903       gap = rem > 0 ? n_places / rem : n_places;
4904       int place = masters_place;
4905       int gap_ct = gap;
4906       for (f = 0; f < n_th; f++) {
4907         kmp_info_t *th = team->t.t_threads[f];
4908         KMP_DEBUG_ASSERT(th != NULL);
4909 
4910         __kmp_set_thread_place(team, th, first_place, last_place, place);
4911         s_count++;
4912 
4913         if ((s_count == S) && rem && (gap_ct == gap)) {
4914           // do nothing, add an extra thread to place on next iteration
4915         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4916           // we added an extra thread to this place; move to next place
4917           if (place == last_place) {
4918             place = first_place;
4919           } else if (place == (num_masks - 1)) {
4920             place = 0;
4921           } else {
4922             place++;
4923           }
4924           s_count = 0;
4925           gap_ct = 1;
4926           rem--;
4927         } else if (s_count == S) { // place full; don't add extra
4928           if (place == last_place) {
4929             place = first_place;
4930           } else if (place == (num_masks - 1)) {
4931             place = 0;
4932           } else {
4933             place++;
4934           }
4935           gap_ct++;
4936           s_count = 0;
4937         }
4938 
4939         KA_TRACE(100,
4940                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4941                   "partition = [%d,%d]\n",
4942                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4943                   th->th.th_new_place, first_place, last_place));
4944       }
4945       KMP_DEBUG_ASSERT(place == masters_place);
4946     }
4947   } break;
4948 
4949   case proc_bind_spread: {
4950     int f;
4951     int n_th = team->t.t_nproc;
4952     int n_places;
4953     int thidx;
4954     if (first_place <= last_place) {
4955       n_places = last_place - first_place + 1;
4956     } else {
4957       n_places = num_masks - first_place + last_place + 1;
4958     }
4959     if (n_th <= n_places) {
4960       int place = -1;
4961 
4962       if (n_places != num_masks) {
4963         int S = n_places / n_th;
4964         int s_count, rem, gap, gap_ct;
4965 
4966         place = masters_place;
4967         rem = n_places - n_th * S;
4968         gap = rem ? n_th / rem : 1;
4969         gap_ct = gap;
4970         thidx = n_th;
4971         if (update_master_only == 1)
4972           thidx = 1;
4973         for (f = 0; f < thidx; f++) {
4974           kmp_info_t *th = team->t.t_threads[f];
4975           KMP_DEBUG_ASSERT(th != NULL);
4976 
4977           int fplace = place, nplace = place;
4978           s_count = 1;
4979           while (s_count < S) {
4980             if (place == last_place) {
4981               place = first_place;
4982             } else if (place == (num_masks - 1)) {
4983               place = 0;
4984             } else {
4985               place++;
4986             }
4987             s_count++;
4988           }
4989           if (rem && (gap_ct == gap)) {
4990             if (place == last_place) {
4991               place = first_place;
4992             } else if (place == (num_masks - 1)) {
4993               place = 0;
4994             } else {
4995               place++;
4996             }
4997             rem--;
4998             gap_ct = 0;
4999           }
5000           __kmp_set_thread_place(team, th, fplace, place, nplace);
5001           gap_ct++;
5002 
5003           if (place == last_place) {
5004             place = first_place;
5005           } else if (place == (num_masks - 1)) {
5006             place = 0;
5007           } else {
5008             place++;
5009           }
5010 
5011           KA_TRACE(100,
5012                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5013                     "partition = [%d,%d], num_masks: %u\n",
5014                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
5015                     f, th->th.th_new_place, th->th.th_first_place,
5016                     th->th.th_last_place, num_masks));
5017         }
5018       } else {
5019         /* Having uniform space of available computation places I can create
5020            T partitions of round(P/T) size and put threads into the first
5021            place of each partition. */
5022         double current = static_cast<double>(masters_place);
5023         double spacing =
5024             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5025         int first, last;
5026         kmp_info_t *th;
5027 
5028         thidx = n_th + 1;
5029         if (update_master_only == 1)
5030           thidx = 1;
5031         for (f = 0; f < thidx; f++) {
5032           first = static_cast<int>(current);
5033           last = static_cast<int>(current + spacing) - 1;
5034           KMP_DEBUG_ASSERT(last >= first);
5035           if (first >= n_places) {
5036             if (masters_place) {
5037               first -= n_places;
5038               last -= n_places;
5039               if (first == (masters_place + 1)) {
5040                 KMP_DEBUG_ASSERT(f == n_th);
5041                 first--;
5042               }
5043               if (last == masters_place) {
5044                 KMP_DEBUG_ASSERT(f == (n_th - 1));
5045                 last--;
5046               }
5047             } else {
5048               KMP_DEBUG_ASSERT(f == n_th);
5049               first = 0;
5050               last = 0;
5051             }
5052           }
5053           if (last >= n_places) {
5054             last = (n_places - 1);
5055           }
5056           place = first;
5057           current += spacing;
5058           if (f < n_th) {
5059             KMP_DEBUG_ASSERT(0 <= first);
5060             KMP_DEBUG_ASSERT(n_places > first);
5061             KMP_DEBUG_ASSERT(0 <= last);
5062             KMP_DEBUG_ASSERT(n_places > last);
5063             KMP_DEBUG_ASSERT(last_place >= first_place);
5064             th = team->t.t_threads[f];
5065             KMP_DEBUG_ASSERT(th);
5066             __kmp_set_thread_place(team, th, first, last, place);
5067             KA_TRACE(100,
5068                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5069                       "partition = [%d,%d], spacing = %.4f\n",
5070                       __kmp_gtid_from_thread(team->t.t_threads[f]),
5071                       team->t.t_id, f, th->th.th_new_place,
5072                       th->th.th_first_place, th->th.th_last_place, spacing));
5073           }
5074         }
5075       }
5076       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5077     } else {
5078       int S, rem, gap, s_count;
5079       S = n_th / n_places;
5080       s_count = 0;
5081       rem = n_th - (S * n_places);
5082       gap = rem > 0 ? n_places / rem : n_places;
5083       int place = masters_place;
5084       int gap_ct = gap;
5085       thidx = n_th;
5086       if (update_master_only == 1)
5087         thidx = 1;
5088       for (f = 0; f < thidx; f++) {
5089         kmp_info_t *th = team->t.t_threads[f];
5090         KMP_DEBUG_ASSERT(th != NULL);
5091 
5092         __kmp_set_thread_place(team, th, place, place, place);
5093         s_count++;
5094 
5095         if ((s_count == S) && rem && (gap_ct == gap)) {
5096           // do nothing, add an extra thread to place on next iteration
5097         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5098           // we added an extra thread to this place; move on to next place
5099           if (place == last_place) {
5100             place = first_place;
5101           } else if (place == (num_masks - 1)) {
5102             place = 0;
5103           } else {
5104             place++;
5105           }
5106           s_count = 0;
5107           gap_ct = 1;
5108           rem--;
5109         } else if (s_count == S) { // place is full; don't add extra thread
5110           if (place == last_place) {
5111             place = first_place;
5112           } else if (place == (num_masks - 1)) {
5113             place = 0;
5114           } else {
5115             place++;
5116           }
5117           gap_ct++;
5118           s_count = 0;
5119         }
5120 
5121         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5122                        "partition = [%d,%d]\n",
5123                        __kmp_gtid_from_thread(team->t.t_threads[f]),
5124                        team->t.t_id, f, th->th.th_new_place,
5125                        th->th.th_first_place, th->th.th_last_place));
5126       }
5127       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5128     }
5129   } break;
5130 
5131   default:
5132     break;
5133   }
5134 
5135   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5136 }
5137 
5138 #endif // KMP_AFFINITY_SUPPORTED
5139 
5140 /* allocate a new team data structure to use.  take one off of the free pool if
5141    available */
5142 kmp_team_t *
5143 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5144 #if OMPT_SUPPORT
5145                     ompt_data_t ompt_parallel_data,
5146 #endif
5147                     kmp_proc_bind_t new_proc_bind,
5148                     kmp_internal_control_t *new_icvs,
5149                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5150   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5151   int f;
5152   kmp_team_t *team;
5153   int use_hot_team = !root->r.r_active;
5154   int level = 0;
5155   int do_place_partition = 1;
5156 
5157   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5158   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5159   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5160   KMP_MB();
5161 
5162 #if KMP_NESTED_HOT_TEAMS
5163   kmp_hot_team_ptr_t *hot_teams;
5164   if (master) {
5165     team = master->th.th_team;
5166     level = team->t.t_active_level;
5167     if (master->th.th_teams_microtask) { // in teams construct?
5168       if (master->th.th_teams_size.nteams > 1 &&
5169           ( // #teams > 1
5170               team->t.t_pkfn ==
5171                   (microtask_t)__kmp_teams_master || // inner fork of the teams
5172               master->th.th_teams_level <
5173                   team->t.t_level)) { // or nested parallel inside the teams
5174         ++level; // not increment if #teams==1, or for outer fork of the teams;
5175         // increment otherwise
5176       }
5177       // Do not perform the place partition if inner fork of the teams
5178       // Wait until nested parallel region encountered inside teams construct
5179       if ((master->th.th_teams_size.nteams == 1 &&
5180            master->th.th_teams_level >= team->t.t_level) ||
5181           (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5182         do_place_partition = 0;
5183     }
5184     hot_teams = master->th.th_hot_teams;
5185     if (level < __kmp_hot_teams_max_level && hot_teams &&
5186         hot_teams[level].hot_team) {
5187       // hot team has already been allocated for given level
5188       use_hot_team = 1;
5189     } else {
5190       use_hot_team = 0;
5191     }
5192   } else {
5193     // check we won't access uninitialized hot_teams, just in case
5194     KMP_DEBUG_ASSERT(new_nproc == 1);
5195   }
5196 #endif
5197   // Optimization to use a "hot" team
5198   if (use_hot_team && new_nproc > 1) {
5199     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5200 #if KMP_NESTED_HOT_TEAMS
5201     team = hot_teams[level].hot_team;
5202 #else
5203     team = root->r.r_hot_team;
5204 #endif
5205 #if KMP_DEBUG
5206     if (__kmp_tasking_mode != tskm_immediate_exec) {
5207       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5208                     "task_team[1] = %p before reinit\n",
5209                     team->t.t_task_team[0], team->t.t_task_team[1]));
5210     }
5211 #endif
5212 
5213     if (team->t.t_nproc != new_nproc &&
5214         __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5215       // Distributed barrier may need a resize
5216       int old_nthr = team->t.t_nproc;
5217       __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5218     }
5219 
5220     // If not doing the place partition, then reset the team's proc bind
5221     // to indicate that partitioning of all threads still needs to take place
5222     if (do_place_partition == 0)
5223       team->t.t_proc_bind = proc_bind_default;
5224     // Has the number of threads changed?
5225     /* Let's assume the most common case is that the number of threads is
5226        unchanged, and put that case first. */
5227     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5228       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5229       // This case can mean that omp_set_num_threads() was called and the hot
5230       // team size was already reduced, so we check the special flag
5231       if (team->t.t_size_changed == -1) {
5232         team->t.t_size_changed = 1;
5233       } else {
5234         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5235       }
5236 
5237       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5238       kmp_r_sched_t new_sched = new_icvs->sched;
5239       // set primary thread's schedule as new run-time schedule
5240       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5241 
5242       __kmp_reinitialize_team(team, new_icvs,
5243                               root->r.r_uber_thread->th.th_ident);
5244 
5245       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5246                     team->t.t_threads[0], team));
5247       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5248 
5249 #if KMP_AFFINITY_SUPPORTED
5250       if ((team->t.t_size_changed == 0) &&
5251           (team->t.t_proc_bind == new_proc_bind)) {
5252         if (new_proc_bind == proc_bind_spread) {
5253           if (do_place_partition) {
5254             // add flag to update only master for spread
5255             __kmp_partition_places(team, 1);
5256           }
5257         }
5258         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5259                        "proc_bind = %d, partition = [%d,%d]\n",
5260                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5261                        team->t.t_last_place));
5262       } else {
5263         if (do_place_partition) {
5264           KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5265           __kmp_partition_places(team);
5266         }
5267       }
5268 #else
5269       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5270 #endif /* KMP_AFFINITY_SUPPORTED */
5271     } else if (team->t.t_nproc > new_nproc) {
5272       KA_TRACE(20,
5273                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5274                 new_nproc));
5275 
5276       team->t.t_size_changed = 1;
5277       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5278         // Barrier size already reduced earlier in this function
5279         // Activate team threads via th_used_in_team
5280         __kmp_add_threads_to_team(team, new_nproc);
5281       }
5282       // When decreasing team size, threads no longer in the team should
5283       // unref task team.
5284       if (__kmp_tasking_mode != tskm_immediate_exec) {
5285         for (f = new_nproc; f < team->t.t_nproc; f++) {
5286           kmp_info_t *th = team->t.t_threads[f];
5287           KMP_DEBUG_ASSERT(th);
5288           th->th.th_task_team = NULL;
5289         }
5290       }
5291 #if KMP_NESTED_HOT_TEAMS
5292       if (__kmp_hot_teams_mode == 0) {
5293         // AC: saved number of threads should correspond to team's value in this
5294         // mode, can be bigger in mode 1, when hot team has threads in reserve
5295         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5296         hot_teams[level].hot_team_nth = new_nproc;
5297 #endif // KMP_NESTED_HOT_TEAMS
5298         /* release the extra threads we don't need any more */
5299         for (f = new_nproc; f < team->t.t_nproc; f++) {
5300           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5301           __kmp_free_thread(team->t.t_threads[f]);
5302           team->t.t_threads[f] = NULL;
5303         }
5304 #if KMP_NESTED_HOT_TEAMS
5305       } // (__kmp_hot_teams_mode == 0)
5306       else {
5307         // When keeping extra threads in team, switch threads to wait on own
5308         // b_go flag
5309         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5310           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5311           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5312           for (int b = 0; b < bs_last_barrier; ++b) {
5313             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5314               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5315             }
5316             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5317           }
5318         }
5319       }
5320 #endif // KMP_NESTED_HOT_TEAMS
5321       team->t.t_nproc = new_nproc;
5322       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5323       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5324       __kmp_reinitialize_team(team, new_icvs,
5325                               root->r.r_uber_thread->th.th_ident);
5326 
5327       // Update remaining threads
5328       for (f = 0; f < new_nproc; ++f) {
5329         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5330       }
5331 
5332       // restore the current task state of the primary thread: should be the
5333       // implicit task
5334       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5335                     team->t.t_threads[0], team));
5336 
5337       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5338 
5339 #ifdef KMP_DEBUG
5340       for (f = 0; f < team->t.t_nproc; f++) {
5341         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5342                          team->t.t_threads[f]->th.th_team_nproc ==
5343                              team->t.t_nproc);
5344       }
5345 #endif
5346 
5347       if (do_place_partition) {
5348         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5349 #if KMP_AFFINITY_SUPPORTED
5350         __kmp_partition_places(team);
5351 #endif
5352       }
5353     } else { // team->t.t_nproc < new_nproc
5354 
5355       KA_TRACE(20,
5356                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5357                 new_nproc));
5358       int old_nproc = team->t.t_nproc; // save old value and use to update only
5359       team->t.t_size_changed = 1;
5360 
5361 #if KMP_NESTED_HOT_TEAMS
5362       int avail_threads = hot_teams[level].hot_team_nth;
5363       if (new_nproc < avail_threads)
5364         avail_threads = new_nproc;
5365       kmp_info_t **other_threads = team->t.t_threads;
5366       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5367         // Adjust barrier data of reserved threads (if any) of the team
5368         // Other data will be set in __kmp_initialize_info() below.
5369         int b;
5370         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5371         for (b = 0; b < bs_last_barrier; ++b) {
5372           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5373           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5374 #if USE_DEBUGGER
5375           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5376 #endif
5377         }
5378       }
5379       if (hot_teams[level].hot_team_nth >= new_nproc) {
5380         // we have all needed threads in reserve, no need to allocate any
5381         // this only possible in mode 1, cannot have reserved threads in mode 0
5382         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5383         team->t.t_nproc = new_nproc; // just get reserved threads involved
5384       } else {
5385         // We may have some threads in reserve, but not enough;
5386         // get reserved threads involved if any.
5387         team->t.t_nproc = hot_teams[level].hot_team_nth;
5388         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5389 #endif // KMP_NESTED_HOT_TEAMS
5390         if (team->t.t_max_nproc < new_nproc) {
5391           /* reallocate larger arrays */
5392           __kmp_reallocate_team_arrays(team, new_nproc);
5393           __kmp_reinitialize_team(team, new_icvs, NULL);
5394         }
5395 
5396 #if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) &&   \
5397     KMP_AFFINITY_SUPPORTED
5398         /* Temporarily set full mask for primary thread before creation of
5399            workers. The reason is that workers inherit the affinity from the
5400            primary thread, so if a lot of workers are created on the single
5401            core quickly, they don't get a chance to set their own affinity for
5402            a long time. */
5403         kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5404 #endif
5405 
5406         /* allocate new threads for the hot team */
5407         for (f = team->t.t_nproc; f < new_nproc; f++) {
5408           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5409           KMP_DEBUG_ASSERT(new_worker);
5410           team->t.t_threads[f] = new_worker;
5411 
5412           KA_TRACE(20,
5413                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5414                     "join=%llu, plain=%llu\n",
5415                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5416                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5417                     team->t.t_bar[bs_plain_barrier].b_arrived));
5418 
5419           { // Initialize barrier data for new threads.
5420             int b;
5421             kmp_balign_t *balign = new_worker->th.th_bar;
5422             for (b = 0; b < bs_last_barrier; ++b) {
5423               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5424               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5425                                KMP_BARRIER_PARENT_FLAG);
5426 #if USE_DEBUGGER
5427               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5428 #endif
5429             }
5430           }
5431         }
5432 
5433 #if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) &&   \
5434     KMP_AFFINITY_SUPPORTED
5435         /* Restore initial primary thread's affinity mask */
5436         new_temp_affinity.restore();
5437 #endif
5438 #if KMP_NESTED_HOT_TEAMS
5439       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5440 #endif // KMP_NESTED_HOT_TEAMS
5441       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5442         // Barrier size already increased earlier in this function
5443         // Activate team threads via th_used_in_team
5444         __kmp_add_threads_to_team(team, new_nproc);
5445       }
5446       /* make sure everyone is syncronized */
5447       // new threads below
5448       __kmp_initialize_team(team, new_nproc, new_icvs,
5449                             root->r.r_uber_thread->th.th_ident);
5450 
5451       /* reinitialize the threads */
5452       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5453       for (f = 0; f < team->t.t_nproc; ++f)
5454         __kmp_initialize_info(team->t.t_threads[f], team, f,
5455                               __kmp_gtid_from_tid(f, team));
5456 
5457       // set th_task_state for new threads in hot team with older thread's state
5458       kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5459       for (f = old_nproc; f < team->t.t_nproc; ++f)
5460         team->t.t_threads[f]->th.th_task_state = old_state;
5461 
5462 #ifdef KMP_DEBUG
5463       for (f = 0; f < team->t.t_nproc; ++f) {
5464         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5465                          team->t.t_threads[f]->th.th_team_nproc ==
5466                              team->t.t_nproc);
5467       }
5468 #endif
5469 
5470       if (do_place_partition) {
5471         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5472 #if KMP_AFFINITY_SUPPORTED
5473         __kmp_partition_places(team);
5474 #endif
5475       }
5476     } // Check changes in number of threads
5477 
5478     if (master->th.th_teams_microtask) {
5479       for (f = 1; f < new_nproc; ++f) {
5480         // propagate teams construct specific info to workers
5481         kmp_info_t *thr = team->t.t_threads[f];
5482         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5483         thr->th.th_teams_level = master->th.th_teams_level;
5484         thr->th.th_teams_size = master->th.th_teams_size;
5485       }
5486     }
5487 #if KMP_NESTED_HOT_TEAMS
5488     if (level) {
5489       // Sync barrier state for nested hot teams, not needed for outermost hot
5490       // team.
5491       for (f = 1; f < new_nproc; ++f) {
5492         kmp_info_t *thr = team->t.t_threads[f];
5493         int b;
5494         kmp_balign_t *balign = thr->th.th_bar;
5495         for (b = 0; b < bs_last_barrier; ++b) {
5496           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5497           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5498 #if USE_DEBUGGER
5499           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5500 #endif
5501         }
5502       }
5503     }
5504 #endif // KMP_NESTED_HOT_TEAMS
5505 
5506     /* reallocate space for arguments if necessary */
5507     __kmp_alloc_argv_entries(argc, team, TRUE);
5508     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5509     // The hot team re-uses the previous task team,
5510     // if untouched during the previous release->gather phase.
5511 
5512     KF_TRACE(10, (" hot_team = %p\n", team));
5513 
5514 #if KMP_DEBUG
5515     if (__kmp_tasking_mode != tskm_immediate_exec) {
5516       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5517                     "task_team[1] = %p after reinit\n",
5518                     team->t.t_task_team[0], team->t.t_task_team[1]));
5519     }
5520 #endif
5521 
5522 #if OMPT_SUPPORT
5523     __ompt_team_assign_id(team, ompt_parallel_data);
5524 #endif
5525 
5526     KMP_MB();
5527 
5528     return team;
5529   }
5530 
5531   /* next, let's try to take one from the team pool */
5532   KMP_MB();
5533   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5534     /* TODO: consider resizing undersized teams instead of reaping them, now
5535        that we have a resizing mechanism */
5536     if (team->t.t_max_nproc >= max_nproc) {
5537       /* take this team from the team pool */
5538       __kmp_team_pool = team->t.t_next_pool;
5539 
5540       if (max_nproc > 1 &&
5541           __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5542         if (!team->t.b) { // Allocate barrier structure
5543           team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5544         }
5545       }
5546 
5547       /* setup the team for fresh use */
5548       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5549 
5550       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5551                     "task_team[1] %p to NULL\n",
5552                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5553       team->t.t_task_team[0] = NULL;
5554       team->t.t_task_team[1] = NULL;
5555 
5556       /* reallocate space for arguments if necessary */
5557       __kmp_alloc_argv_entries(argc, team, TRUE);
5558       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5559 
5560       KA_TRACE(
5561           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5562                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5563       { // Initialize barrier data.
5564         int b;
5565         for (b = 0; b < bs_last_barrier; ++b) {
5566           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5567 #if USE_DEBUGGER
5568           team->t.t_bar[b].b_master_arrived = 0;
5569           team->t.t_bar[b].b_team_arrived = 0;
5570 #endif
5571         }
5572       }
5573 
5574       team->t.t_proc_bind = new_proc_bind;
5575 
5576       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5577                     team->t.t_id));
5578 
5579 #if OMPT_SUPPORT
5580       __ompt_team_assign_id(team, ompt_parallel_data);
5581 #endif
5582 
5583       team->t.t_nested_nth = NULL;
5584 
5585       KMP_MB();
5586 
5587       return team;
5588     }
5589 
5590     /* reap team if it is too small, then loop back and check the next one */
5591     // not sure if this is wise, but, will be redone during the hot-teams
5592     // rewrite.
5593     /* TODO: Use technique to find the right size hot-team, don't reap them */
5594     team = __kmp_reap_team(team);
5595     __kmp_team_pool = team;
5596   }
5597 
5598   /* nothing available in the pool, no matter, make a new team! */
5599   KMP_MB();
5600   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5601 
5602   /* and set it up */
5603   team->t.t_max_nproc = max_nproc;
5604   if (max_nproc > 1 &&
5605       __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5606     // Allocate barrier structure
5607     team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5608   }
5609 
5610   /* NOTE well, for some reason allocating one big buffer and dividing it up
5611      seems to really hurt performance a lot on the P4, so, let's not use this */
5612   __kmp_allocate_team_arrays(team, max_nproc);
5613 
5614   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5615   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5616 
5617   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5618                 "%p to NULL\n",
5619                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5620   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5621   // memory, no need to duplicate
5622   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5623   // memory, no need to duplicate
5624 
5625   if (__kmp_storage_map) {
5626     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5627   }
5628 
5629   /* allocate space for arguments */
5630   __kmp_alloc_argv_entries(argc, team, FALSE);
5631   team->t.t_argc = argc;
5632 
5633   KA_TRACE(20,
5634            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5635             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5636   { // Initialize barrier data.
5637     int b;
5638     for (b = 0; b < bs_last_barrier; ++b) {
5639       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5640 #if USE_DEBUGGER
5641       team->t.t_bar[b].b_master_arrived = 0;
5642       team->t.t_bar[b].b_team_arrived = 0;
5643 #endif
5644     }
5645   }
5646 
5647   team->t.t_proc_bind = new_proc_bind;
5648 
5649 #if OMPT_SUPPORT
5650   __ompt_team_assign_id(team, ompt_parallel_data);
5651   team->t.ompt_serialized_team_info = NULL;
5652 #endif
5653 
5654   KMP_MB();
5655 
5656   team->t.t_nested_nth = NULL;
5657 
5658   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5659                 team->t.t_id));
5660 
5661   return team;
5662 }
5663 
5664 /* TODO implement hot-teams at all levels */
5665 /* TODO implement lazy thread release on demand (disband request) */
5666 
5667 /* free the team.  return it to the team pool.  release all the threads
5668  * associated with it */
5669 void __kmp_free_team(kmp_root_t *root,
5670                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5671   int f;
5672   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5673                 team->t.t_id));
5674 
5675   /* verify state */
5676   KMP_DEBUG_ASSERT(root);
5677   KMP_DEBUG_ASSERT(team);
5678   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5679   KMP_DEBUG_ASSERT(team->t.t_threads);
5680 
5681   int use_hot_team = team == root->r.r_hot_team;
5682 #if KMP_NESTED_HOT_TEAMS
5683   int level;
5684   if (master) {
5685     level = team->t.t_active_level - 1;
5686     if (master->th.th_teams_microtask) { // in teams construct?
5687       if (master->th.th_teams_size.nteams > 1) {
5688         ++level; // level was not increased in teams construct for
5689         // team_of_masters
5690       }
5691       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5692           master->th.th_teams_level == team->t.t_level) {
5693         ++level; // level was not increased in teams construct for
5694         // team_of_workers before the parallel
5695       } // team->t.t_level will be increased inside parallel
5696     }
5697 #if KMP_DEBUG
5698     kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5699 #endif
5700     if (level < __kmp_hot_teams_max_level) {
5701       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5702       use_hot_team = 1;
5703     }
5704   }
5705 #endif // KMP_NESTED_HOT_TEAMS
5706 
5707   /* team is done working */
5708   TCW_SYNC_PTR(team->t.t_pkfn,
5709                NULL); // Important for Debugging Support Library.
5710 #if KMP_OS_WINDOWS
5711   team->t.t_copyin_counter = 0; // init counter for possible reuse
5712 #endif
5713   // Do not reset pointer to parent team to NULL for hot teams.
5714 
5715   /* if we are non-hot team, release our threads */
5716   if (!use_hot_team) {
5717     if (__kmp_tasking_mode != tskm_immediate_exec) {
5718       // Wait for threads to reach reapable state
5719       for (f = 1; f < team->t.t_nproc; ++f) {
5720         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5721         kmp_info_t *th = team->t.t_threads[f];
5722         volatile kmp_uint32 *state = &th->th.th_reap_state;
5723         while (*state != KMP_SAFE_TO_REAP) {
5724 #if KMP_OS_WINDOWS
5725           // On Windows a thread can be killed at any time, check this
5726           DWORD ecode;
5727           if (!__kmp_is_thread_alive(th, &ecode)) {
5728             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5729             break;
5730           }
5731 #endif
5732           // first check if thread is sleeping
5733           if (th->th.th_sleep_loc)
5734             __kmp_null_resume_wrapper(th);
5735           KMP_CPU_PAUSE();
5736         }
5737       }
5738 
5739       // Delete task teams
5740       int tt_idx;
5741       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5742         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5743         if (task_team != NULL) {
5744           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5745             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5746             team->t.t_threads[f]->th.th_task_team = NULL;
5747           }
5748           KA_TRACE(
5749               20,
5750               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5751                __kmp_get_gtid(), task_team, team->t.t_id));
5752 #if KMP_NESTED_HOT_TEAMS
5753           __kmp_free_task_team(master, task_team);
5754 #endif
5755           team->t.t_task_team[tt_idx] = NULL;
5756         }
5757       }
5758     }
5759 
5760     // Before clearing parent pointer, check if nested_nth list should be freed
5761     if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth &&
5762         team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) {
5763       KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
5764       KMP_INTERNAL_FREE(team->t.t_nested_nth);
5765     }
5766     team->t.t_nested_nth = NULL;
5767 
5768     // Reset pointer to parent team only for non-hot teams.
5769     team->t.t_parent = NULL;
5770     team->t.t_level = 0;
5771     team->t.t_active_level = 0;
5772 
5773     /* free the worker threads */
5774     for (f = 1; f < team->t.t_nproc; ++f) {
5775       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5776       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5777         (void)KMP_COMPARE_AND_STORE_ACQ32(
5778             &(team->t.t_threads[f]->th.th_used_in_team), 1, 2);
5779       }
5780       __kmp_free_thread(team->t.t_threads[f]);
5781     }
5782 
5783     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5784       if (team->t.b) {
5785         // wake up thread at old location
5786         team->t.b->go_release();
5787         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5788           for (f = 1; f < team->t.t_nproc; ++f) {
5789             if (team->t.b->sleep[f].sleep) {
5790               __kmp_atomic_resume_64(
5791                   team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5792                   (kmp_atomic_flag_64<> *)NULL);
5793             }
5794           }
5795         }
5796         // Wait for threads to be removed from team
5797         for (int f = 1; f < team->t.t_nproc; ++f) {
5798           while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5799             KMP_CPU_PAUSE();
5800         }
5801       }
5802     }
5803 
5804     for (f = 1; f < team->t.t_nproc; ++f) {
5805       team->t.t_threads[f] = NULL;
5806     }
5807 
5808     if (team->t.t_max_nproc > 1 &&
5809         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5810       distributedBarrier::deallocate(team->t.b);
5811       team->t.b = NULL;
5812     }
5813     /* put the team back in the team pool */
5814     /* TODO limit size of team pool, call reap_team if pool too large */
5815     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5816     __kmp_team_pool = (volatile kmp_team_t *)team;
5817   } else { // Check if team was created for primary threads in teams construct
5818     // See if first worker is a CG root
5819     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5820                      team->t.t_threads[1]->th.th_cg_roots);
5821     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5822       // Clean up the CG root nodes on workers so that this team can be re-used
5823       for (f = 1; f < team->t.t_nproc; ++f) {
5824         kmp_info_t *thr = team->t.t_threads[f];
5825         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5826                          thr->th.th_cg_roots->cg_root == thr);
5827         // Pop current CG root off list
5828         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5829         thr->th.th_cg_roots = tmp->up;
5830         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5831                        " up to node %p. cg_nthreads was %d\n",
5832                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5833         int i = tmp->cg_nthreads--;
5834         if (i == 1) {
5835           __kmp_free(tmp); // free CG if we are the last thread in it
5836         }
5837         // Restore current task's thread_limit from CG root
5838         if (thr->th.th_cg_roots)
5839           thr->th.th_current_task->td_icvs.thread_limit =
5840               thr->th.th_cg_roots->cg_thread_limit;
5841       }
5842     }
5843   }
5844 
5845   KMP_MB();
5846 }
5847 
5848 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5849 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5850   kmp_team_t *next_pool = team->t.t_next_pool;
5851 
5852   KMP_DEBUG_ASSERT(team);
5853   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5854   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5855   KMP_DEBUG_ASSERT(team->t.t_threads);
5856   KMP_DEBUG_ASSERT(team->t.t_argv);
5857 
5858   /* TODO clean the threads that are a part of this? */
5859 
5860   /* free stuff */
5861   __kmp_free_team_arrays(team);
5862   if (team->t.t_argv != &team->t.t_inline_argv[0])
5863     __kmp_free((void *)team->t.t_argv);
5864   __kmp_free(team);
5865 
5866   KMP_MB();
5867   return next_pool;
5868 }
5869 
5870 // Free the thread.  Don't reap it, just place it on the pool of available
5871 // threads.
5872 //
5873 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5874 // binding for the affinity mechanism to be useful.
5875 //
5876 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5877 // However, we want to avoid a potential performance problem by always
5878 // scanning through the list to find the correct point at which to insert
5879 // the thread (potential N**2 behavior).  To do this we keep track of the
5880 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5881 // With single-level parallelism, threads will always be added to the tail
5882 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5883 // parallelism, all bets are off and we may need to scan through the entire
5884 // free list.
5885 //
5886 // This change also has a potentially large performance benefit, for some
5887 // applications.  Previously, as threads were freed from the hot team, they
5888 // would be placed back on the free list in inverse order.  If the hot team
5889 // grew back to it's original size, then the freed thread would be placed
5890 // back on the hot team in reverse order.  This could cause bad cache
5891 // locality problems on programs where the size of the hot team regularly
5892 // grew and shrunk.
5893 //
5894 // Now, for single-level parallelism, the OMP tid is always == gtid.
5895 void __kmp_free_thread(kmp_info_t *this_th) {
5896   int gtid;
5897   kmp_info_t **scan;
5898 
5899   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5900                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5901 
5902   KMP_DEBUG_ASSERT(this_th);
5903 
5904   // When moving thread to pool, switch thread to wait on own b_go flag, and
5905   // uninitialized (NULL team).
5906   int b;
5907   kmp_balign_t *balign = this_th->th.th_bar;
5908   for (b = 0; b < bs_last_barrier; ++b) {
5909     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5910       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5911     balign[b].bb.team = NULL;
5912     balign[b].bb.leaf_kids = 0;
5913   }
5914   this_th->th.th_task_state = 0;
5915   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5916 
5917   /* put thread back on the free pool */
5918   TCW_PTR(this_th->th.th_team, NULL);
5919   TCW_PTR(this_th->th.th_root, NULL);
5920   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5921 
5922   while (this_th->th.th_cg_roots) {
5923     this_th->th.th_cg_roots->cg_nthreads--;
5924     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5925                    " %p of thread  %p to %d\n",
5926                    this_th, this_th->th.th_cg_roots,
5927                    this_th->th.th_cg_roots->cg_root,
5928                    this_th->th.th_cg_roots->cg_nthreads));
5929     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5930     if (tmp->cg_root == this_th) { // Thread is a cg_root
5931       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5932       KA_TRACE(
5933           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5934       this_th->th.th_cg_roots = tmp->up;
5935       __kmp_free(tmp);
5936     } else { // Worker thread
5937       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5938         __kmp_free(tmp);
5939       }
5940       this_th->th.th_cg_roots = NULL;
5941       break;
5942     }
5943   }
5944 
5945   /* If the implicit task assigned to this thread can be used by other threads
5946    * -> multiple threads can share the data and try to free the task at
5947    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5948    * with higher probability when hot team is disabled but can occurs even when
5949    * the hot team is enabled */
5950   __kmp_free_implicit_task(this_th);
5951   this_th->th.th_current_task = NULL;
5952 
5953   // If the __kmp_thread_pool_insert_pt is already past the new insert
5954   // point, then we need to re-scan the entire list.
5955   gtid = this_th->th.th_info.ds.ds_gtid;
5956   if (__kmp_thread_pool_insert_pt != NULL) {
5957     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5958     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5959       __kmp_thread_pool_insert_pt = NULL;
5960     }
5961   }
5962 
5963   // Scan down the list to find the place to insert the thread.
5964   // scan is the address of a link in the list, possibly the address of
5965   // __kmp_thread_pool itself.
5966   //
5967   // In the absence of nested parallelism, the for loop will have 0 iterations.
5968   if (__kmp_thread_pool_insert_pt != NULL) {
5969     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5970   } else {
5971     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5972   }
5973   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5974        scan = &((*scan)->th.th_next_pool))
5975     ;
5976 
5977   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5978   // to its address.
5979   TCW_PTR(this_th->th.th_next_pool, *scan);
5980   __kmp_thread_pool_insert_pt = *scan = this_th;
5981   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5982                    (this_th->th.th_info.ds.ds_gtid <
5983                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5984   TCW_4(this_th->th.th_in_pool, TRUE);
5985   __kmp_suspend_initialize_thread(this_th);
5986   __kmp_lock_suspend_mx(this_th);
5987   if (this_th->th.th_active == TRUE) {
5988     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5989     this_th->th.th_active_in_pool = TRUE;
5990   }
5991 #if KMP_DEBUG
5992   else {
5993     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5994   }
5995 #endif
5996   __kmp_unlock_suspend_mx(this_th);
5997 
5998   TCW_4(__kmp_nth, __kmp_nth - 1);
5999 
6000 #ifdef KMP_ADJUST_BLOCKTIME
6001   /* Adjust blocktime back to user setting or default if necessary */
6002   /* Middle initialization might never have occurred                */
6003   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6004     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6005     if (__kmp_nth <= __kmp_avail_proc) {
6006       __kmp_zero_bt = FALSE;
6007     }
6008   }
6009 #endif /* KMP_ADJUST_BLOCKTIME */
6010 
6011   KMP_MB();
6012 }
6013 
6014 /* ------------------------------------------------------------------------ */
6015 
6016 void *__kmp_launch_thread(kmp_info_t *this_thr) {
6017 #if OMP_PROFILING_SUPPORT
6018   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
6019   // TODO: add a configuration option for time granularity
6020   if (ProfileTraceFile)
6021     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
6022 #endif
6023 
6024   int gtid = this_thr->th.th_info.ds.ds_gtid;
6025   /*    void                 *stack_data;*/
6026   kmp_team_t **volatile pteam;
6027 
6028   KMP_MB();
6029   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6030 
6031   if (__kmp_env_consistency_check) {
6032     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6033   }
6034 
6035 #if OMPD_SUPPORT
6036   if (ompd_state & OMPD_ENABLE_BP)
6037     ompd_bp_thread_begin();
6038 #endif
6039 
6040 #if OMPT_SUPPORT
6041   ompt_data_t *thread_data = nullptr;
6042   if (ompt_enabled.enabled) {
6043     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6044     *thread_data = ompt_data_none;
6045 
6046     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6047     this_thr->th.ompt_thread_info.wait_id = 0;
6048     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6049     this_thr->th.ompt_thread_info.parallel_flags = 0;
6050     if (ompt_enabled.ompt_callback_thread_begin) {
6051       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6052           ompt_thread_worker, thread_data);
6053     }
6054     this_thr->th.ompt_thread_info.state = ompt_state_idle;
6055   }
6056 #endif
6057 
6058   /* This is the place where threads wait for work */
6059   while (!TCR_4(__kmp_global.g.g_done)) {
6060     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6061     KMP_MB();
6062 
6063     /* wait for work to do */
6064     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6065 
6066     /* No tid yet since not part of a team */
6067     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6068 
6069 #if OMPT_SUPPORT
6070     if (ompt_enabled.enabled) {
6071       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6072     }
6073 #endif
6074 
6075     pteam = &this_thr->th.th_team;
6076 
6077     /* have we been allocated? */
6078     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6079       /* we were just woken up, so run our new task */
6080       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6081         int rc;
6082         KA_TRACE(20,
6083                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6084                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6085                   (*pteam)->t.t_pkfn));
6086 
6087         updateHWFPControl(*pteam);
6088 
6089 #if OMPT_SUPPORT
6090         if (ompt_enabled.enabled) {
6091           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6092         }
6093 #endif
6094 
6095         rc = (*pteam)->t.t_invoke(gtid);
6096         KMP_ASSERT(rc);
6097 
6098         KMP_MB();
6099         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6100                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6101                       (*pteam)->t.t_pkfn));
6102       }
6103 #if OMPT_SUPPORT
6104       if (ompt_enabled.enabled) {
6105         /* no frame set while outside task */
6106         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6107 
6108         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6109       }
6110 #endif
6111       /* join barrier after parallel region */
6112       __kmp_join_barrier(gtid);
6113     }
6114   }
6115 
6116 #if OMPD_SUPPORT
6117   if (ompd_state & OMPD_ENABLE_BP)
6118     ompd_bp_thread_end();
6119 #endif
6120 
6121 #if OMPT_SUPPORT
6122   if (ompt_enabled.ompt_callback_thread_end) {
6123     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6124   }
6125 #endif
6126 
6127   this_thr->th.th_task_team = NULL;
6128   /* run the destructors for the threadprivate data for this thread */
6129   __kmp_common_destroy_gtid(gtid);
6130 
6131   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6132   KMP_MB();
6133 
6134 #if OMP_PROFILING_SUPPORT
6135   llvm::timeTraceProfilerFinishThread();
6136 #endif
6137   return this_thr;
6138 }
6139 
6140 /* ------------------------------------------------------------------------ */
6141 
6142 void __kmp_internal_end_dest(void *specific_gtid) {
6143   // Make sure no significant bits are lost
6144   int gtid;
6145   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6146 
6147   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6148   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6149    * this is because 0 is reserved for the nothing-stored case */
6150 
6151   __kmp_internal_end_thread(gtid);
6152 }
6153 
6154 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6155 
6156 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6157   __kmp_internal_end_atexit();
6158 }
6159 
6160 #endif
6161 
6162 /* [Windows] josh: when the atexit handler is called, there may still be more
6163    than one thread alive */
6164 void __kmp_internal_end_atexit(void) {
6165   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6166   /* [Windows]
6167      josh: ideally, we want to completely shutdown the library in this atexit
6168      handler, but stat code that depends on thread specific data for gtid fails
6169      because that data becomes unavailable at some point during the shutdown, so
6170      we call __kmp_internal_end_thread instead. We should eventually remove the
6171      dependency on __kmp_get_specific_gtid in the stat code and use
6172      __kmp_internal_end_library to cleanly shutdown the library.
6173 
6174      // TODO: Can some of this comment about GVS be removed?
6175      I suspect that the offending stat code is executed when the calling thread
6176      tries to clean up a dead root thread's data structures, resulting in GVS
6177      code trying to close the GVS structures for that thread, but since the stat
6178      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6179      the calling thread is cleaning up itself instead of another thread, it get
6180      confused. This happens because allowing a thread to unregister and cleanup
6181      another thread is a recent modification for addressing an issue.
6182      Based on the current design (20050722), a thread may end up
6183      trying to unregister another thread only if thread death does not trigger
6184      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
6185      thread specific data destructor function to detect thread death. For
6186      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6187      is nothing.  Thus, the workaround is applicable only for Windows static
6188      stat library. */
6189   __kmp_internal_end_library(-1);
6190 #if KMP_OS_WINDOWS
6191   __kmp_close_console();
6192 #endif
6193 }
6194 
6195 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6196   // It is assumed __kmp_forkjoin_lock is acquired.
6197 
6198   int gtid;
6199 
6200   KMP_DEBUG_ASSERT(thread != NULL);
6201 
6202   gtid = thread->th.th_info.ds.ds_gtid;
6203 
6204   if (!is_root) {
6205     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6206       /* Assume the threads are at the fork barrier here */
6207       KA_TRACE(
6208           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6209                gtid));
6210       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6211         while (
6212             !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6213           KMP_CPU_PAUSE();
6214         __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6215       } else {
6216         /* Need release fence here to prevent seg faults for tree forkjoin
6217            barrier (GEH) */
6218         kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6219                            thread);
6220         __kmp_release_64(&flag);
6221       }
6222     }
6223 
6224     // Terminate OS thread.
6225     __kmp_reap_worker(thread);
6226 
6227     // The thread was killed asynchronously.  If it was actively
6228     // spinning in the thread pool, decrement the global count.
6229     //
6230     // There is a small timing hole here - if the worker thread was just waking
6231     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6232     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6233     // the global counter might not get updated.
6234     //
6235     // Currently, this can only happen as the library is unloaded,
6236     // so there are no harmful side effects.
6237     if (thread->th.th_active_in_pool) {
6238       thread->th.th_active_in_pool = FALSE;
6239       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6240       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6241     }
6242   }
6243 
6244   __kmp_free_implicit_task(thread);
6245 
6246 // Free the fast memory for tasking
6247 #if USE_FAST_MEMORY
6248   __kmp_free_fast_memory(thread);
6249 #endif /* USE_FAST_MEMORY */
6250 
6251   __kmp_suspend_uninitialize_thread(thread);
6252 
6253   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6254   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6255 
6256   --__kmp_all_nth;
6257   // __kmp_nth was decremented when thread is added to the pool.
6258 
6259 #ifdef KMP_ADJUST_BLOCKTIME
6260   /* Adjust blocktime back to user setting or default if necessary */
6261   /* Middle initialization might never have occurred                */
6262   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6263     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6264     if (__kmp_nth <= __kmp_avail_proc) {
6265       __kmp_zero_bt = FALSE;
6266     }
6267   }
6268 #endif /* KMP_ADJUST_BLOCKTIME */
6269 
6270   /* free the memory being used */
6271   if (__kmp_env_consistency_check) {
6272     if (thread->th.th_cons) {
6273       __kmp_free_cons_stack(thread->th.th_cons);
6274       thread->th.th_cons = NULL;
6275     }
6276   }
6277 
6278   if (thread->th.th_pri_common != NULL) {
6279     __kmp_free(thread->th.th_pri_common);
6280     thread->th.th_pri_common = NULL;
6281   }
6282 
6283 #if KMP_USE_BGET
6284   if (thread->th.th_local.bget_data != NULL) {
6285     __kmp_finalize_bget(thread);
6286   }
6287 #endif
6288 
6289 #if KMP_AFFINITY_SUPPORTED
6290   if (thread->th.th_affin_mask != NULL) {
6291     KMP_CPU_FREE(thread->th.th_affin_mask);
6292     thread->th.th_affin_mask = NULL;
6293   }
6294 #endif /* KMP_AFFINITY_SUPPORTED */
6295 
6296 #if KMP_USE_HIER_SCHED
6297   if (thread->th.th_hier_bar_data != NULL) {
6298     __kmp_free(thread->th.th_hier_bar_data);
6299     thread->th.th_hier_bar_data = NULL;
6300   }
6301 #endif
6302 
6303   __kmp_reap_team(thread->th.th_serial_team);
6304   thread->th.th_serial_team = NULL;
6305   __kmp_free(thread);
6306 
6307   KMP_MB();
6308 
6309 } // __kmp_reap_thread
6310 
6311 static void __kmp_itthash_clean(kmp_info_t *th) {
6312 #if USE_ITT_NOTIFY
6313   if (__kmp_itt_region_domains.count > 0) {
6314     for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6315       kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6316       while (bucket) {
6317         kmp_itthash_entry_t *next = bucket->next_in_bucket;
6318         __kmp_thread_free(th, bucket);
6319         bucket = next;
6320       }
6321     }
6322   }
6323   if (__kmp_itt_barrier_domains.count > 0) {
6324     for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6325       kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6326       while (bucket) {
6327         kmp_itthash_entry_t *next = bucket->next_in_bucket;
6328         __kmp_thread_free(th, bucket);
6329         bucket = next;
6330       }
6331     }
6332   }
6333 #endif
6334 }
6335 
6336 static void __kmp_internal_end(void) {
6337   int i;
6338 
6339   /* First, unregister the library */
6340   __kmp_unregister_library();
6341 
6342 #if KMP_OS_WINDOWS
6343   /* In Win static library, we can't tell when a root actually dies, so we
6344      reclaim the data structures for any root threads that have died but not
6345      unregistered themselves, in order to shut down cleanly.
6346      In Win dynamic library we also can't tell when a thread dies.  */
6347   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6348 // dead roots
6349 #endif
6350 
6351   for (i = 0; i < __kmp_threads_capacity; i++)
6352     if (__kmp_root[i])
6353       if (__kmp_root[i]->r.r_active)
6354         break;
6355   KMP_MB(); /* Flush all pending memory write invalidates.  */
6356   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6357 
6358   if (i < __kmp_threads_capacity) {
6359 #if KMP_USE_MONITOR
6360     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6361     KMP_MB(); /* Flush all pending memory write invalidates.  */
6362 
6363     // Need to check that monitor was initialized before reaping it. If we are
6364     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6365     // __kmp_monitor will appear to contain valid data, but it is only valid in
6366     // the parent process, not the child.
6367     // New behavior (201008): instead of keying off of the flag
6368     // __kmp_init_parallel, the monitor thread creation is keyed off
6369     // of the new flag __kmp_init_monitor.
6370     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6371     if (TCR_4(__kmp_init_monitor)) {
6372       __kmp_reap_monitor(&__kmp_monitor);
6373       TCW_4(__kmp_init_monitor, 0);
6374     }
6375     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6376     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6377 #endif // KMP_USE_MONITOR
6378   } else {
6379 /* TODO move this to cleanup code */
6380 #ifdef KMP_DEBUG
6381     /* make sure that everything has properly ended */
6382     for (i = 0; i < __kmp_threads_capacity; i++) {
6383       if (__kmp_root[i]) {
6384         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6385         //                    there can be uber threads alive here
6386         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6387       }
6388     }
6389 #endif
6390 
6391     KMP_MB();
6392 
6393     // Reap the worker threads.
6394     // This is valid for now, but be careful if threads are reaped sooner.
6395     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6396       // Get the next thread from the pool.
6397       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6398       __kmp_thread_pool = thread->th.th_next_pool;
6399       // Reap it.
6400       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6401       thread->th.th_next_pool = NULL;
6402       thread->th.th_in_pool = FALSE;
6403       __kmp_reap_thread(thread, 0);
6404     }
6405     __kmp_thread_pool_insert_pt = NULL;
6406 
6407     // Reap teams.
6408     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6409       // Get the next team from the pool.
6410       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6411       __kmp_team_pool = team->t.t_next_pool;
6412       // Reap it.
6413       team->t.t_next_pool = NULL;
6414       __kmp_reap_team(team);
6415     }
6416 
6417     __kmp_reap_task_teams();
6418 
6419 #if KMP_OS_UNIX
6420     // Threads that are not reaped should not access any resources since they
6421     // are going to be deallocated soon, so the shutdown sequence should wait
6422     // until all threads either exit the final spin-waiting loop or begin
6423     // sleeping after the given blocktime.
6424     for (i = 0; i < __kmp_threads_capacity; i++) {
6425       kmp_info_t *thr = __kmp_threads[i];
6426       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6427         KMP_CPU_PAUSE();
6428     }
6429 #endif
6430 
6431     for (i = 0; i < __kmp_threads_capacity; ++i) {
6432       // TBD: Add some checking...
6433       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6434     }
6435 
6436     /* Make sure all threadprivate destructors get run by joining with all
6437        worker threads before resetting this flag */
6438     TCW_SYNC_4(__kmp_init_common, FALSE);
6439 
6440     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6441     KMP_MB();
6442 
6443 #if KMP_USE_MONITOR
6444     // See note above: One of the possible fixes for CQ138434 / CQ140126
6445     //
6446     // FIXME: push both code fragments down and CSE them?
6447     // push them into __kmp_cleanup() ?
6448     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6449     if (TCR_4(__kmp_init_monitor)) {
6450       __kmp_reap_monitor(&__kmp_monitor);
6451       TCW_4(__kmp_init_monitor, 0);
6452     }
6453     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6454     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6455 #endif
6456   } /* else !__kmp_global.t_active */
6457   TCW_4(__kmp_init_gtid, FALSE);
6458   KMP_MB(); /* Flush all pending memory write invalidates.  */
6459 
6460   __kmp_cleanup();
6461 #if OMPT_SUPPORT
6462   ompt_fini();
6463 #endif
6464 }
6465 
6466 void __kmp_internal_end_library(int gtid_req) {
6467   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6468   /* this shouldn't be a race condition because __kmp_internal_end() is the
6469      only place to clear __kmp_serial_init */
6470   /* we'll check this later too, after we get the lock */
6471   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6472   // redundant, because the next check will work in any case.
6473   if (__kmp_global.g.g_abort) {
6474     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6475     /* TODO abort? */
6476     return;
6477   }
6478   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6479     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6480     return;
6481   }
6482 
6483   // If hidden helper team has been initialized, we need to deinit it
6484   if (TCR_4(__kmp_init_hidden_helper) &&
6485       !TCR_4(__kmp_hidden_helper_team_done)) {
6486     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6487     // First release the main thread to let it continue its work
6488     __kmp_hidden_helper_main_thread_release();
6489     // Wait until the hidden helper team has been destroyed
6490     __kmp_hidden_helper_threads_deinitz_wait();
6491   }
6492 
6493   KMP_MB(); /* Flush all pending memory write invalidates.  */
6494   /* find out who we are and what we should do */
6495   {
6496     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6497     KA_TRACE(
6498         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6499     if (gtid == KMP_GTID_SHUTDOWN) {
6500       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6501                     "already shutdown\n"));
6502       return;
6503     } else if (gtid == KMP_GTID_MONITOR) {
6504       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6505                     "registered, or system shutdown\n"));
6506       return;
6507     } else if (gtid == KMP_GTID_DNE) {
6508       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6509                     "shutdown\n"));
6510       /* we don't know who we are, but we may still shutdown the library */
6511     } else if (KMP_UBER_GTID(gtid)) {
6512       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6513       if (__kmp_root[gtid]->r.r_active) {
6514         __kmp_global.g.g_abort = -1;
6515         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6516         __kmp_unregister_library();
6517         KA_TRACE(10,
6518                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6519                   gtid));
6520         return;
6521       } else {
6522         __kmp_itthash_clean(__kmp_threads[gtid]);
6523         KA_TRACE(
6524             10,
6525             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6526         __kmp_unregister_root_current_thread(gtid);
6527       }
6528     } else {
6529 /* worker threads may call this function through the atexit handler, if they
6530  * call exit() */
6531 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6532    TODO: do a thorough shutdown instead */
6533 #ifdef DUMP_DEBUG_ON_EXIT
6534       if (__kmp_debug_buf)
6535         __kmp_dump_debug_buffer();
6536 #endif
6537       // added unregister library call here when we switch to shm linux
6538       // if we don't, it will leave lots of files in /dev/shm
6539       // cleanup shared memory file before exiting.
6540       __kmp_unregister_library();
6541       return;
6542     }
6543   }
6544   /* synchronize the termination process */
6545   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6546 
6547   /* have we already finished */
6548   if (__kmp_global.g.g_abort) {
6549     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6550     /* TODO abort? */
6551     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6552     return;
6553   }
6554   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6555     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6556     return;
6557   }
6558 
6559   /* We need this lock to enforce mutex between this reading of
6560      __kmp_threads_capacity and the writing by __kmp_register_root.
6561      Alternatively, we can use a counter of roots that is atomically updated by
6562      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6563      __kmp_internal_end_*.  */
6564   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6565 
6566   /* now we can safely conduct the actual termination */
6567   __kmp_internal_end();
6568 
6569   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6570   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6571 
6572   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6573 
6574 #ifdef DUMP_DEBUG_ON_EXIT
6575   if (__kmp_debug_buf)
6576     __kmp_dump_debug_buffer();
6577 #endif
6578 
6579 #if KMP_OS_WINDOWS
6580   __kmp_close_console();
6581 #endif
6582 
6583   __kmp_fini_allocator();
6584 
6585 } // __kmp_internal_end_library
6586 
6587 void __kmp_internal_end_thread(int gtid_req) {
6588   int i;
6589 
6590   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6591   /* this shouldn't be a race condition because __kmp_internal_end() is the
6592    * only place to clear __kmp_serial_init */
6593   /* we'll check this later too, after we get the lock */
6594   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6595   // redundant, because the next check will work in any case.
6596   if (__kmp_global.g.g_abort) {
6597     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6598     /* TODO abort? */
6599     return;
6600   }
6601   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6602     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6603     return;
6604   }
6605 
6606   // If hidden helper team has been initialized, we need to deinit it
6607   if (TCR_4(__kmp_init_hidden_helper) &&
6608       !TCR_4(__kmp_hidden_helper_team_done)) {
6609     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6610     // First release the main thread to let it continue its work
6611     __kmp_hidden_helper_main_thread_release();
6612     // Wait until the hidden helper team has been destroyed
6613     __kmp_hidden_helper_threads_deinitz_wait();
6614   }
6615 
6616   KMP_MB(); /* Flush all pending memory write invalidates.  */
6617 
6618   /* find out who we are and what we should do */
6619   {
6620     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6621     KA_TRACE(10,
6622              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6623     if (gtid == KMP_GTID_SHUTDOWN) {
6624       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6625                     "already shutdown\n"));
6626       return;
6627     } else if (gtid == KMP_GTID_MONITOR) {
6628       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6629                     "registered, or system shutdown\n"));
6630       return;
6631     } else if (gtid == KMP_GTID_DNE) {
6632       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6633                     "shutdown\n"));
6634       return;
6635       /* we don't know who we are */
6636     } else if (KMP_UBER_GTID(gtid)) {
6637       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6638       if (__kmp_root[gtid]->r.r_active) {
6639         __kmp_global.g.g_abort = -1;
6640         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6641         KA_TRACE(10,
6642                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6643                   gtid));
6644         return;
6645       } else {
6646         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6647                       gtid));
6648         __kmp_unregister_root_current_thread(gtid);
6649       }
6650     } else {
6651       /* just a worker thread, let's leave */
6652       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6653 
6654       if (gtid >= 0) {
6655         __kmp_threads[gtid]->th.th_task_team = NULL;
6656       }
6657 
6658       KA_TRACE(10,
6659                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6660                 gtid));
6661       return;
6662     }
6663   }
6664 #if KMP_DYNAMIC_LIB
6665   if (__kmp_pause_status != kmp_hard_paused)
6666   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6667   // because we will better shutdown later in the library destructor.
6668   {
6669     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6670     return;
6671   }
6672 #endif
6673   /* synchronize the termination process */
6674   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6675 
6676   /* have we already finished */
6677   if (__kmp_global.g.g_abort) {
6678     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6679     /* TODO abort? */
6680     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6681     return;
6682   }
6683   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6684     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6685     return;
6686   }
6687 
6688   /* We need this lock to enforce mutex between this reading of
6689      __kmp_threads_capacity and the writing by __kmp_register_root.
6690      Alternatively, we can use a counter of roots that is atomically updated by
6691      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6692      __kmp_internal_end_*.  */
6693 
6694   /* should we finish the run-time?  are all siblings done? */
6695   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6696 
6697   for (i = 0; i < __kmp_threads_capacity; ++i) {
6698     if (KMP_UBER_GTID(i)) {
6699       KA_TRACE(
6700           10,
6701           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6702       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6703       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6704       return;
6705     }
6706   }
6707 
6708   /* now we can safely conduct the actual termination */
6709 
6710   __kmp_internal_end();
6711 
6712   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6713   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6714 
6715   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6716 
6717 #ifdef DUMP_DEBUG_ON_EXIT
6718   if (__kmp_debug_buf)
6719     __kmp_dump_debug_buffer();
6720 #endif
6721 } // __kmp_internal_end_thread
6722 
6723 // -----------------------------------------------------------------------------
6724 // Library registration stuff.
6725 
6726 static long __kmp_registration_flag = 0;
6727 // Random value used to indicate library initialization.
6728 static char *__kmp_registration_str = NULL;
6729 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6730 
6731 static inline char *__kmp_reg_status_name() {
6732 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6733    each thread. If registration and unregistration go in different threads
6734    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6735    env var can not be found, because the name will contain different pid. */
6736 // macOS* complains about name being too long with additional getuid()
6737 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6738   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6739                           (int)getuid());
6740 #else
6741   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6742 #endif
6743 } // __kmp_reg_status_get
6744 
6745 #if defined(KMP_USE_SHM)
6746 bool __kmp_shm_available = false;
6747 bool __kmp_tmp_available = false;
6748 // If /dev/shm is not accessible, we will create a temporary file under /tmp.
6749 char *temp_reg_status_file_name = nullptr;
6750 #endif
6751 
6752 void __kmp_register_library_startup(void) {
6753 
6754   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6755   int done = 0;
6756   union {
6757     double dtime;
6758     long ltime;
6759   } time;
6760 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6761   __kmp_initialize_system_tick();
6762 #endif
6763   __kmp_read_system_time(&time.dtime);
6764   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6765   __kmp_registration_str =
6766       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6767                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6768 
6769   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6770                 __kmp_registration_str));
6771 
6772   while (!done) {
6773 
6774     char *value = NULL; // Actual value of the environment variable.
6775 
6776 #if defined(KMP_USE_SHM)
6777     char *shm_name = nullptr;
6778     char *data1 = nullptr;
6779     __kmp_shm_available = __kmp_detect_shm();
6780     if (__kmp_shm_available) {
6781       int fd1 = -1;
6782       shm_name = __kmp_str_format("/%s", name);
6783       int shm_preexist = 0;
6784       fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6785       if ((fd1 == -1) && (errno == EEXIST)) {
6786         // file didn't open because it already exists.
6787         // try opening existing file
6788         fd1 = shm_open(shm_name, O_RDWR, 0600);
6789         if (fd1 == -1) { // file didn't open
6790           KMP_WARNING(FunctionError, "Can't open SHM");
6791           __kmp_shm_available = false;
6792         } else { // able to open existing file
6793           shm_preexist = 1;
6794         }
6795       }
6796       if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6797         if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6798           KMP_WARNING(FunctionError, "Can't set size of SHM");
6799           __kmp_shm_available = false;
6800         }
6801       }
6802       if (__kmp_shm_available) { // SHM exists, now map it
6803         data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6804                              fd1, 0);
6805         if (data1 == MAP_FAILED) { // failed to map shared memory
6806           KMP_WARNING(FunctionError, "Can't map SHM");
6807           __kmp_shm_available = false;
6808         }
6809       }
6810       if (__kmp_shm_available) { // SHM mapped
6811         if (shm_preexist == 0) { // set data to SHM, set value
6812           KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6813         }
6814         // Read value from either what we just wrote or existing file.
6815         value = __kmp_str_format("%s", data1); // read value from SHM
6816         munmap(data1, SHM_SIZE);
6817       }
6818       if (fd1 != -1)
6819         close(fd1);
6820     }
6821     if (!__kmp_shm_available)
6822       __kmp_tmp_available = __kmp_detect_tmp();
6823     if (!__kmp_shm_available && __kmp_tmp_available) {
6824       // SHM failed to work due to an error other than that the file already
6825       // exists. Try to create a temp file under /tmp.
6826       // If /tmp isn't accessible, fall back to using environment variable.
6827       // TODO: /tmp might not always be the temporary directory. For now we will
6828       // not consider TMPDIR.
6829       int fd1 = -1;
6830       temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
6831       int tmp_preexist = 0;
6832       fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6833       if ((fd1 == -1) && (errno == EEXIST)) {
6834         // file didn't open because it already exists.
6835         // try opening existing file
6836         fd1 = open(temp_reg_status_file_name, O_RDWR, 0600);
6837         if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6838           KMP_WARNING(FunctionError, "Can't open TEMP");
6839           __kmp_tmp_available = false;
6840         } else {
6841           tmp_preexist = 1;
6842         }
6843       }
6844       if (__kmp_tmp_available && tmp_preexist == 0) {
6845         // we created /tmp file now set size
6846         if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6847           KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6848           __kmp_tmp_available = false;
6849         }
6850       }
6851       if (__kmp_tmp_available) {
6852         data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6853                              fd1, 0);
6854         if (data1 == MAP_FAILED) { // failed to map /tmp
6855           KMP_WARNING(FunctionError, "Can't map /tmp");
6856           __kmp_tmp_available = false;
6857         }
6858       }
6859       if (__kmp_tmp_available) {
6860         if (tmp_preexist == 0) { // set data to TMP, set value
6861           KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6862         }
6863         // Read value from either what we just wrote or existing file.
6864         value = __kmp_str_format("%s", data1); // read value from SHM
6865         munmap(data1, SHM_SIZE);
6866       }
6867       if (fd1 != -1)
6868         close(fd1);
6869     }
6870     if (!__kmp_shm_available && !__kmp_tmp_available) {
6871       // no /dev/shm and no /tmp -- fall back to environment variable
6872       // Set environment variable, but do not overwrite if it exists.
6873       __kmp_env_set(name, __kmp_registration_str, 0);
6874       // read value to see if it got set
6875       value = __kmp_env_get(name);
6876     }
6877 #else // Windows and unix with static library
6878     // Set environment variable, but do not overwrite if it exists.
6879     __kmp_env_set(name, __kmp_registration_str, 0);
6880     // read value to see if it got set
6881     value = __kmp_env_get(name);
6882 #endif
6883 
6884     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6885       done = 1; // Ok, environment variable set successfully, exit the loop.
6886     } else {
6887       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6888       // Check whether it alive or dead.
6889       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6890       char *tail = value;
6891       char *flag_addr_str = NULL;
6892       char *flag_val_str = NULL;
6893       char const *file_name = NULL;
6894       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6895       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6896       file_name = tail;
6897       if (tail != NULL) {
6898         unsigned long *flag_addr = 0;
6899         unsigned long flag_val = 0;
6900         KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6901         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6902         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6903           // First, check whether environment-encoded address is mapped into
6904           // addr space.
6905           // If so, dereference it to see if it still has the right value.
6906           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6907             neighbor = 1;
6908           } else {
6909             // If not, then we know the other copy of the library is no longer
6910             // running.
6911             neighbor = 2;
6912           }
6913         }
6914       }
6915       switch (neighbor) {
6916       case 0: // Cannot parse environment variable -- neighbor status unknown.
6917         // Assume it is the incompatible format of future version of the
6918         // library. Assume the other library is alive.
6919         // WARN( ... ); // TODO: Issue a warning.
6920         file_name = "unknown library";
6921         KMP_FALLTHROUGH();
6922       // Attention! Falling to the next case. That's intentional.
6923       case 1: { // Neighbor is alive.
6924         // Check it is allowed.
6925         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6926         if (!__kmp_str_match_true(duplicate_ok)) {
6927           // That's not allowed. Issue fatal error.
6928           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6929                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6930         }
6931         KMP_INTERNAL_FREE(duplicate_ok);
6932         __kmp_duplicate_library_ok = 1;
6933         done = 1; // Exit the loop.
6934       } break;
6935       case 2: { // Neighbor is dead.
6936 
6937 #if defined(KMP_USE_SHM)
6938         if (__kmp_shm_available) { // close shared memory.
6939           shm_unlink(shm_name); // this removes file in /dev/shm
6940         } else if (__kmp_tmp_available) {
6941           unlink(temp_reg_status_file_name); // this removes the temp file
6942         } else {
6943           // Clear the variable and try to register library again.
6944           __kmp_env_unset(name);
6945         }
6946 #else
6947         // Clear the variable and try to register library again.
6948         __kmp_env_unset(name);
6949 #endif
6950       } break;
6951       default: {
6952         KMP_DEBUG_ASSERT(0);
6953       } break;
6954       }
6955     }
6956     KMP_INTERNAL_FREE((void *)value);
6957 #if defined(KMP_USE_SHM)
6958     if (shm_name)
6959       KMP_INTERNAL_FREE((void *)shm_name);
6960 #endif
6961   } // while
6962   KMP_INTERNAL_FREE((void *)name);
6963 
6964 } // func __kmp_register_library_startup
6965 
6966 void __kmp_unregister_library(void) {
6967 
6968   char *name = __kmp_reg_status_name();
6969   char *value = NULL;
6970 
6971 #if defined(KMP_USE_SHM)
6972   char *shm_name = nullptr;
6973   int fd1;
6974   if (__kmp_shm_available) {
6975     shm_name = __kmp_str_format("/%s", name);
6976     fd1 = shm_open(shm_name, O_RDONLY, 0600);
6977     if (fd1 != -1) { // File opened successfully
6978       char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6979       if (data1 != MAP_FAILED) {
6980         value = __kmp_str_format("%s", data1); // read value from SHM
6981         munmap(data1, SHM_SIZE);
6982       }
6983       close(fd1);
6984     }
6985   } else if (__kmp_tmp_available) { // try /tmp
6986     fd1 = open(temp_reg_status_file_name, O_RDONLY);
6987     if (fd1 != -1) { // File opened successfully
6988       char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6989       if (data1 != MAP_FAILED) {
6990         value = __kmp_str_format("%s", data1); // read value from /tmp
6991         munmap(data1, SHM_SIZE);
6992       }
6993       close(fd1);
6994     }
6995   } else { // fall back to envirable
6996     value = __kmp_env_get(name);
6997   }
6998 #else
6999   value = __kmp_env_get(name);
7000 #endif
7001 
7002   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
7003   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
7004   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
7005 //  Ok, this is our variable. Delete it.
7006 #if defined(KMP_USE_SHM)
7007     if (__kmp_shm_available) {
7008       shm_unlink(shm_name); // this removes file in /dev/shm
7009     } else if (__kmp_tmp_available) {
7010       unlink(temp_reg_status_file_name); // this removes the temp file
7011     } else {
7012       __kmp_env_unset(name);
7013     }
7014 #else
7015     __kmp_env_unset(name);
7016 #endif
7017   }
7018 
7019 #if defined(KMP_USE_SHM)
7020   if (shm_name)
7021     KMP_INTERNAL_FREE(shm_name);
7022   if (temp_reg_status_file_name)
7023     KMP_INTERNAL_FREE(temp_reg_status_file_name);
7024 #endif
7025 
7026   KMP_INTERNAL_FREE(__kmp_registration_str);
7027   KMP_INTERNAL_FREE(value);
7028   KMP_INTERNAL_FREE(name);
7029 
7030   __kmp_registration_flag = 0;
7031   __kmp_registration_str = NULL;
7032 
7033 } // __kmp_unregister_library
7034 
7035 // End of Library registration stuff.
7036 // -----------------------------------------------------------------------------
7037 
7038 #if KMP_MIC_SUPPORTED
7039 
7040 static void __kmp_check_mic_type() {
7041   kmp_cpuid_t cpuid_state = {0};
7042   kmp_cpuid_t *cs_p = &cpuid_state;
7043   __kmp_x86_cpuid(1, 0, cs_p);
7044   // We don't support mic1 at the moment
7045   if ((cs_p->eax & 0xff0) == 0xB10) {
7046     __kmp_mic_type = mic2;
7047   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
7048     __kmp_mic_type = mic3;
7049   } else {
7050     __kmp_mic_type = non_mic;
7051   }
7052 }
7053 
7054 #endif /* KMP_MIC_SUPPORTED */
7055 
7056 #if KMP_HAVE_UMWAIT
7057 static void __kmp_user_level_mwait_init() {
7058   struct kmp_cpuid buf;
7059   __kmp_x86_cpuid(7, 0, &buf);
7060   __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7061   __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7062   __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7063   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7064                 __kmp_umwait_enabled));
7065 }
7066 #elif KMP_HAVE_MWAIT
7067 #ifndef AT_INTELPHIUSERMWAIT
7068 // Spurious, non-existent value that should always fail to return anything.
7069 // Will be replaced with the correct value when we know that.
7070 #define AT_INTELPHIUSERMWAIT 10000
7071 #endif
7072 // getauxval() function is available in RHEL7 and SLES12. If a system with an
7073 // earlier OS is used to build the RTL, we'll use the following internal
7074 // function when the entry is not found.
7075 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7076 unsigned long getauxval(unsigned long) { return 0; }
7077 
7078 static void __kmp_user_level_mwait_init() {
7079   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7080   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7081   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7082   // KMP_USER_LEVEL_MWAIT was set to TRUE.
7083   if (__kmp_mic_type == mic3) {
7084     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7085     if ((res & 0x1) || __kmp_user_level_mwait) {
7086       __kmp_mwait_enabled = TRUE;
7087       if (__kmp_user_level_mwait) {
7088         KMP_INFORM(EnvMwaitWarn);
7089       }
7090     } else {
7091       __kmp_mwait_enabled = FALSE;
7092     }
7093   }
7094   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7095                 "__kmp_mwait_enabled = %d\n",
7096                 __kmp_mic_type, __kmp_mwait_enabled));
7097 }
7098 #endif /* KMP_HAVE_UMWAIT */
7099 
7100 static void __kmp_do_serial_initialize(void) {
7101   int i, gtid;
7102   size_t size;
7103 
7104   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7105 
7106   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7107   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7108   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7109   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7110   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7111 
7112 #if OMPT_SUPPORT
7113   ompt_pre_init();
7114 #endif
7115 #if OMPD_SUPPORT
7116   __kmp_env_dump();
7117   ompd_init();
7118 #endif
7119 
7120   __kmp_validate_locks();
7121 
7122 #if ENABLE_LIBOMPTARGET
7123   /* Initialize functions from libomptarget */
7124   __kmp_init_omptarget();
7125 #endif
7126 
7127   /* Initialize internal memory allocator */
7128   __kmp_init_allocator();
7129 
7130   /* Register the library startup via an environment variable or via mapped
7131      shared memory file and check to see whether another copy of the library is
7132      already registered. Since forked child process is often terminated, we
7133      postpone the registration till middle initialization in the child */
7134   if (__kmp_need_register_serial)
7135     __kmp_register_library_startup();
7136 
7137   /* TODO reinitialization of library */
7138   if (TCR_4(__kmp_global.g.g_done)) {
7139     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7140   }
7141 
7142   __kmp_global.g.g_abort = 0;
7143   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7144 
7145 /* initialize the locks */
7146 #if KMP_USE_ADAPTIVE_LOCKS
7147 #if KMP_DEBUG_ADAPTIVE_LOCKS
7148   __kmp_init_speculative_stats();
7149 #endif
7150 #endif
7151 #if KMP_STATS_ENABLED
7152   __kmp_stats_init();
7153 #endif
7154   __kmp_init_lock(&__kmp_global_lock);
7155   __kmp_init_atomic_lock(&__kmp_atomic_lock);
7156   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7157   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7158   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7159   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7160   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7161   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7162   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7163   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7164   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7165   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7166   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7167   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7168   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7169   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7170 #if KMP_USE_MONITOR
7171   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7172 #endif
7173   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7174 
7175   /* conduct initialization and initial setup of configuration */
7176 
7177   __kmp_runtime_initialize();
7178 
7179 #if KMP_MIC_SUPPORTED
7180   __kmp_check_mic_type();
7181 #endif
7182 
7183 // Some global variable initialization moved here from kmp_env_initialize()
7184 #ifdef KMP_DEBUG
7185   kmp_diag = 0;
7186 #endif
7187   __kmp_abort_delay = 0;
7188 
7189   // From __kmp_init_dflt_team_nth()
7190   /* assume the entire machine will be used */
7191   __kmp_dflt_team_nth_ub = __kmp_xproc;
7192   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7193     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7194   }
7195   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7196     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7197   }
7198   __kmp_max_nth = __kmp_sys_max_nth;
7199   __kmp_cg_max_nth = __kmp_sys_max_nth;
7200   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7201   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7202     __kmp_teams_max_nth = __kmp_sys_max_nth;
7203   }
7204 
7205   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7206   // part
7207   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7208 #if KMP_USE_MONITOR
7209   __kmp_monitor_wakeups =
7210       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7211   __kmp_bt_intervals =
7212       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7213 #endif
7214   // From "KMP_LIBRARY" part of __kmp_env_initialize()
7215   __kmp_library = library_throughput;
7216   // From KMP_SCHEDULE initialization
7217   __kmp_static = kmp_sch_static_balanced;
7218 // AC: do not use analytical here, because it is non-monotonous
7219 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7220 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7221 // need to repeat assignment
7222 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7223 // bit control and barrier method control parts
7224 #if KMP_FAST_REDUCTION_BARRIER
7225 #define kmp_reduction_barrier_gather_bb ((int)1)
7226 #define kmp_reduction_barrier_release_bb ((int)1)
7227 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7228 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7229 #endif // KMP_FAST_REDUCTION_BARRIER
7230   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7231     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7232     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7233     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7234     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7235 #if KMP_FAST_REDUCTION_BARRIER
7236     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7237       // lin_64 ): hyper,1
7238       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7239       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7240       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7241       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7242     }
7243 #endif // KMP_FAST_REDUCTION_BARRIER
7244   }
7245 #if KMP_FAST_REDUCTION_BARRIER
7246 #undef kmp_reduction_barrier_release_pat
7247 #undef kmp_reduction_barrier_gather_pat
7248 #undef kmp_reduction_barrier_release_bb
7249 #undef kmp_reduction_barrier_gather_bb
7250 #endif // KMP_FAST_REDUCTION_BARRIER
7251 #if KMP_MIC_SUPPORTED
7252   if (__kmp_mic_type == mic2) { // KNC
7253     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7254     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7255     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7256         1; // forkjoin release
7257     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7258     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7259   }
7260 #if KMP_FAST_REDUCTION_BARRIER
7261   if (__kmp_mic_type == mic2) { // KNC
7262     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7263     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7264   }
7265 #endif // KMP_FAST_REDUCTION_BARRIER
7266 #endif // KMP_MIC_SUPPORTED
7267 
7268 // From KMP_CHECKS initialization
7269 #ifdef KMP_DEBUG
7270   __kmp_env_checks = TRUE; /* development versions have the extra checks */
7271 #else
7272   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7273 #endif
7274 
7275   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7276   __kmp_foreign_tp = TRUE;
7277 
7278   __kmp_global.g.g_dynamic = FALSE;
7279   __kmp_global.g.g_dynamic_mode = dynamic_default;
7280 
7281   __kmp_init_nesting_mode();
7282 
7283   __kmp_env_initialize(NULL);
7284 
7285 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7286   __kmp_user_level_mwait_init();
7287 #endif
7288 // Print all messages in message catalog for testing purposes.
7289 #ifdef KMP_DEBUG
7290   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7291   if (__kmp_str_match_true(val)) {
7292     kmp_str_buf_t buffer;
7293     __kmp_str_buf_init(&buffer);
7294     __kmp_i18n_dump_catalog(&buffer);
7295     __kmp_printf("%s", buffer.str);
7296     __kmp_str_buf_free(&buffer);
7297   }
7298   __kmp_env_free(&val);
7299 #endif
7300 
7301   __kmp_threads_capacity =
7302       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7303   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7304   __kmp_tp_capacity = __kmp_default_tp_capacity(
7305       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7306 
7307   // If the library is shut down properly, both pools must be NULL. Just in
7308   // case, set them to NULL -- some memory may leak, but subsequent code will
7309   // work even if pools are not freed.
7310   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7311   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7312   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7313   __kmp_thread_pool = NULL;
7314   __kmp_thread_pool_insert_pt = NULL;
7315   __kmp_team_pool = NULL;
7316 
7317   /* Allocate all of the variable sized records */
7318   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7319    * expandable */
7320   /* Since allocation is cache-aligned, just add extra padding at the end */
7321   size =
7322       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7323       CACHE_LINE;
7324   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7325   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7326                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
7327 
7328   /* init thread counts */
7329   KMP_DEBUG_ASSERT(__kmp_all_nth ==
7330                    0); // Asserts fail if the library is reinitializing and
7331   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7332   __kmp_all_nth = 0;
7333   __kmp_nth = 0;
7334 
7335   /* setup the uber master thread and hierarchy */
7336   gtid = __kmp_register_root(TRUE);
7337   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
7338   KMP_ASSERT(KMP_UBER_GTID(gtid));
7339   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7340 
7341   KMP_MB(); /* Flush all pending memory write invalidates.  */
7342 
7343   __kmp_common_initialize();
7344 
7345 #if KMP_OS_UNIX
7346   /* invoke the child fork handler */
7347   __kmp_register_atfork();
7348 #endif
7349 
7350 #if !KMP_DYNAMIC_LIB ||                                                        \
7351     ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7352   {
7353     /* Invoke the exit handler when the program finishes, only for static
7354        library and macOS* dynamic. For other dynamic libraries, we already
7355        have _fini and DllMain. */
7356     int rc = atexit(__kmp_internal_end_atexit);
7357     if (rc != 0) {
7358       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7359                   __kmp_msg_null);
7360     }
7361   }
7362 #endif
7363 
7364 #if KMP_HANDLE_SIGNALS
7365 #if KMP_OS_UNIX
7366   /* NOTE: make sure that this is called before the user installs their own
7367      signal handlers so that the user handlers are called first. this way they
7368      can return false, not call our handler, avoid terminating the library, and
7369      continue execution where they left off. */
7370   __kmp_install_signals(FALSE);
7371 #endif /* KMP_OS_UNIX */
7372 #if KMP_OS_WINDOWS
7373   __kmp_install_signals(TRUE);
7374 #endif /* KMP_OS_WINDOWS */
7375 #endif
7376 
7377   /* we have finished the serial initialization */
7378   __kmp_init_counter++;
7379 
7380   __kmp_init_serial = TRUE;
7381 
7382   if (__kmp_version) {
7383     __kmp_print_version_1();
7384   }
7385 
7386   if (__kmp_settings) {
7387     __kmp_env_print();
7388   }
7389 
7390   if (__kmp_display_env || __kmp_display_env_verbose) {
7391     __kmp_env_print_2();
7392   }
7393 
7394 #if OMPT_SUPPORT
7395   ompt_post_init();
7396 #endif
7397 
7398   KMP_MB();
7399 
7400   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7401 }
7402 
7403 void __kmp_serial_initialize(void) {
7404   if (__kmp_init_serial) {
7405     return;
7406   }
7407   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7408   if (__kmp_init_serial) {
7409     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7410     return;
7411   }
7412   __kmp_do_serial_initialize();
7413   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7414 }
7415 
7416 static void __kmp_do_middle_initialize(void) {
7417   int i, j;
7418   int prev_dflt_team_nth;
7419 
7420   if (!__kmp_init_serial) {
7421     __kmp_do_serial_initialize();
7422   }
7423 
7424   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7425 
7426   if (UNLIKELY(!__kmp_need_register_serial)) {
7427     // We are in a forked child process. The registration was skipped during
7428     // serial initialization in __kmp_atfork_child handler. Do it here.
7429     __kmp_register_library_startup();
7430   }
7431 
7432   // Save the previous value for the __kmp_dflt_team_nth so that
7433   // we can avoid some reinitialization if it hasn't changed.
7434   prev_dflt_team_nth = __kmp_dflt_team_nth;
7435 
7436 #if KMP_AFFINITY_SUPPORTED
7437   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7438   // number of cores on the machine.
7439   __kmp_affinity_initialize(__kmp_affinity);
7440 
7441 #endif /* KMP_AFFINITY_SUPPORTED */
7442 
7443   KMP_ASSERT(__kmp_xproc > 0);
7444   if (__kmp_avail_proc == 0) {
7445     __kmp_avail_proc = __kmp_xproc;
7446   }
7447 
7448   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7449   // correct them now
7450   j = 0;
7451   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7452     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7453         __kmp_avail_proc;
7454     j++;
7455   }
7456 
7457   if (__kmp_dflt_team_nth == 0) {
7458 #ifdef KMP_DFLT_NTH_CORES
7459     // Default #threads = #cores
7460     __kmp_dflt_team_nth = __kmp_ncores;
7461     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7462                   "__kmp_ncores (%d)\n",
7463                   __kmp_dflt_team_nth));
7464 #else
7465     // Default #threads = #available OS procs
7466     __kmp_dflt_team_nth = __kmp_avail_proc;
7467     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7468                   "__kmp_avail_proc(%d)\n",
7469                   __kmp_dflt_team_nth));
7470 #endif /* KMP_DFLT_NTH_CORES */
7471   }
7472 
7473   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7474     __kmp_dflt_team_nth = KMP_MIN_NTH;
7475   }
7476   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7477     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7478   }
7479 
7480   if (__kmp_nesting_mode > 0)
7481     __kmp_set_nesting_mode_threads();
7482 
7483   // There's no harm in continuing if the following check fails,
7484   // but it indicates an error in the previous logic.
7485   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7486 
7487   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7488     // Run through the __kmp_threads array and set the num threads icv for each
7489     // root thread that is currently registered with the RTL (which has not
7490     // already explicitly set its nthreads-var with a call to
7491     // omp_set_num_threads()).
7492     for (i = 0; i < __kmp_threads_capacity; i++) {
7493       kmp_info_t *thread = __kmp_threads[i];
7494       if (thread == NULL)
7495         continue;
7496       if (thread->th.th_current_task->td_icvs.nproc != 0)
7497         continue;
7498 
7499       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7500     }
7501   }
7502   KA_TRACE(
7503       20,
7504       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7505        __kmp_dflt_team_nth));
7506 
7507 #ifdef KMP_ADJUST_BLOCKTIME
7508   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7509   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7510     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7511     if (__kmp_nth > __kmp_avail_proc) {
7512       __kmp_zero_bt = TRUE;
7513     }
7514   }
7515 #endif /* KMP_ADJUST_BLOCKTIME */
7516 
7517   /* we have finished middle initialization */
7518   TCW_SYNC_4(__kmp_init_middle, TRUE);
7519 
7520   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7521 }
7522 
7523 void __kmp_middle_initialize(void) {
7524   if (__kmp_init_middle) {
7525     return;
7526   }
7527   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7528   if (__kmp_init_middle) {
7529     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7530     return;
7531   }
7532   __kmp_do_middle_initialize();
7533   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7534 }
7535 
7536 void __kmp_parallel_initialize(void) {
7537   int gtid = __kmp_entry_gtid(); // this might be a new root
7538 
7539   /* synchronize parallel initialization (for sibling) */
7540   if (TCR_4(__kmp_init_parallel))
7541     return;
7542   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7543   if (TCR_4(__kmp_init_parallel)) {
7544     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7545     return;
7546   }
7547 
7548   /* TODO reinitialization after we have already shut down */
7549   if (TCR_4(__kmp_global.g.g_done)) {
7550     KA_TRACE(
7551         10,
7552         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7553     __kmp_infinite_loop();
7554   }
7555 
7556   /* jc: The lock __kmp_initz_lock is already held, so calling
7557      __kmp_serial_initialize would cause a deadlock.  So we call
7558      __kmp_do_serial_initialize directly. */
7559   if (!__kmp_init_middle) {
7560     __kmp_do_middle_initialize();
7561   }
7562   __kmp_assign_root_init_mask();
7563   __kmp_resume_if_hard_paused();
7564 
7565   /* begin initialization */
7566   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7567   KMP_ASSERT(KMP_UBER_GTID(gtid));
7568 
7569 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7570   // Save the FP control regs.
7571   // Worker threads will set theirs to these values at thread startup.
7572   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7573   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7574   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7575 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7576 
7577 #if KMP_OS_UNIX
7578 #if KMP_HANDLE_SIGNALS
7579   /*  must be after __kmp_serial_initialize  */
7580   __kmp_install_signals(TRUE);
7581 #endif
7582 #endif
7583 
7584   __kmp_suspend_initialize();
7585 
7586 #if defined(USE_LOAD_BALANCE)
7587   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7588     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7589   }
7590 #else
7591   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7592     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7593   }
7594 #endif
7595 
7596   if (__kmp_version) {
7597     __kmp_print_version_2();
7598   }
7599 
7600   /* we have finished parallel initialization */
7601   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7602 
7603   KMP_MB();
7604   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7605 
7606   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7607 }
7608 
7609 void __kmp_hidden_helper_initialize() {
7610   if (TCR_4(__kmp_init_hidden_helper))
7611     return;
7612 
7613   // __kmp_parallel_initialize is required before we initialize hidden helper
7614   if (!TCR_4(__kmp_init_parallel))
7615     __kmp_parallel_initialize();
7616 
7617   // Double check. Note that this double check should not be placed before
7618   // __kmp_parallel_initialize as it will cause dead lock.
7619   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7620   if (TCR_4(__kmp_init_hidden_helper)) {
7621     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7622     return;
7623   }
7624 
7625 #if KMP_AFFINITY_SUPPORTED
7626   // Initialize hidden helper affinity settings.
7627   // The above __kmp_parallel_initialize() will initialize
7628   // regular affinity (and topology) if not already done.
7629   if (!__kmp_hh_affinity.flags.initialized)
7630     __kmp_affinity_initialize(__kmp_hh_affinity);
7631 #endif
7632 
7633   // Set the count of hidden helper tasks to be executed to zero
7634   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7635 
7636   // Set the global variable indicating that we're initializing hidden helper
7637   // team/threads
7638   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7639 
7640   // Platform independent initialization
7641   __kmp_do_initialize_hidden_helper_threads();
7642 
7643   // Wait here for the finish of initialization of hidden helper teams
7644   __kmp_hidden_helper_threads_initz_wait();
7645 
7646   // We have finished hidden helper initialization
7647   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7648 
7649   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7650 }
7651 
7652 /* ------------------------------------------------------------------------ */
7653 
7654 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7655                                    kmp_team_t *team) {
7656   kmp_disp_t *dispatch;
7657 
7658   KMP_MB();
7659 
7660   /* none of the threads have encountered any constructs, yet. */
7661   this_thr->th.th_local.this_construct = 0;
7662 #if KMP_CACHE_MANAGE
7663   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7664 #endif /* KMP_CACHE_MANAGE */
7665   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7666   KMP_DEBUG_ASSERT(dispatch);
7667   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7668   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7669   // this_thr->th.th_info.ds.ds_tid ] );
7670 
7671   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7672   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7673   if (__kmp_env_consistency_check)
7674     __kmp_push_parallel(gtid, team->t.t_ident);
7675 
7676   KMP_MB(); /* Flush all pending memory write invalidates.  */
7677 }
7678 
7679 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7680                                   kmp_team_t *team) {
7681   if (__kmp_env_consistency_check)
7682     __kmp_pop_parallel(gtid, team->t.t_ident);
7683 
7684   __kmp_finish_implicit_task(this_thr);
7685 }
7686 
7687 int __kmp_invoke_task_func(int gtid) {
7688   int rc;
7689   int tid = __kmp_tid_from_gtid(gtid);
7690   kmp_info_t *this_thr = __kmp_threads[gtid];
7691   kmp_team_t *team = this_thr->th.th_team;
7692 
7693   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7694 #if USE_ITT_BUILD
7695   if (__itt_stack_caller_create_ptr) {
7696     // inform ittnotify about entering user's code
7697     if (team->t.t_stack_id != NULL) {
7698       __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7699     } else {
7700       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7701       __kmp_itt_stack_callee_enter(
7702           (__itt_caller)team->t.t_parent->t.t_stack_id);
7703     }
7704   }
7705 #endif /* USE_ITT_BUILD */
7706 #if INCLUDE_SSC_MARKS
7707   SSC_MARK_INVOKING();
7708 #endif
7709 
7710 #if OMPT_SUPPORT
7711   void *dummy;
7712   void **exit_frame_p;
7713   ompt_data_t *my_task_data;
7714   ompt_data_t *my_parallel_data;
7715   int ompt_team_size;
7716 
7717   if (ompt_enabled.enabled) {
7718     exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7719                          .ompt_task_info.frame.exit_frame.ptr);
7720   } else {
7721     exit_frame_p = &dummy;
7722   }
7723 
7724   my_task_data =
7725       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7726   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7727   if (ompt_enabled.ompt_callback_implicit_task) {
7728     ompt_team_size = team->t.t_nproc;
7729     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7730         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7731         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7732     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7733   }
7734 #endif
7735 
7736 #if KMP_STATS_ENABLED
7737   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7738   if (previous_state == stats_state_e::TEAMS_REGION) {
7739     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7740   } else {
7741     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7742   }
7743   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7744 #endif
7745 
7746   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7747                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7748 #if OMPT_SUPPORT
7749                               ,
7750                               exit_frame_p
7751 #endif
7752   );
7753 #if OMPT_SUPPORT
7754   *exit_frame_p = NULL;
7755   this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_team;
7756 #endif
7757 
7758 #if KMP_STATS_ENABLED
7759   if (previous_state == stats_state_e::TEAMS_REGION) {
7760     KMP_SET_THREAD_STATE(previous_state);
7761   }
7762   KMP_POP_PARTITIONED_TIMER();
7763 #endif
7764 
7765 #if USE_ITT_BUILD
7766   if (__itt_stack_caller_create_ptr) {
7767     // inform ittnotify about leaving user's code
7768     if (team->t.t_stack_id != NULL) {
7769       __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7770     } else {
7771       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7772       __kmp_itt_stack_callee_leave(
7773           (__itt_caller)team->t.t_parent->t.t_stack_id);
7774     }
7775   }
7776 #endif /* USE_ITT_BUILD */
7777   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7778 
7779   return rc;
7780 }
7781 
7782 void __kmp_teams_master(int gtid) {
7783   // This routine is called by all primary threads in teams construct
7784   kmp_info_t *thr = __kmp_threads[gtid];
7785   kmp_team_t *team = thr->th.th_team;
7786   ident_t *loc = team->t.t_ident;
7787   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7788   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7789   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7790   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7791                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7792 
7793   // This thread is a new CG root.  Set up the proper variables.
7794   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7795   tmp->cg_root = thr; // Make thr the CG root
7796   // Init to thread limit stored when league primary threads were forked
7797   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7798   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7799   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7800                  " cg_nthreads to 1\n",
7801                  thr, tmp));
7802   tmp->up = thr->th.th_cg_roots;
7803   thr->th.th_cg_roots = tmp;
7804 
7805 // Launch league of teams now, but not let workers execute
7806 // (they hang on fork barrier until next parallel)
7807 #if INCLUDE_SSC_MARKS
7808   SSC_MARK_FORKING();
7809 #endif
7810   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7811                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7812                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7813 #if INCLUDE_SSC_MARKS
7814   SSC_MARK_JOINING();
7815 #endif
7816   // If the team size was reduced from the limit, set it to the new size
7817   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7818     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7819   // AC: last parameter "1" eliminates join barrier which won't work because
7820   // worker threads are in a fork barrier waiting for more parallel regions
7821   __kmp_join_call(loc, gtid
7822 #if OMPT_SUPPORT
7823                   ,
7824                   fork_context_intel
7825 #endif
7826                   ,
7827                   1);
7828 }
7829 
7830 int __kmp_invoke_teams_master(int gtid) {
7831   kmp_info_t *this_thr = __kmp_threads[gtid];
7832   kmp_team_t *team = this_thr->th.th_team;
7833 #if KMP_DEBUG
7834   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7835     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7836                      (void *)__kmp_teams_master);
7837 #endif
7838   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7839 #if OMPT_SUPPORT
7840   int tid = __kmp_tid_from_gtid(gtid);
7841   ompt_data_t *task_data =
7842       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7843   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7844   if (ompt_enabled.ompt_callback_implicit_task) {
7845     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7846         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7847         ompt_task_initial);
7848     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7849   }
7850 #endif
7851   __kmp_teams_master(gtid);
7852 #if OMPT_SUPPORT
7853   this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_league;
7854 #endif
7855   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7856   return 1;
7857 }
7858 
7859 /* this sets the requested number of threads for the next parallel region
7860    encountered by this team. since this should be enclosed in the forkjoin
7861    critical section it should avoid race conditions with asymmetrical nested
7862    parallelism */
7863 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7864   kmp_info_t *thr = __kmp_threads[gtid];
7865 
7866   if (num_threads > 0)
7867     thr->th.th_set_nproc = num_threads;
7868 }
7869 
7870 void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length,
7871                                  int *num_threads_list) {
7872   kmp_info_t *thr = __kmp_threads[gtid];
7873 
7874   KMP_DEBUG_ASSERT(list_length > 1);
7875 
7876   if (num_threads_list[0] > 0)
7877     thr->th.th_set_nproc = num_threads_list[0];
7878   thr->th.th_set_nested_nth =
7879       (int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int));
7880   for (kmp_uint32 i = 0; i < list_length; ++i)
7881     thr->th.th_set_nested_nth[i] = num_threads_list[i];
7882   thr->th.th_set_nested_nth_sz = list_length;
7883 }
7884 
7885 void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
7886                                   const char *msg) {
7887   kmp_info_t *thr = __kmp_threads[gtid];
7888   thr->th.th_nt_strict = true;
7889   thr->th.th_nt_loc = loc;
7890   // if sev is unset make fatal
7891   if (sev == severity_warning)
7892     thr->th.th_nt_sev = sev;
7893   else
7894     thr->th.th_nt_sev = severity_fatal;
7895   // if msg is unset, use an appropriate message
7896   if (msg)
7897     thr->th.th_nt_msg = msg;
7898   else
7899     thr->th.th_nt_msg = "Cannot form team with number of threads specified by "
7900                         "strict num_threads clause.";
7901 }
7902 
7903 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7904                                     int num_threads) {
7905   KMP_DEBUG_ASSERT(thr);
7906   // Remember the number of threads for inner parallel regions
7907   if (!TCR_4(__kmp_init_middle))
7908     __kmp_middle_initialize(); // get internal globals calculated
7909   __kmp_assign_root_init_mask();
7910   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7911   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7912 
7913   if (num_threads == 0) {
7914     if (__kmp_teams_thread_limit > 0) {
7915       num_threads = __kmp_teams_thread_limit;
7916     } else {
7917       num_threads = __kmp_avail_proc / num_teams;
7918     }
7919     // adjust num_threads w/o warning as it is not user setting
7920     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7921     // no thread_limit clause specified -  do not change thread-limit-var ICV
7922     if (num_threads > __kmp_dflt_team_nth) {
7923       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7924     }
7925     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7926       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7927     } // prevent team size to exceed thread-limit-var
7928     if (num_teams * num_threads > __kmp_teams_max_nth) {
7929       num_threads = __kmp_teams_max_nth / num_teams;
7930     }
7931     if (num_threads == 0) {
7932       num_threads = 1;
7933     }
7934   } else {
7935     if (num_threads < 0) {
7936       __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7937                 __kmp_msg_null);
7938       num_threads = 1;
7939     }
7940     // This thread will be the primary thread of the league primary threads
7941     // Store new thread limit; old limit is saved in th_cg_roots list
7942     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7943     // num_threads = min(num_threads, nthreads-var)
7944     if (num_threads > __kmp_dflt_team_nth) {
7945       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7946     }
7947     if (num_teams * num_threads > __kmp_teams_max_nth) {
7948       int new_threads = __kmp_teams_max_nth / num_teams;
7949       if (new_threads == 0) {
7950         new_threads = 1;
7951       }
7952       if (new_threads != num_threads) {
7953         if (!__kmp_reserve_warn) { // user asked for too many threads
7954           __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7955           __kmp_msg(kmp_ms_warning,
7956                     KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7957                     KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7958         }
7959       }
7960       num_threads = new_threads;
7961     }
7962   }
7963   thr->th.th_teams_size.nth = num_threads;
7964 }
7965 
7966 /* this sets the requested number of teams for the teams region and/or
7967    the number of threads for the next parallel region encountered  */
7968 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7969                           int num_threads) {
7970   kmp_info_t *thr = __kmp_threads[gtid];
7971   if (num_teams < 0) {
7972     // OpenMP specification requires requested values to be positive,
7973     // but people can send us any value, so we'd better check
7974     __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7975               __kmp_msg_null);
7976     num_teams = 1;
7977   }
7978   if (num_teams == 0) {
7979     if (__kmp_nteams > 0) {
7980       num_teams = __kmp_nteams;
7981     } else {
7982       num_teams = 1; // default number of teams is 1.
7983     }
7984   }
7985   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7986     if (!__kmp_reserve_warn) {
7987       __kmp_reserve_warn = 1;
7988       __kmp_msg(kmp_ms_warning,
7989                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7990                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7991     }
7992     num_teams = __kmp_teams_max_nth;
7993   }
7994   // Set number of teams (number of threads in the outer "parallel" of the
7995   // teams)
7996   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7997 
7998   __kmp_push_thread_limit(thr, num_teams, num_threads);
7999 }
8000 
8001 /* This sets the requested number of teams for the teams region and/or
8002    the number of threads for the next parallel region encountered  */
8003 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
8004                              int num_teams_ub, int num_threads) {
8005   kmp_info_t *thr = __kmp_threads[gtid];
8006   KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
8007   KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
8008   KMP_DEBUG_ASSERT(num_threads >= 0);
8009 
8010   if (num_teams_lb > num_teams_ub) {
8011     __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
8012                 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
8013   }
8014 
8015   int num_teams = 1; // defalt number of teams is 1.
8016 
8017   if (num_teams_lb == 0 && num_teams_ub > 0)
8018     num_teams_lb = num_teams_ub;
8019 
8020   if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
8021     num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
8022     if (num_teams > __kmp_teams_max_nth) {
8023       if (!__kmp_reserve_warn) {
8024         __kmp_reserve_warn = 1;
8025         __kmp_msg(kmp_ms_warning,
8026                   KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
8027                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
8028       }
8029       num_teams = __kmp_teams_max_nth;
8030     }
8031   } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
8032     num_teams = num_teams_ub;
8033   } else { // num_teams_lb <= num_teams <= num_teams_ub
8034     if (num_threads <= 0) {
8035       if (num_teams_ub > __kmp_teams_max_nth) {
8036         num_teams = num_teams_lb;
8037       } else {
8038         num_teams = num_teams_ub;
8039       }
8040     } else {
8041       num_teams = (num_threads > __kmp_teams_max_nth)
8042                       ? num_teams
8043                       : __kmp_teams_max_nth / num_threads;
8044       if (num_teams < num_teams_lb) {
8045         num_teams = num_teams_lb;
8046       } else if (num_teams > num_teams_ub) {
8047         num_teams = num_teams_ub;
8048       }
8049     }
8050   }
8051   // Set number of teams (number of threads in the outer "parallel" of the
8052   // teams)
8053   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
8054 
8055   __kmp_push_thread_limit(thr, num_teams, num_threads);
8056 }
8057 
8058 // Set the proc_bind var to use in the following parallel region.
8059 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8060   kmp_info_t *thr = __kmp_threads[gtid];
8061   thr->th.th_set_proc_bind = proc_bind;
8062 }
8063 
8064 /* Launch the worker threads into the microtask. */
8065 
8066 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8067   kmp_info_t *this_thr = __kmp_threads[gtid];
8068 
8069 #ifdef KMP_DEBUG
8070   int f;
8071 #endif /* KMP_DEBUG */
8072 
8073   KMP_DEBUG_ASSERT(team);
8074   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8075   KMP_ASSERT(KMP_MASTER_GTID(gtid));
8076   KMP_MB(); /* Flush all pending memory write invalidates.  */
8077 
8078   team->t.t_construct = 0; /* no single directives seen yet */
8079   team->t.t_ordered.dt.t_value =
8080       0; /* thread 0 enters the ordered section first */
8081 
8082   /* Reset the identifiers on the dispatch buffer */
8083   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8084   if (team->t.t_max_nproc > 1) {
8085     int i;
8086     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8087       team->t.t_disp_buffer[i].buffer_index = i;
8088       team->t.t_disp_buffer[i].doacross_buf_idx = i;
8089     }
8090   } else {
8091     team->t.t_disp_buffer[0].buffer_index = 0;
8092     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8093   }
8094 
8095   KMP_MB(); /* Flush all pending memory write invalidates.  */
8096   KMP_ASSERT(this_thr->th.th_team == team);
8097 
8098 #ifdef KMP_DEBUG
8099   for (f = 0; f < team->t.t_nproc; f++) {
8100     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8101                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8102   }
8103 #endif /* KMP_DEBUG */
8104 
8105   /* release the worker threads so they may begin working */
8106   __kmp_fork_barrier(gtid, 0);
8107 }
8108 
8109 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8110   kmp_info_t *this_thr = __kmp_threads[gtid];
8111 
8112   KMP_DEBUG_ASSERT(team);
8113   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8114   KMP_ASSERT(KMP_MASTER_GTID(gtid));
8115   KMP_MB(); /* Flush all pending memory write invalidates.  */
8116 
8117   /* Join barrier after fork */
8118 
8119 #ifdef KMP_DEBUG
8120   if (__kmp_threads[gtid] &&
8121       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8122     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8123                  __kmp_threads[gtid]);
8124     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8125                  "team->t.t_nproc=%d\n",
8126                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8127                  team->t.t_nproc);
8128     __kmp_print_structure();
8129   }
8130   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8131                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8132 #endif /* KMP_DEBUG */
8133 
8134   __kmp_join_barrier(gtid); /* wait for everyone */
8135 #if OMPT_SUPPORT
8136   ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
8137   if (ompt_enabled.enabled &&
8138       (ompt_state == ompt_state_wait_barrier_teams ||
8139        ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
8140     int ds_tid = this_thr->th.th_info.ds.ds_tid;
8141     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8142     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8143 #if OMPT_OPTIONAL
8144     void *codeptr = NULL;
8145     if (KMP_MASTER_TID(ds_tid) &&
8146         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8147          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8148       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8149 
8150     ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
8151     if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
8152       sync_kind = ompt_sync_region_barrier_teams;
8153     if (ompt_enabled.ompt_callback_sync_region_wait) {
8154       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8155           sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8156     }
8157     if (ompt_enabled.ompt_callback_sync_region) {
8158       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8159           sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8160     }
8161 #endif
8162     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8163       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8164           ompt_scope_end, NULL, task_data, 0, ds_tid,
8165           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8166     }
8167   }
8168 #endif
8169 
8170   KMP_MB(); /* Flush all pending memory write invalidates.  */
8171   KMP_ASSERT(this_thr->th.th_team == team);
8172 }
8173 
8174 /* ------------------------------------------------------------------------ */
8175 
8176 #ifdef USE_LOAD_BALANCE
8177 
8178 // Return the worker threads actively spinning in the hot team, if we
8179 // are at the outermost level of parallelism.  Otherwise, return 0.
8180 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8181   int i;
8182   int retval;
8183   kmp_team_t *hot_team;
8184 
8185   if (root->r.r_active) {
8186     return 0;
8187   }
8188   hot_team = root->r.r_hot_team;
8189   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8190     return hot_team->t.t_nproc - 1; // Don't count primary thread
8191   }
8192 
8193   // Skip the primary thread - it is accounted for elsewhere.
8194   retval = 0;
8195   for (i = 1; i < hot_team->t.t_nproc; i++) {
8196     if (hot_team->t.t_threads[i]->th.th_active) {
8197       retval++;
8198     }
8199   }
8200   return retval;
8201 }
8202 
8203 // Perform an automatic adjustment to the number of
8204 // threads used by the next parallel region.
8205 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8206   int retval;
8207   int pool_active;
8208   int hot_team_active;
8209   int team_curr_active;
8210   int system_active;
8211 
8212   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8213                 set_nproc));
8214   KMP_DEBUG_ASSERT(root);
8215   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8216                        ->th.th_current_task->td_icvs.dynamic == TRUE);
8217   KMP_DEBUG_ASSERT(set_nproc > 1);
8218 
8219   if (set_nproc == 1) {
8220     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8221     return 1;
8222   }
8223 
8224   // Threads that are active in the thread pool, active in the hot team for this
8225   // particular root (if we are at the outer par level), and the currently
8226   // executing thread (to become the primary thread) are available to add to the
8227   // new team, but are currently contributing to the system load, and must be
8228   // accounted for.
8229   pool_active = __kmp_thread_pool_active_nth;
8230   hot_team_active = __kmp_active_hot_team_nproc(root);
8231   team_curr_active = pool_active + hot_team_active + 1;
8232 
8233   // Check the system load.
8234   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8235   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8236                 "hot team active = %d\n",
8237                 system_active, pool_active, hot_team_active));
8238 
8239   if (system_active < 0) {
8240     // There was an error reading the necessary info from /proc, so use the
8241     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8242     // = dynamic_thread_limit, we shouldn't wind up getting back here.
8243     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8244     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8245 
8246     // Make this call behave like the thread limit algorithm.
8247     retval = __kmp_avail_proc - __kmp_nth +
8248              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8249     if (retval > set_nproc) {
8250       retval = set_nproc;
8251     }
8252     if (retval < KMP_MIN_NTH) {
8253       retval = KMP_MIN_NTH;
8254     }
8255 
8256     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8257                   retval));
8258     return retval;
8259   }
8260 
8261   // There is a slight delay in the load balance algorithm in detecting new
8262   // running procs. The real system load at this instant should be at least as
8263   // large as the #active omp thread that are available to add to the team.
8264   if (system_active < team_curr_active) {
8265     system_active = team_curr_active;
8266   }
8267   retval = __kmp_avail_proc - system_active + team_curr_active;
8268   if (retval > set_nproc) {
8269     retval = set_nproc;
8270   }
8271   if (retval < KMP_MIN_NTH) {
8272     retval = KMP_MIN_NTH;
8273   }
8274 
8275   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8276   return retval;
8277 } // __kmp_load_balance_nproc()
8278 
8279 #endif /* USE_LOAD_BALANCE */
8280 
8281 /* ------------------------------------------------------------------------ */
8282 
8283 /* NOTE: this is called with the __kmp_init_lock held */
8284 void __kmp_cleanup(void) {
8285   int f;
8286 
8287   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8288 
8289   if (TCR_4(__kmp_init_parallel)) {
8290 #if KMP_HANDLE_SIGNALS
8291     __kmp_remove_signals();
8292 #endif
8293     TCW_4(__kmp_init_parallel, FALSE);
8294   }
8295 
8296   if (TCR_4(__kmp_init_middle)) {
8297 #if KMP_AFFINITY_SUPPORTED
8298     __kmp_affinity_uninitialize();
8299 #endif /* KMP_AFFINITY_SUPPORTED */
8300     __kmp_cleanup_hierarchy();
8301     TCW_4(__kmp_init_middle, FALSE);
8302   }
8303 
8304   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8305 
8306   if (__kmp_init_serial) {
8307     __kmp_runtime_destroy();
8308     __kmp_init_serial = FALSE;
8309   }
8310 
8311   __kmp_cleanup_threadprivate_caches();
8312 
8313   for (f = 0; f < __kmp_threads_capacity; f++) {
8314     if (__kmp_root[f] != NULL) {
8315       __kmp_free(__kmp_root[f]);
8316       __kmp_root[f] = NULL;
8317     }
8318   }
8319   __kmp_free(__kmp_threads);
8320   // __kmp_threads and __kmp_root were allocated at once, as single block, so
8321   // there is no need in freeing __kmp_root.
8322   __kmp_threads = NULL;
8323   __kmp_root = NULL;
8324   __kmp_threads_capacity = 0;
8325 
8326   // Free old __kmp_threads arrays if they exist.
8327   kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8328   while (ptr) {
8329     kmp_old_threads_list_t *next = ptr->next;
8330     __kmp_free(ptr->threads);
8331     __kmp_free(ptr);
8332     ptr = next;
8333   }
8334 
8335 #if KMP_USE_DYNAMIC_LOCK
8336   __kmp_cleanup_indirect_user_locks();
8337 #else
8338   __kmp_cleanup_user_locks();
8339 #endif
8340 #if OMPD_SUPPORT
8341   if (ompd_state) {
8342     __kmp_free(ompd_env_block);
8343     ompd_env_block = NULL;
8344     ompd_env_block_size = 0;
8345   }
8346 #endif
8347 
8348 #if KMP_AFFINITY_SUPPORTED
8349   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8350   __kmp_cpuinfo_file = NULL;
8351 #endif /* KMP_AFFINITY_SUPPORTED */
8352 
8353 #if KMP_USE_ADAPTIVE_LOCKS
8354 #if KMP_DEBUG_ADAPTIVE_LOCKS
8355   __kmp_print_speculative_stats();
8356 #endif
8357 #endif
8358   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8359   __kmp_nested_nth.nth = NULL;
8360   __kmp_nested_nth.size = 0;
8361   __kmp_nested_nth.used = 0;
8362 
8363   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8364   __kmp_nested_proc_bind.bind_types = NULL;
8365   __kmp_nested_proc_bind.size = 0;
8366   __kmp_nested_proc_bind.used = 0;
8367   if (__kmp_affinity_format) {
8368     KMP_INTERNAL_FREE(__kmp_affinity_format);
8369     __kmp_affinity_format = NULL;
8370   }
8371 
8372   __kmp_i18n_catclose();
8373 
8374 #if KMP_USE_HIER_SCHED
8375   __kmp_hier_scheds.deallocate();
8376 #endif
8377 
8378 #if KMP_STATS_ENABLED
8379   __kmp_stats_fini();
8380 #endif
8381 
8382   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8383 }
8384 
8385 /* ------------------------------------------------------------------------ */
8386 
8387 int __kmp_ignore_mppbeg(void) {
8388   char *env;
8389 
8390   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8391     if (__kmp_str_match_false(env))
8392       return FALSE;
8393   }
8394   // By default __kmpc_begin() is no-op.
8395   return TRUE;
8396 }
8397 
8398 int __kmp_ignore_mppend(void) {
8399   char *env;
8400 
8401   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8402     if (__kmp_str_match_false(env))
8403       return FALSE;
8404   }
8405   // By default __kmpc_end() is no-op.
8406   return TRUE;
8407 }
8408 
8409 void __kmp_internal_begin(void) {
8410   int gtid;
8411   kmp_root_t *root;
8412 
8413   /* this is a very important step as it will register new sibling threads
8414      and assign these new uber threads a new gtid */
8415   gtid = __kmp_entry_gtid();
8416   root = __kmp_threads[gtid]->th.th_root;
8417   KMP_ASSERT(KMP_UBER_GTID(gtid));
8418 
8419   if (root->r.r_begin)
8420     return;
8421   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8422   if (root->r.r_begin) {
8423     __kmp_release_lock(&root->r.r_begin_lock, gtid);
8424     return;
8425   }
8426 
8427   root->r.r_begin = TRUE;
8428 
8429   __kmp_release_lock(&root->r.r_begin_lock, gtid);
8430 }
8431 
8432 /* ------------------------------------------------------------------------ */
8433 
8434 void __kmp_user_set_library(enum library_type arg) {
8435   int gtid;
8436   kmp_root_t *root;
8437   kmp_info_t *thread;
8438 
8439   /* first, make sure we are initialized so we can get our gtid */
8440 
8441   gtid = __kmp_entry_gtid();
8442   thread = __kmp_threads[gtid];
8443 
8444   root = thread->th.th_root;
8445 
8446   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8447                 library_serial));
8448   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8449                                   thread */
8450     KMP_WARNING(SetLibraryIncorrectCall);
8451     return;
8452   }
8453 
8454   switch (arg) {
8455   case library_serial:
8456     thread->th.th_set_nproc = 0;
8457     set__nproc(thread, 1);
8458     break;
8459   case library_turnaround:
8460     thread->th.th_set_nproc = 0;
8461     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8462                                            : __kmp_dflt_team_nth_ub);
8463     break;
8464   case library_throughput:
8465     thread->th.th_set_nproc = 0;
8466     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8467                                            : __kmp_dflt_team_nth_ub);
8468     break;
8469   default:
8470     KMP_FATAL(UnknownLibraryType, arg);
8471   }
8472 
8473   __kmp_aux_set_library(arg);
8474 }
8475 
8476 void __kmp_aux_set_stacksize(size_t arg) {
8477   if (!__kmp_init_serial)
8478     __kmp_serial_initialize();
8479 
8480 #if KMP_OS_DARWIN
8481   if (arg & (0x1000 - 1)) {
8482     arg &= ~(0x1000 - 1);
8483     if (arg + 0x1000) /* check for overflow if we round up */
8484       arg += 0x1000;
8485   }
8486 #endif
8487   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8488 
8489   /* only change the default stacksize before the first parallel region */
8490   if (!TCR_4(__kmp_init_parallel)) {
8491     size_t value = arg; /* argument is in bytes */
8492 
8493     if (value < __kmp_sys_min_stksize)
8494       value = __kmp_sys_min_stksize;
8495     else if (value > KMP_MAX_STKSIZE)
8496       value = KMP_MAX_STKSIZE;
8497 
8498     __kmp_stksize = value;
8499 
8500     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8501   }
8502 
8503   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8504 }
8505 
8506 /* set the behaviour of the runtime library */
8507 /* TODO this can cause some odd behaviour with sibling parallelism... */
8508 void __kmp_aux_set_library(enum library_type arg) {
8509   __kmp_library = arg;
8510 
8511   switch (__kmp_library) {
8512   case library_serial: {
8513     KMP_INFORM(LibraryIsSerial);
8514   } break;
8515   case library_turnaround:
8516     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8517       __kmp_use_yield = 2; // only yield when oversubscribed
8518     break;
8519   case library_throughput:
8520     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8521       __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8522     break;
8523   default:
8524     KMP_FATAL(UnknownLibraryType, arg);
8525   }
8526 }
8527 
8528 /* Getting team information common for all team API */
8529 // Returns NULL if not in teams construct
8530 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8531   kmp_info_t *thr = __kmp_entry_thread();
8532   teams_serialized = 0;
8533   if (thr->th.th_teams_microtask) {
8534     kmp_team_t *team = thr->th.th_team;
8535     int tlevel = thr->th.th_teams_level; // the level of the teams construct
8536     int ii = team->t.t_level;
8537     teams_serialized = team->t.t_serialized;
8538     int level = tlevel + 1;
8539     KMP_DEBUG_ASSERT(ii >= tlevel);
8540     while (ii > level) {
8541       for (teams_serialized = team->t.t_serialized;
8542            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8543       }
8544       if (team->t.t_serialized && (!teams_serialized)) {
8545         team = team->t.t_parent;
8546         continue;
8547       }
8548       if (ii > level) {
8549         team = team->t.t_parent;
8550         ii--;
8551       }
8552     }
8553     return team;
8554   }
8555   return NULL;
8556 }
8557 
8558 int __kmp_aux_get_team_num() {
8559   int serialized;
8560   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8561   if (team) {
8562     if (serialized > 1) {
8563       return 0; // teams region is serialized ( 1 team of 1 thread ).
8564     } else {
8565       return team->t.t_master_tid;
8566     }
8567   }
8568   return 0;
8569 }
8570 
8571 int __kmp_aux_get_num_teams() {
8572   int serialized;
8573   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8574   if (team) {
8575     if (serialized > 1) {
8576       return 1;
8577     } else {
8578       return team->t.t_parent->t.t_nproc;
8579     }
8580   }
8581   return 1;
8582 }
8583 
8584 /* ------------------------------------------------------------------------ */
8585 
8586 /*
8587  * Affinity Format Parser
8588  *
8589  * Field is in form of: %[[[0].]size]type
8590  * % and type are required (%% means print a literal '%')
8591  * type is either single char or long name surrounded by {},
8592  * e.g., N or {num_threads}
8593  * 0 => leading zeros
8594  * . => right justified when size is specified
8595  * by default output is left justified
8596  * size is the *minimum* field length
8597  * All other characters are printed as is
8598  *
8599  * Available field types:
8600  * L {thread_level}      - omp_get_level()
8601  * n {thread_num}        - omp_get_thread_num()
8602  * h {host}              - name of host machine
8603  * P {process_id}        - process id (integer)
8604  * T {thread_identifier} - native thread identifier (integer)
8605  * N {num_threads}       - omp_get_num_threads()
8606  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8607  * a {thread_affinity}   - comma separated list of integers or integer ranges
8608  *                         (values of affinity mask)
8609  *
8610  * Implementation-specific field types can be added
8611  * If a type is unknown, print "undefined"
8612  */
8613 
8614 // Structure holding the short name, long name, and corresponding data type
8615 // for snprintf.  A table of these will represent the entire valid keyword
8616 // field types.
8617 typedef struct kmp_affinity_format_field_t {
8618   char short_name; // from spec e.g., L -> thread level
8619   const char *long_name; // from spec thread_level -> thread level
8620   char field_format; // data type for snprintf (typically 'd' or 's'
8621   // for integer or string)
8622 } kmp_affinity_format_field_t;
8623 
8624 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8625 #if KMP_AFFINITY_SUPPORTED
8626     {'A', "thread_affinity", 's'},
8627 #endif
8628     {'t', "team_num", 'd'},
8629     {'T', "num_teams", 'd'},
8630     {'L', "nesting_level", 'd'},
8631     {'n', "thread_num", 'd'},
8632     {'N', "num_threads", 'd'},
8633     {'a', "ancestor_tnum", 'd'},
8634     {'H', "host", 's'},
8635     {'P', "process_id", 'd'},
8636     {'i', "native_thread_id", 'd'}};
8637 
8638 // Return the number of characters it takes to hold field
8639 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8640                                             const char **ptr,
8641                                             kmp_str_buf_t *field_buffer) {
8642   int rc, format_index, field_value;
8643   const char *width_left, *width_right;
8644   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8645   static const int FORMAT_SIZE = 20;
8646   char format[FORMAT_SIZE] = {0};
8647   char absolute_short_name = 0;
8648 
8649   KMP_DEBUG_ASSERT(gtid >= 0);
8650   KMP_DEBUG_ASSERT(th);
8651   KMP_DEBUG_ASSERT(**ptr == '%');
8652   KMP_DEBUG_ASSERT(field_buffer);
8653 
8654   __kmp_str_buf_clear(field_buffer);
8655 
8656   // Skip the initial %
8657   (*ptr)++;
8658 
8659   // Check for %% first
8660   if (**ptr == '%') {
8661     __kmp_str_buf_cat(field_buffer, "%", 1);
8662     (*ptr)++; // skip over the second %
8663     return 1;
8664   }
8665 
8666   // Parse field modifiers if they are present
8667   pad_zeros = false;
8668   if (**ptr == '0') {
8669     pad_zeros = true;
8670     (*ptr)++; // skip over 0
8671   }
8672   right_justify = false;
8673   if (**ptr == '.') {
8674     right_justify = true;
8675     (*ptr)++; // skip over .
8676   }
8677   // Parse width of field: [width_left, width_right)
8678   width_left = width_right = NULL;
8679   if (**ptr >= '0' && **ptr <= '9') {
8680     width_left = *ptr;
8681     SKIP_DIGITS(*ptr);
8682     width_right = *ptr;
8683   }
8684 
8685   // Create the format for KMP_SNPRINTF based on flags parsed above
8686   format_index = 0;
8687   format[format_index++] = '%';
8688   if (!right_justify)
8689     format[format_index++] = '-';
8690   if (pad_zeros)
8691     format[format_index++] = '0';
8692   if (width_left && width_right) {
8693     int i = 0;
8694     // Only allow 8 digit number widths.
8695     // This also prevents overflowing format variable
8696     while (i < 8 && width_left < width_right) {
8697       format[format_index++] = *width_left;
8698       width_left++;
8699       i++;
8700     }
8701   }
8702 
8703   // Parse a name (long or short)
8704   // Canonicalize the name into absolute_short_name
8705   found_valid_name = false;
8706   parse_long_name = (**ptr == '{');
8707   if (parse_long_name)
8708     (*ptr)++; // skip initial left brace
8709   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8710                              sizeof(__kmp_affinity_format_table[0]);
8711        ++i) {
8712     char short_name = __kmp_affinity_format_table[i].short_name;
8713     const char *long_name = __kmp_affinity_format_table[i].long_name;
8714     char field_format = __kmp_affinity_format_table[i].field_format;
8715     if (parse_long_name) {
8716       size_t length = KMP_STRLEN(long_name);
8717       if (strncmp(*ptr, long_name, length) == 0) {
8718         found_valid_name = true;
8719         (*ptr) += length; // skip the long name
8720       }
8721     } else if (**ptr == short_name) {
8722       found_valid_name = true;
8723       (*ptr)++; // skip the short name
8724     }
8725     if (found_valid_name) {
8726       format[format_index++] = field_format;
8727       format[format_index++] = '\0';
8728       absolute_short_name = short_name;
8729       break;
8730     }
8731   }
8732   if (parse_long_name) {
8733     if (**ptr != '}') {
8734       absolute_short_name = 0;
8735     } else {
8736       (*ptr)++; // skip over the right brace
8737     }
8738   }
8739 
8740   // Attempt to fill the buffer with the requested
8741   // value using snprintf within __kmp_str_buf_print()
8742   switch (absolute_short_name) {
8743   case 't':
8744     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8745     break;
8746   case 'T':
8747     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8748     break;
8749   case 'L':
8750     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8751     break;
8752   case 'n':
8753     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8754     break;
8755   case 'H': {
8756     static const int BUFFER_SIZE = 256;
8757     char buf[BUFFER_SIZE];
8758     __kmp_expand_host_name(buf, BUFFER_SIZE);
8759     rc = __kmp_str_buf_print(field_buffer, format, buf);
8760   } break;
8761   case 'P':
8762     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8763     break;
8764   case 'i':
8765     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8766     break;
8767   case 'N':
8768     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8769     break;
8770   case 'a':
8771     field_value =
8772         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8773     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8774     break;
8775 #if KMP_AFFINITY_SUPPORTED
8776   case 'A': {
8777     kmp_str_buf_t buf;
8778     __kmp_str_buf_init(&buf);
8779     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8780     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8781     __kmp_str_buf_free(&buf);
8782   } break;
8783 #endif
8784   default:
8785     // According to spec, If an implementation does not have info for field
8786     // type, then "undefined" is printed
8787     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8788     // Skip the field
8789     if (parse_long_name) {
8790       SKIP_TOKEN(*ptr);
8791       if (**ptr == '}')
8792         (*ptr)++;
8793     } else {
8794       (*ptr)++;
8795     }
8796   }
8797 
8798   KMP_ASSERT(format_index <= FORMAT_SIZE);
8799   return rc;
8800 }
8801 
8802 /*
8803  * Return number of characters needed to hold the affinity string
8804  * (not including null byte character)
8805  * The resultant string is printed to buffer, which the caller can then
8806  * handle afterwards
8807  */
8808 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8809                                   kmp_str_buf_t *buffer) {
8810   const char *parse_ptr;
8811   size_t retval;
8812   const kmp_info_t *th;
8813   kmp_str_buf_t field;
8814 
8815   KMP_DEBUG_ASSERT(buffer);
8816   KMP_DEBUG_ASSERT(gtid >= 0);
8817 
8818   __kmp_str_buf_init(&field);
8819   __kmp_str_buf_clear(buffer);
8820 
8821   th = __kmp_threads[gtid];
8822   retval = 0;
8823 
8824   // If format is NULL or zero-length string, then we use
8825   // affinity-format-var ICV
8826   parse_ptr = format;
8827   if (parse_ptr == NULL || *parse_ptr == '\0') {
8828     parse_ptr = __kmp_affinity_format;
8829   }
8830   KMP_DEBUG_ASSERT(parse_ptr);
8831 
8832   while (*parse_ptr != '\0') {
8833     // Parse a field
8834     if (*parse_ptr == '%') {
8835       // Put field in the buffer
8836       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8837       __kmp_str_buf_catbuf(buffer, &field);
8838       retval += rc;
8839     } else {
8840       // Put literal character in buffer
8841       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8842       retval++;
8843       parse_ptr++;
8844     }
8845   }
8846   __kmp_str_buf_free(&field);
8847   return retval;
8848 }
8849 
8850 // Displays the affinity string to stdout
8851 void __kmp_aux_display_affinity(int gtid, const char *format) {
8852   kmp_str_buf_t buf;
8853   __kmp_str_buf_init(&buf);
8854   __kmp_aux_capture_affinity(gtid, format, &buf);
8855   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8856   __kmp_str_buf_free(&buf);
8857 }
8858 
8859 /* ------------------------------------------------------------------------ */
8860 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8861   int blocktime = arg; /* argument is in microseconds */
8862 #if KMP_USE_MONITOR
8863   int bt_intervals;
8864 #endif
8865   kmp_int8 bt_set;
8866 
8867   __kmp_save_internal_controls(thread);
8868 
8869   /* Normalize and set blocktime for the teams */
8870   if (blocktime < KMP_MIN_BLOCKTIME)
8871     blocktime = KMP_MIN_BLOCKTIME;
8872   else if (blocktime > KMP_MAX_BLOCKTIME)
8873     blocktime = KMP_MAX_BLOCKTIME;
8874 
8875   set__blocktime_team(thread->th.th_team, tid, blocktime);
8876   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8877 
8878 #if KMP_USE_MONITOR
8879   /* Calculate and set blocktime intervals for the teams */
8880   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8881 
8882   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8883   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8884 #endif
8885 
8886   /* Set whether blocktime has been set to "TRUE" */
8887   bt_set = TRUE;
8888 
8889   set__bt_set_team(thread->th.th_team, tid, bt_set);
8890   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8891 #if KMP_USE_MONITOR
8892   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8893                 "bt_intervals=%d, monitor_updates=%d\n",
8894                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8895                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8896                 __kmp_monitor_wakeups));
8897 #else
8898   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8899                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8900                 thread->th.th_team->t.t_id, tid, blocktime));
8901 #endif
8902 }
8903 
8904 void __kmp_aux_set_defaults(char const *str, size_t len) {
8905   if (!__kmp_init_serial) {
8906     __kmp_serial_initialize();
8907   }
8908   __kmp_env_initialize(str);
8909 
8910   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8911     __kmp_env_print();
8912   }
8913 } // __kmp_aux_set_defaults
8914 
8915 /* ------------------------------------------------------------------------ */
8916 /* internal fast reduction routines */
8917 
8918 PACKED_REDUCTION_METHOD_T
8919 __kmp_determine_reduction_method(
8920     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8921     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8922     kmp_critical_name *lck) {
8923 
8924   // Default reduction method: critical construct ( lck != NULL, like in current
8925   // PAROPT )
8926   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8927   // can be selected by RTL
8928   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8929   // can be selected by RTL
8930   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8931   // among generated by PAROPT.
8932 
8933   PACKED_REDUCTION_METHOD_T retval;
8934 
8935   int team_size;
8936 
8937   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8938 
8939 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8940   (loc &&                                                                      \
8941    ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8942 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8943 
8944   retval = critical_reduce_block;
8945 
8946   // another choice of getting a team size (with 1 dynamic deference) is slower
8947   team_size = __kmp_get_team_num_threads(global_tid);
8948   if (team_size == 1) {
8949 
8950     retval = empty_reduce_block;
8951 
8952   } else {
8953 
8954     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8955 
8956 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8957     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||             \
8958     KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
8959 
8960 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8961     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HAIKU ||       \
8962     KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8963 
8964     int teamsize_cutoff = 4;
8965 
8966 #if KMP_MIC_SUPPORTED
8967     if (__kmp_mic_type != non_mic) {
8968       teamsize_cutoff = 8;
8969     }
8970 #endif
8971     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8972     if (tree_available) {
8973       if (team_size <= teamsize_cutoff) {
8974         if (atomic_available) {
8975           retval = atomic_reduce_block;
8976         }
8977       } else {
8978         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8979       }
8980     } else if (atomic_available) {
8981       retval = atomic_reduce_block;
8982     }
8983 #else
8984 #error "Unknown or unsupported OS"
8985 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8986        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HAIKU ||
8987        // KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8988 
8989 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS ||       \
8990     KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32 || KMP_ARCH_SPARC
8991 
8992 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8993     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HAIKU || KMP_OS_HURD ||         \
8994     KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8995 
8996     // basic tuning
8997 
8998     if (atomic_available) {
8999       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
9000         retval = atomic_reduce_block;
9001       }
9002     } // otherwise: use critical section
9003 
9004 #elif KMP_OS_DARWIN
9005 
9006     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9007     if (atomic_available && (num_vars <= 3)) {
9008       retval = atomic_reduce_block;
9009     } else if (tree_available) {
9010       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
9011           (reduce_size < (2000 * sizeof(kmp_real64)))) {
9012         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
9013       }
9014     } // otherwise: use critical section
9015 
9016 #else
9017 #error "Unknown or unsupported OS"
9018 #endif
9019 
9020 #else
9021 #error "Unknown or unsupported architecture"
9022 #endif
9023   }
9024 
9025   // KMP_FORCE_REDUCTION
9026 
9027   // If the team is serialized (team_size == 1), ignore the forced reduction
9028   // method and stay with the unsynchronized method (empty_reduce_block)
9029   if (__kmp_force_reduction_method != reduction_method_not_defined &&
9030       team_size != 1) {
9031 
9032     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
9033 
9034     int atomic_available, tree_available;
9035 
9036     switch ((forced_retval = __kmp_force_reduction_method)) {
9037     case critical_reduce_block:
9038       KMP_ASSERT(lck); // lck should be != 0
9039       break;
9040 
9041     case atomic_reduce_block:
9042       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
9043       if (!atomic_available) {
9044         KMP_WARNING(RedMethodNotSupported, "atomic");
9045         forced_retval = critical_reduce_block;
9046       }
9047       break;
9048 
9049     case tree_reduce_block:
9050       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9051       if (!tree_available) {
9052         KMP_WARNING(RedMethodNotSupported, "tree");
9053         forced_retval = critical_reduce_block;
9054       } else {
9055 #if KMP_FAST_REDUCTION_BARRIER
9056         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
9057 #endif
9058       }
9059       break;
9060 
9061     default:
9062       KMP_ASSERT(0); // "unsupported method specified"
9063     }
9064 
9065     retval = forced_retval;
9066   }
9067 
9068   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9069 
9070 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
9071 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9072 
9073   return (retval);
9074 }
9075 // this function is for testing set/get/determine reduce method
9076 kmp_int32 __kmp_get_reduce_method(void) {
9077   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9078 }
9079 
9080 // Soft pause sets up threads to ignore blocktime and just go to sleep.
9081 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
9082 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9083 
9084 // Hard pause shuts down the runtime completely.  Resume happens naturally when
9085 // OpenMP is used subsequently.
9086 void __kmp_hard_pause() {
9087   __kmp_pause_status = kmp_hard_paused;
9088   __kmp_internal_end_thread(-1);
9089 }
9090 
9091 // Soft resume sets __kmp_pause_status, and wakes up all threads.
9092 void __kmp_resume_if_soft_paused() {
9093   if (__kmp_pause_status == kmp_soft_paused) {
9094     __kmp_pause_status = kmp_not_paused;
9095 
9096     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9097       kmp_info_t *thread = __kmp_threads[gtid];
9098       if (thread) { // Wake it if sleeping
9099         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9100                          thread);
9101         if (fl.is_sleeping())
9102           fl.resume(gtid);
9103         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
9104           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
9105         } else { // thread holds the lock and may sleep soon
9106           do { // until either the thread sleeps, or we can get the lock
9107             if (fl.is_sleeping()) {
9108               fl.resume(gtid);
9109               break;
9110             } else if (__kmp_try_suspend_mx(thread)) {
9111               __kmp_unlock_suspend_mx(thread);
9112               break;
9113             }
9114           } while (1);
9115         }
9116       }
9117     }
9118   }
9119 }
9120 
9121 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
9122 // TODO: add warning messages
9123 int __kmp_pause_resource(kmp_pause_status_t level) {
9124   if (level == kmp_not_paused) { // requesting resume
9125     if (__kmp_pause_status == kmp_not_paused) {
9126       // error message about runtime not being paused, so can't resume
9127       return 1;
9128     } else {
9129       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9130                        __kmp_pause_status == kmp_hard_paused);
9131       __kmp_pause_status = kmp_not_paused;
9132       return 0;
9133     }
9134   } else if (level == kmp_soft_paused) { // requesting soft pause
9135     if (__kmp_pause_status != kmp_not_paused) {
9136       // error message about already being paused
9137       return 1;
9138     } else {
9139       __kmp_soft_pause();
9140       return 0;
9141     }
9142   } else if (level == kmp_hard_paused || level == kmp_stop_tool_paused) {
9143     // requesting hard pause or stop_tool pause
9144     if (__kmp_pause_status != kmp_not_paused) {
9145       // error message about already being paused
9146       return 1;
9147     } else {
9148       __kmp_hard_pause();
9149       return 0;
9150     }
9151   } else {
9152     // error message about invalid level
9153     return 1;
9154   }
9155 }
9156 
9157 void __kmp_omp_display_env(int verbose) {
9158   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9159   if (__kmp_init_serial == 0)
9160     __kmp_do_serial_initialize();
9161   __kmp_display_env_impl(!verbose, verbose);
9162   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9163 }
9164 
9165 // The team size is changing, so distributed barrier must be modified
9166 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9167                                int new_nthreads) {
9168   KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9169                    bp_dist_bar);
9170   kmp_info_t **other_threads = team->t.t_threads;
9171 
9172   // We want all the workers to stop waiting on the barrier while we adjust the
9173   // size of the team.
9174   for (int f = 1; f < old_nthreads; ++f) {
9175     KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9176     // Ignore threads that are already inactive or not present in the team
9177     if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9178       // teams construct causes thread_limit to get passed in, and some of
9179       // those could be inactive; just ignore them
9180       continue;
9181     }
9182     // If thread is transitioning still to in_use state, wait for it
9183     if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9184       while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9185         KMP_CPU_PAUSE();
9186     }
9187     // The thread should be in_use now
9188     KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9189     // Transition to unused state
9190     team->t.t_threads[f]->th.th_used_in_team.store(2);
9191     KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9192   }
9193   // Release all the workers
9194   team->t.b->go_release();
9195 
9196   KMP_MFENCE();
9197 
9198   // Workers should see transition status 2 and move to 0; but may need to be
9199   // woken up first
9200   int count = old_nthreads - 1;
9201   while (count > 0) {
9202     count = old_nthreads - 1;
9203     for (int f = 1; f < old_nthreads; ++f) {
9204       if (other_threads[f]->th.th_used_in_team.load() != 0) {
9205         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9206           kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9207               void *, other_threads[f]->th.th_sleep_loc);
9208           __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9209         }
9210       } else {
9211         KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9212         count--;
9213       }
9214     }
9215   }
9216   // Now update the barrier size
9217   team->t.b->update_num_threads(new_nthreads);
9218   team->t.b->go_reset();
9219 }
9220 
9221 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9222   // Add the threads back to the team
9223   KMP_DEBUG_ASSERT(team);
9224   // Threads were paused and pointed at th_used_in_team temporarily during a
9225   // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9226   // the thread that it should transition itself back into the team. Then, if
9227   // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9228   // to wake it up.
9229   for (int f = 1; f < new_nthreads; ++f) {
9230     KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9231     (void)KMP_COMPARE_AND_STORE_ACQ32(
9232         &(team->t.t_threads[f]->th.th_used_in_team), 0, 3);
9233     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9234       __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9235                       (kmp_flag_32<false, false> *)NULL);
9236     }
9237   }
9238   // The threads should be transitioning to the team; when they are done, they
9239   // should have set th_used_in_team to 1. This loop forces master to wait until
9240   // all threads have moved into the team and are waiting in the barrier.
9241   int count = new_nthreads - 1;
9242   while (count > 0) {
9243     count = new_nthreads - 1;
9244     for (int f = 1; f < new_nthreads; ++f) {
9245       if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9246         count--;
9247       }
9248     }
9249   }
9250 }
9251 
9252 // Globals and functions for hidden helper task
9253 kmp_info_t **__kmp_hidden_helper_threads;
9254 kmp_info_t *__kmp_hidden_helper_main_thread;
9255 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9256 #if KMP_OS_LINUX
9257 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9258 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9259 #else
9260 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9261 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9262 #endif
9263 
9264 namespace {
9265 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9266 
9267 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9268   // This is an explicit synchronization on all hidden helper threads in case
9269   // that when a regular thread pushes a hidden helper task to one hidden
9270   // helper thread, the thread has not been awaken once since they're released
9271   // by the main thread after creating the team.
9272   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9273   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9274          __kmp_hidden_helper_threads_num)
9275     ;
9276 
9277   // If main thread, then wait for signal
9278   if (__kmpc_master(nullptr, *gtid)) {
9279     // First, unset the initial state and release the initial thread
9280     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9281     __kmp_hidden_helper_initz_release();
9282     __kmp_hidden_helper_main_thread_wait();
9283     // Now wake up all worker threads
9284     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9285       __kmp_hidden_helper_worker_thread_signal();
9286     }
9287   }
9288 }
9289 } // namespace
9290 
9291 void __kmp_hidden_helper_threads_initz_routine() {
9292   // Create a new root for hidden helper team/threads
9293   const int gtid = __kmp_register_root(TRUE);
9294   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9295   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9296   __kmp_hidden_helper_main_thread->th.th_set_nproc =
9297       __kmp_hidden_helper_threads_num;
9298 
9299   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9300 
9301   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9302 
9303   // Set the initialization flag to FALSE
9304   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9305 
9306   __kmp_hidden_helper_threads_deinitz_release();
9307 }
9308 
9309 /* Nesting Mode:
9310    Set via KMP_NESTING_MODE, which takes an integer.
9311    Note: we skip duplicate topology levels, and skip levels with only
9312       one entity.
9313    KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9314    KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9315       in the topology, and initializes the number of threads at each of those
9316       levels to the number of entities at each level, respectively, below the
9317       entity at the parent level.
9318    KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9319       but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9320       the user to turn nesting on explicitly. This is an even more experimental
9321       option to this experimental feature, and may change or go away in the
9322       future.
9323 */
9324 
9325 // Allocate space to store nesting levels
9326 void __kmp_init_nesting_mode() {
9327   int levels = KMP_HW_LAST;
9328   __kmp_nesting_mode_nlevels = levels;
9329   __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9330   for (int i = 0; i < levels; ++i)
9331     __kmp_nesting_nth_level[i] = 0;
9332   if (__kmp_nested_nth.size < levels) {
9333     __kmp_nested_nth.nth =
9334         (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9335     __kmp_nested_nth.size = levels;
9336   }
9337 }
9338 
9339 // Set # threads for top levels of nesting; must be called after topology set
9340 void __kmp_set_nesting_mode_threads() {
9341   kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9342 
9343   if (__kmp_nesting_mode == 1)
9344     __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9345   else if (__kmp_nesting_mode > 1)
9346     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9347 
9348   if (__kmp_topology) { // use topology info
9349     int loc, hw_level;
9350     for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9351                                 loc < __kmp_nesting_mode_nlevels;
9352          loc++, hw_level++) {
9353       __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9354       if (__kmp_nesting_nth_level[loc] == 1)
9355         loc--;
9356     }
9357     // Make sure all cores are used
9358     if (__kmp_nesting_mode > 1 && loc > 1) {
9359       int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9360       int num_cores = __kmp_topology->get_count(core_level);
9361       int upper_levels = 1;
9362       for (int level = 0; level < loc - 1; ++level)
9363         upper_levels *= __kmp_nesting_nth_level[level];
9364       if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9365         __kmp_nesting_nth_level[loc - 1] =
9366             num_cores / __kmp_nesting_nth_level[loc - 2];
9367     }
9368     __kmp_nesting_mode_nlevels = loc;
9369     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9370   } else { // no topology info available; provide a reasonable guesstimation
9371     if (__kmp_avail_proc >= 4) {
9372       __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9373       __kmp_nesting_nth_level[1] = 2;
9374       __kmp_nesting_mode_nlevels = 2;
9375     } else {
9376       __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9377       __kmp_nesting_mode_nlevels = 1;
9378     }
9379     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9380   }
9381   for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9382     __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9383   }
9384   set__nproc(thread, __kmp_nesting_nth_level[0]);
9385   if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9386     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9387   if (get__max_active_levels(thread) > 1) {
9388     // if max levels was set, set nesting mode levels to same
9389     __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9390   }
9391   if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9392     set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9393 }
9394 
9395 // Empty symbols to export (see exports_so.txt) when feature is disabled
9396 extern "C" {
9397 #if !KMP_STATS_ENABLED
9398 void __kmp_reset_stats() {}
9399 #endif
9400 #if !USE_DEBUGGER
9401 int __kmp_omp_debug_struct_info = FALSE;
9402 int __kmp_debugging = FALSE;
9403 #endif
9404 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9405 void __kmp_itt_fini_ittlib() {}
9406 void __kmp_itt_init_ittlib() {}
9407 #endif
9408 }
9409 
9410 // end of file
9411