xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_runtime.cpp (revision e6bfd18d21b225af6a0ed67ceeaf1293b7b9eba5)
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #if KMP_OS_WINDOWS
51 // windows does not need include files as it doesn't use shared memory
52 #else
53 #include <sys/mman.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #define SHM_SIZE 1024
57 #endif
58 
59 #if defined(KMP_GOMP_COMPAT)
60 char const __kmp_version_alt_comp[] =
61     KMP_VERSION_PREFIX "alternative compiler support: yes";
62 #endif /* defined(KMP_GOMP_COMPAT) */
63 
64 char const __kmp_version_omp_api[] =
65     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66 
67 #ifdef KMP_DEBUG
68 char const __kmp_version_lock[] =
69     KMP_VERSION_PREFIX "lock type: run time selectable";
70 #endif /* KMP_DEBUG */
71 
72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73 
74 /* ------------------------------------------------------------------------ */
75 
76 #if KMP_USE_MONITOR
77 kmp_info_t __kmp_monitor;
78 #endif
79 
80 /* Forward declarations */
81 
82 void __kmp_cleanup(void);
83 
84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85                                   int gtid);
86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87                                   kmp_internal_control_t *new_icvs,
88                                   ident_t *loc);
89 #if KMP_AFFINITY_SUPPORTED
90 static void __kmp_partition_places(kmp_team_t *team,
91                                    int update_master_only = 0);
92 #endif
93 static void __kmp_do_serial_initialize(void);
94 void __kmp_fork_barrier(int gtid, int tid);
95 void __kmp_join_barrier(int gtid);
96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97                           kmp_internal_control_t *new_icvs, ident_t *loc);
98 
99 #ifdef USE_LOAD_BALANCE
100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101 #endif
102 
103 static int __kmp_expand_threads(int nNeed);
104 #if KMP_OS_WINDOWS
105 static int __kmp_unregister_root_other_thread(int gtid);
106 #endif
107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109 
110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111                                int new_nthreads);
112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113 
114 /* Calculate the identifier of the current thread */
115 /* fast (and somewhat portable) way to get unique identifier of executing
116    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117 int __kmp_get_global_thread_id() {
118   int i;
119   kmp_info_t **other_threads;
120   size_t stack_data;
121   char *stack_addr;
122   size_t stack_size;
123   char *stack_base;
124 
125   KA_TRACE(
126       1000,
127       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
128        __kmp_nth, __kmp_all_nth));
129 
130   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133      __kmp_init_gtid for this to work. */
134 
135   if (!TCR_4(__kmp_init_gtid))
136     return KMP_GTID_DNE;
137 
138 #ifdef KMP_TDATA_GTID
139   if (TCR_4(__kmp_gtid_mode) >= 3) {
140     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141     return __kmp_gtid;
142   }
143 #endif
144   if (TCR_4(__kmp_gtid_mode) >= 2) {
145     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146     return __kmp_gtid_get_specific();
147   }
148   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149 
150   stack_addr = (char *)&stack_data;
151   other_threads = __kmp_threads;
152 
153   /* ATT: The code below is a source of potential bugs due to unsynchronized
154      access to __kmp_threads array. For example:
155      1. Current thread loads other_threads[i] to thr and checks it, it is
156         non-NULL.
157      2. Current thread is suspended by OS.
158      3. Another thread unregisters and finishes (debug versions of free()
159         may fill memory with something like 0xEF).
160      4. Current thread is resumed.
161      5. Current thread reads junk from *thr.
162      TODO: Fix it.  --ln  */
163 
164   for (i = 0; i < __kmp_threads_capacity; i++) {
165 
166     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167     if (!thr)
168       continue;
169 
170     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172 
173     /* stack grows down -- search through all of the active threads */
174 
175     if (stack_addr <= stack_base) {
176       size_t stack_diff = stack_base - stack_addr;
177 
178       if (stack_diff <= stack_size) {
179         /* The only way we can be closer than the allocated */
180         /* stack size is if we are running on this thread. */
181         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
182         return i;
183       }
184     }
185   }
186 
187   /* get specific to try and determine our gtid */
188   KA_TRACE(1000,
189            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
190             "thread, using TLS\n"));
191   i = __kmp_gtid_get_specific();
192 
193   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
194 
195   /* if we havn't been assigned a gtid, then return code */
196   if (i < 0)
197     return i;
198 
199   /* dynamically updated stack window for uber threads to avoid get_specific
200      call */
201   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
202     KMP_FATAL(StackOverflow, i);
203   }
204 
205   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206   if (stack_addr > stack_base) {
207     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
208     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
210                 stack_base);
211   } else {
212     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
213             stack_base - stack_addr);
214   }
215 
216   /* Reprint stack bounds for ubermaster since they have been refined */
217   if (__kmp_storage_map) {
218     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
220     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
221                                  other_threads[i]->th.th_info.ds.ds_stacksize,
222                                  "th_%d stack (refinement)", i);
223   }
224   return i;
225 }
226 
227 int __kmp_get_global_thread_id_reg() {
228   int gtid;
229 
230   if (!__kmp_init_serial) {
231     gtid = KMP_GTID_DNE;
232   } else
233 #ifdef KMP_TDATA_GTID
234       if (TCR_4(__kmp_gtid_mode) >= 3) {
235     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
236     gtid = __kmp_gtid;
237   } else
238 #endif
239       if (TCR_4(__kmp_gtid_mode) >= 2) {
240     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
241     gtid = __kmp_gtid_get_specific();
242   } else {
243     KA_TRACE(1000,
244              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
245     gtid = __kmp_get_global_thread_id();
246   }
247 
248   /* we must be a new uber master sibling thread */
249   if (gtid == KMP_GTID_DNE) {
250     KA_TRACE(10,
251              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
252               "Registering a new gtid.\n"));
253     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
254     if (!__kmp_init_serial) {
255       __kmp_do_serial_initialize();
256       gtid = __kmp_gtid_get_specific();
257     } else {
258       gtid = __kmp_register_root(FALSE);
259     }
260     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
261     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
262   }
263 
264   KMP_DEBUG_ASSERT(gtid >= 0);
265 
266   return gtid;
267 }
268 
269 /* caller must hold forkjoin_lock */
270 void __kmp_check_stack_overlap(kmp_info_t *th) {
271   int f;
272   char *stack_beg = NULL;
273   char *stack_end = NULL;
274   int gtid;
275 
276   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
277   if (__kmp_storage_map) {
278     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
279     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
280 
281     gtid = __kmp_gtid_from_thread(th);
282 
283     if (gtid == KMP_GTID_MONITOR) {
284       __kmp_print_storage_map_gtid(
285           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286           "th_%s stack (%s)", "mon",
287           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288     } else {
289       __kmp_print_storage_map_gtid(
290           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
291           "th_%d stack (%s)", gtid,
292           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
293     }
294   }
295 
296   /* No point in checking ubermaster threads since they use refinement and
297    * cannot overlap */
298   gtid = __kmp_gtid_from_thread(th);
299   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
300     KA_TRACE(10,
301              ("__kmp_check_stack_overlap: performing extensive checking\n"));
302     if (stack_beg == NULL) {
303       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
304       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
305     }
306 
307     for (f = 0; f < __kmp_threads_capacity; f++) {
308       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
309 
310       if (f_th && f_th != th) {
311         char *other_stack_end =
312             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
313         char *other_stack_beg =
314             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
315         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
316             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
317 
318           /* Print the other stack values before the abort */
319           if (__kmp_storage_map)
320             __kmp_print_storage_map_gtid(
321                 -1, other_stack_beg, other_stack_end,
322                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
323                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
324 
325           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
326                       __kmp_msg_null);
327         }
328       }
329     }
330   }
331   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
332 }
333 
334 /* ------------------------------------------------------------------------ */
335 
336 void __kmp_infinite_loop(void) {
337   static int done = FALSE;
338 
339   while (!done) {
340     KMP_YIELD(TRUE);
341   }
342 }
343 
344 #define MAX_MESSAGE 512
345 
346 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
347                                   char const *format, ...) {
348   char buffer[MAX_MESSAGE];
349   va_list ap;
350 
351   va_start(ap, format);
352   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
353                p2, (unsigned long)size, format);
354   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
355   __kmp_vprintf(kmp_err, buffer, ap);
356 #if KMP_PRINT_DATA_PLACEMENT
357   int node;
358   if (gtid >= 0) {
359     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
360       if (__kmp_storage_map_verbose) {
361         node = __kmp_get_host_node(p1);
362         if (node < 0) /* doesn't work, so don't try this next time */
363           __kmp_storage_map_verbose = FALSE;
364         else {
365           char *last;
366           int lastNode;
367           int localProc = __kmp_get_cpu_from_gtid(gtid);
368 
369           const int page_size = KMP_GET_PAGE_SIZE();
370 
371           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
372           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
373           if (localProc >= 0)
374             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
375                                  localProc >> 1);
376           else
377             __kmp_printf_no_lock("  GTID %d\n", gtid);
378 #if KMP_USE_PRCTL
379           /* The more elaborate format is disabled for now because of the prctl
380            * hanging bug. */
381           do {
382             last = p1;
383             lastNode = node;
384             /* This loop collates adjacent pages with the same host node. */
385             do {
386               (char *)p1 += page_size;
387             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
388             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
389                                  lastNode);
390           } while (p1 <= p2);
391 #else
392           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
393                                (char *)p1 + (page_size - 1),
394                                __kmp_get_host_node(p1));
395           if (p1 < p2) {
396             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
397                                  (char *)p2 + (page_size - 1),
398                                  __kmp_get_host_node(p2));
399           }
400 #endif
401         }
402       }
403     } else
404       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
405   }
406 #endif /* KMP_PRINT_DATA_PLACEMENT */
407   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
408 }
409 
410 void __kmp_warn(char const *format, ...) {
411   char buffer[MAX_MESSAGE];
412   va_list ap;
413 
414   if (__kmp_generate_warnings == kmp_warnings_off) {
415     return;
416   }
417 
418   va_start(ap, format);
419 
420   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
421   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
422   __kmp_vprintf(kmp_err, buffer, ap);
423   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
424 
425   va_end(ap);
426 }
427 
428 void __kmp_abort_process() {
429   // Later threads may stall here, but that's ok because abort() will kill them.
430   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
431 
432   if (__kmp_debug_buf) {
433     __kmp_dump_debug_buffer();
434   }
435 
436   if (KMP_OS_WINDOWS) {
437     // Let other threads know of abnormal termination and prevent deadlock
438     // if abort happened during library initialization or shutdown
439     __kmp_global.g.g_abort = SIGABRT;
440 
441     /* On Windows* OS by default abort() causes pop-up error box, which stalls
442        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
443        boxes. _set_abort_behavior() works well, but this function is not
444        available in VS7 (this is not problem for DLL, but it is a problem for
445        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
446        help, at least in some versions of MS C RTL.
447 
448        It seems following sequence is the only way to simulate abort() and
449        avoid pop-up error box. */
450     raise(SIGABRT);
451     _exit(3); // Just in case, if signal ignored, exit anyway.
452   } else {
453     __kmp_unregister_library();
454     abort();
455   }
456 
457   __kmp_infinite_loop();
458   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
459 
460 } // __kmp_abort_process
461 
462 void __kmp_abort_thread(void) {
463   // TODO: Eliminate g_abort global variable and this function.
464   // In case of abort just call abort(), it will kill all the threads.
465   __kmp_infinite_loop();
466 } // __kmp_abort_thread
467 
468 /* Print out the storage map for the major kmp_info_t thread data structures
469    that are allocated together. */
470 
471 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
472   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
473                                gtid);
474 
475   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
476                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
477 
478   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
479                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
480 
481   __kmp_print_storage_map_gtid(
482       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
483       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
484 
485   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
486                                &thr->th.th_bar[bs_plain_barrier + 1],
487                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
488                                gtid);
489 
490   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
491                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
492                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
493                                gtid);
494 
495 #if KMP_FAST_REDUCTION_BARRIER
496   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
497                                &thr->th.th_bar[bs_reduction_barrier + 1],
498                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
499                                gtid);
500 #endif // KMP_FAST_REDUCTION_BARRIER
501 }
502 
503 /* Print out the storage map for the major kmp_team_t team data structures
504    that are allocated together. */
505 
506 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
507                                          int team_id, int num_thr) {
508   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
509   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
510                                header, team_id);
511 
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
513                                &team->t.t_bar[bs_last_barrier],
514                                sizeof(kmp_balign_team_t) * bs_last_barrier,
515                                "%s_%d.t_bar", header, team_id);
516 
517   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
518                                &team->t.t_bar[bs_plain_barrier + 1],
519                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
520                                header, team_id);
521 
522   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
523                                &team->t.t_bar[bs_forkjoin_barrier + 1],
524                                sizeof(kmp_balign_team_t),
525                                "%s_%d.t_bar[forkjoin]", header, team_id);
526 
527 #if KMP_FAST_REDUCTION_BARRIER
528   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
529                                &team->t.t_bar[bs_reduction_barrier + 1],
530                                sizeof(kmp_balign_team_t),
531                                "%s_%d.t_bar[reduction]", header, team_id);
532 #endif // KMP_FAST_REDUCTION_BARRIER
533 
534   __kmp_print_storage_map_gtid(
535       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
536       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
537 
538   __kmp_print_storage_map_gtid(
539       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
540       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
541 
542   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
543                                &team->t.t_disp_buffer[num_disp_buff],
544                                sizeof(dispatch_shared_info_t) * num_disp_buff,
545                                "%s_%d.t_disp_buffer", header, team_id);
546 }
547 
548 static void __kmp_init_allocator() {
549   __kmp_init_memkind();
550   __kmp_init_target_mem();
551 }
552 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
553 
554 /* ------------------------------------------------------------------------ */
555 
556 #if KMP_DYNAMIC_LIB
557 #if KMP_OS_WINDOWS
558 
559 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
560   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
561 
562   switch (fdwReason) {
563 
564   case DLL_PROCESS_ATTACH:
565     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
566 
567     return TRUE;
568 
569   case DLL_PROCESS_DETACH:
570     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
571 
572     // According to Windows* documentation for DllMain entry point:
573     // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
574     //   lpReserved == NULL when FreeLibrary() is called,
575     //   lpReserved != NULL when the process is terminated.
576     // When FreeLibrary() is called, worker threads remain alive. So the
577     // runtime's state is consistent and executing proper shutdown is OK.
578     // When the process is terminated, worker threads have exited or been
579     // forcefully terminated by the OS and only the shutdown thread remains.
580     // This can leave the runtime in an inconsistent state.
581     // Hence, only attempt proper cleanup when FreeLibrary() is called.
582     // Otherwise, rely on OS to reclaim resources.
583     if (lpReserved == NULL)
584       __kmp_internal_end_library(__kmp_gtid_get_specific());
585 
586     return TRUE;
587 
588   case DLL_THREAD_ATTACH:
589     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
590 
591     /* if we want to register new siblings all the time here call
592      * __kmp_get_gtid(); */
593     return TRUE;
594 
595   case DLL_THREAD_DETACH:
596     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
597 
598     __kmp_internal_end_thread(__kmp_gtid_get_specific());
599     return TRUE;
600   }
601 
602   return TRUE;
603 }
604 
605 #endif /* KMP_OS_WINDOWS */
606 #endif /* KMP_DYNAMIC_LIB */
607 
608 /* __kmp_parallel_deo -- Wait until it's our turn. */
609 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
610   int gtid = *gtid_ref;
611 #ifdef BUILD_PARALLEL_ORDERED
612   kmp_team_t *team = __kmp_team_from_gtid(gtid);
613 #endif /* BUILD_PARALLEL_ORDERED */
614 
615   if (__kmp_env_consistency_check) {
616     if (__kmp_threads[gtid]->th.th_root->r.r_active)
617 #if KMP_USE_DYNAMIC_LOCK
618       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
619 #else
620       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
621 #endif
622   }
623 #ifdef BUILD_PARALLEL_ORDERED
624   if (!team->t.t_serialized) {
625     KMP_MB();
626     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
627              NULL);
628     KMP_MB();
629   }
630 #endif /* BUILD_PARALLEL_ORDERED */
631 }
632 
633 /* __kmp_parallel_dxo -- Signal the next task. */
634 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
635   int gtid = *gtid_ref;
636 #ifdef BUILD_PARALLEL_ORDERED
637   int tid = __kmp_tid_from_gtid(gtid);
638   kmp_team_t *team = __kmp_team_from_gtid(gtid);
639 #endif /* BUILD_PARALLEL_ORDERED */
640 
641   if (__kmp_env_consistency_check) {
642     if (__kmp_threads[gtid]->th.th_root->r.r_active)
643       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
644   }
645 #ifdef BUILD_PARALLEL_ORDERED
646   if (!team->t.t_serialized) {
647     KMP_MB(); /* Flush all pending memory write invalidates.  */
648 
649     /* use the tid of the next thread in this team */
650     /* TODO replace with general release procedure */
651     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
652 
653     KMP_MB(); /* Flush all pending memory write invalidates.  */
654   }
655 #endif /* BUILD_PARALLEL_ORDERED */
656 }
657 
658 /* ------------------------------------------------------------------------ */
659 /* The BARRIER for a SINGLE process section is always explicit   */
660 
661 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
662   int status;
663   kmp_info_t *th;
664   kmp_team_t *team;
665 
666   if (!TCR_4(__kmp_init_parallel))
667     __kmp_parallel_initialize();
668   __kmp_resume_if_soft_paused();
669 
670   th = __kmp_threads[gtid];
671   team = th->th.th_team;
672   status = 0;
673 
674   th->th.th_ident = id_ref;
675 
676   if (team->t.t_serialized) {
677     status = 1;
678   } else {
679     kmp_int32 old_this = th->th.th_local.this_construct;
680 
681     ++th->th.th_local.this_construct;
682     /* try to set team count to thread count--success means thread got the
683        single block */
684     /* TODO: Should this be acquire or release? */
685     if (team->t.t_construct == old_this) {
686       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
687                                               th->th.th_local.this_construct);
688     }
689 #if USE_ITT_BUILD
690     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
691         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
692         team->t.t_active_level == 1) {
693       // Only report metadata by primary thread of active team at level 1
694       __kmp_itt_metadata_single(id_ref);
695     }
696 #endif /* USE_ITT_BUILD */
697   }
698 
699   if (__kmp_env_consistency_check) {
700     if (status && push_ws) {
701       __kmp_push_workshare(gtid, ct_psingle, id_ref);
702     } else {
703       __kmp_check_workshare(gtid, ct_psingle, id_ref);
704     }
705   }
706 #if USE_ITT_BUILD
707   if (status) {
708     __kmp_itt_single_start(gtid);
709   }
710 #endif /* USE_ITT_BUILD */
711   return status;
712 }
713 
714 void __kmp_exit_single(int gtid) {
715 #if USE_ITT_BUILD
716   __kmp_itt_single_end(gtid);
717 #endif /* USE_ITT_BUILD */
718   if (__kmp_env_consistency_check)
719     __kmp_pop_workshare(gtid, ct_psingle, NULL);
720 }
721 
722 /* determine if we can go parallel or must use a serialized parallel region and
723  * how many threads we can use
724  * set_nproc is the number of threads requested for the team
725  * returns 0 if we should serialize or only use one thread,
726  * otherwise the number of threads to use
727  * The forkjoin lock is held by the caller. */
728 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
729                                  int master_tid, int set_nthreads,
730                                  int enter_teams) {
731   int capacity;
732   int new_nthreads;
733   KMP_DEBUG_ASSERT(__kmp_init_serial);
734   KMP_DEBUG_ASSERT(root && parent_team);
735   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
736 
737   // If dyn-var is set, dynamically adjust the number of desired threads,
738   // according to the method specified by dynamic_mode.
739   new_nthreads = set_nthreads;
740   if (!get__dynamic_2(parent_team, master_tid)) {
741     ;
742   }
743 #ifdef USE_LOAD_BALANCE
744   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
745     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
746     if (new_nthreads == 1) {
747       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
748                     "reservation to 1 thread\n",
749                     master_tid));
750       return 1;
751     }
752     if (new_nthreads < set_nthreads) {
753       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
754                     "reservation to %d threads\n",
755                     master_tid, new_nthreads));
756     }
757   }
758 #endif /* USE_LOAD_BALANCE */
759   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
760     new_nthreads = __kmp_avail_proc - __kmp_nth +
761                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
762     if (new_nthreads <= 1) {
763       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
764                     "reservation to 1 thread\n",
765                     master_tid));
766       return 1;
767     }
768     if (new_nthreads < set_nthreads) {
769       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
770                     "reservation to %d threads\n",
771                     master_tid, new_nthreads));
772     } else {
773       new_nthreads = set_nthreads;
774     }
775   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
776     if (set_nthreads > 2) {
777       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
778       new_nthreads = (new_nthreads % set_nthreads) + 1;
779       if (new_nthreads == 1) {
780         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
781                       "reservation to 1 thread\n",
782                       master_tid));
783         return 1;
784       }
785       if (new_nthreads < set_nthreads) {
786         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
787                       "reservation to %d threads\n",
788                       master_tid, new_nthreads));
789       }
790     }
791   } else {
792     KMP_ASSERT(0);
793   }
794 
795   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
796   if (__kmp_nth + new_nthreads -
797           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
798       __kmp_max_nth) {
799     int tl_nthreads = __kmp_max_nth - __kmp_nth +
800                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
801     if (tl_nthreads <= 0) {
802       tl_nthreads = 1;
803     }
804 
805     // If dyn-var is false, emit a 1-time warning.
806     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
807       __kmp_reserve_warn = 1;
808       __kmp_msg(kmp_ms_warning,
809                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
810                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
811     }
812     if (tl_nthreads == 1) {
813       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
814                     "reduced reservation to 1 thread\n",
815                     master_tid));
816       return 1;
817     }
818     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
819                   "reservation to %d threads\n",
820                   master_tid, tl_nthreads));
821     new_nthreads = tl_nthreads;
822   }
823 
824   // Respect OMP_THREAD_LIMIT
825   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
826   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
827   if (cg_nthreads + new_nthreads -
828           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
829       max_cg_threads) {
830     int tl_nthreads = max_cg_threads - cg_nthreads +
831                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
832     if (tl_nthreads <= 0) {
833       tl_nthreads = 1;
834     }
835 
836     // If dyn-var is false, emit a 1-time warning.
837     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
838       __kmp_reserve_warn = 1;
839       __kmp_msg(kmp_ms_warning,
840                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
841                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
842     }
843     if (tl_nthreads == 1) {
844       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
845                     "reduced reservation to 1 thread\n",
846                     master_tid));
847       return 1;
848     }
849     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
850                   "reservation to %d threads\n",
851                   master_tid, tl_nthreads));
852     new_nthreads = tl_nthreads;
853   }
854 
855   // Check if the threads array is large enough, or needs expanding.
856   // See comment in __kmp_register_root() about the adjustment if
857   // __kmp_threads[0] == NULL.
858   capacity = __kmp_threads_capacity;
859   if (TCR_PTR(__kmp_threads[0]) == NULL) {
860     --capacity;
861   }
862   // If it is not for initializing the hidden helper team, we need to take
863   // __kmp_hidden_helper_threads_num out of the capacity because it is included
864   // in __kmp_threads_capacity.
865   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
866     capacity -= __kmp_hidden_helper_threads_num;
867   }
868   if (__kmp_nth + new_nthreads -
869           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870       capacity) {
871     // Expand the threads array.
872     int slotsRequired = __kmp_nth + new_nthreads -
873                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
874                         capacity;
875     int slotsAdded = __kmp_expand_threads(slotsRequired);
876     if (slotsAdded < slotsRequired) {
877       // The threads array was not expanded enough.
878       new_nthreads -= (slotsRequired - slotsAdded);
879       KMP_ASSERT(new_nthreads >= 1);
880 
881       // If dyn-var is false, emit a 1-time warning.
882       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
883         __kmp_reserve_warn = 1;
884         if (__kmp_tp_cached) {
885           __kmp_msg(kmp_ms_warning,
886                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
887                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
888                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
889         } else {
890           __kmp_msg(kmp_ms_warning,
891                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
892                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
893         }
894       }
895     }
896   }
897 
898 #ifdef KMP_DEBUG
899   if (new_nthreads == 1) {
900     KC_TRACE(10,
901              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
902               "dead roots and rechecking; requested %d threads\n",
903               __kmp_get_gtid(), set_nthreads));
904   } else {
905     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
906                   " %d threads\n",
907                   __kmp_get_gtid(), new_nthreads, set_nthreads));
908   }
909 #endif // KMP_DEBUG
910   return new_nthreads;
911 }
912 
913 /* Allocate threads from the thread pool and assign them to the new team. We are
914    assured that there are enough threads available, because we checked on that
915    earlier within critical section forkjoin */
916 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
917                                     kmp_info_t *master_th, int master_gtid,
918                                     int fork_teams_workers) {
919   int i;
920   int use_hot_team;
921 
922   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
923   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
924   KMP_MB();
925 
926   /* first, let's setup the primary thread */
927   master_th->th.th_info.ds.ds_tid = 0;
928   master_th->th.th_team = team;
929   master_th->th.th_team_nproc = team->t.t_nproc;
930   master_th->th.th_team_master = master_th;
931   master_th->th.th_team_serialized = FALSE;
932   master_th->th.th_dispatch = &team->t.t_dispatch[0];
933 
934 /* make sure we are not the optimized hot team */
935 #if KMP_NESTED_HOT_TEAMS
936   use_hot_team = 0;
937   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
938   if (hot_teams) { // hot teams array is not allocated if
939     // KMP_HOT_TEAMS_MAX_LEVEL=0
940     int level = team->t.t_active_level - 1; // index in array of hot teams
941     if (master_th->th.th_teams_microtask) { // are we inside the teams?
942       if (master_th->th.th_teams_size.nteams > 1) {
943         ++level; // level was not increased in teams construct for
944         // team_of_masters
945       }
946       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
947           master_th->th.th_teams_level == team->t.t_level) {
948         ++level; // level was not increased in teams construct for
949         // team_of_workers before the parallel
950       } // team->t.t_level will be increased inside parallel
951     }
952     if (level < __kmp_hot_teams_max_level) {
953       if (hot_teams[level].hot_team) {
954         // hot team has already been allocated for given level
955         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
956         use_hot_team = 1; // the team is ready to use
957       } else {
958         use_hot_team = 0; // AC: threads are not allocated yet
959         hot_teams[level].hot_team = team; // remember new hot team
960         hot_teams[level].hot_team_nth = team->t.t_nproc;
961       }
962     } else {
963       use_hot_team = 0;
964     }
965   }
966 #else
967   use_hot_team = team == root->r.r_hot_team;
968 #endif
969   if (!use_hot_team) {
970 
971     /* install the primary thread */
972     team->t.t_threads[0] = master_th;
973     __kmp_initialize_info(master_th, team, 0, master_gtid);
974 
975     /* now, install the worker threads */
976     for (i = 1; i < team->t.t_nproc; i++) {
977 
978       /* fork or reallocate a new thread and install it in team */
979       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
980       team->t.t_threads[i] = thr;
981       KMP_DEBUG_ASSERT(thr);
982       KMP_DEBUG_ASSERT(thr->th.th_team == team);
983       /* align team and thread arrived states */
984       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
985                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
986                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
987                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
988                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
989                     team->t.t_bar[bs_plain_barrier].b_arrived));
990       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
991       thr->th.th_teams_level = master_th->th.th_teams_level;
992       thr->th.th_teams_size = master_th->th.th_teams_size;
993       { // Initialize threads' barrier data.
994         int b;
995         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
996         for (b = 0; b < bs_last_barrier; ++b) {
997           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
998           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
999 #if USE_DEBUGGER
1000           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1001 #endif
1002         }
1003       }
1004     }
1005 
1006 #if KMP_AFFINITY_SUPPORTED
1007     // Do not partition the places list for teams construct workers who
1008     // haven't actually been forked to do real work yet. This partitioning
1009     // will take place in the parallel region nested within the teams construct.
1010     if (!fork_teams_workers) {
1011       __kmp_partition_places(team);
1012     }
1013 #endif
1014   }
1015 
1016   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1017     for (i = 0; i < team->t.t_nproc; i++) {
1018       kmp_info_t *thr = team->t.t_threads[i];
1019       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1020           thr->th.th_prev_level != team->t.t_level) {
1021         team->t.t_display_affinity = 1;
1022         break;
1023       }
1024     }
1025   }
1026 
1027   KMP_MB();
1028 }
1029 
1030 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1031 // Propagate any changes to the floating point control registers out to the team
1032 // We try to avoid unnecessary writes to the relevant cache line in the team
1033 // structure, so we don't make changes unless they are needed.
1034 inline static void propagateFPControl(kmp_team_t *team) {
1035   if (__kmp_inherit_fp_control) {
1036     kmp_int16 x87_fpu_control_word;
1037     kmp_uint32 mxcsr;
1038 
1039     // Get primary thread's values of FPU control flags (both X87 and vector)
1040     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1041     __kmp_store_mxcsr(&mxcsr);
1042     mxcsr &= KMP_X86_MXCSR_MASK;
1043 
1044     // There is no point looking at t_fp_control_saved here.
1045     // If it is TRUE, we still have to update the values if they are different
1046     // from those we now have. If it is FALSE we didn't save anything yet, but
1047     // our objective is the same. We have to ensure that the values in the team
1048     // are the same as those we have.
1049     // So, this code achieves what we need whether or not t_fp_control_saved is
1050     // true. By checking whether the value needs updating we avoid unnecessary
1051     // writes that would put the cache-line into a written state, causing all
1052     // threads in the team to have to read it again.
1053     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1054     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1055     // Although we don't use this value, other code in the runtime wants to know
1056     // whether it should restore them. So we must ensure it is correct.
1057     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1058   } else {
1059     // Similarly here. Don't write to this cache-line in the team structure
1060     // unless we have to.
1061     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1062   }
1063 }
1064 
1065 // Do the opposite, setting the hardware registers to the updated values from
1066 // the team.
1067 inline static void updateHWFPControl(kmp_team_t *team) {
1068   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1069     // Only reset the fp control regs if they have been changed in the team.
1070     // the parallel region that we are exiting.
1071     kmp_int16 x87_fpu_control_word;
1072     kmp_uint32 mxcsr;
1073     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1074     __kmp_store_mxcsr(&mxcsr);
1075     mxcsr &= KMP_X86_MXCSR_MASK;
1076 
1077     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1078       __kmp_clear_x87_fpu_status_word();
1079       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1080     }
1081 
1082     if (team->t.t_mxcsr != mxcsr) {
1083       __kmp_load_mxcsr(&team->t.t_mxcsr);
1084     }
1085   }
1086 }
1087 #else
1088 #define propagateFPControl(x) ((void)0)
1089 #define updateHWFPControl(x) ((void)0)
1090 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1091 
1092 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1093                                      int realloc); // forward declaration
1094 
1095 /* Run a parallel region that has been serialized, so runs only in a team of the
1096    single primary thread. */
1097 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1098   kmp_info_t *this_thr;
1099   kmp_team_t *serial_team;
1100 
1101   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1102 
1103   /* Skip all this code for autopar serialized loops since it results in
1104      unacceptable overhead */
1105   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1106     return;
1107 
1108   if (!TCR_4(__kmp_init_parallel))
1109     __kmp_parallel_initialize();
1110   __kmp_resume_if_soft_paused();
1111 
1112   this_thr = __kmp_threads[global_tid];
1113   serial_team = this_thr->th.th_serial_team;
1114 
1115   /* utilize the serialized team held by this thread */
1116   KMP_DEBUG_ASSERT(serial_team);
1117   KMP_MB();
1118 
1119   if (__kmp_tasking_mode != tskm_immediate_exec) {
1120     KMP_DEBUG_ASSERT(
1121         this_thr->th.th_task_team ==
1122         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1123     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1124                      NULL);
1125     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1126                   "team %p, new task_team = NULL\n",
1127                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1128     this_thr->th.th_task_team = NULL;
1129   }
1130 
1131   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1132   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1133     proc_bind = proc_bind_false;
1134   } else if (proc_bind == proc_bind_default) {
1135     // No proc_bind clause was specified, so use the current value
1136     // of proc-bind-var for this parallel region.
1137     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1138   }
1139   // Reset for next parallel region
1140   this_thr->th.th_set_proc_bind = proc_bind_default;
1141 
1142 #if OMPT_SUPPORT
1143   ompt_data_t ompt_parallel_data = ompt_data_none;
1144   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1145   if (ompt_enabled.enabled &&
1146       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1147 
1148     ompt_task_info_t *parent_task_info;
1149     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1150 
1151     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1152     if (ompt_enabled.ompt_callback_parallel_begin) {
1153       int team_size = 1;
1154 
1155       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1156           &(parent_task_info->task_data), &(parent_task_info->frame),
1157           &ompt_parallel_data, team_size,
1158           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1159     }
1160   }
1161 #endif // OMPT_SUPPORT
1162 
1163   if (this_thr->th.th_team != serial_team) {
1164     // Nested level will be an index in the nested nthreads array
1165     int level = this_thr->th.th_team->t.t_level;
1166 
1167     if (serial_team->t.t_serialized) {
1168       /* this serial team was already used
1169          TODO increase performance by making this locks more specific */
1170       kmp_team_t *new_team;
1171 
1172       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1173 
1174       new_team =
1175           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1176 #if OMPT_SUPPORT
1177                               ompt_parallel_data,
1178 #endif
1179                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1180                               0 USE_NESTED_HOT_ARG(NULL));
1181       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1182       KMP_ASSERT(new_team);
1183 
1184       /* setup new serialized team and install it */
1185       new_team->t.t_threads[0] = this_thr;
1186       new_team->t.t_parent = this_thr->th.th_team;
1187       serial_team = new_team;
1188       this_thr->th.th_serial_team = serial_team;
1189 
1190       KF_TRACE(
1191           10,
1192           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1193            global_tid, serial_team));
1194 
1195       /* TODO the above breaks the requirement that if we run out of resources,
1196          then we can still guarantee that serialized teams are ok, since we may
1197          need to allocate a new one */
1198     } else {
1199       KF_TRACE(
1200           10,
1201           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1202            global_tid, serial_team));
1203     }
1204 
1205     /* we have to initialize this serial team */
1206     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1207     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1208     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1209     serial_team->t.t_ident = loc;
1210     serial_team->t.t_serialized = 1;
1211     serial_team->t.t_nproc = 1;
1212     serial_team->t.t_parent = this_thr->th.th_team;
1213     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1214     this_thr->th.th_team = serial_team;
1215     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1216 
1217     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1218                   this_thr->th.th_current_task));
1219     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1220     this_thr->th.th_current_task->td_flags.executing = 0;
1221 
1222     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1223 
1224     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1225        implicit task for each serialized task represented by
1226        team->t.t_serialized? */
1227     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1228               &this_thr->th.th_current_task->td_parent->td_icvs);
1229 
1230     // Thread value exists in the nested nthreads array for the next nested
1231     // level
1232     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1233       this_thr->th.th_current_task->td_icvs.nproc =
1234           __kmp_nested_nth.nth[level + 1];
1235     }
1236 
1237     if (__kmp_nested_proc_bind.used &&
1238         (level + 1 < __kmp_nested_proc_bind.used)) {
1239       this_thr->th.th_current_task->td_icvs.proc_bind =
1240           __kmp_nested_proc_bind.bind_types[level + 1];
1241     }
1242 
1243 #if USE_DEBUGGER
1244     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1245 #endif
1246     this_thr->th.th_info.ds.ds_tid = 0;
1247 
1248     /* set thread cache values */
1249     this_thr->th.th_team_nproc = 1;
1250     this_thr->th.th_team_master = this_thr;
1251     this_thr->th.th_team_serialized = 1;
1252 
1253     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1254     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1255     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1256 
1257     propagateFPControl(serial_team);
1258 
1259     /* check if we need to allocate dispatch buffers stack */
1260     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1261     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1262       serial_team->t.t_dispatch->th_disp_buffer =
1263           (dispatch_private_info_t *)__kmp_allocate(
1264               sizeof(dispatch_private_info_t));
1265     }
1266     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1267 
1268     KMP_MB();
1269 
1270   } else {
1271     /* this serialized team is already being used,
1272      * that's fine, just add another nested level */
1273     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1274     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1275     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1276     ++serial_team->t.t_serialized;
1277     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1278 
1279     // Nested level will be an index in the nested nthreads array
1280     int level = this_thr->th.th_team->t.t_level;
1281     // Thread value exists in the nested nthreads array for the next nested
1282     // level
1283     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1284       this_thr->th.th_current_task->td_icvs.nproc =
1285           __kmp_nested_nth.nth[level + 1];
1286     }
1287     serial_team->t.t_level++;
1288     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1289                   "of serial team %p to %d\n",
1290                   global_tid, serial_team, serial_team->t.t_level));
1291 
1292     /* allocate/push dispatch buffers stack */
1293     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1294     {
1295       dispatch_private_info_t *disp_buffer =
1296           (dispatch_private_info_t *)__kmp_allocate(
1297               sizeof(dispatch_private_info_t));
1298       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1299       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1300     }
1301     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1302 
1303     KMP_MB();
1304   }
1305   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1306 
1307   // Perform the display affinity functionality for
1308   // serialized parallel regions
1309   if (__kmp_display_affinity) {
1310     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1311         this_thr->th.th_prev_num_threads != 1) {
1312       // NULL means use the affinity-format-var ICV
1313       __kmp_aux_display_affinity(global_tid, NULL);
1314       this_thr->th.th_prev_level = serial_team->t.t_level;
1315       this_thr->th.th_prev_num_threads = 1;
1316     }
1317   }
1318 
1319   if (__kmp_env_consistency_check)
1320     __kmp_push_parallel(global_tid, NULL);
1321 #if OMPT_SUPPORT
1322   serial_team->t.ompt_team_info.master_return_address = codeptr;
1323   if (ompt_enabled.enabled &&
1324       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1325     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1326         OMPT_GET_FRAME_ADDRESS(0);
1327 
1328     ompt_lw_taskteam_t lw_taskteam;
1329     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1330                             &ompt_parallel_data, codeptr);
1331 
1332     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1333     // don't use lw_taskteam after linking. content was swaped
1334 
1335     /* OMPT implicit task begin */
1336     if (ompt_enabled.ompt_callback_implicit_task) {
1337       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1338           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1339           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1340           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1341       OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1342           __kmp_tid_from_gtid(global_tid);
1343     }
1344 
1345     /* OMPT state */
1346     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1347     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1348         OMPT_GET_FRAME_ADDRESS(0);
1349   }
1350 #endif
1351 }
1352 
1353 /* most of the work for a fork */
1354 /* return true if we really went parallel, false if serialized */
1355 int __kmp_fork_call(ident_t *loc, int gtid,
1356                     enum fork_context_e call_context, // Intel, GNU, ...
1357                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1358                     kmp_va_list ap) {
1359   void **argv;
1360   int i;
1361   int master_tid;
1362   int master_this_cons;
1363   kmp_team_t *team;
1364   kmp_team_t *parent_team;
1365   kmp_info_t *master_th;
1366   kmp_root_t *root;
1367   int nthreads;
1368   int master_active;
1369   int master_set_numthreads;
1370   int level;
1371   int active_level;
1372   int teams_level;
1373 #if KMP_NESTED_HOT_TEAMS
1374   kmp_hot_team_ptr_t **p_hot_teams;
1375 #endif
1376   { // KMP_TIME_BLOCK
1377     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1378     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1379 
1380     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1381     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1382       /* Some systems prefer the stack for the root thread(s) to start with */
1383       /* some gap from the parent stack to prevent false sharing. */
1384       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1385       /* These 2 lines below are so this does not get optimized out */
1386       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1387         __kmp_stkpadding += (short)((kmp_int64)dummy);
1388     }
1389 
1390     /* initialize if needed */
1391     KMP_DEBUG_ASSERT(
1392         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1393     if (!TCR_4(__kmp_init_parallel))
1394       __kmp_parallel_initialize();
1395     __kmp_resume_if_soft_paused();
1396 
1397     /* setup current data */
1398     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1399     // shutdown
1400     parent_team = master_th->th.th_team;
1401     master_tid = master_th->th.th_info.ds.ds_tid;
1402     master_this_cons = master_th->th.th_local.this_construct;
1403     root = master_th->th.th_root;
1404     master_active = root->r.r_active;
1405     master_set_numthreads = master_th->th.th_set_nproc;
1406 
1407 #if OMPT_SUPPORT
1408     ompt_data_t ompt_parallel_data = ompt_data_none;
1409     ompt_data_t *parent_task_data;
1410     ompt_frame_t *ompt_frame;
1411     ompt_data_t *implicit_task_data;
1412     void *return_address = NULL;
1413 
1414     if (ompt_enabled.enabled) {
1415       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1416                                     NULL, NULL);
1417       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1418     }
1419 #endif
1420 
1421     // Assign affinity to root thread if it hasn't happened yet
1422     __kmp_assign_root_init_mask();
1423 
1424     // Nested level will be an index in the nested nthreads array
1425     level = parent_team->t.t_level;
1426     // used to launch non-serial teams even if nested is not allowed
1427     active_level = parent_team->t.t_active_level;
1428     // needed to check nesting inside the teams
1429     teams_level = master_th->th.th_teams_level;
1430 #if KMP_NESTED_HOT_TEAMS
1431     p_hot_teams = &master_th->th.th_hot_teams;
1432     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1433       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1434           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1435       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1436       // it is either actual or not needed (when active_level > 0)
1437       (*p_hot_teams)[0].hot_team_nth = 1;
1438     }
1439 #endif
1440 
1441 #if OMPT_SUPPORT
1442     if (ompt_enabled.enabled) {
1443       if (ompt_enabled.ompt_callback_parallel_begin) {
1444         int team_size = master_set_numthreads
1445                             ? master_set_numthreads
1446                             : get__nproc_2(parent_team, master_tid);
1447         int flags = OMPT_INVOKER(call_context) |
1448                     ((microtask == (microtask_t)__kmp_teams_master)
1449                          ? ompt_parallel_league
1450                          : ompt_parallel_team);
1451         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1452             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1453             return_address);
1454       }
1455       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1456     }
1457 #endif
1458 
1459     master_th->th.th_ident = loc;
1460 
1461     if (master_th->th.th_teams_microtask && ap &&
1462         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1463       // AC: This is start of parallel that is nested inside teams construct.
1464       // The team is actual (hot), all workers are ready at the fork barrier.
1465       // No lock needed to initialize the team a bit, then free workers.
1466       parent_team->t.t_ident = loc;
1467       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1468       parent_team->t.t_argc = argc;
1469       argv = (void **)parent_team->t.t_argv;
1470       for (i = argc - 1; i >= 0; --i)
1471         *argv++ = va_arg(kmp_va_deref(ap), void *);
1472       // Increment our nested depth levels, but not increase the serialization
1473       if (parent_team == master_th->th.th_serial_team) {
1474         // AC: we are in serialized parallel
1475         __kmpc_serialized_parallel(loc, gtid);
1476         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1477 
1478         if (call_context == fork_context_gnu) {
1479           // AC: need to decrement t_serialized for enquiry functions to work
1480           // correctly, will restore at join time
1481           parent_team->t.t_serialized--;
1482           return TRUE;
1483         }
1484 
1485 #if OMPD_SUPPORT
1486         parent_team->t.t_pkfn = microtask;
1487 #endif
1488 
1489 #if OMPT_SUPPORT
1490         void *dummy;
1491         void **exit_frame_p;
1492 
1493         ompt_lw_taskteam_t lw_taskteam;
1494 
1495         if (ompt_enabled.enabled) {
1496           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1497                                   &ompt_parallel_data, return_address);
1498           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1499 
1500           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1501           // don't use lw_taskteam after linking. content was swaped
1502 
1503           /* OMPT implicit task begin */
1504           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1505           if (ompt_enabled.ompt_callback_implicit_task) {
1506             OMPT_CUR_TASK_INFO(master_th)->thread_num =
1507                 __kmp_tid_from_gtid(gtid);
1508             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1509                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1510                 implicit_task_data, 1,
1511                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1512           }
1513 
1514           /* OMPT state */
1515           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1516         } else {
1517           exit_frame_p = &dummy;
1518         }
1519 #endif
1520         // AC: need to decrement t_serialized for enquiry functions to work
1521         // correctly, will restore at join time
1522         parent_team->t.t_serialized--;
1523 
1524         {
1525           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1526           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1527           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1528 #if OMPT_SUPPORT
1529                                  ,
1530                                  exit_frame_p
1531 #endif
1532           );
1533         }
1534 
1535 #if OMPT_SUPPORT
1536         if (ompt_enabled.enabled) {
1537           *exit_frame_p = NULL;
1538           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1539           if (ompt_enabled.ompt_callback_implicit_task) {
1540             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1541                 ompt_scope_end, NULL, implicit_task_data, 1,
1542                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1543           }
1544           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1545           __ompt_lw_taskteam_unlink(master_th);
1546           if (ompt_enabled.ompt_callback_parallel_end) {
1547             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1548                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1549                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1550                 return_address);
1551           }
1552           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1553         }
1554 #endif
1555         return TRUE;
1556       }
1557 
1558       parent_team->t.t_pkfn = microtask;
1559       parent_team->t.t_invoke = invoker;
1560       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1561       parent_team->t.t_active_level++;
1562       parent_team->t.t_level++;
1563       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1564 
1565 #if OMPT_SUPPORT
1566       if (ompt_enabled.enabled) {
1567         ompt_lw_taskteam_t lw_taskteam;
1568         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1569                                 &ompt_parallel_data, return_address);
1570         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1571       }
1572 #endif
1573 
1574       /* Change number of threads in the team if requested */
1575       if (master_set_numthreads) { // The parallel has num_threads clause
1576         if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1577           // AC: only can reduce number of threads dynamically, can't increase
1578           kmp_info_t **other_threads = parent_team->t.t_threads;
1579           // NOTE: if using distributed barrier, we need to run this code block
1580           // even when the team size appears not to have changed from the max.
1581           int old_proc = master_th->th.th_teams_size.nth;
1582           if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
1583               bp_dist_bar) {
1584             __kmp_resize_dist_barrier(parent_team, old_proc,
1585                                       master_set_numthreads);
1586             __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1587           }
1588           parent_team->t.t_nproc = master_set_numthreads;
1589           for (i = 0; i < master_set_numthreads; ++i) {
1590             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1591           }
1592         }
1593         // Keep extra threads hot in the team for possible next parallels
1594         master_th->th.th_set_nproc = 0;
1595       }
1596 
1597 #if USE_DEBUGGER
1598       if (__kmp_debugging) { // Let debugger override number of threads.
1599         int nth = __kmp_omp_num_threads(loc);
1600         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1601           master_set_numthreads = nth;
1602         }
1603       }
1604 #endif
1605 
1606       // Figure out the proc_bind policy for the nested parallel within teams
1607       kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1608       // proc_bind_default means don't update
1609       kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1610       if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1611         proc_bind = proc_bind_false;
1612       } else {
1613         // No proc_bind clause specified; use current proc-bind-var
1614         if (proc_bind == proc_bind_default) {
1615           proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1616         }
1617         /* else: The proc_bind policy was specified explicitly on parallel
1618            clause.
1619            This overrides proc-bind-var for this parallel region, but does not
1620            change proc-bind-var. */
1621         // Figure the value of proc-bind-var for the child threads.
1622         if ((level + 1 < __kmp_nested_proc_bind.used) &&
1623             (__kmp_nested_proc_bind.bind_types[level + 1] !=
1624              master_th->th.th_current_task->td_icvs.proc_bind)) {
1625           proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1626         }
1627       }
1628       KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1629       // Need to change the bind-var ICV to correct value for each implicit task
1630       if (proc_bind_icv != proc_bind_default &&
1631           master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1632         kmp_info_t **other_threads = parent_team->t.t_threads;
1633         for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1634           other_threads[i]->th.th_current_task->td_icvs.proc_bind =
1635               proc_bind_icv;
1636         }
1637       }
1638       // Reset for next parallel region
1639       master_th->th.th_set_proc_bind = proc_bind_default;
1640 
1641 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1642       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1643            KMP_ITT_DEBUG) &&
1644           __kmp_forkjoin_frames_mode == 3 &&
1645           parent_team->t.t_active_level == 1 // only report frames at level 1
1646           && master_th->th.th_teams_size.nteams == 1) {
1647         kmp_uint64 tmp_time = __itt_get_timestamp();
1648         master_th->th.th_frame_time = tmp_time;
1649         parent_team->t.t_region_time = tmp_time;
1650       }
1651       if (__itt_stack_caller_create_ptr) {
1652         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1653         // create new stack stitching id before entering fork barrier
1654         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1655       }
1656 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1657 #if KMP_AFFINITY_SUPPORTED
1658       __kmp_partition_places(parent_team);
1659 #endif
1660 
1661       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1662                     "master_th=%p, gtid=%d\n",
1663                     root, parent_team, master_th, gtid));
1664       __kmp_internal_fork(loc, gtid, parent_team);
1665       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1666                     "master_th=%p, gtid=%d\n",
1667                     root, parent_team, master_th, gtid));
1668 
1669       if (call_context == fork_context_gnu)
1670         return TRUE;
1671 
1672       /* Invoke microtask for PRIMARY thread */
1673       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1674                     parent_team->t.t_id, parent_team->t.t_pkfn));
1675 
1676       if (!parent_team->t.t_invoke(gtid)) {
1677         KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1678       }
1679       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1680                     parent_team->t.t_id, parent_team->t.t_pkfn));
1681       KMP_MB(); /* Flush all pending memory write invalidates.  */
1682 
1683       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1684 
1685       return TRUE;
1686     } // Parallel closely nested in teams construct
1687 
1688 #if KMP_DEBUG
1689     if (__kmp_tasking_mode != tskm_immediate_exec) {
1690       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1691                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1692     }
1693 #endif
1694 
1695     // Need this to happen before we determine the number of threads, not while
1696     // we are allocating the team
1697     //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1698     int enter_teams = 0;
1699     if (parent_team->t.t_active_level >=
1700         master_th->th.th_current_task->td_icvs.max_active_levels) {
1701       nthreads = 1;
1702     } else {
1703       enter_teams = ((ap == NULL && active_level == 0) ||
1704                      (ap && teams_level > 0 && teams_level == level));
1705       nthreads = master_set_numthreads
1706                      ? master_set_numthreads
1707                      // TODO: get nproc directly from current task
1708                      : get__nproc_2(parent_team, master_tid);
1709       // Check if we need to take forkjoin lock? (no need for serialized
1710       // parallel out of teams construct). This code moved here from
1711       // __kmp_reserve_threads() to speedup nested serialized parallels.
1712       if (nthreads > 1) {
1713         if ((get__max_active_levels(master_th) == 1 &&
1714              (root->r.r_in_parallel && !enter_teams)) ||
1715             (__kmp_library == library_serial)) {
1716           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1717                         " threads\n",
1718                         gtid, nthreads));
1719           nthreads = 1;
1720         }
1721       }
1722       if (nthreads > 1) {
1723         /* determine how many new threads we can use */
1724         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1725         /* AC: If we execute teams from parallel region (on host), then teams
1726            should be created but each can only have 1 thread if nesting is
1727            disabled. If teams called from serial region, then teams and their
1728            threads should be created regardless of the nesting setting. */
1729         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1730                                          nthreads, enter_teams);
1731         if (nthreads == 1) {
1732           // Free lock for single thread execution here; for multi-thread
1733           // execution it will be freed later after team of threads created
1734           // and initialized
1735           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1736         }
1737       }
1738     }
1739     KMP_DEBUG_ASSERT(nthreads > 0);
1740 
1741     // If we temporarily changed the set number of threads then restore it now
1742     master_th->th.th_set_nproc = 0;
1743 
1744     /* create a serialized parallel region? */
1745     if (nthreads == 1) {
1746 /* josh todo: hypothetical question: what do we do for OS X*? */
1747 #if KMP_OS_LINUX &&                                                            \
1748     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1749       void *args[argc];
1750 #else
1751       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1752 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1753           KMP_ARCH_AARCH64) */
1754 
1755       KA_TRACE(20,
1756                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1757 
1758       __kmpc_serialized_parallel(loc, gtid);
1759 
1760 #if OMPD_SUPPORT
1761       master_th->th.th_serial_team->t.t_pkfn = microtask;
1762 #endif
1763 
1764       if (call_context == fork_context_intel) {
1765         /* TODO this sucks, use the compiler itself to pass args! :) */
1766         master_th->th.th_serial_team->t.t_ident = loc;
1767         if (!ap) {
1768           // revert change made in __kmpc_serialized_parallel()
1769           master_th->th.th_serial_team->t.t_level--;
1770           // Get args from parent team for teams construct
1771 
1772 #if OMPT_SUPPORT
1773           void *dummy;
1774           void **exit_frame_p;
1775           ompt_task_info_t *task_info;
1776 
1777           ompt_lw_taskteam_t lw_taskteam;
1778 
1779           if (ompt_enabled.enabled) {
1780             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1781                                     &ompt_parallel_data, return_address);
1782 
1783             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1784             // don't use lw_taskteam after linking. content was swaped
1785 
1786             task_info = OMPT_CUR_TASK_INFO(master_th);
1787             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1788             if (ompt_enabled.ompt_callback_implicit_task) {
1789               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1790                   __kmp_tid_from_gtid(gtid);
1791               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1792                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1793                   &(task_info->task_data), 1,
1794                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1795                   ompt_task_implicit);
1796             }
1797 
1798             /* OMPT state */
1799             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1800           } else {
1801             exit_frame_p = &dummy;
1802           }
1803 #endif
1804 
1805           {
1806             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1807             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1808             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1809                                    parent_team->t.t_argv
1810 #if OMPT_SUPPORT
1811                                    ,
1812                                    exit_frame_p
1813 #endif
1814             );
1815           }
1816 
1817 #if OMPT_SUPPORT
1818           if (ompt_enabled.enabled) {
1819             *exit_frame_p = NULL;
1820             if (ompt_enabled.ompt_callback_implicit_task) {
1821               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1822                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1823                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1824                   ompt_task_implicit);
1825             }
1826             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1827             __ompt_lw_taskteam_unlink(master_th);
1828             if (ompt_enabled.ompt_callback_parallel_end) {
1829               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1830                   &ompt_parallel_data, parent_task_data,
1831                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1832                   return_address);
1833             }
1834             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1835           }
1836 #endif
1837         } else if (microtask == (microtask_t)__kmp_teams_master) {
1838           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1839                            master_th->th.th_serial_team);
1840           team = master_th->th.th_team;
1841           // team->t.t_pkfn = microtask;
1842           team->t.t_invoke = invoker;
1843           __kmp_alloc_argv_entries(argc, team, TRUE);
1844           team->t.t_argc = argc;
1845           argv = (void **)team->t.t_argv;
1846           if (ap) {
1847             for (i = argc - 1; i >= 0; --i)
1848               *argv++ = va_arg(kmp_va_deref(ap), void *);
1849           } else {
1850             for (i = 0; i < argc; ++i)
1851               // Get args from parent team for teams construct
1852               argv[i] = parent_team->t.t_argv[i];
1853           }
1854           // AC: revert change made in __kmpc_serialized_parallel()
1855           //     because initial code in teams should have level=0
1856           team->t.t_level--;
1857           // AC: call special invoker for outer "parallel" of teams construct
1858           invoker(gtid);
1859 #if OMPT_SUPPORT
1860           if (ompt_enabled.enabled) {
1861             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1862             if (ompt_enabled.ompt_callback_implicit_task) {
1863               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1864                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1865                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1866             }
1867             if (ompt_enabled.ompt_callback_parallel_end) {
1868               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1869                   &ompt_parallel_data, parent_task_data,
1870                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1871                   return_address);
1872             }
1873             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1874           }
1875 #endif
1876         } else {
1877           argv = args;
1878           for (i = argc - 1; i >= 0; --i)
1879             *argv++ = va_arg(kmp_va_deref(ap), void *);
1880           KMP_MB();
1881 
1882 #if OMPT_SUPPORT
1883           void *dummy;
1884           void **exit_frame_p;
1885           ompt_task_info_t *task_info;
1886 
1887           ompt_lw_taskteam_t lw_taskteam;
1888 
1889           if (ompt_enabled.enabled) {
1890             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1891                                     &ompt_parallel_data, return_address);
1892             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1893             // don't use lw_taskteam after linking. content was swaped
1894             task_info = OMPT_CUR_TASK_INFO(master_th);
1895             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1896 
1897             /* OMPT implicit task begin */
1898             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1899             if (ompt_enabled.ompt_callback_implicit_task) {
1900               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1901                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1902                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1903                   ompt_task_implicit);
1904               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1905                   __kmp_tid_from_gtid(gtid);
1906             }
1907 
1908             /* OMPT state */
1909             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1910           } else {
1911             exit_frame_p = &dummy;
1912           }
1913 #endif
1914 
1915           {
1916             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1917             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1918             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1919 #if OMPT_SUPPORT
1920                                    ,
1921                                    exit_frame_p
1922 #endif
1923             );
1924           }
1925 
1926 #if OMPT_SUPPORT
1927           if (ompt_enabled.enabled) {
1928             *exit_frame_p = NULL;
1929             if (ompt_enabled.ompt_callback_implicit_task) {
1930               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1931                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1932                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1933                   ompt_task_implicit);
1934             }
1935 
1936             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1937             __ompt_lw_taskteam_unlink(master_th);
1938             if (ompt_enabled.ompt_callback_parallel_end) {
1939               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1940                   &ompt_parallel_data, parent_task_data,
1941                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1942                   return_address);
1943             }
1944             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1945           }
1946 #endif
1947         }
1948       } else if (call_context == fork_context_gnu) {
1949 #if OMPT_SUPPORT
1950         ompt_lw_taskteam_t lwt;
1951         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1952                                 return_address);
1953 
1954         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1955         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1956 // don't use lw_taskteam after linking. content was swaped
1957 #endif
1958 
1959         // we were called from GNU native code
1960         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1961         return FALSE;
1962       } else {
1963         KMP_ASSERT2(call_context < fork_context_last,
1964                     "__kmp_fork_call: unknown fork_context parameter");
1965       }
1966 
1967       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1968       KMP_MB();
1969       return FALSE;
1970     } // if (nthreads == 1)
1971 
1972     // GEH: only modify the executing flag in the case when not serialized
1973     //      serialized case is handled in kmpc_serialized_parallel
1974     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1975                   "curtask=%p, curtask_max_aclevel=%d\n",
1976                   parent_team->t.t_active_level, master_th,
1977                   master_th->th.th_current_task,
1978                   master_th->th.th_current_task->td_icvs.max_active_levels));
1979     // TODO: GEH - cannot do this assertion because root thread not set up as
1980     // executing
1981     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1982     master_th->th.th_current_task->td_flags.executing = 0;
1983 
1984     if (!master_th->th.th_teams_microtask || level > teams_level) {
1985       /* Increment our nested depth level */
1986       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1987     }
1988 
1989     // See if we need to make a copy of the ICVs.
1990     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1991     if ((level + 1 < __kmp_nested_nth.used) &&
1992         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1993       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1994     } else {
1995       nthreads_icv = 0; // don't update
1996     }
1997 
1998     // Figure out the proc_bind_policy for the new team.
1999     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2000     // proc_bind_default means don't update
2001     kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2002     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2003       proc_bind = proc_bind_false;
2004     } else {
2005       // No proc_bind clause specified; use current proc-bind-var for this
2006       // parallel region
2007       if (proc_bind == proc_bind_default) {
2008         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2009       }
2010       // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2011       if (master_th->th.th_teams_microtask &&
2012           microtask == (microtask_t)__kmp_teams_master) {
2013         proc_bind = __kmp_teams_proc_bind;
2014       }
2015       /* else: The proc_bind policy was specified explicitly on parallel clause.
2016          This overrides proc-bind-var for this parallel region, but does not
2017          change proc-bind-var. */
2018       // Figure the value of proc-bind-var for the child threads.
2019       if ((level + 1 < __kmp_nested_proc_bind.used) &&
2020           (__kmp_nested_proc_bind.bind_types[level + 1] !=
2021            master_th->th.th_current_task->td_icvs.proc_bind)) {
2022         // Do not modify the proc bind icv for the two teams construct forks
2023         // They just let the proc bind icv pass through
2024         if (!master_th->th.th_teams_microtask ||
2025             !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2026           proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2027       }
2028     }
2029 
2030     // Reset for next parallel region
2031     master_th->th.th_set_proc_bind = proc_bind_default;
2032 
2033     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2034       kmp_internal_control_t new_icvs;
2035       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2036       new_icvs.next = NULL;
2037       if (nthreads_icv > 0) {
2038         new_icvs.nproc = nthreads_icv;
2039       }
2040       if (proc_bind_icv != proc_bind_default) {
2041         new_icvs.proc_bind = proc_bind_icv;
2042       }
2043 
2044       /* allocate a new parallel team */
2045       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2046       team = __kmp_allocate_team(root, nthreads, nthreads,
2047 #if OMPT_SUPPORT
2048                                  ompt_parallel_data,
2049 #endif
2050                                  proc_bind, &new_icvs,
2051                                  argc USE_NESTED_HOT_ARG(master_th));
2052       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2053         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2054     } else {
2055       /* allocate a new parallel team */
2056       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2057       team = __kmp_allocate_team(root, nthreads, nthreads,
2058 #if OMPT_SUPPORT
2059                                  ompt_parallel_data,
2060 #endif
2061                                  proc_bind,
2062                                  &master_th->th.th_current_task->td_icvs,
2063                                  argc USE_NESTED_HOT_ARG(master_th));
2064       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2065         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2066                   &master_th->th.th_current_task->td_icvs);
2067     }
2068     KF_TRACE(
2069         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2070 
2071     /* setup the new team */
2072     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2073     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2074     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2075     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2076     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2077 #if OMPT_SUPPORT
2078     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2079                           return_address);
2080 #endif
2081     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2082     // TODO: parent_team->t.t_level == INT_MAX ???
2083     if (!master_th->th.th_teams_microtask || level > teams_level) {
2084       int new_level = parent_team->t.t_level + 1;
2085       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2086       new_level = parent_team->t.t_active_level + 1;
2087       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2088     } else {
2089       // AC: Do not increase parallel level at start of the teams construct
2090       int new_level = parent_team->t.t_level;
2091       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2092       new_level = parent_team->t.t_active_level;
2093       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2094     }
2095     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2096     // set primary thread's schedule as new run-time schedule
2097     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2098 
2099     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2100     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2101 
2102     // Update the floating point rounding in the team if required.
2103     propagateFPControl(team);
2104 #if OMPD_SUPPORT
2105     if (ompd_state & OMPD_ENABLE_BP)
2106       ompd_bp_parallel_begin();
2107 #endif
2108 
2109     if (__kmp_tasking_mode != tskm_immediate_exec) {
2110       // Set primary thread's task team to team's task team. Unless this is hot
2111       // team, it should be NULL.
2112       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2113                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2114       KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2115                     "%p, new task_team %p / team %p\n",
2116                     __kmp_gtid_from_thread(master_th),
2117                     master_th->th.th_task_team, parent_team,
2118                     team->t.t_task_team[master_th->th.th_task_state], team));
2119 
2120       if (active_level || master_th->th.th_task_team) {
2121         // Take a memo of primary thread's task_state
2122         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2123         if (master_th->th.th_task_state_top >=
2124             master_th->th.th_task_state_stack_sz) { // increase size
2125           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2126           kmp_uint8 *old_stack, *new_stack;
2127           kmp_uint32 i;
2128           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2129           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2130             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2131           }
2132           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2133                ++i) { // zero-init rest of stack
2134             new_stack[i] = 0;
2135           }
2136           old_stack = master_th->th.th_task_state_memo_stack;
2137           master_th->th.th_task_state_memo_stack = new_stack;
2138           master_th->th.th_task_state_stack_sz = new_size;
2139           __kmp_free(old_stack);
2140         }
2141         // Store primary thread's task_state on stack
2142         master_th->th
2143             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2144             master_th->th.th_task_state;
2145         master_th->th.th_task_state_top++;
2146 #if KMP_NESTED_HOT_TEAMS
2147         if (master_th->th.th_hot_teams &&
2148             active_level < __kmp_hot_teams_max_level &&
2149             team == master_th->th.th_hot_teams[active_level].hot_team) {
2150           // Restore primary thread's nested state if nested hot team
2151           master_th->th.th_task_state =
2152               master_th->th
2153                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2154         } else {
2155 #endif
2156           master_th->th.th_task_state = 0;
2157 #if KMP_NESTED_HOT_TEAMS
2158         }
2159 #endif
2160       }
2161 #if !KMP_NESTED_HOT_TEAMS
2162       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2163                        (team == root->r.r_hot_team));
2164 #endif
2165     }
2166 
2167     KA_TRACE(
2168         20,
2169         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2170          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2171          team->t.t_nproc));
2172     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2173                      (team->t.t_master_tid == 0 &&
2174                       (team->t.t_parent == root->r.r_root_team ||
2175                        team->t.t_parent->t.t_serialized)));
2176     KMP_MB();
2177 
2178     /* now, setup the arguments */
2179     argv = (void **)team->t.t_argv;
2180     if (ap) {
2181       for (i = argc - 1; i >= 0; --i) {
2182         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2183         KMP_CHECK_UPDATE(*argv, new_argv);
2184         argv++;
2185       }
2186     } else {
2187       for (i = 0; i < argc; ++i) {
2188         // Get args from parent team for teams construct
2189         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2190       }
2191     }
2192 
2193     /* now actually fork the threads */
2194     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2195     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2196       root->r.r_active = TRUE;
2197 
2198     __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2199     __kmp_setup_icv_copy(team, nthreads,
2200                          &master_th->th.th_current_task->td_icvs, loc);
2201 
2202 #if OMPT_SUPPORT
2203     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2204 #endif
2205 
2206     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2207 
2208 #if USE_ITT_BUILD
2209     if (team->t.t_active_level == 1 // only report frames at level 1
2210         && !master_th->th.th_teams_microtask) { // not in teams construct
2211 #if USE_ITT_NOTIFY
2212       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2213           (__kmp_forkjoin_frames_mode == 3 ||
2214            __kmp_forkjoin_frames_mode == 1)) {
2215         kmp_uint64 tmp_time = 0;
2216         if (__itt_get_timestamp_ptr)
2217           tmp_time = __itt_get_timestamp();
2218         // Internal fork - report frame begin
2219         master_th->th.th_frame_time = tmp_time;
2220         if (__kmp_forkjoin_frames_mode == 3)
2221           team->t.t_region_time = tmp_time;
2222       } else
2223 // only one notification scheme (either "submit" or "forking/joined", not both)
2224 #endif /* USE_ITT_NOTIFY */
2225         if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2226             __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2227           // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2228           __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2229         }
2230     }
2231 #endif /* USE_ITT_BUILD */
2232 
2233     /* now go on and do the work */
2234     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2235     KMP_MB();
2236     KF_TRACE(10,
2237              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2238               root, team, master_th, gtid));
2239 
2240 #if USE_ITT_BUILD
2241     if (__itt_stack_caller_create_ptr) {
2242       // create new stack stitching id before entering fork barrier
2243       if (!enter_teams) {
2244         KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2245         team->t.t_stack_id = __kmp_itt_stack_caller_create();
2246       } else if (parent_team->t.t_serialized) {
2247         // keep stack stitching id in the serialized parent_team;
2248         // current team will be used for parallel inside the teams;
2249         // if parent_team is active, then it already keeps stack stitching id
2250         // for the league of teams
2251         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2252         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2253       }
2254     }
2255 #endif /* USE_ITT_BUILD */
2256 
2257     // AC: skip __kmp_internal_fork at teams construct, let only primary
2258     // threads execute
2259     if (ap) {
2260       __kmp_internal_fork(loc, gtid, team);
2261       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2262                     "master_th=%p, gtid=%d\n",
2263                     root, team, master_th, gtid));
2264     }
2265 
2266     if (call_context == fork_context_gnu) {
2267       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2268       return TRUE;
2269     }
2270 
2271     /* Invoke microtask for PRIMARY thread */
2272     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2273                   team->t.t_id, team->t.t_pkfn));
2274   } // END of timer KMP_fork_call block
2275 
2276 #if KMP_STATS_ENABLED
2277   // If beginning a teams construct, then change thread state
2278   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2279   if (!ap) {
2280     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2281   }
2282 #endif
2283 
2284   if (!team->t.t_invoke(gtid)) {
2285     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2286   }
2287 
2288 #if KMP_STATS_ENABLED
2289   // If was beginning of a teams construct, then reset thread state
2290   if (!ap) {
2291     KMP_SET_THREAD_STATE(previous_state);
2292   }
2293 #endif
2294 
2295   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2296                 team->t.t_id, team->t.t_pkfn));
2297   KMP_MB(); /* Flush all pending memory write invalidates.  */
2298 
2299   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2300 #if OMPT_SUPPORT
2301   if (ompt_enabled.enabled) {
2302     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2303   }
2304 #endif
2305 
2306   return TRUE;
2307 }
2308 
2309 #if OMPT_SUPPORT
2310 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2311                                             kmp_team_t *team) {
2312   // restore state outside the region
2313   thread->th.ompt_thread_info.state =
2314       ((team->t.t_serialized) ? ompt_state_work_serial
2315                               : ompt_state_work_parallel);
2316 }
2317 
2318 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2319                                    kmp_team_t *team, ompt_data_t *parallel_data,
2320                                    int flags, void *codeptr) {
2321   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2322   if (ompt_enabled.ompt_callback_parallel_end) {
2323     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2324         parallel_data, &(task_info->task_data), flags, codeptr);
2325   }
2326 
2327   task_info->frame.enter_frame = ompt_data_none;
2328   __kmp_join_restore_state(thread, team);
2329 }
2330 #endif
2331 
2332 void __kmp_join_call(ident_t *loc, int gtid
2333 #if OMPT_SUPPORT
2334                      ,
2335                      enum fork_context_e fork_context
2336 #endif
2337                      ,
2338                      int exit_teams) {
2339   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2340   kmp_team_t *team;
2341   kmp_team_t *parent_team;
2342   kmp_info_t *master_th;
2343   kmp_root_t *root;
2344   int master_active;
2345 
2346   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2347 
2348   /* setup current data */
2349   master_th = __kmp_threads[gtid];
2350   root = master_th->th.th_root;
2351   team = master_th->th.th_team;
2352   parent_team = team->t.t_parent;
2353 
2354   master_th->th.th_ident = loc;
2355 
2356 #if OMPT_SUPPORT
2357   void *team_microtask = (void *)team->t.t_pkfn;
2358   // For GOMP interface with serialized parallel, need the
2359   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2360   // and end-parallel events.
2361   if (ompt_enabled.enabled &&
2362       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2363     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2364   }
2365 #endif
2366 
2367 #if KMP_DEBUG
2368   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2369     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2370                   "th_task_team = %p\n",
2371                   __kmp_gtid_from_thread(master_th), team,
2372                   team->t.t_task_team[master_th->th.th_task_state],
2373                   master_th->th.th_task_team));
2374     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2375                      team->t.t_task_team[master_th->th.th_task_state]);
2376   }
2377 #endif
2378 
2379   if (team->t.t_serialized) {
2380     if (master_th->th.th_teams_microtask) {
2381       // We are in teams construct
2382       int level = team->t.t_level;
2383       int tlevel = master_th->th.th_teams_level;
2384       if (level == tlevel) {
2385         // AC: we haven't incremented it earlier at start of teams construct,
2386         //     so do it here - at the end of teams construct
2387         team->t.t_level++;
2388       } else if (level == tlevel + 1) {
2389         // AC: we are exiting parallel inside teams, need to increment
2390         // serialization in order to restore it in the next call to
2391         // __kmpc_end_serialized_parallel
2392         team->t.t_serialized++;
2393       }
2394     }
2395     __kmpc_end_serialized_parallel(loc, gtid);
2396 
2397 #if OMPT_SUPPORT
2398     if (ompt_enabled.enabled) {
2399       __kmp_join_restore_state(master_th, parent_team);
2400     }
2401 #endif
2402 
2403     return;
2404   }
2405 
2406   master_active = team->t.t_master_active;
2407 
2408   if (!exit_teams) {
2409     // AC: No barrier for internal teams at exit from teams construct.
2410     //     But there is barrier for external team (league).
2411     __kmp_internal_join(loc, gtid, team);
2412 #if USE_ITT_BUILD
2413     if (__itt_stack_caller_create_ptr) {
2414       KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2415       // destroy the stack stitching id after join barrier
2416       __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2417       team->t.t_stack_id = NULL;
2418     }
2419 #endif
2420   } else {
2421     master_th->th.th_task_state =
2422         0; // AC: no tasking in teams (out of any parallel)
2423 #if USE_ITT_BUILD
2424     if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2425       KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2426       // destroy the stack stitching id on exit from the teams construct
2427       // if parent_team is active, then the id will be destroyed later on
2428       // by master of the league of teams
2429       __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2430       parent_team->t.t_stack_id = NULL;
2431     }
2432 #endif
2433 
2434     if (team->t.t_nproc > 1 &&
2435         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2436       team->t.b->update_num_threads(team->t.t_nproc);
2437       __kmp_add_threads_to_team(team, team->t.t_nproc);
2438     }
2439   }
2440 
2441   KMP_MB();
2442 
2443 #if OMPT_SUPPORT
2444   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2445   void *codeptr = team->t.ompt_team_info.master_return_address;
2446 #endif
2447 
2448 #if USE_ITT_BUILD
2449   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2450   if (team->t.t_active_level == 1 &&
2451       (!master_th->th.th_teams_microtask || /* not in teams construct */
2452        master_th->th.th_teams_size.nteams == 1)) {
2453     master_th->th.th_ident = loc;
2454     // only one notification scheme (either "submit" or "forking/joined", not
2455     // both)
2456     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2457         __kmp_forkjoin_frames_mode == 3)
2458       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2459                              master_th->th.th_frame_time, 0, loc,
2460                              master_th->th.th_team_nproc, 1);
2461     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2462              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2463       __kmp_itt_region_joined(gtid);
2464   } // active_level == 1
2465 #endif /* USE_ITT_BUILD */
2466 
2467 #if KMP_AFFINITY_SUPPORTED
2468   if (!exit_teams) {
2469     // Restore master thread's partition.
2470     master_th->th.th_first_place = team->t.t_first_place;
2471     master_th->th.th_last_place = team->t.t_last_place;
2472   }
2473 #endif // KMP_AFFINITY_SUPPORTED
2474 
2475   if (master_th->th.th_teams_microtask && !exit_teams &&
2476       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2477       team->t.t_level == master_th->th.th_teams_level + 1) {
2478 // AC: We need to leave the team structure intact at the end of parallel
2479 // inside the teams construct, so that at the next parallel same (hot) team
2480 // works, only adjust nesting levels
2481 #if OMPT_SUPPORT
2482     ompt_data_t ompt_parallel_data = ompt_data_none;
2483     if (ompt_enabled.enabled) {
2484       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2485       if (ompt_enabled.ompt_callback_implicit_task) {
2486         int ompt_team_size = team->t.t_nproc;
2487         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2488             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2489             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2490       }
2491       task_info->frame.exit_frame = ompt_data_none;
2492       task_info->task_data = ompt_data_none;
2493       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2494       __ompt_lw_taskteam_unlink(master_th);
2495     }
2496 #endif
2497     /* Decrement our nested depth level */
2498     team->t.t_level--;
2499     team->t.t_active_level--;
2500     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2501 
2502     // Restore number of threads in the team if needed. This code relies on
2503     // the proper adjustment of th_teams_size.nth after the fork in
2504     // __kmp_teams_master on each teams primary thread in the case that
2505     // __kmp_reserve_threads reduced it.
2506     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2507       int old_num = master_th->th.th_team_nproc;
2508       int new_num = master_th->th.th_teams_size.nth;
2509       kmp_info_t **other_threads = team->t.t_threads;
2510       team->t.t_nproc = new_num;
2511       for (int i = 0; i < old_num; ++i) {
2512         other_threads[i]->th.th_team_nproc = new_num;
2513       }
2514       // Adjust states of non-used threads of the team
2515       for (int i = old_num; i < new_num; ++i) {
2516         // Re-initialize thread's barrier data.
2517         KMP_DEBUG_ASSERT(other_threads[i]);
2518         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2519         for (int b = 0; b < bs_last_barrier; ++b) {
2520           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2521           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2522 #if USE_DEBUGGER
2523           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2524 #endif
2525         }
2526         if (__kmp_tasking_mode != tskm_immediate_exec) {
2527           // Synchronize thread's task state
2528           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2529         }
2530       }
2531     }
2532 
2533 #if OMPT_SUPPORT
2534     if (ompt_enabled.enabled) {
2535       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2536                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2537     }
2538 #endif
2539 
2540     return;
2541   }
2542 
2543   /* do cleanup and restore the parent team */
2544   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2545   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2546 
2547   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2548 
2549   /* jc: The following lock has instructions with REL and ACQ semantics,
2550      separating the parallel user code called in this parallel region
2551      from the serial user code called after this function returns. */
2552   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2553 
2554   if (!master_th->th.th_teams_microtask ||
2555       team->t.t_level > master_th->th.th_teams_level) {
2556     /* Decrement our nested depth level */
2557     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2558   }
2559   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2560 
2561 #if OMPT_SUPPORT
2562   if (ompt_enabled.enabled) {
2563     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2564     if (ompt_enabled.ompt_callback_implicit_task) {
2565       int flags = (team_microtask == (void *)__kmp_teams_master)
2566                       ? ompt_task_initial
2567                       : ompt_task_implicit;
2568       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2569       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2570           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2571           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2572     }
2573     task_info->frame.exit_frame = ompt_data_none;
2574     task_info->task_data = ompt_data_none;
2575   }
2576 #endif
2577 
2578   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2579                 master_th, team));
2580   __kmp_pop_current_task_from_thread(master_th);
2581 
2582   master_th->th.th_def_allocator = team->t.t_def_allocator;
2583 
2584 #if OMPD_SUPPORT
2585   if (ompd_state & OMPD_ENABLE_BP)
2586     ompd_bp_parallel_end();
2587 #endif
2588   updateHWFPControl(team);
2589 
2590   if (root->r.r_active != master_active)
2591     root->r.r_active = master_active;
2592 
2593   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2594                             master_th)); // this will free worker threads
2595 
2596   /* this race was fun to find. make sure the following is in the critical
2597      region otherwise assertions may fail occasionally since the old team may be
2598      reallocated and the hierarchy appears inconsistent. it is actually safe to
2599      run and won't cause any bugs, but will cause those assertion failures. it's
2600      only one deref&assign so might as well put this in the critical region */
2601   master_th->th.th_team = parent_team;
2602   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2603   master_th->th.th_team_master = parent_team->t.t_threads[0];
2604   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2605 
2606   /* restore serialized team, if need be */
2607   if (parent_team->t.t_serialized &&
2608       parent_team != master_th->th.th_serial_team &&
2609       parent_team != root->r.r_root_team) {
2610     __kmp_free_team(root,
2611                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2612     master_th->th.th_serial_team = parent_team;
2613   }
2614 
2615   if (__kmp_tasking_mode != tskm_immediate_exec) {
2616     if (master_th->th.th_task_state_top >
2617         0) { // Restore task state from memo stack
2618       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2619       // Remember primary thread's state if we re-use this nested hot team
2620       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2621           master_th->th.th_task_state;
2622       --master_th->th.th_task_state_top; // pop
2623       // Now restore state at this level
2624       master_th->th.th_task_state =
2625           master_th->th
2626               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2627     }
2628     // Copy the task team from the parent team to the primary thread
2629     master_th->th.th_task_team =
2630         parent_team->t.t_task_team[master_th->th.th_task_state];
2631     KA_TRACE(20,
2632              ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2633               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2634               parent_team));
2635   }
2636 
2637   // TODO: GEH - cannot do this assertion because root thread not set up as
2638   // executing
2639   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2640   master_th->th.th_current_task->td_flags.executing = 1;
2641 
2642   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2643 
2644 #if KMP_AFFINITY_SUPPORTED
2645   if (master_th->th.th_team->t.t_level == 0 && __kmp_affin_reset) {
2646     __kmp_reset_root_init_mask(gtid);
2647   }
2648 #endif
2649 #if OMPT_SUPPORT
2650   int flags =
2651       OMPT_INVOKER(fork_context) |
2652       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2653                                                       : ompt_parallel_team);
2654   if (ompt_enabled.enabled) {
2655     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2656                     codeptr);
2657   }
2658 #endif
2659 
2660   KMP_MB();
2661   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2662 }
2663 
2664 /* Check whether we should push an internal control record onto the
2665    serial team stack.  If so, do it.  */
2666 void __kmp_save_internal_controls(kmp_info_t *thread) {
2667 
2668   if (thread->th.th_team != thread->th.th_serial_team) {
2669     return;
2670   }
2671   if (thread->th.th_team->t.t_serialized > 1) {
2672     int push = 0;
2673 
2674     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2675       push = 1;
2676     } else {
2677       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2678           thread->th.th_team->t.t_serialized) {
2679         push = 1;
2680       }
2681     }
2682     if (push) { /* push a record on the serial team's stack */
2683       kmp_internal_control_t *control =
2684           (kmp_internal_control_t *)__kmp_allocate(
2685               sizeof(kmp_internal_control_t));
2686 
2687       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2688 
2689       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2690 
2691       control->next = thread->th.th_team->t.t_control_stack_top;
2692       thread->th.th_team->t.t_control_stack_top = control;
2693     }
2694   }
2695 }
2696 
2697 /* Changes set_nproc */
2698 void __kmp_set_num_threads(int new_nth, int gtid) {
2699   kmp_info_t *thread;
2700   kmp_root_t *root;
2701 
2702   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2703   KMP_DEBUG_ASSERT(__kmp_init_serial);
2704 
2705   if (new_nth < 1)
2706     new_nth = 1;
2707   else if (new_nth > __kmp_max_nth)
2708     new_nth = __kmp_max_nth;
2709 
2710   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2711   thread = __kmp_threads[gtid];
2712   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2713     return; // nothing to do
2714 
2715   __kmp_save_internal_controls(thread);
2716 
2717   set__nproc(thread, new_nth);
2718 
2719   // If this omp_set_num_threads() call will cause the hot team size to be
2720   // reduced (in the absence of a num_threads clause), then reduce it now,
2721   // rather than waiting for the next parallel region.
2722   root = thread->th.th_root;
2723   if (__kmp_init_parallel && (!root->r.r_active) &&
2724       (root->r.r_hot_team->t.t_nproc > new_nth)
2725 #if KMP_NESTED_HOT_TEAMS
2726       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2727 #endif
2728   ) {
2729     kmp_team_t *hot_team = root->r.r_hot_team;
2730     int f;
2731 
2732     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2733 
2734     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2735       __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2736     }
2737     // Release the extra threads we don't need any more.
2738     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2739       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2740       if (__kmp_tasking_mode != tskm_immediate_exec) {
2741         // When decreasing team size, threads no longer in the team should unref
2742         // task team.
2743         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2744       }
2745       __kmp_free_thread(hot_team->t.t_threads[f]);
2746       hot_team->t.t_threads[f] = NULL;
2747     }
2748     hot_team->t.t_nproc = new_nth;
2749 #if KMP_NESTED_HOT_TEAMS
2750     if (thread->th.th_hot_teams) {
2751       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2752       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2753     }
2754 #endif
2755 
2756     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2757       hot_team->t.b->update_num_threads(new_nth);
2758       __kmp_add_threads_to_team(hot_team, new_nth);
2759     }
2760 
2761     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2762 
2763     // Update the t_nproc field in the threads that are still active.
2764     for (f = 0; f < new_nth; f++) {
2765       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2766       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2767     }
2768     // Special flag in case omp_set_num_threads() call
2769     hot_team->t.t_size_changed = -1;
2770   }
2771 }
2772 
2773 /* Changes max_active_levels */
2774 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2775   kmp_info_t *thread;
2776 
2777   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2778                 "%d = (%d)\n",
2779                 gtid, max_active_levels));
2780   KMP_DEBUG_ASSERT(__kmp_init_serial);
2781 
2782   // validate max_active_levels
2783   if (max_active_levels < 0) {
2784     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2785     // We ignore this call if the user has specified a negative value.
2786     // The current setting won't be changed. The last valid setting will be
2787     // used. A warning will be issued (if warnings are allowed as controlled by
2788     // the KMP_WARNINGS env var).
2789     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2790                   "max_active_levels for thread %d = (%d)\n",
2791                   gtid, max_active_levels));
2792     return;
2793   }
2794   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2795     // it's OK, the max_active_levels is within the valid range: [ 0;
2796     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2797     // We allow a zero value. (implementation defined behavior)
2798   } else {
2799     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2800                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2801     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2802     // Current upper limit is MAX_INT. (implementation defined behavior)
2803     // If the input exceeds the upper limit, we correct the input to be the
2804     // upper limit. (implementation defined behavior)
2805     // Actually, the flow should never get here until we use MAX_INT limit.
2806   }
2807   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2808                 "max_active_levels for thread %d = (%d)\n",
2809                 gtid, max_active_levels));
2810 
2811   thread = __kmp_threads[gtid];
2812 
2813   __kmp_save_internal_controls(thread);
2814 
2815   set__max_active_levels(thread, max_active_levels);
2816 }
2817 
2818 /* Gets max_active_levels */
2819 int __kmp_get_max_active_levels(int gtid) {
2820   kmp_info_t *thread;
2821 
2822   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2823   KMP_DEBUG_ASSERT(__kmp_init_serial);
2824 
2825   thread = __kmp_threads[gtid];
2826   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2827   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2828                 "curtask_maxaclevel=%d\n",
2829                 gtid, thread->th.th_current_task,
2830                 thread->th.th_current_task->td_icvs.max_active_levels));
2831   return thread->th.th_current_task->td_icvs.max_active_levels;
2832 }
2833 
2834 // nteams-var per-device ICV
2835 void __kmp_set_num_teams(int num_teams) {
2836   if (num_teams > 0)
2837     __kmp_nteams = num_teams;
2838 }
2839 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2840 // teams-thread-limit-var per-device ICV
2841 void __kmp_set_teams_thread_limit(int limit) {
2842   if (limit > 0)
2843     __kmp_teams_thread_limit = limit;
2844 }
2845 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2846 
2847 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2848 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2849 
2850 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2851 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2852   kmp_info_t *thread;
2853   kmp_sched_t orig_kind;
2854   //    kmp_team_t *team;
2855 
2856   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2857                 gtid, (int)kind, chunk));
2858   KMP_DEBUG_ASSERT(__kmp_init_serial);
2859 
2860   // Check if the kind parameter is valid, correct if needed.
2861   // Valid parameters should fit in one of two intervals - standard or extended:
2862   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2863   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2864   orig_kind = kind;
2865   kind = __kmp_sched_without_mods(kind);
2866 
2867   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2868       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2869     // TODO: Hint needs attention in case we change the default schedule.
2870     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2871               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2872               __kmp_msg_null);
2873     kind = kmp_sched_default;
2874     chunk = 0; // ignore chunk value in case of bad kind
2875   }
2876 
2877   thread = __kmp_threads[gtid];
2878 
2879   __kmp_save_internal_controls(thread);
2880 
2881   if (kind < kmp_sched_upper_std) {
2882     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2883       // differ static chunked vs. unchunked:  chunk should be invalid to
2884       // indicate unchunked schedule (which is the default)
2885       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2886     } else {
2887       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2888           __kmp_sch_map[kind - kmp_sched_lower - 1];
2889     }
2890   } else {
2891     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2892     //    kmp_sched_lower - 2 ];
2893     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2894         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2895                       kmp_sched_lower - 2];
2896   }
2897   __kmp_sched_apply_mods_intkind(
2898       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2899   if (kind == kmp_sched_auto || chunk < 1) {
2900     // ignore parameter chunk for schedule auto
2901     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2902   } else {
2903     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2904   }
2905 }
2906 
2907 /* Gets def_sched_var ICV values */
2908 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2909   kmp_info_t *thread;
2910   enum sched_type th_type;
2911 
2912   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2913   KMP_DEBUG_ASSERT(__kmp_init_serial);
2914 
2915   thread = __kmp_threads[gtid];
2916 
2917   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2918   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2919   case kmp_sch_static:
2920   case kmp_sch_static_greedy:
2921   case kmp_sch_static_balanced:
2922     *kind = kmp_sched_static;
2923     __kmp_sched_apply_mods_stdkind(kind, th_type);
2924     *chunk = 0; // chunk was not set, try to show this fact via zero value
2925     return;
2926   case kmp_sch_static_chunked:
2927     *kind = kmp_sched_static;
2928     break;
2929   case kmp_sch_dynamic_chunked:
2930     *kind = kmp_sched_dynamic;
2931     break;
2932   case kmp_sch_guided_chunked:
2933   case kmp_sch_guided_iterative_chunked:
2934   case kmp_sch_guided_analytical_chunked:
2935     *kind = kmp_sched_guided;
2936     break;
2937   case kmp_sch_auto:
2938     *kind = kmp_sched_auto;
2939     break;
2940   case kmp_sch_trapezoidal:
2941     *kind = kmp_sched_trapezoidal;
2942     break;
2943 #if KMP_STATIC_STEAL_ENABLED
2944   case kmp_sch_static_steal:
2945     *kind = kmp_sched_static_steal;
2946     break;
2947 #endif
2948   default:
2949     KMP_FATAL(UnknownSchedulingType, th_type);
2950   }
2951 
2952   __kmp_sched_apply_mods_stdkind(kind, th_type);
2953   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2954 }
2955 
2956 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2957 
2958   int ii, dd;
2959   kmp_team_t *team;
2960   kmp_info_t *thr;
2961 
2962   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2963   KMP_DEBUG_ASSERT(__kmp_init_serial);
2964 
2965   // validate level
2966   if (level == 0)
2967     return 0;
2968   if (level < 0)
2969     return -1;
2970   thr = __kmp_threads[gtid];
2971   team = thr->th.th_team;
2972   ii = team->t.t_level;
2973   if (level > ii)
2974     return -1;
2975 
2976   if (thr->th.th_teams_microtask) {
2977     // AC: we are in teams region where multiple nested teams have same level
2978     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2979     if (level <=
2980         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2981       KMP_DEBUG_ASSERT(ii >= tlevel);
2982       // AC: As we need to pass by the teams league, we need to artificially
2983       // increase ii
2984       if (ii == tlevel) {
2985         ii += 2; // three teams have same level
2986       } else {
2987         ii++; // two teams have same level
2988       }
2989     }
2990   }
2991 
2992   if (ii == level)
2993     return __kmp_tid_from_gtid(gtid);
2994 
2995   dd = team->t.t_serialized;
2996   level++;
2997   while (ii > level) {
2998     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2999     }
3000     if ((team->t.t_serialized) && (!dd)) {
3001       team = team->t.t_parent;
3002       continue;
3003     }
3004     if (ii > level) {
3005       team = team->t.t_parent;
3006       dd = team->t.t_serialized;
3007       ii--;
3008     }
3009   }
3010 
3011   return (dd > 1) ? (0) : (team->t.t_master_tid);
3012 }
3013 
3014 int __kmp_get_team_size(int gtid, int level) {
3015 
3016   int ii, dd;
3017   kmp_team_t *team;
3018   kmp_info_t *thr;
3019 
3020   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3021   KMP_DEBUG_ASSERT(__kmp_init_serial);
3022 
3023   // validate level
3024   if (level == 0)
3025     return 1;
3026   if (level < 0)
3027     return -1;
3028   thr = __kmp_threads[gtid];
3029   team = thr->th.th_team;
3030   ii = team->t.t_level;
3031   if (level > ii)
3032     return -1;
3033 
3034   if (thr->th.th_teams_microtask) {
3035     // AC: we are in teams region where multiple nested teams have same level
3036     int tlevel = thr->th.th_teams_level; // the level of the teams construct
3037     if (level <=
3038         tlevel) { // otherwise usual algorithm works (will not touch the teams)
3039       KMP_DEBUG_ASSERT(ii >= tlevel);
3040       // AC: As we need to pass by the teams league, we need to artificially
3041       // increase ii
3042       if (ii == tlevel) {
3043         ii += 2; // three teams have same level
3044       } else {
3045         ii++; // two teams have same level
3046       }
3047     }
3048   }
3049 
3050   while (ii > level) {
3051     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3052     }
3053     if (team->t.t_serialized && (!dd)) {
3054       team = team->t.t_parent;
3055       continue;
3056     }
3057     if (ii > level) {
3058       team = team->t.t_parent;
3059       ii--;
3060     }
3061   }
3062 
3063   return team->t.t_nproc;
3064 }
3065 
3066 kmp_r_sched_t __kmp_get_schedule_global() {
3067   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3068   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3069   // independently. So one can get the updated schedule here.
3070 
3071   kmp_r_sched_t r_sched;
3072 
3073   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3074   // __kmp_guided. __kmp_sched should keep original value, so that user can set
3075   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3076   // different roots (even in OMP 2.5)
3077   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3078   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3079   if (s == kmp_sch_static) {
3080     // replace STATIC with more detailed schedule (balanced or greedy)
3081     r_sched.r_sched_type = __kmp_static;
3082   } else if (s == kmp_sch_guided_chunked) {
3083     // replace GUIDED with more detailed schedule (iterative or analytical)
3084     r_sched.r_sched_type = __kmp_guided;
3085   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3086     r_sched.r_sched_type = __kmp_sched;
3087   }
3088   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3089 
3090   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3091     // __kmp_chunk may be wrong here (if it was not ever set)
3092     r_sched.chunk = KMP_DEFAULT_CHUNK;
3093   } else {
3094     r_sched.chunk = __kmp_chunk;
3095   }
3096 
3097   return r_sched;
3098 }
3099 
3100 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3101    at least argc number of *t_argv entries for the requested team. */
3102 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3103 
3104   KMP_DEBUG_ASSERT(team);
3105   if (!realloc || argc > team->t.t_max_argc) {
3106 
3107     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3108                    "current entries=%d\n",
3109                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3110     /* if previously allocated heap space for args, free them */
3111     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3112       __kmp_free((void *)team->t.t_argv);
3113 
3114     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3115       /* use unused space in the cache line for arguments */
3116       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3117       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3118                      "argv entries\n",
3119                      team->t.t_id, team->t.t_max_argc));
3120       team->t.t_argv = &team->t.t_inline_argv[0];
3121       if (__kmp_storage_map) {
3122         __kmp_print_storage_map_gtid(
3123             -1, &team->t.t_inline_argv[0],
3124             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3125             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3126             team->t.t_id);
3127       }
3128     } else {
3129       /* allocate space for arguments in the heap */
3130       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3131                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3132                                : 2 * argc;
3133       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3134                      "argv entries\n",
3135                      team->t.t_id, team->t.t_max_argc));
3136       team->t.t_argv =
3137           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3138       if (__kmp_storage_map) {
3139         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3140                                      &team->t.t_argv[team->t.t_max_argc],
3141                                      sizeof(void *) * team->t.t_max_argc,
3142                                      "team_%d.t_argv", team->t.t_id);
3143       }
3144     }
3145   }
3146 }
3147 
3148 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3149   int i;
3150   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3151   team->t.t_threads =
3152       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3153   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3154       sizeof(dispatch_shared_info_t) * num_disp_buff);
3155   team->t.t_dispatch =
3156       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3157   team->t.t_implicit_task_taskdata =
3158       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3159   team->t.t_max_nproc = max_nth;
3160 
3161   /* setup dispatch buffers */
3162   for (i = 0; i < num_disp_buff; ++i) {
3163     team->t.t_disp_buffer[i].buffer_index = i;
3164     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3165   }
3166 }
3167 
3168 static void __kmp_free_team_arrays(kmp_team_t *team) {
3169   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3170   int i;
3171   for (i = 0; i < team->t.t_max_nproc; ++i) {
3172     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3173       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3174       team->t.t_dispatch[i].th_disp_buffer = NULL;
3175     }
3176   }
3177 #if KMP_USE_HIER_SCHED
3178   __kmp_dispatch_free_hierarchies(team);
3179 #endif
3180   __kmp_free(team->t.t_threads);
3181   __kmp_free(team->t.t_disp_buffer);
3182   __kmp_free(team->t.t_dispatch);
3183   __kmp_free(team->t.t_implicit_task_taskdata);
3184   team->t.t_threads = NULL;
3185   team->t.t_disp_buffer = NULL;
3186   team->t.t_dispatch = NULL;
3187   team->t.t_implicit_task_taskdata = 0;
3188 }
3189 
3190 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3191   kmp_info_t **oldThreads = team->t.t_threads;
3192 
3193   __kmp_free(team->t.t_disp_buffer);
3194   __kmp_free(team->t.t_dispatch);
3195   __kmp_free(team->t.t_implicit_task_taskdata);
3196   __kmp_allocate_team_arrays(team, max_nth);
3197 
3198   KMP_MEMCPY(team->t.t_threads, oldThreads,
3199              team->t.t_nproc * sizeof(kmp_info_t *));
3200 
3201   __kmp_free(oldThreads);
3202 }
3203 
3204 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3205 
3206   kmp_r_sched_t r_sched =
3207       __kmp_get_schedule_global(); // get current state of scheduling globals
3208 
3209   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3210 
3211   kmp_internal_control_t g_icvs = {
3212     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3213     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3214     // adjustment of threads (per thread)
3215     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3216     // whether blocktime is explicitly set
3217     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3218 #if KMP_USE_MONITOR
3219     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3220 // intervals
3221 #endif
3222     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3223     // next parallel region (per thread)
3224     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3225     __kmp_cg_max_nth, // int thread_limit;
3226     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3227     // for max_active_levels
3228     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3229     // {sched,chunk} pair
3230     __kmp_nested_proc_bind.bind_types[0],
3231     __kmp_default_device,
3232     NULL // struct kmp_internal_control *next;
3233   };
3234 
3235   return g_icvs;
3236 }
3237 
3238 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3239 
3240   kmp_internal_control_t gx_icvs;
3241   gx_icvs.serial_nesting_level =
3242       0; // probably =team->t.t_serial like in save_inter_controls
3243   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3244   gx_icvs.next = NULL;
3245 
3246   return gx_icvs;
3247 }
3248 
3249 static void __kmp_initialize_root(kmp_root_t *root) {
3250   int f;
3251   kmp_team_t *root_team;
3252   kmp_team_t *hot_team;
3253   int hot_team_max_nth;
3254   kmp_r_sched_t r_sched =
3255       __kmp_get_schedule_global(); // get current state of scheduling globals
3256   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3257   KMP_DEBUG_ASSERT(root);
3258   KMP_ASSERT(!root->r.r_begin);
3259 
3260   /* setup the root state structure */
3261   __kmp_init_lock(&root->r.r_begin_lock);
3262   root->r.r_begin = FALSE;
3263   root->r.r_active = FALSE;
3264   root->r.r_in_parallel = 0;
3265   root->r.r_blocktime = __kmp_dflt_blocktime;
3266 #if KMP_AFFINITY_SUPPORTED
3267   root->r.r_affinity_assigned = FALSE;
3268 #endif
3269 
3270   /* setup the root team for this task */
3271   /* allocate the root team structure */
3272   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3273 
3274   root_team =
3275       __kmp_allocate_team(root,
3276                           1, // new_nproc
3277                           1, // max_nproc
3278 #if OMPT_SUPPORT
3279                           ompt_data_none, // root parallel id
3280 #endif
3281                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3282                           0 // argc
3283                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3284       );
3285 #if USE_DEBUGGER
3286   // Non-NULL value should be assigned to make the debugger display the root
3287   // team.
3288   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3289 #endif
3290 
3291   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3292 
3293   root->r.r_root_team = root_team;
3294   root_team->t.t_control_stack_top = NULL;
3295 
3296   /* initialize root team */
3297   root_team->t.t_threads[0] = NULL;
3298   root_team->t.t_nproc = 1;
3299   root_team->t.t_serialized = 1;
3300   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3301   root_team->t.t_sched.sched = r_sched.sched;
3302   KA_TRACE(
3303       20,
3304       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3305        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3306 
3307   /* setup the  hot team for this task */
3308   /* allocate the hot team structure */
3309   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3310 
3311   hot_team =
3312       __kmp_allocate_team(root,
3313                           1, // new_nproc
3314                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3315 #if OMPT_SUPPORT
3316                           ompt_data_none, // root parallel id
3317 #endif
3318                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3319                           0 // argc
3320                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3321       );
3322   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3323 
3324   root->r.r_hot_team = hot_team;
3325   root_team->t.t_control_stack_top = NULL;
3326 
3327   /* first-time initialization */
3328   hot_team->t.t_parent = root_team;
3329 
3330   /* initialize hot team */
3331   hot_team_max_nth = hot_team->t.t_max_nproc;
3332   for (f = 0; f < hot_team_max_nth; ++f) {
3333     hot_team->t.t_threads[f] = NULL;
3334   }
3335   hot_team->t.t_nproc = 1;
3336   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3337   hot_team->t.t_sched.sched = r_sched.sched;
3338   hot_team->t.t_size_changed = 0;
3339 }
3340 
3341 #ifdef KMP_DEBUG
3342 
3343 typedef struct kmp_team_list_item {
3344   kmp_team_p const *entry;
3345   struct kmp_team_list_item *next;
3346 } kmp_team_list_item_t;
3347 typedef kmp_team_list_item_t *kmp_team_list_t;
3348 
3349 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3350     kmp_team_list_t list, // List of teams.
3351     kmp_team_p const *team // Team to add.
3352 ) {
3353 
3354   // List must terminate with item where both entry and next are NULL.
3355   // Team is added to the list only once.
3356   // List is sorted in ascending order by team id.
3357   // Team id is *not* a key.
3358 
3359   kmp_team_list_t l;
3360 
3361   KMP_DEBUG_ASSERT(list != NULL);
3362   if (team == NULL) {
3363     return;
3364   }
3365 
3366   __kmp_print_structure_team_accum(list, team->t.t_parent);
3367   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3368 
3369   // Search list for the team.
3370   l = list;
3371   while (l->next != NULL && l->entry != team) {
3372     l = l->next;
3373   }
3374   if (l->next != NULL) {
3375     return; // Team has been added before, exit.
3376   }
3377 
3378   // Team is not found. Search list again for insertion point.
3379   l = list;
3380   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3381     l = l->next;
3382   }
3383 
3384   // Insert team.
3385   {
3386     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3387         sizeof(kmp_team_list_item_t));
3388     *item = *l;
3389     l->entry = team;
3390     l->next = item;
3391   }
3392 }
3393 
3394 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3395 
3396 ) {
3397   __kmp_printf("%s", title);
3398   if (team != NULL) {
3399     __kmp_printf("%2x %p\n", team->t.t_id, team);
3400   } else {
3401     __kmp_printf(" - (nil)\n");
3402   }
3403 }
3404 
3405 static void __kmp_print_structure_thread(char const *title,
3406                                          kmp_info_p const *thread) {
3407   __kmp_printf("%s", title);
3408   if (thread != NULL) {
3409     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3410   } else {
3411     __kmp_printf(" - (nil)\n");
3412   }
3413 }
3414 
3415 void __kmp_print_structure(void) {
3416 
3417   kmp_team_list_t list;
3418 
3419   // Initialize list of teams.
3420   list =
3421       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3422   list->entry = NULL;
3423   list->next = NULL;
3424 
3425   __kmp_printf("\n------------------------------\nGlobal Thread "
3426                "Table\n------------------------------\n");
3427   {
3428     int gtid;
3429     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3430       __kmp_printf("%2d", gtid);
3431       if (__kmp_threads != NULL) {
3432         __kmp_printf(" %p", __kmp_threads[gtid]);
3433       }
3434       if (__kmp_root != NULL) {
3435         __kmp_printf(" %p", __kmp_root[gtid]);
3436       }
3437       __kmp_printf("\n");
3438     }
3439   }
3440 
3441   // Print out __kmp_threads array.
3442   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3443                "----------\n");
3444   if (__kmp_threads != NULL) {
3445     int gtid;
3446     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3447       kmp_info_t const *thread = __kmp_threads[gtid];
3448       if (thread != NULL) {
3449         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3450         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3451         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3452         __kmp_print_structure_team("    Serial Team:  ",
3453                                    thread->th.th_serial_team);
3454         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3455         __kmp_print_structure_thread("    Primary:      ",
3456                                      thread->th.th_team_master);
3457         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3458         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3459         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3460         __kmp_print_structure_thread("    Next in pool: ",
3461                                      thread->th.th_next_pool);
3462         __kmp_printf("\n");
3463         __kmp_print_structure_team_accum(list, thread->th.th_team);
3464         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3465       }
3466     }
3467   } else {
3468     __kmp_printf("Threads array is not allocated.\n");
3469   }
3470 
3471   // Print out __kmp_root array.
3472   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3473                "--------\n");
3474   if (__kmp_root != NULL) {
3475     int gtid;
3476     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3477       kmp_root_t const *root = __kmp_root[gtid];
3478       if (root != NULL) {
3479         __kmp_printf("GTID %2d %p:\n", gtid, root);
3480         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3481         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3482         __kmp_print_structure_thread("    Uber Thread:  ",
3483                                      root->r.r_uber_thread);
3484         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3485         __kmp_printf("    In Parallel:  %2d\n",
3486                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3487         __kmp_printf("\n");
3488         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3489         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3490       }
3491     }
3492   } else {
3493     __kmp_printf("Ubers array is not allocated.\n");
3494   }
3495 
3496   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3497                "--------\n");
3498   while (list->next != NULL) {
3499     kmp_team_p const *team = list->entry;
3500     int i;
3501     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3502     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3503     __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
3504     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3505     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3506     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3507     for (i = 0; i < team->t.t_nproc; ++i) {
3508       __kmp_printf("    Thread %2d:      ", i);
3509       __kmp_print_structure_thread("", team->t.t_threads[i]);
3510     }
3511     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3512     __kmp_printf("\n");
3513     list = list->next;
3514   }
3515 
3516   // Print out __kmp_thread_pool and __kmp_team_pool.
3517   __kmp_printf("\n------------------------------\nPools\n----------------------"
3518                "--------\n");
3519   __kmp_print_structure_thread("Thread pool:          ",
3520                                CCAST(kmp_info_t *, __kmp_thread_pool));
3521   __kmp_print_structure_team("Team pool:            ",
3522                              CCAST(kmp_team_t *, __kmp_team_pool));
3523   __kmp_printf("\n");
3524 
3525   // Free team list.
3526   while (list != NULL) {
3527     kmp_team_list_item_t *item = list;
3528     list = list->next;
3529     KMP_INTERNAL_FREE(item);
3530   }
3531 }
3532 
3533 #endif
3534 
3535 //---------------------------------------------------------------------------
3536 //  Stuff for per-thread fast random number generator
3537 //  Table of primes
3538 static const unsigned __kmp_primes[] = {
3539     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3540     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3541     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3542     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3543     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3544     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3545     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3546     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3547     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3548     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3549     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3550 
3551 //---------------------------------------------------------------------------
3552 //  __kmp_get_random: Get a random number using a linear congruential method.
3553 unsigned short __kmp_get_random(kmp_info_t *thread) {
3554   unsigned x = thread->th.th_x;
3555   unsigned short r = (unsigned short)(x >> 16);
3556 
3557   thread->th.th_x = x * thread->th.th_a + 1;
3558 
3559   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3560                 thread->th.th_info.ds.ds_tid, r));
3561 
3562   return r;
3563 }
3564 //--------------------------------------------------------
3565 // __kmp_init_random: Initialize a random number generator
3566 void __kmp_init_random(kmp_info_t *thread) {
3567   unsigned seed = thread->th.th_info.ds.ds_tid;
3568 
3569   thread->th.th_a =
3570       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3571   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3572   KA_TRACE(30,
3573            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3574 }
3575 
3576 #if KMP_OS_WINDOWS
3577 /* reclaim array entries for root threads that are already dead, returns number
3578  * reclaimed */
3579 static int __kmp_reclaim_dead_roots(void) {
3580   int i, r = 0;
3581 
3582   for (i = 0; i < __kmp_threads_capacity; ++i) {
3583     if (KMP_UBER_GTID(i) &&
3584         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3585         !__kmp_root[i]
3586              ->r.r_active) { // AC: reclaim only roots died in non-active state
3587       r += __kmp_unregister_root_other_thread(i);
3588     }
3589   }
3590   return r;
3591 }
3592 #endif
3593 
3594 /* This function attempts to create free entries in __kmp_threads and
3595    __kmp_root, and returns the number of free entries generated.
3596 
3597    For Windows* OS static library, the first mechanism used is to reclaim array
3598    entries for root threads that are already dead.
3599 
3600    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3601    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3602    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3603    threadprivate cache array has been created. Synchronization with
3604    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3605 
3606    After any dead root reclamation, if the clipping value allows array expansion
3607    to result in the generation of a total of nNeed free slots, the function does
3608    that expansion. If not, nothing is done beyond the possible initial root
3609    thread reclamation.
3610 
3611    If any argument is negative, the behavior is undefined. */
3612 static int __kmp_expand_threads(int nNeed) {
3613   int added = 0;
3614   int minimumRequiredCapacity;
3615   int newCapacity;
3616   kmp_info_t **newThreads;
3617   kmp_root_t **newRoot;
3618 
3619   // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3620   // resizing __kmp_threads does not need additional protection if foreign
3621   // threads are present
3622 
3623 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3624   /* only for Windows static library */
3625   /* reclaim array entries for root threads that are already dead */
3626   added = __kmp_reclaim_dead_roots();
3627 
3628   if (nNeed) {
3629     nNeed -= added;
3630     if (nNeed < 0)
3631       nNeed = 0;
3632   }
3633 #endif
3634   if (nNeed <= 0)
3635     return added;
3636 
3637   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3638   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3639   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3640   // > __kmp_max_nth in one of two ways:
3641   //
3642   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3643   //    may not be reused by another thread, so we may need to increase
3644   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3645   //
3646   // 2) New foreign root(s) are encountered.  We always register new foreign
3647   //    roots. This may cause a smaller # of threads to be allocated at
3648   //    subsequent parallel regions, but the worker threads hang around (and
3649   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3650   //
3651   // Anyway, that is the reason for moving the check to see if
3652   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3653   // instead of having it performed here. -BB
3654 
3655   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3656 
3657   /* compute expansion headroom to check if we can expand */
3658   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3659     /* possible expansion too small -- give up */
3660     return added;
3661   }
3662   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3663 
3664   newCapacity = __kmp_threads_capacity;
3665   do {
3666     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3667                                                           : __kmp_sys_max_nth;
3668   } while (newCapacity < minimumRequiredCapacity);
3669   newThreads = (kmp_info_t **)__kmp_allocate(
3670       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3671   newRoot =
3672       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3673   KMP_MEMCPY(newThreads, __kmp_threads,
3674              __kmp_threads_capacity * sizeof(kmp_info_t *));
3675   KMP_MEMCPY(newRoot, __kmp_root,
3676              __kmp_threads_capacity * sizeof(kmp_root_t *));
3677   // Put old __kmp_threads array on a list. Any ongoing references to the old
3678   // list will be valid. This list is cleaned up at library shutdown.
3679   kmp_old_threads_list_t *node =
3680       (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3681   node->threads = __kmp_threads;
3682   node->next = __kmp_old_threads_list;
3683   __kmp_old_threads_list = node;
3684 
3685   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3686   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3687   added += newCapacity - __kmp_threads_capacity;
3688   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3689 
3690   if (newCapacity > __kmp_tp_capacity) {
3691     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3692     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3693       __kmp_threadprivate_resize_cache(newCapacity);
3694     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3695       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3696     }
3697     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3698   }
3699 
3700   return added;
3701 }
3702 
3703 /* Register the current thread as a root thread and obtain our gtid. We must
3704    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3705    thread that calls from __kmp_do_serial_initialize() */
3706 int __kmp_register_root(int initial_thread) {
3707   kmp_info_t *root_thread;
3708   kmp_root_t *root;
3709   int gtid;
3710   int capacity;
3711   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3712   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3713   KMP_MB();
3714 
3715   /* 2007-03-02:
3716      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3717      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3718      work as expected -- it may return false (that means there is at least one
3719      empty slot in __kmp_threads array), but it is possible the only free slot
3720      is #0, which is reserved for initial thread and so cannot be used for this
3721      one. Following code workarounds this bug.
3722 
3723      However, right solution seems to be not reserving slot #0 for initial
3724      thread because:
3725      (1) there is no magic in slot #0,
3726      (2) we cannot detect initial thread reliably (the first thread which does
3727         serial initialization may be not a real initial thread).
3728   */
3729   capacity = __kmp_threads_capacity;
3730   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3731     --capacity;
3732   }
3733 
3734   // If it is not for initializing the hidden helper team, we need to take
3735   // __kmp_hidden_helper_threads_num out of the capacity because it is included
3736   // in __kmp_threads_capacity.
3737   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3738     capacity -= __kmp_hidden_helper_threads_num;
3739   }
3740 
3741   /* see if there are too many threads */
3742   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3743     if (__kmp_tp_cached) {
3744       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3745                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3746                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3747     } else {
3748       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3749                   __kmp_msg_null);
3750     }
3751   }
3752 
3753   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3754   // 0: initial thread, also a regular OpenMP thread.
3755   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3756   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3757   // regular OpenMP threads.
3758   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3759     // Find an available thread slot for hidden helper thread. Slots for hidden
3760     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3761     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3762                    gtid <= __kmp_hidden_helper_threads_num;
3763          gtid++)
3764       ;
3765     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3766     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3767                  "hidden helper thread: T#%d\n",
3768                  gtid));
3769   } else {
3770     /* find an available thread slot */
3771     // Don't reassign the zero slot since we need that to only be used by
3772     // initial thread. Slots for hidden helper threads should also be skipped.
3773     if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3774       gtid = 0;
3775     } else {
3776       for (gtid = __kmp_hidden_helper_threads_num + 1;
3777            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3778         ;
3779     }
3780     KA_TRACE(
3781         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3782     KMP_ASSERT(gtid < __kmp_threads_capacity);
3783   }
3784 
3785   /* update global accounting */
3786   __kmp_all_nth++;
3787   TCW_4(__kmp_nth, __kmp_nth + 1);
3788 
3789   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3790   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3791   if (__kmp_adjust_gtid_mode) {
3792     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3793       if (TCR_4(__kmp_gtid_mode) != 2) {
3794         TCW_4(__kmp_gtid_mode, 2);
3795       }
3796     } else {
3797       if (TCR_4(__kmp_gtid_mode) != 1) {
3798         TCW_4(__kmp_gtid_mode, 1);
3799       }
3800     }
3801   }
3802 
3803 #ifdef KMP_ADJUST_BLOCKTIME
3804   /* Adjust blocktime to zero if necessary            */
3805   /* Middle initialization might not have occurred yet */
3806   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3807     if (__kmp_nth > __kmp_avail_proc) {
3808       __kmp_zero_bt = TRUE;
3809     }
3810   }
3811 #endif /* KMP_ADJUST_BLOCKTIME */
3812 
3813   /* setup this new hierarchy */
3814   if (!(root = __kmp_root[gtid])) {
3815     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3816     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3817   }
3818 
3819 #if KMP_STATS_ENABLED
3820   // Initialize stats as soon as possible (right after gtid assignment).
3821   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3822   __kmp_stats_thread_ptr->startLife();
3823   KMP_SET_THREAD_STATE(SERIAL_REGION);
3824   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3825 #endif
3826   __kmp_initialize_root(root);
3827 
3828   /* setup new root thread structure */
3829   if (root->r.r_uber_thread) {
3830     root_thread = root->r.r_uber_thread;
3831   } else {
3832     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3833     if (__kmp_storage_map) {
3834       __kmp_print_thread_storage_map(root_thread, gtid);
3835     }
3836     root_thread->th.th_info.ds.ds_gtid = gtid;
3837 #if OMPT_SUPPORT
3838     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3839 #endif
3840     root_thread->th.th_root = root;
3841     if (__kmp_env_consistency_check) {
3842       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3843     }
3844 #if USE_FAST_MEMORY
3845     __kmp_initialize_fast_memory(root_thread);
3846 #endif /* USE_FAST_MEMORY */
3847 
3848 #if KMP_USE_BGET
3849     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3850     __kmp_initialize_bget(root_thread);
3851 #endif
3852     __kmp_init_random(root_thread); // Initialize random number generator
3853   }
3854 
3855   /* setup the serial team held in reserve by the root thread */
3856   if (!root_thread->th.th_serial_team) {
3857     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3858     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3859     root_thread->th.th_serial_team = __kmp_allocate_team(
3860         root, 1, 1,
3861 #if OMPT_SUPPORT
3862         ompt_data_none, // root parallel id
3863 #endif
3864         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3865   }
3866   KMP_ASSERT(root_thread->th.th_serial_team);
3867   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3868                 root_thread->th.th_serial_team));
3869 
3870   /* drop root_thread into place */
3871   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3872 
3873   root->r.r_root_team->t.t_threads[0] = root_thread;
3874   root->r.r_hot_team->t.t_threads[0] = root_thread;
3875   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3876   // AC: the team created in reserve, not for execution (it is unused for now).
3877   root_thread->th.th_serial_team->t.t_serialized = 0;
3878   root->r.r_uber_thread = root_thread;
3879 
3880   /* initialize the thread, get it ready to go */
3881   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3882   TCW_4(__kmp_init_gtid, TRUE);
3883 
3884   /* prepare the primary thread for get_gtid() */
3885   __kmp_gtid_set_specific(gtid);
3886 
3887 #if USE_ITT_BUILD
3888   __kmp_itt_thread_name(gtid);
3889 #endif /* USE_ITT_BUILD */
3890 
3891 #ifdef KMP_TDATA_GTID
3892   __kmp_gtid = gtid;
3893 #endif
3894   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3895   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3896 
3897   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3898                 "plain=%u\n",
3899                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3900                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3901                 KMP_INIT_BARRIER_STATE));
3902   { // Initialize barrier data.
3903     int b;
3904     for (b = 0; b < bs_last_barrier; ++b) {
3905       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3906 #if USE_DEBUGGER
3907       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3908 #endif
3909     }
3910   }
3911   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3912                    KMP_INIT_BARRIER_STATE);
3913 
3914 #if KMP_AFFINITY_SUPPORTED
3915   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3916   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3917   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3918   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3919 #endif /* KMP_AFFINITY_SUPPORTED */
3920   root_thread->th.th_def_allocator = __kmp_def_allocator;
3921   root_thread->th.th_prev_level = 0;
3922   root_thread->th.th_prev_num_threads = 1;
3923 
3924   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3925   tmp->cg_root = root_thread;
3926   tmp->cg_thread_limit = __kmp_cg_max_nth;
3927   tmp->cg_nthreads = 1;
3928   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3929                  " cg_nthreads init to 1\n",
3930                  root_thread, tmp));
3931   tmp->up = NULL;
3932   root_thread->th.th_cg_roots = tmp;
3933 
3934   __kmp_root_counter++;
3935 
3936 #if OMPT_SUPPORT
3937   if (!initial_thread && ompt_enabled.enabled) {
3938 
3939     kmp_info_t *root_thread = ompt_get_thread();
3940 
3941     ompt_set_thread_state(root_thread, ompt_state_overhead);
3942 
3943     if (ompt_enabled.ompt_callback_thread_begin) {
3944       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3945           ompt_thread_initial, __ompt_get_thread_data_internal());
3946     }
3947     ompt_data_t *task_data;
3948     ompt_data_t *parallel_data;
3949     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3950                                   NULL);
3951     if (ompt_enabled.ompt_callback_implicit_task) {
3952       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3953           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3954     }
3955 
3956     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3957   }
3958 #endif
3959 #if OMPD_SUPPORT
3960   if (ompd_state & OMPD_ENABLE_BP)
3961     ompd_bp_thread_begin();
3962 #endif
3963 
3964   KMP_MB();
3965   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3966 
3967   return gtid;
3968 }
3969 
3970 #if KMP_NESTED_HOT_TEAMS
3971 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3972                                 const int max_level) {
3973   int i, n, nth;
3974   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3975   if (!hot_teams || !hot_teams[level].hot_team) {
3976     return 0;
3977   }
3978   KMP_DEBUG_ASSERT(level < max_level);
3979   kmp_team_t *team = hot_teams[level].hot_team;
3980   nth = hot_teams[level].hot_team_nth;
3981   n = nth - 1; // primary thread is not freed
3982   if (level < max_level - 1) {
3983     for (i = 0; i < nth; ++i) {
3984       kmp_info_t *th = team->t.t_threads[i];
3985       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3986       if (i > 0 && th->th.th_hot_teams) {
3987         __kmp_free(th->th.th_hot_teams);
3988         th->th.th_hot_teams = NULL;
3989       }
3990     }
3991   }
3992   __kmp_free_team(root, team, NULL);
3993   return n;
3994 }
3995 #endif
3996 
3997 // Resets a root thread and clear its root and hot teams.
3998 // Returns the number of __kmp_threads entries directly and indirectly freed.
3999 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4000   kmp_team_t *root_team = root->r.r_root_team;
4001   kmp_team_t *hot_team = root->r.r_hot_team;
4002   int n = hot_team->t.t_nproc;
4003   int i;
4004 
4005   KMP_DEBUG_ASSERT(!root->r.r_active);
4006 
4007   root->r.r_root_team = NULL;
4008   root->r.r_hot_team = NULL;
4009   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4010   // before call to __kmp_free_team().
4011   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4012 #if KMP_NESTED_HOT_TEAMS
4013   if (__kmp_hot_teams_max_level >
4014       0) { // need to free nested hot teams and their threads if any
4015     for (i = 0; i < hot_team->t.t_nproc; ++i) {
4016       kmp_info_t *th = hot_team->t.t_threads[i];
4017       if (__kmp_hot_teams_max_level > 1) {
4018         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4019       }
4020       if (th->th.th_hot_teams) {
4021         __kmp_free(th->th.th_hot_teams);
4022         th->th.th_hot_teams = NULL;
4023       }
4024     }
4025   }
4026 #endif
4027   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4028 
4029   // Before we can reap the thread, we need to make certain that all other
4030   // threads in the teams that had this root as ancestor have stopped trying to
4031   // steal tasks.
4032   if (__kmp_tasking_mode != tskm_immediate_exec) {
4033     __kmp_wait_to_unref_task_teams();
4034   }
4035 
4036 #if KMP_OS_WINDOWS
4037   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4038   KA_TRACE(
4039       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4040            "\n",
4041            (LPVOID) & (root->r.r_uber_thread->th),
4042            root->r.r_uber_thread->th.th_info.ds.ds_thread));
4043   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4044 #endif /* KMP_OS_WINDOWS */
4045 
4046 #if OMPD_SUPPORT
4047   if (ompd_state & OMPD_ENABLE_BP)
4048     ompd_bp_thread_end();
4049 #endif
4050 
4051 #if OMPT_SUPPORT
4052   ompt_data_t *task_data;
4053   ompt_data_t *parallel_data;
4054   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4055                                 NULL);
4056   if (ompt_enabled.ompt_callback_implicit_task) {
4057     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4058         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4059   }
4060   if (ompt_enabled.ompt_callback_thread_end) {
4061     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4062         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4063   }
4064 #endif
4065 
4066   TCW_4(__kmp_nth,
4067         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4068   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4069   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4070                  " to %d\n",
4071                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4072                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4073   if (i == 1) {
4074     // need to free contention group structure
4075     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4076                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
4077     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4078     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4079     root->r.r_uber_thread->th.th_cg_roots = NULL;
4080   }
4081   __kmp_reap_thread(root->r.r_uber_thread, 1);
4082 
4083   // We canot put root thread to __kmp_thread_pool, so we have to reap it
4084   // instead of freeing.
4085   root->r.r_uber_thread = NULL;
4086   /* mark root as no longer in use */
4087   root->r.r_begin = FALSE;
4088 
4089   return n;
4090 }
4091 
4092 void __kmp_unregister_root_current_thread(int gtid) {
4093   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4094   /* this lock should be ok, since unregister_root_current_thread is never
4095      called during an abort, only during a normal close. furthermore, if you
4096      have the forkjoin lock, you should never try to get the initz lock */
4097   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4098   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4099     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4100                   "exiting T#%d\n",
4101                   gtid));
4102     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4103     return;
4104   }
4105   kmp_root_t *root = __kmp_root[gtid];
4106 
4107   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4108   KMP_ASSERT(KMP_UBER_GTID(gtid));
4109   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4110   KMP_ASSERT(root->r.r_active == FALSE);
4111 
4112   KMP_MB();
4113 
4114   kmp_info_t *thread = __kmp_threads[gtid];
4115   kmp_team_t *team = thread->th.th_team;
4116   kmp_task_team_t *task_team = thread->th.th_task_team;
4117 
4118   // we need to wait for the proxy tasks before finishing the thread
4119   if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4120                             task_team->tt.tt_hidden_helper_task_encountered)) {
4121 #if OMPT_SUPPORT
4122     // the runtime is shutting down so we won't report any events
4123     thread->th.ompt_thread_info.state = ompt_state_undefined;
4124 #endif
4125     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4126   }
4127 
4128   __kmp_reset_root(gtid, root);
4129 
4130   KMP_MB();
4131   KC_TRACE(10,
4132            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4133 
4134   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4135 }
4136 
4137 #if KMP_OS_WINDOWS
4138 /* __kmp_forkjoin_lock must be already held
4139    Unregisters a root thread that is not the current thread.  Returns the number
4140    of __kmp_threads entries freed as a result. */
4141 static int __kmp_unregister_root_other_thread(int gtid) {
4142   kmp_root_t *root = __kmp_root[gtid];
4143   int r;
4144 
4145   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4146   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4147   KMP_ASSERT(KMP_UBER_GTID(gtid));
4148   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4149   KMP_ASSERT(root->r.r_active == FALSE);
4150 
4151   r = __kmp_reset_root(gtid, root);
4152   KC_TRACE(10,
4153            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4154   return r;
4155 }
4156 #endif
4157 
4158 #if KMP_DEBUG
4159 void __kmp_task_info() {
4160 
4161   kmp_int32 gtid = __kmp_entry_gtid();
4162   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4163   kmp_info_t *this_thr = __kmp_threads[gtid];
4164   kmp_team_t *steam = this_thr->th.th_serial_team;
4165   kmp_team_t *team = this_thr->th.th_team;
4166 
4167   __kmp_printf(
4168       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4169       "ptask=%p\n",
4170       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4171       team->t.t_implicit_task_taskdata[tid].td_parent);
4172 }
4173 #endif // KMP_DEBUG
4174 
4175 /* TODO optimize with one big memclr, take out what isn't needed, split
4176    responsibility to workers as much as possible, and delay initialization of
4177    features as much as possible  */
4178 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4179                                   int tid, int gtid) {
4180   /* this_thr->th.th_info.ds.ds_gtid is setup in
4181      kmp_allocate_thread/create_worker.
4182      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4183   KMP_DEBUG_ASSERT(this_thr != NULL);
4184   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4185   KMP_DEBUG_ASSERT(team);
4186   KMP_DEBUG_ASSERT(team->t.t_threads);
4187   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4188   kmp_info_t *master = team->t.t_threads[0];
4189   KMP_DEBUG_ASSERT(master);
4190   KMP_DEBUG_ASSERT(master->th.th_root);
4191 
4192   KMP_MB();
4193 
4194   TCW_SYNC_PTR(this_thr->th.th_team, team);
4195 
4196   this_thr->th.th_info.ds.ds_tid = tid;
4197   this_thr->th.th_set_nproc = 0;
4198   if (__kmp_tasking_mode != tskm_immediate_exec)
4199     // When tasking is possible, threads are not safe to reap until they are
4200     // done tasking; this will be set when tasking code is exited in wait
4201     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4202   else // no tasking --> always safe to reap
4203     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4204   this_thr->th.th_set_proc_bind = proc_bind_default;
4205 #if KMP_AFFINITY_SUPPORTED
4206   this_thr->th.th_new_place = this_thr->th.th_current_place;
4207 #endif
4208   this_thr->th.th_root = master->th.th_root;
4209 
4210   /* setup the thread's cache of the team structure */
4211   this_thr->th.th_team_nproc = team->t.t_nproc;
4212   this_thr->th.th_team_master = master;
4213   this_thr->th.th_team_serialized = team->t.t_serialized;
4214 
4215   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4216 
4217   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4218                 tid, gtid, this_thr, this_thr->th.th_current_task));
4219 
4220   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4221                            team, tid, TRUE);
4222 
4223   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4224                 tid, gtid, this_thr, this_thr->th.th_current_task));
4225   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4226   // __kmp_initialize_team()?
4227 
4228   /* TODO no worksharing in speculative threads */
4229   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4230 
4231   this_thr->th.th_local.this_construct = 0;
4232 
4233   if (!this_thr->th.th_pri_common) {
4234     this_thr->th.th_pri_common =
4235         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4236     if (__kmp_storage_map) {
4237       __kmp_print_storage_map_gtid(
4238           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4239           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4240     }
4241     this_thr->th.th_pri_head = NULL;
4242   }
4243 
4244   if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4245       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4246     // Make new thread's CG root same as primary thread's
4247     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4248     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4249     if (tmp) {
4250       // worker changes CG, need to check if old CG should be freed
4251       int i = tmp->cg_nthreads--;
4252       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4253                      " on node %p of thread %p to %d\n",
4254                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4255       if (i == 1) {
4256         __kmp_free(tmp); // last thread left CG --> free it
4257       }
4258     }
4259     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4260     // Increment new thread's CG root's counter to add the new thread
4261     this_thr->th.th_cg_roots->cg_nthreads++;
4262     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4263                    " node %p of thread %p to %d\n",
4264                    this_thr, this_thr->th.th_cg_roots,
4265                    this_thr->th.th_cg_roots->cg_root,
4266                    this_thr->th.th_cg_roots->cg_nthreads));
4267     this_thr->th.th_current_task->td_icvs.thread_limit =
4268         this_thr->th.th_cg_roots->cg_thread_limit;
4269   }
4270 
4271   /* Initialize dynamic dispatch */
4272   {
4273     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4274     // Use team max_nproc since this will never change for the team.
4275     size_t disp_size =
4276         sizeof(dispatch_private_info_t) *
4277         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4278     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4279                   team->t.t_max_nproc));
4280     KMP_ASSERT(dispatch);
4281     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4282     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4283 
4284     dispatch->th_disp_index = 0;
4285     dispatch->th_doacross_buf_idx = 0;
4286     if (!dispatch->th_disp_buffer) {
4287       dispatch->th_disp_buffer =
4288           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4289 
4290       if (__kmp_storage_map) {
4291         __kmp_print_storage_map_gtid(
4292             gtid, &dispatch->th_disp_buffer[0],
4293             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4294                                           ? 1
4295                                           : __kmp_dispatch_num_buffers],
4296             disp_size,
4297             "th_%d.th_dispatch.th_disp_buffer "
4298             "(team_%d.t_dispatch[%d].th_disp_buffer)",
4299             gtid, team->t.t_id, gtid);
4300       }
4301     } else {
4302       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4303     }
4304 
4305     dispatch->th_dispatch_pr_current = 0;
4306     dispatch->th_dispatch_sh_current = 0;
4307 
4308     dispatch->th_deo_fcn = 0; /* ORDERED     */
4309     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4310   }
4311 
4312   this_thr->th.th_next_pool = NULL;
4313 
4314   if (!this_thr->th.th_task_state_memo_stack) {
4315     size_t i;
4316     this_thr->th.th_task_state_memo_stack =
4317         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4318     this_thr->th.th_task_state_top = 0;
4319     this_thr->th.th_task_state_stack_sz = 4;
4320     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4321          ++i) // zero init the stack
4322       this_thr->th.th_task_state_memo_stack[i] = 0;
4323   }
4324 
4325   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4326   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4327 
4328   KMP_MB();
4329 }
4330 
4331 /* allocate a new thread for the requesting team. this is only called from
4332    within a forkjoin critical section. we will first try to get an available
4333    thread from the thread pool. if none is available, we will fork a new one
4334    assuming we are able to create a new one. this should be assured, as the
4335    caller should check on this first. */
4336 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4337                                   int new_tid) {
4338   kmp_team_t *serial_team;
4339   kmp_info_t *new_thr;
4340   int new_gtid;
4341 
4342   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4343   KMP_DEBUG_ASSERT(root && team);
4344 #if !KMP_NESTED_HOT_TEAMS
4345   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4346 #endif
4347   KMP_MB();
4348 
4349   /* first, try to get one from the thread pool */
4350   if (__kmp_thread_pool) {
4351     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4352     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4353     if (new_thr == __kmp_thread_pool_insert_pt) {
4354       __kmp_thread_pool_insert_pt = NULL;
4355     }
4356     TCW_4(new_thr->th.th_in_pool, FALSE);
4357     __kmp_suspend_initialize_thread(new_thr);
4358     __kmp_lock_suspend_mx(new_thr);
4359     if (new_thr->th.th_active_in_pool == TRUE) {
4360       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4361       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4362       new_thr->th.th_active_in_pool = FALSE;
4363     }
4364     __kmp_unlock_suspend_mx(new_thr);
4365 
4366     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4367                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4368     KMP_ASSERT(!new_thr->th.th_team);
4369     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4370 
4371     /* setup the thread structure */
4372     __kmp_initialize_info(new_thr, team, new_tid,
4373                           new_thr->th.th_info.ds.ds_gtid);
4374     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4375 
4376     TCW_4(__kmp_nth, __kmp_nth + 1);
4377 
4378     new_thr->th.th_task_state = 0;
4379     new_thr->th.th_task_state_top = 0;
4380     new_thr->th.th_task_state_stack_sz = 4;
4381 
4382     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4383       // Make sure pool thread has transitioned to waiting on own thread struct
4384       KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4385       // Thread activated in __kmp_allocate_team when increasing team size
4386     }
4387 
4388 #ifdef KMP_ADJUST_BLOCKTIME
4389     /* Adjust blocktime back to zero if necessary */
4390     /* Middle initialization might not have occurred yet */
4391     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4392       if (__kmp_nth > __kmp_avail_proc) {
4393         __kmp_zero_bt = TRUE;
4394       }
4395     }
4396 #endif /* KMP_ADJUST_BLOCKTIME */
4397 
4398 #if KMP_DEBUG
4399     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4400     // KMP_BARRIER_PARENT_FLAG.
4401     int b;
4402     kmp_balign_t *balign = new_thr->th.th_bar;
4403     for (b = 0; b < bs_last_barrier; ++b)
4404       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4405 #endif
4406 
4407     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4408                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4409 
4410     KMP_MB();
4411     return new_thr;
4412   }
4413 
4414   /* no, well fork a new one */
4415   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4416   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4417 
4418 #if KMP_USE_MONITOR
4419   // If this is the first worker thread the RTL is creating, then also
4420   // launch the monitor thread.  We try to do this as early as possible.
4421   if (!TCR_4(__kmp_init_monitor)) {
4422     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4423     if (!TCR_4(__kmp_init_monitor)) {
4424       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4425       TCW_4(__kmp_init_monitor, 1);
4426       __kmp_create_monitor(&__kmp_monitor);
4427       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4428 #if KMP_OS_WINDOWS
4429       // AC: wait until monitor has started. This is a fix for CQ232808.
4430       // The reason is that if the library is loaded/unloaded in a loop with
4431       // small (parallel) work in between, then there is high probability that
4432       // monitor thread started after the library shutdown. At shutdown it is
4433       // too late to cope with the problem, because when the primary thread is
4434       // in DllMain (process detach) the monitor has no chances to start (it is
4435       // blocked), and primary thread has no means to inform the monitor that
4436       // the library has gone, because all the memory which the monitor can
4437       // access is going to be released/reset.
4438       while (TCR_4(__kmp_init_monitor) < 2) {
4439         KMP_YIELD(TRUE);
4440       }
4441       KF_TRACE(10, ("after monitor thread has started\n"));
4442 #endif
4443     }
4444     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4445   }
4446 #endif
4447 
4448   KMP_MB();
4449 
4450   {
4451     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4452                              ? 1
4453                              : __kmp_hidden_helper_threads_num + 1;
4454 
4455     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4456          ++new_gtid) {
4457       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4458     }
4459 
4460     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4461       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4462     }
4463   }
4464 
4465   /* allocate space for it. */
4466   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4467 
4468   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4469 
4470 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4471   // suppress race conditions detection on synchronization flags in debug mode
4472   // this helps to analyze library internals eliminating false positives
4473   __itt_suppress_mark_range(
4474       __itt_suppress_range, __itt_suppress_threading_errors,
4475       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4476   __itt_suppress_mark_range(
4477       __itt_suppress_range, __itt_suppress_threading_errors,
4478       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4479 #if KMP_OS_WINDOWS
4480   __itt_suppress_mark_range(
4481       __itt_suppress_range, __itt_suppress_threading_errors,
4482       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4483 #else
4484   __itt_suppress_mark_range(__itt_suppress_range,
4485                             __itt_suppress_threading_errors,
4486                             &new_thr->th.th_suspend_init_count,
4487                             sizeof(new_thr->th.th_suspend_init_count));
4488 #endif
4489   // TODO: check if we need to also suppress b_arrived flags
4490   __itt_suppress_mark_range(__itt_suppress_range,
4491                             __itt_suppress_threading_errors,
4492                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4493                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4494   __itt_suppress_mark_range(__itt_suppress_range,
4495                             __itt_suppress_threading_errors,
4496                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4497                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4498   __itt_suppress_mark_range(__itt_suppress_range,
4499                             __itt_suppress_threading_errors,
4500                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4501                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4502 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4503   if (__kmp_storage_map) {
4504     __kmp_print_thread_storage_map(new_thr, new_gtid);
4505   }
4506 
4507   // add the reserve serialized team, initialized from the team's primary thread
4508   {
4509     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4510     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4511     new_thr->th.th_serial_team = serial_team =
4512         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4513 #if OMPT_SUPPORT
4514                                           ompt_data_none, // root parallel id
4515 #endif
4516                                           proc_bind_default, &r_icvs,
4517                                           0 USE_NESTED_HOT_ARG(NULL));
4518   }
4519   KMP_ASSERT(serial_team);
4520   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4521   // execution (it is unused for now).
4522   serial_team->t.t_threads[0] = new_thr;
4523   KF_TRACE(10,
4524            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4525             new_thr));
4526 
4527   /* setup the thread structures */
4528   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4529 
4530 #if USE_FAST_MEMORY
4531   __kmp_initialize_fast_memory(new_thr);
4532 #endif /* USE_FAST_MEMORY */
4533 
4534 #if KMP_USE_BGET
4535   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4536   __kmp_initialize_bget(new_thr);
4537 #endif
4538 
4539   __kmp_init_random(new_thr); // Initialize random number generator
4540 
4541   /* Initialize these only once when thread is grabbed for a team allocation */
4542   KA_TRACE(20,
4543            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4544             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4545 
4546   int b;
4547   kmp_balign_t *balign = new_thr->th.th_bar;
4548   for (b = 0; b < bs_last_barrier; ++b) {
4549     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4550     balign[b].bb.team = NULL;
4551     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4552     balign[b].bb.use_oncore_barrier = 0;
4553   }
4554 
4555   TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4556   new_thr->th.th_sleep_loc_type = flag_unset;
4557 
4558   new_thr->th.th_spin_here = FALSE;
4559   new_thr->th.th_next_waiting = 0;
4560 #if KMP_OS_UNIX
4561   new_thr->th.th_blocking = false;
4562 #endif
4563 
4564 #if KMP_AFFINITY_SUPPORTED
4565   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4566   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4567   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4568   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4569 #endif
4570   new_thr->th.th_def_allocator = __kmp_def_allocator;
4571   new_thr->th.th_prev_level = 0;
4572   new_thr->th.th_prev_num_threads = 1;
4573 
4574   TCW_4(new_thr->th.th_in_pool, FALSE);
4575   new_thr->th.th_active_in_pool = FALSE;
4576   TCW_4(new_thr->th.th_active, TRUE);
4577 
4578   /* adjust the global counters */
4579   __kmp_all_nth++;
4580   __kmp_nth++;
4581 
4582   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4583   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4584   if (__kmp_adjust_gtid_mode) {
4585     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4586       if (TCR_4(__kmp_gtid_mode) != 2) {
4587         TCW_4(__kmp_gtid_mode, 2);
4588       }
4589     } else {
4590       if (TCR_4(__kmp_gtid_mode) != 1) {
4591         TCW_4(__kmp_gtid_mode, 1);
4592       }
4593     }
4594   }
4595 
4596 #ifdef KMP_ADJUST_BLOCKTIME
4597   /* Adjust blocktime back to zero if necessary       */
4598   /* Middle initialization might not have occurred yet */
4599   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4600     if (__kmp_nth > __kmp_avail_proc) {
4601       __kmp_zero_bt = TRUE;
4602     }
4603   }
4604 #endif /* KMP_ADJUST_BLOCKTIME */
4605 
4606   /* actually fork it and create the new worker thread */
4607   KF_TRACE(
4608       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4609   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4610   KF_TRACE(10,
4611            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4612 
4613   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4614                 new_gtid));
4615   KMP_MB();
4616   return new_thr;
4617 }
4618 
4619 /* Reinitialize team for reuse.
4620    The hot team code calls this case at every fork barrier, so EPCC barrier
4621    test are extremely sensitive to changes in it, esp. writes to the team
4622    struct, which cause a cache invalidation in all threads.
4623    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4624 static void __kmp_reinitialize_team(kmp_team_t *team,
4625                                     kmp_internal_control_t *new_icvs,
4626                                     ident_t *loc) {
4627   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4628                 team->t.t_threads[0], team));
4629   KMP_DEBUG_ASSERT(team && new_icvs);
4630   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4631   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4632 
4633   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4634   // Copy ICVs to the primary thread's implicit taskdata
4635   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4636   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4637 
4638   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4639                 team->t.t_threads[0], team));
4640 }
4641 
4642 /* Initialize the team data structure.
4643    This assumes the t_threads and t_max_nproc are already set.
4644    Also, we don't touch the arguments */
4645 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4646                                   kmp_internal_control_t *new_icvs,
4647                                   ident_t *loc) {
4648   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4649 
4650   /* verify */
4651   KMP_DEBUG_ASSERT(team);
4652   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4653   KMP_DEBUG_ASSERT(team->t.t_threads);
4654   KMP_MB();
4655 
4656   team->t.t_master_tid = 0; /* not needed */
4657   /* team->t.t_master_bar;        not needed */
4658   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4659   team->t.t_nproc = new_nproc;
4660 
4661   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4662   team->t.t_next_pool = NULL;
4663   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4664    * up hot team */
4665 
4666   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4667   team->t.t_invoke = NULL; /* not needed */
4668 
4669   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4670   team->t.t_sched.sched = new_icvs->sched.sched;
4671 
4672 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4673   team->t.t_fp_control_saved = FALSE; /* not needed */
4674   team->t.t_x87_fpu_control_word = 0; /* not needed */
4675   team->t.t_mxcsr = 0; /* not needed */
4676 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4677 
4678   team->t.t_construct = 0;
4679 
4680   team->t.t_ordered.dt.t_value = 0;
4681   team->t.t_master_active = FALSE;
4682 
4683 #ifdef KMP_DEBUG
4684   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4685 #endif
4686 #if KMP_OS_WINDOWS
4687   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4688 #endif
4689 
4690   team->t.t_control_stack_top = NULL;
4691 
4692   __kmp_reinitialize_team(team, new_icvs, loc);
4693 
4694   KMP_MB();
4695   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4696 }
4697 
4698 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4699 /* Sets full mask for thread and returns old mask, no changes to structures. */
4700 static void
4701 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4702   if (KMP_AFFINITY_CAPABLE()) {
4703     int status;
4704     if (old_mask != NULL) {
4705       status = __kmp_get_system_affinity(old_mask, TRUE);
4706       int error = errno;
4707       if (status != 0) {
4708         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4709                     __kmp_msg_null);
4710       }
4711     }
4712     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4713   }
4714 }
4715 #endif
4716 
4717 #if KMP_AFFINITY_SUPPORTED
4718 
4719 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4720 // It calculates the worker + primary thread's partition based upon the parent
4721 // thread's partition, and binds each worker to a thread in their partition.
4722 // The primary thread's partition should already include its current binding.
4723 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4724   // Do not partition places for the hidden helper team
4725   if (KMP_HIDDEN_HELPER_TEAM(team))
4726     return;
4727   // Copy the primary thread's place partition to the team struct
4728   kmp_info_t *master_th = team->t.t_threads[0];
4729   KMP_DEBUG_ASSERT(master_th != NULL);
4730   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4731   int first_place = master_th->th.th_first_place;
4732   int last_place = master_th->th.th_last_place;
4733   int masters_place = master_th->th.th_current_place;
4734   team->t.t_first_place = first_place;
4735   team->t.t_last_place = last_place;
4736 
4737   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4738                 "bound to place %d partition = [%d,%d]\n",
4739                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4740                 team->t.t_id, masters_place, first_place, last_place));
4741 
4742   switch (proc_bind) {
4743 
4744   case proc_bind_default:
4745     // Serial teams might have the proc_bind policy set to proc_bind_default.
4746     // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4747     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4748     break;
4749 
4750   case proc_bind_primary: {
4751     int f;
4752     int n_th = team->t.t_nproc;
4753     for (f = 1; f < n_th; f++) {
4754       kmp_info_t *th = team->t.t_threads[f];
4755       KMP_DEBUG_ASSERT(th != NULL);
4756       th->th.th_first_place = first_place;
4757       th->th.th_last_place = last_place;
4758       th->th.th_new_place = masters_place;
4759       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4760           team->t.t_display_affinity != 1) {
4761         team->t.t_display_affinity = 1;
4762       }
4763 
4764       KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4765                      "partition = [%d,%d]\n",
4766                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4767                      f, masters_place, first_place, last_place));
4768     }
4769   } break;
4770 
4771   case proc_bind_close: {
4772     int f;
4773     int n_th = team->t.t_nproc;
4774     int n_places;
4775     if (first_place <= last_place) {
4776       n_places = last_place - first_place + 1;
4777     } else {
4778       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4779     }
4780     if (n_th <= n_places) {
4781       int place = masters_place;
4782       for (f = 1; f < n_th; f++) {
4783         kmp_info_t *th = team->t.t_threads[f];
4784         KMP_DEBUG_ASSERT(th != NULL);
4785 
4786         if (place == last_place) {
4787           place = first_place;
4788         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4789           place = 0;
4790         } else {
4791           place++;
4792         }
4793         th->th.th_first_place = first_place;
4794         th->th.th_last_place = last_place;
4795         th->th.th_new_place = place;
4796         if (__kmp_display_affinity && place != th->th.th_current_place &&
4797             team->t.t_display_affinity != 1) {
4798           team->t.t_display_affinity = 1;
4799         }
4800 
4801         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4802                        "partition = [%d,%d]\n",
4803                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4804                        team->t.t_id, f, place, first_place, last_place));
4805       }
4806     } else {
4807       int S, rem, gap, s_count;
4808       S = n_th / n_places;
4809       s_count = 0;
4810       rem = n_th - (S * n_places);
4811       gap = rem > 0 ? n_places / rem : n_places;
4812       int place = masters_place;
4813       int gap_ct = gap;
4814       for (f = 0; f < n_th; f++) {
4815         kmp_info_t *th = team->t.t_threads[f];
4816         KMP_DEBUG_ASSERT(th != NULL);
4817 
4818         th->th.th_first_place = first_place;
4819         th->th.th_last_place = last_place;
4820         th->th.th_new_place = place;
4821         if (__kmp_display_affinity && place != th->th.th_current_place &&
4822             team->t.t_display_affinity != 1) {
4823           team->t.t_display_affinity = 1;
4824         }
4825         s_count++;
4826 
4827         if ((s_count == S) && rem && (gap_ct == gap)) {
4828           // do nothing, add an extra thread to place on next iteration
4829         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4830           // we added an extra thread to this place; move to next place
4831           if (place == last_place) {
4832             place = first_place;
4833           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4834             place = 0;
4835           } else {
4836             place++;
4837           }
4838           s_count = 0;
4839           gap_ct = 1;
4840           rem--;
4841         } else if (s_count == S) { // place full; don't add extra
4842           if (place == last_place) {
4843             place = first_place;
4844           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4845             place = 0;
4846           } else {
4847             place++;
4848           }
4849           gap_ct++;
4850           s_count = 0;
4851         }
4852 
4853         KA_TRACE(100,
4854                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4855                   "partition = [%d,%d]\n",
4856                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4857                   th->th.th_new_place, first_place, last_place));
4858       }
4859       KMP_DEBUG_ASSERT(place == masters_place);
4860     }
4861   } break;
4862 
4863   case proc_bind_spread: {
4864     int f;
4865     int n_th = team->t.t_nproc;
4866     int n_places;
4867     int thidx;
4868     if (first_place <= last_place) {
4869       n_places = last_place - first_place + 1;
4870     } else {
4871       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4872     }
4873     if (n_th <= n_places) {
4874       int place = -1;
4875 
4876       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4877         int S = n_places / n_th;
4878         int s_count, rem, gap, gap_ct;
4879 
4880         place = masters_place;
4881         rem = n_places - n_th * S;
4882         gap = rem ? n_th / rem : 1;
4883         gap_ct = gap;
4884         thidx = n_th;
4885         if (update_master_only == 1)
4886           thidx = 1;
4887         for (f = 0; f < thidx; f++) {
4888           kmp_info_t *th = team->t.t_threads[f];
4889           KMP_DEBUG_ASSERT(th != NULL);
4890 
4891           th->th.th_first_place = place;
4892           th->th.th_new_place = place;
4893           if (__kmp_display_affinity && place != th->th.th_current_place &&
4894               team->t.t_display_affinity != 1) {
4895             team->t.t_display_affinity = 1;
4896           }
4897           s_count = 1;
4898           while (s_count < S) {
4899             if (place == last_place) {
4900               place = first_place;
4901             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4902               place = 0;
4903             } else {
4904               place++;
4905             }
4906             s_count++;
4907           }
4908           if (rem && (gap_ct == gap)) {
4909             if (place == last_place) {
4910               place = first_place;
4911             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4912               place = 0;
4913             } else {
4914               place++;
4915             }
4916             rem--;
4917             gap_ct = 0;
4918           }
4919           th->th.th_last_place = place;
4920           gap_ct++;
4921 
4922           if (place == last_place) {
4923             place = first_place;
4924           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4925             place = 0;
4926           } else {
4927             place++;
4928           }
4929 
4930           KA_TRACE(100,
4931                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4932                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4933                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4934                     f, th->th.th_new_place, th->th.th_first_place,
4935                     th->th.th_last_place, __kmp_affinity_num_masks));
4936         }
4937       } else {
4938         /* Having uniform space of available computation places I can create
4939            T partitions of round(P/T) size and put threads into the first
4940            place of each partition. */
4941         double current = static_cast<double>(masters_place);
4942         double spacing =
4943             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4944         int first, last;
4945         kmp_info_t *th;
4946 
4947         thidx = n_th + 1;
4948         if (update_master_only == 1)
4949           thidx = 1;
4950         for (f = 0; f < thidx; f++) {
4951           first = static_cast<int>(current);
4952           last = static_cast<int>(current + spacing) - 1;
4953           KMP_DEBUG_ASSERT(last >= first);
4954           if (first >= n_places) {
4955             if (masters_place) {
4956               first -= n_places;
4957               last -= n_places;
4958               if (first == (masters_place + 1)) {
4959                 KMP_DEBUG_ASSERT(f == n_th);
4960                 first--;
4961               }
4962               if (last == masters_place) {
4963                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4964                 last--;
4965               }
4966             } else {
4967               KMP_DEBUG_ASSERT(f == n_th);
4968               first = 0;
4969               last = 0;
4970             }
4971           }
4972           if (last >= n_places) {
4973             last = (n_places - 1);
4974           }
4975           place = first;
4976           current += spacing;
4977           if (f < n_th) {
4978             KMP_DEBUG_ASSERT(0 <= first);
4979             KMP_DEBUG_ASSERT(n_places > first);
4980             KMP_DEBUG_ASSERT(0 <= last);
4981             KMP_DEBUG_ASSERT(n_places > last);
4982             KMP_DEBUG_ASSERT(last_place >= first_place);
4983             th = team->t.t_threads[f];
4984             KMP_DEBUG_ASSERT(th);
4985             th->th.th_first_place = first;
4986             th->th.th_new_place = place;
4987             th->th.th_last_place = last;
4988             if (__kmp_display_affinity && place != th->th.th_current_place &&
4989                 team->t.t_display_affinity != 1) {
4990               team->t.t_display_affinity = 1;
4991             }
4992             KA_TRACE(100,
4993                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4994                       "partition = [%d,%d], spacing = %.4f\n",
4995                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4996                       team->t.t_id, f, th->th.th_new_place,
4997                       th->th.th_first_place, th->th.th_last_place, spacing));
4998           }
4999         }
5000       }
5001       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5002     } else {
5003       int S, rem, gap, s_count;
5004       S = n_th / n_places;
5005       s_count = 0;
5006       rem = n_th - (S * n_places);
5007       gap = rem > 0 ? n_places / rem : n_places;
5008       int place = masters_place;
5009       int gap_ct = gap;
5010       thidx = n_th;
5011       if (update_master_only == 1)
5012         thidx = 1;
5013       for (f = 0; f < thidx; f++) {
5014         kmp_info_t *th = team->t.t_threads[f];
5015         KMP_DEBUG_ASSERT(th != NULL);
5016 
5017         th->th.th_first_place = place;
5018         th->th.th_last_place = place;
5019         th->th.th_new_place = place;
5020         if (__kmp_display_affinity && place != th->th.th_current_place &&
5021             team->t.t_display_affinity != 1) {
5022           team->t.t_display_affinity = 1;
5023         }
5024         s_count++;
5025 
5026         if ((s_count == S) && rem && (gap_ct == gap)) {
5027           // do nothing, add an extra thread to place on next iteration
5028         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5029           // we added an extra thread to this place; move on to next place
5030           if (place == last_place) {
5031             place = first_place;
5032           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
5033             place = 0;
5034           } else {
5035             place++;
5036           }
5037           s_count = 0;
5038           gap_ct = 1;
5039           rem--;
5040         } else if (s_count == S) { // place is full; don't add extra thread
5041           if (place == last_place) {
5042             place = first_place;
5043           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
5044             place = 0;
5045           } else {
5046             place++;
5047           }
5048           gap_ct++;
5049           s_count = 0;
5050         }
5051 
5052         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5053                        "partition = [%d,%d]\n",
5054                        __kmp_gtid_from_thread(team->t.t_threads[f]),
5055                        team->t.t_id, f, th->th.th_new_place,
5056                        th->th.th_first_place, th->th.th_last_place));
5057       }
5058       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5059     }
5060   } break;
5061 
5062   default:
5063     break;
5064   }
5065 
5066   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5067 }
5068 
5069 #endif // KMP_AFFINITY_SUPPORTED
5070 
5071 /* allocate a new team data structure to use.  take one off of the free pool if
5072    available */
5073 kmp_team_t *
5074 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5075 #if OMPT_SUPPORT
5076                     ompt_data_t ompt_parallel_data,
5077 #endif
5078                     kmp_proc_bind_t new_proc_bind,
5079                     kmp_internal_control_t *new_icvs,
5080                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5081   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5082   int f;
5083   kmp_team_t *team;
5084   int use_hot_team = !root->r.r_active;
5085   int level = 0;
5086   int do_place_partition = 1;
5087 
5088   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5089   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5090   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5091   KMP_MB();
5092 
5093 #if KMP_NESTED_HOT_TEAMS
5094   kmp_hot_team_ptr_t *hot_teams;
5095   if (master) {
5096     team = master->th.th_team;
5097     level = team->t.t_active_level;
5098     if (master->th.th_teams_microtask) { // in teams construct?
5099       if (master->th.th_teams_size.nteams > 1 &&
5100           ( // #teams > 1
5101               team->t.t_pkfn ==
5102                   (microtask_t)__kmp_teams_master || // inner fork of the teams
5103               master->th.th_teams_level <
5104                   team->t.t_level)) { // or nested parallel inside the teams
5105         ++level; // not increment if #teams==1, or for outer fork of the teams;
5106         // increment otherwise
5107       }
5108       // Do not perform the place partition if inner fork of the teams
5109       // Wait until nested parallel region encountered inside teams construct
5110       if ((master->th.th_teams_size.nteams == 1 &&
5111            master->th.th_teams_level >= team->t.t_level) ||
5112           (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5113         do_place_partition = 0;
5114     }
5115     hot_teams = master->th.th_hot_teams;
5116     if (level < __kmp_hot_teams_max_level && hot_teams &&
5117         hot_teams[level].hot_team) {
5118       // hot team has already been allocated for given level
5119       use_hot_team = 1;
5120     } else {
5121       use_hot_team = 0;
5122     }
5123   } else {
5124     // check we won't access uninitialized hot_teams, just in case
5125     KMP_DEBUG_ASSERT(new_nproc == 1);
5126   }
5127 #endif
5128   // Optimization to use a "hot" team
5129   if (use_hot_team && new_nproc > 1) {
5130     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5131 #if KMP_NESTED_HOT_TEAMS
5132     team = hot_teams[level].hot_team;
5133 #else
5134     team = root->r.r_hot_team;
5135 #endif
5136 #if KMP_DEBUG
5137     if (__kmp_tasking_mode != tskm_immediate_exec) {
5138       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5139                     "task_team[1] = %p before reinit\n",
5140                     team->t.t_task_team[0], team->t.t_task_team[1]));
5141     }
5142 #endif
5143 
5144     if (team->t.t_nproc != new_nproc &&
5145         __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5146       // Distributed barrier may need a resize
5147       int old_nthr = team->t.t_nproc;
5148       __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5149     }
5150 
5151     // If not doing the place partition, then reset the team's proc bind
5152     // to indicate that partitioning of all threads still needs to take place
5153     if (do_place_partition == 0)
5154       team->t.t_proc_bind = proc_bind_default;
5155     // Has the number of threads changed?
5156     /* Let's assume the most common case is that the number of threads is
5157        unchanged, and put that case first. */
5158     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5159       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5160       // This case can mean that omp_set_num_threads() was called and the hot
5161       // team size was already reduced, so we check the special flag
5162       if (team->t.t_size_changed == -1) {
5163         team->t.t_size_changed = 1;
5164       } else {
5165         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5166       }
5167 
5168       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5169       kmp_r_sched_t new_sched = new_icvs->sched;
5170       // set primary thread's schedule as new run-time schedule
5171       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5172 
5173       __kmp_reinitialize_team(team, new_icvs,
5174                               root->r.r_uber_thread->th.th_ident);
5175 
5176       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5177                     team->t.t_threads[0], team));
5178       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5179 
5180 #if KMP_AFFINITY_SUPPORTED
5181       if ((team->t.t_size_changed == 0) &&
5182           (team->t.t_proc_bind == new_proc_bind)) {
5183         if (new_proc_bind == proc_bind_spread) {
5184           if (do_place_partition) {
5185             // add flag to update only master for spread
5186             __kmp_partition_places(team, 1);
5187           }
5188         }
5189         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5190                        "proc_bind = %d, partition = [%d,%d]\n",
5191                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5192                        team->t.t_last_place));
5193       } else {
5194         if (do_place_partition) {
5195           KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5196           __kmp_partition_places(team);
5197         }
5198       }
5199 #else
5200       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5201 #endif /* KMP_AFFINITY_SUPPORTED */
5202     } else if (team->t.t_nproc > new_nproc) {
5203       KA_TRACE(20,
5204                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5205                 new_nproc));
5206 
5207       team->t.t_size_changed = 1;
5208       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5209         // Barrier size already reduced earlier in this function
5210         // Activate team threads via th_used_in_team
5211         __kmp_add_threads_to_team(team, new_nproc);
5212       }
5213 #if KMP_NESTED_HOT_TEAMS
5214       if (__kmp_hot_teams_mode == 0) {
5215         // AC: saved number of threads should correspond to team's value in this
5216         // mode, can be bigger in mode 1, when hot team has threads in reserve
5217         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5218         hot_teams[level].hot_team_nth = new_nproc;
5219 #endif // KMP_NESTED_HOT_TEAMS
5220         /* release the extra threads we don't need any more */
5221         for (f = new_nproc; f < team->t.t_nproc; f++) {
5222           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5223           if (__kmp_tasking_mode != tskm_immediate_exec) {
5224             // When decreasing team size, threads no longer in the team should
5225             // unref task team.
5226             team->t.t_threads[f]->th.th_task_team = NULL;
5227           }
5228           __kmp_free_thread(team->t.t_threads[f]);
5229           team->t.t_threads[f] = NULL;
5230         }
5231 #if KMP_NESTED_HOT_TEAMS
5232       } // (__kmp_hot_teams_mode == 0)
5233       else {
5234         // When keeping extra threads in team, switch threads to wait on own
5235         // b_go flag
5236         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5237           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5238           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5239           for (int b = 0; b < bs_last_barrier; ++b) {
5240             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5241               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5242             }
5243             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5244           }
5245         }
5246       }
5247 #endif // KMP_NESTED_HOT_TEAMS
5248       team->t.t_nproc = new_nproc;
5249       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5250       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5251       __kmp_reinitialize_team(team, new_icvs,
5252                               root->r.r_uber_thread->th.th_ident);
5253 
5254       // Update remaining threads
5255       for (f = 0; f < new_nproc; ++f) {
5256         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5257       }
5258 
5259       // restore the current task state of the primary thread: should be the
5260       // implicit task
5261       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5262                     team->t.t_threads[0], team));
5263 
5264       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5265 
5266 #ifdef KMP_DEBUG
5267       for (f = 0; f < team->t.t_nproc; f++) {
5268         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5269                          team->t.t_threads[f]->th.th_team_nproc ==
5270                              team->t.t_nproc);
5271       }
5272 #endif
5273 
5274       if (do_place_partition) {
5275         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5276 #if KMP_AFFINITY_SUPPORTED
5277         __kmp_partition_places(team);
5278 #endif
5279       }
5280     } else { // team->t.t_nproc < new_nproc
5281 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5282       kmp_affin_mask_t *old_mask;
5283       if (KMP_AFFINITY_CAPABLE()) {
5284         KMP_CPU_ALLOC(old_mask);
5285       }
5286 #endif
5287 
5288       KA_TRACE(20,
5289                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5290                 new_nproc));
5291       int old_nproc = team->t.t_nproc; // save old value and use to update only
5292       team->t.t_size_changed = 1;
5293 
5294 #if KMP_NESTED_HOT_TEAMS
5295       int avail_threads = hot_teams[level].hot_team_nth;
5296       if (new_nproc < avail_threads)
5297         avail_threads = new_nproc;
5298       kmp_info_t **other_threads = team->t.t_threads;
5299       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5300         // Adjust barrier data of reserved threads (if any) of the team
5301         // Other data will be set in __kmp_initialize_info() below.
5302         int b;
5303         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5304         for (b = 0; b < bs_last_barrier; ++b) {
5305           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5306           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5307 #if USE_DEBUGGER
5308           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5309 #endif
5310         }
5311       }
5312       if (hot_teams[level].hot_team_nth >= new_nproc) {
5313         // we have all needed threads in reserve, no need to allocate any
5314         // this only possible in mode 1, cannot have reserved threads in mode 0
5315         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5316         team->t.t_nproc = new_nproc; // just get reserved threads involved
5317       } else {
5318         // We may have some threads in reserve, but not enough;
5319         // get reserved threads involved if any.
5320         team->t.t_nproc = hot_teams[level].hot_team_nth;
5321         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5322 #endif // KMP_NESTED_HOT_TEAMS
5323         if (team->t.t_max_nproc < new_nproc) {
5324           /* reallocate larger arrays */
5325           __kmp_reallocate_team_arrays(team, new_nproc);
5326           __kmp_reinitialize_team(team, new_icvs, NULL);
5327         }
5328 
5329 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5330         /* Temporarily set full mask for primary thread before creation of
5331            workers. The reason is that workers inherit the affinity from the
5332            primary thread, so if a lot of workers are created on the single
5333            core quickly, they don't get a chance to set their own affinity for
5334            a long time. */
5335         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5336 #endif
5337 
5338         /* allocate new threads for the hot team */
5339         for (f = team->t.t_nproc; f < new_nproc; f++) {
5340           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5341           KMP_DEBUG_ASSERT(new_worker);
5342           team->t.t_threads[f] = new_worker;
5343 
5344           KA_TRACE(20,
5345                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5346                     "join=%llu, plain=%llu\n",
5347                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5348                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5349                     team->t.t_bar[bs_plain_barrier].b_arrived));
5350 
5351           { // Initialize barrier data for new threads.
5352             int b;
5353             kmp_balign_t *balign = new_worker->th.th_bar;
5354             for (b = 0; b < bs_last_barrier; ++b) {
5355               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5356               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5357                                KMP_BARRIER_PARENT_FLAG);
5358 #if USE_DEBUGGER
5359               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5360 #endif
5361             }
5362           }
5363         }
5364 
5365 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5366         if (KMP_AFFINITY_CAPABLE()) {
5367           /* Restore initial primary thread's affinity mask */
5368           __kmp_set_system_affinity(old_mask, TRUE);
5369           KMP_CPU_FREE(old_mask);
5370         }
5371 #endif
5372 #if KMP_NESTED_HOT_TEAMS
5373       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5374 #endif // KMP_NESTED_HOT_TEAMS
5375       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5376         // Barrier size already increased earlier in this function
5377         // Activate team threads via th_used_in_team
5378         __kmp_add_threads_to_team(team, new_nproc);
5379       }
5380       /* make sure everyone is syncronized */
5381       // new threads below
5382       __kmp_initialize_team(team, new_nproc, new_icvs,
5383                             root->r.r_uber_thread->th.th_ident);
5384 
5385       /* reinitialize the threads */
5386       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5387       for (f = 0; f < team->t.t_nproc; ++f)
5388         __kmp_initialize_info(team->t.t_threads[f], team, f,
5389                               __kmp_gtid_from_tid(f, team));
5390 
5391       if (level) { // set th_task_state for new threads in nested hot team
5392         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5393         // only need to set the th_task_state for the new threads. th_task_state
5394         // for primary thread will not be accurate until after this in
5395         // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5396         // get the correct value.
5397         for (f = old_nproc; f < team->t.t_nproc; ++f)
5398           team->t.t_threads[f]->th.th_task_state =
5399               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5400       } else { // set th_task_state for new threads in non-nested hot team
5401         // copy primary thread's state
5402         kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5403         for (f = old_nproc; f < team->t.t_nproc; ++f)
5404           team->t.t_threads[f]->th.th_task_state = old_state;
5405       }
5406 
5407 #ifdef KMP_DEBUG
5408       for (f = 0; f < team->t.t_nproc; ++f) {
5409         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5410                          team->t.t_threads[f]->th.th_team_nproc ==
5411                              team->t.t_nproc);
5412       }
5413 #endif
5414 
5415       if (do_place_partition) {
5416         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5417 #if KMP_AFFINITY_SUPPORTED
5418         __kmp_partition_places(team);
5419 #endif
5420       }
5421     } // Check changes in number of threads
5422 
5423     kmp_info_t *master = team->t.t_threads[0];
5424     if (master->th.th_teams_microtask) {
5425       for (f = 1; f < new_nproc; ++f) {
5426         // propagate teams construct specific info to workers
5427         kmp_info_t *thr = team->t.t_threads[f];
5428         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5429         thr->th.th_teams_level = master->th.th_teams_level;
5430         thr->th.th_teams_size = master->th.th_teams_size;
5431       }
5432     }
5433 #if KMP_NESTED_HOT_TEAMS
5434     if (level) {
5435       // Sync barrier state for nested hot teams, not needed for outermost hot
5436       // team.
5437       for (f = 1; f < new_nproc; ++f) {
5438         kmp_info_t *thr = team->t.t_threads[f];
5439         int b;
5440         kmp_balign_t *balign = thr->th.th_bar;
5441         for (b = 0; b < bs_last_barrier; ++b) {
5442           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5443           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5444 #if USE_DEBUGGER
5445           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5446 #endif
5447         }
5448       }
5449     }
5450 #endif // KMP_NESTED_HOT_TEAMS
5451 
5452     /* reallocate space for arguments if necessary */
5453     __kmp_alloc_argv_entries(argc, team, TRUE);
5454     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5455     // The hot team re-uses the previous task team,
5456     // if untouched during the previous release->gather phase.
5457 
5458     KF_TRACE(10, (" hot_team = %p\n", team));
5459 
5460 #if KMP_DEBUG
5461     if (__kmp_tasking_mode != tskm_immediate_exec) {
5462       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5463                     "task_team[1] = %p after reinit\n",
5464                     team->t.t_task_team[0], team->t.t_task_team[1]));
5465     }
5466 #endif
5467 
5468 #if OMPT_SUPPORT
5469     __ompt_team_assign_id(team, ompt_parallel_data);
5470 #endif
5471 
5472     KMP_MB();
5473 
5474     return team;
5475   }
5476 
5477   /* next, let's try to take one from the team pool */
5478   KMP_MB();
5479   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5480     /* TODO: consider resizing undersized teams instead of reaping them, now
5481        that we have a resizing mechanism */
5482     if (team->t.t_max_nproc >= max_nproc) {
5483       /* take this team from the team pool */
5484       __kmp_team_pool = team->t.t_next_pool;
5485 
5486       if (max_nproc > 1 &&
5487           __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5488         if (!team->t.b) { // Allocate barrier structure
5489           team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5490         }
5491       }
5492 
5493       /* setup the team for fresh use */
5494       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5495 
5496       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5497                     "task_team[1] %p to NULL\n",
5498                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5499       team->t.t_task_team[0] = NULL;
5500       team->t.t_task_team[1] = NULL;
5501 
5502       /* reallocate space for arguments if necessary */
5503       __kmp_alloc_argv_entries(argc, team, TRUE);
5504       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5505 
5506       KA_TRACE(
5507           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5508                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5509       { // Initialize barrier data.
5510         int b;
5511         for (b = 0; b < bs_last_barrier; ++b) {
5512           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5513 #if USE_DEBUGGER
5514           team->t.t_bar[b].b_master_arrived = 0;
5515           team->t.t_bar[b].b_team_arrived = 0;
5516 #endif
5517         }
5518       }
5519 
5520       team->t.t_proc_bind = new_proc_bind;
5521 
5522       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5523                     team->t.t_id));
5524 
5525 #if OMPT_SUPPORT
5526       __ompt_team_assign_id(team, ompt_parallel_data);
5527 #endif
5528 
5529       KMP_MB();
5530 
5531       return team;
5532     }
5533 
5534     /* reap team if it is too small, then loop back and check the next one */
5535     // not sure if this is wise, but, will be redone during the hot-teams
5536     // rewrite.
5537     /* TODO: Use technique to find the right size hot-team, don't reap them */
5538     team = __kmp_reap_team(team);
5539     __kmp_team_pool = team;
5540   }
5541 
5542   /* nothing available in the pool, no matter, make a new team! */
5543   KMP_MB();
5544   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5545 
5546   /* and set it up */
5547   team->t.t_max_nproc = max_nproc;
5548   if (max_nproc > 1 &&
5549       __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5550     // Allocate barrier structure
5551     team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5552   }
5553 
5554   /* NOTE well, for some reason allocating one big buffer and dividing it up
5555      seems to really hurt performance a lot on the P4, so, let's not use this */
5556   __kmp_allocate_team_arrays(team, max_nproc);
5557 
5558   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5559   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5560 
5561   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5562                 "%p to NULL\n",
5563                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5564   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5565   // memory, no need to duplicate
5566   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5567   // memory, no need to duplicate
5568 
5569   if (__kmp_storage_map) {
5570     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5571   }
5572 
5573   /* allocate space for arguments */
5574   __kmp_alloc_argv_entries(argc, team, FALSE);
5575   team->t.t_argc = argc;
5576 
5577   KA_TRACE(20,
5578            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5579             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5580   { // Initialize barrier data.
5581     int b;
5582     for (b = 0; b < bs_last_barrier; ++b) {
5583       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5584 #if USE_DEBUGGER
5585       team->t.t_bar[b].b_master_arrived = 0;
5586       team->t.t_bar[b].b_team_arrived = 0;
5587 #endif
5588     }
5589   }
5590 
5591   team->t.t_proc_bind = new_proc_bind;
5592 
5593 #if OMPT_SUPPORT
5594   __ompt_team_assign_id(team, ompt_parallel_data);
5595   team->t.ompt_serialized_team_info = NULL;
5596 #endif
5597 
5598   KMP_MB();
5599 
5600   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5601                 team->t.t_id));
5602 
5603   return team;
5604 }
5605 
5606 /* TODO implement hot-teams at all levels */
5607 /* TODO implement lazy thread release on demand (disband request) */
5608 
5609 /* free the team.  return it to the team pool.  release all the threads
5610  * associated with it */
5611 void __kmp_free_team(kmp_root_t *root,
5612                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5613   int f;
5614   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5615                 team->t.t_id));
5616 
5617   /* verify state */
5618   KMP_DEBUG_ASSERT(root);
5619   KMP_DEBUG_ASSERT(team);
5620   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5621   KMP_DEBUG_ASSERT(team->t.t_threads);
5622 
5623   int use_hot_team = team == root->r.r_hot_team;
5624 #if KMP_NESTED_HOT_TEAMS
5625   int level;
5626   if (master) {
5627     level = team->t.t_active_level - 1;
5628     if (master->th.th_teams_microtask) { // in teams construct?
5629       if (master->th.th_teams_size.nteams > 1) {
5630         ++level; // level was not increased in teams construct for
5631         // team_of_masters
5632       }
5633       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5634           master->th.th_teams_level == team->t.t_level) {
5635         ++level; // level was not increased in teams construct for
5636         // team_of_workers before the parallel
5637       } // team->t.t_level will be increased inside parallel
5638     }
5639 #if KMP_DEBUG
5640     kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5641 #endif
5642     if (level < __kmp_hot_teams_max_level) {
5643       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5644       use_hot_team = 1;
5645     }
5646   }
5647 #endif // KMP_NESTED_HOT_TEAMS
5648 
5649   /* team is done working */
5650   TCW_SYNC_PTR(team->t.t_pkfn,
5651                NULL); // Important for Debugging Support Library.
5652 #if KMP_OS_WINDOWS
5653   team->t.t_copyin_counter = 0; // init counter for possible reuse
5654 #endif
5655   // Do not reset pointer to parent team to NULL for hot teams.
5656 
5657   /* if we are non-hot team, release our threads */
5658   if (!use_hot_team) {
5659     if (__kmp_tasking_mode != tskm_immediate_exec) {
5660       // Wait for threads to reach reapable state
5661       for (f = 1; f < team->t.t_nproc; ++f) {
5662         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5663         kmp_info_t *th = team->t.t_threads[f];
5664         volatile kmp_uint32 *state = &th->th.th_reap_state;
5665         while (*state != KMP_SAFE_TO_REAP) {
5666 #if KMP_OS_WINDOWS
5667           // On Windows a thread can be killed at any time, check this
5668           DWORD ecode;
5669           if (!__kmp_is_thread_alive(th, &ecode)) {
5670             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5671             break;
5672           }
5673 #endif
5674           // first check if thread is sleeping
5675           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5676           if (fl.is_sleeping())
5677             fl.resume(__kmp_gtid_from_thread(th));
5678           KMP_CPU_PAUSE();
5679         }
5680       }
5681 
5682       // Delete task teams
5683       int tt_idx;
5684       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5685         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5686         if (task_team != NULL) {
5687           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5688             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5689             team->t.t_threads[f]->th.th_task_team = NULL;
5690           }
5691           KA_TRACE(
5692               20,
5693               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5694                __kmp_get_gtid(), task_team, team->t.t_id));
5695 #if KMP_NESTED_HOT_TEAMS
5696           __kmp_free_task_team(master, task_team);
5697 #endif
5698           team->t.t_task_team[tt_idx] = NULL;
5699         }
5700       }
5701     }
5702 
5703     // Reset pointer to parent team only for non-hot teams.
5704     team->t.t_parent = NULL;
5705     team->t.t_level = 0;
5706     team->t.t_active_level = 0;
5707 
5708     /* free the worker threads */
5709     for (f = 1; f < team->t.t_nproc; ++f) {
5710       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5711       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5712         KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5713                                     1, 2);
5714       }
5715       __kmp_free_thread(team->t.t_threads[f]);
5716     }
5717 
5718     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5719       if (team->t.b) {
5720         // wake up thread at old location
5721         team->t.b->go_release();
5722         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5723           for (f = 1; f < team->t.t_nproc; ++f) {
5724             if (team->t.b->sleep[f].sleep) {
5725               __kmp_atomic_resume_64(
5726                   team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5727                   (kmp_atomic_flag_64<> *)NULL);
5728             }
5729           }
5730         }
5731         // Wait for threads to be removed from team
5732         for (int f = 1; f < team->t.t_nproc; ++f) {
5733           while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5734             KMP_CPU_PAUSE();
5735         }
5736       }
5737     }
5738 
5739     for (f = 1; f < team->t.t_nproc; ++f) {
5740       team->t.t_threads[f] = NULL;
5741     }
5742 
5743     if (team->t.t_max_nproc > 1 &&
5744         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5745       distributedBarrier::deallocate(team->t.b);
5746       team->t.b = NULL;
5747     }
5748     /* put the team back in the team pool */
5749     /* TODO limit size of team pool, call reap_team if pool too large */
5750     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5751     __kmp_team_pool = (volatile kmp_team_t *)team;
5752   } else { // Check if team was created for primary threads in teams construct
5753     // See if first worker is a CG root
5754     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5755                      team->t.t_threads[1]->th.th_cg_roots);
5756     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5757       // Clean up the CG root nodes on workers so that this team can be re-used
5758       for (f = 1; f < team->t.t_nproc; ++f) {
5759         kmp_info_t *thr = team->t.t_threads[f];
5760         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5761                          thr->th.th_cg_roots->cg_root == thr);
5762         // Pop current CG root off list
5763         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5764         thr->th.th_cg_roots = tmp->up;
5765         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5766                        " up to node %p. cg_nthreads was %d\n",
5767                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5768         int i = tmp->cg_nthreads--;
5769         if (i == 1) {
5770           __kmp_free(tmp); // free CG if we are the last thread in it
5771         }
5772         // Restore current task's thread_limit from CG root
5773         if (thr->th.th_cg_roots)
5774           thr->th.th_current_task->td_icvs.thread_limit =
5775               thr->th.th_cg_roots->cg_thread_limit;
5776       }
5777     }
5778   }
5779 
5780   KMP_MB();
5781 }
5782 
5783 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5784 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5785   kmp_team_t *next_pool = team->t.t_next_pool;
5786 
5787   KMP_DEBUG_ASSERT(team);
5788   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5789   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5790   KMP_DEBUG_ASSERT(team->t.t_threads);
5791   KMP_DEBUG_ASSERT(team->t.t_argv);
5792 
5793   /* TODO clean the threads that are a part of this? */
5794 
5795   /* free stuff */
5796   __kmp_free_team_arrays(team);
5797   if (team->t.t_argv != &team->t.t_inline_argv[0])
5798     __kmp_free((void *)team->t.t_argv);
5799   __kmp_free(team);
5800 
5801   KMP_MB();
5802   return next_pool;
5803 }
5804 
5805 // Free the thread.  Don't reap it, just place it on the pool of available
5806 // threads.
5807 //
5808 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5809 // binding for the affinity mechanism to be useful.
5810 //
5811 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5812 // However, we want to avoid a potential performance problem by always
5813 // scanning through the list to find the correct point at which to insert
5814 // the thread (potential N**2 behavior).  To do this we keep track of the
5815 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5816 // With single-level parallelism, threads will always be added to the tail
5817 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5818 // parallelism, all bets are off and we may need to scan through the entire
5819 // free list.
5820 //
5821 // This change also has a potentially large performance benefit, for some
5822 // applications.  Previously, as threads were freed from the hot team, they
5823 // would be placed back on the free list in inverse order.  If the hot team
5824 // grew back to it's original size, then the freed thread would be placed
5825 // back on the hot team in reverse order.  This could cause bad cache
5826 // locality problems on programs where the size of the hot team regularly
5827 // grew and shrunk.
5828 //
5829 // Now, for single-level parallelism, the OMP tid is always == gtid.
5830 void __kmp_free_thread(kmp_info_t *this_th) {
5831   int gtid;
5832   kmp_info_t **scan;
5833 
5834   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5835                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5836 
5837   KMP_DEBUG_ASSERT(this_th);
5838 
5839   // When moving thread to pool, switch thread to wait on own b_go flag, and
5840   // uninitialized (NULL team).
5841   int b;
5842   kmp_balign_t *balign = this_th->th.th_bar;
5843   for (b = 0; b < bs_last_barrier; ++b) {
5844     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5845       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5846     balign[b].bb.team = NULL;
5847     balign[b].bb.leaf_kids = 0;
5848   }
5849   this_th->th.th_task_state = 0;
5850   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5851 
5852   /* put thread back on the free pool */
5853   TCW_PTR(this_th->th.th_team, NULL);
5854   TCW_PTR(this_th->th.th_root, NULL);
5855   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5856 
5857   while (this_th->th.th_cg_roots) {
5858     this_th->th.th_cg_roots->cg_nthreads--;
5859     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5860                    " %p of thread  %p to %d\n",
5861                    this_th, this_th->th.th_cg_roots,
5862                    this_th->th.th_cg_roots->cg_root,
5863                    this_th->th.th_cg_roots->cg_nthreads));
5864     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5865     if (tmp->cg_root == this_th) { // Thread is a cg_root
5866       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5867       KA_TRACE(
5868           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5869       this_th->th.th_cg_roots = tmp->up;
5870       __kmp_free(tmp);
5871     } else { // Worker thread
5872       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5873         __kmp_free(tmp);
5874       }
5875       this_th->th.th_cg_roots = NULL;
5876       break;
5877     }
5878   }
5879 
5880   /* If the implicit task assigned to this thread can be used by other threads
5881    * -> multiple threads can share the data and try to free the task at
5882    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5883    * with higher probability when hot team is disabled but can occurs even when
5884    * the hot team is enabled */
5885   __kmp_free_implicit_task(this_th);
5886   this_th->th.th_current_task = NULL;
5887 
5888   // If the __kmp_thread_pool_insert_pt is already past the new insert
5889   // point, then we need to re-scan the entire list.
5890   gtid = this_th->th.th_info.ds.ds_gtid;
5891   if (__kmp_thread_pool_insert_pt != NULL) {
5892     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5893     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5894       __kmp_thread_pool_insert_pt = NULL;
5895     }
5896   }
5897 
5898   // Scan down the list to find the place to insert the thread.
5899   // scan is the address of a link in the list, possibly the address of
5900   // __kmp_thread_pool itself.
5901   //
5902   // In the absence of nested parallelism, the for loop will have 0 iterations.
5903   if (__kmp_thread_pool_insert_pt != NULL) {
5904     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5905   } else {
5906     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5907   }
5908   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5909        scan = &((*scan)->th.th_next_pool))
5910     ;
5911 
5912   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5913   // to its address.
5914   TCW_PTR(this_th->th.th_next_pool, *scan);
5915   __kmp_thread_pool_insert_pt = *scan = this_th;
5916   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5917                    (this_th->th.th_info.ds.ds_gtid <
5918                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5919   TCW_4(this_th->th.th_in_pool, TRUE);
5920   __kmp_suspend_initialize_thread(this_th);
5921   __kmp_lock_suspend_mx(this_th);
5922   if (this_th->th.th_active == TRUE) {
5923     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5924     this_th->th.th_active_in_pool = TRUE;
5925   }
5926 #if KMP_DEBUG
5927   else {
5928     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5929   }
5930 #endif
5931   __kmp_unlock_suspend_mx(this_th);
5932 
5933   TCW_4(__kmp_nth, __kmp_nth - 1);
5934 
5935 #ifdef KMP_ADJUST_BLOCKTIME
5936   /* Adjust blocktime back to user setting or default if necessary */
5937   /* Middle initialization might never have occurred                */
5938   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5939     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5940     if (__kmp_nth <= __kmp_avail_proc) {
5941       __kmp_zero_bt = FALSE;
5942     }
5943   }
5944 #endif /* KMP_ADJUST_BLOCKTIME */
5945 
5946   KMP_MB();
5947 }
5948 
5949 /* ------------------------------------------------------------------------ */
5950 
5951 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5952 #if OMP_PROFILING_SUPPORT
5953   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5954   // TODO: add a configuration option for time granularity
5955   if (ProfileTraceFile)
5956     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5957 #endif
5958 
5959   int gtid = this_thr->th.th_info.ds.ds_gtid;
5960   /*    void                 *stack_data;*/
5961   kmp_team_t **volatile pteam;
5962 
5963   KMP_MB();
5964   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5965 
5966   if (__kmp_env_consistency_check) {
5967     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5968   }
5969 
5970 #if OMPD_SUPPORT
5971   if (ompd_state & OMPD_ENABLE_BP)
5972     ompd_bp_thread_begin();
5973 #endif
5974 
5975 #if OMPT_SUPPORT
5976   ompt_data_t *thread_data = nullptr;
5977   if (ompt_enabled.enabled) {
5978     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5979     *thread_data = ompt_data_none;
5980 
5981     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5982     this_thr->th.ompt_thread_info.wait_id = 0;
5983     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5984     this_thr->th.ompt_thread_info.parallel_flags = 0;
5985     if (ompt_enabled.ompt_callback_thread_begin) {
5986       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5987           ompt_thread_worker, thread_data);
5988     }
5989     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5990   }
5991 #endif
5992 
5993   /* This is the place where threads wait for work */
5994   while (!TCR_4(__kmp_global.g.g_done)) {
5995     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5996     KMP_MB();
5997 
5998     /* wait for work to do */
5999     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6000 
6001     /* No tid yet since not part of a team */
6002     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6003 
6004 #if OMPT_SUPPORT
6005     if (ompt_enabled.enabled) {
6006       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6007     }
6008 #endif
6009 
6010     pteam = &this_thr->th.th_team;
6011 
6012     /* have we been allocated? */
6013     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6014       /* we were just woken up, so run our new task */
6015       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6016         int rc;
6017         KA_TRACE(20,
6018                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6019                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6020                   (*pteam)->t.t_pkfn));
6021 
6022         updateHWFPControl(*pteam);
6023 
6024 #if OMPT_SUPPORT
6025         if (ompt_enabled.enabled) {
6026           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6027         }
6028 #endif
6029 
6030         rc = (*pteam)->t.t_invoke(gtid);
6031         KMP_ASSERT(rc);
6032 
6033         KMP_MB();
6034         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6035                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6036                       (*pteam)->t.t_pkfn));
6037       }
6038 #if OMPT_SUPPORT
6039       if (ompt_enabled.enabled) {
6040         /* no frame set while outside task */
6041         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6042 
6043         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6044       }
6045 #endif
6046       /* join barrier after parallel region */
6047       __kmp_join_barrier(gtid);
6048     }
6049   }
6050   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
6051 
6052 #if OMPD_SUPPORT
6053   if (ompd_state & OMPD_ENABLE_BP)
6054     ompd_bp_thread_end();
6055 #endif
6056 
6057 #if OMPT_SUPPORT
6058   if (ompt_enabled.ompt_callback_thread_end) {
6059     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6060   }
6061 #endif
6062 
6063   this_thr->th.th_task_team = NULL;
6064   /* run the destructors for the threadprivate data for this thread */
6065   __kmp_common_destroy_gtid(gtid);
6066 
6067   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6068   KMP_MB();
6069 
6070 #if OMP_PROFILING_SUPPORT
6071   llvm::timeTraceProfilerFinishThread();
6072 #endif
6073   return this_thr;
6074 }
6075 
6076 /* ------------------------------------------------------------------------ */
6077 
6078 void __kmp_internal_end_dest(void *specific_gtid) {
6079   // Make sure no significant bits are lost
6080   int gtid;
6081   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6082 
6083   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6084   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6085    * this is because 0 is reserved for the nothing-stored case */
6086 
6087   __kmp_internal_end_thread(gtid);
6088 }
6089 
6090 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6091 
6092 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6093   __kmp_internal_end_atexit();
6094 }
6095 
6096 #endif
6097 
6098 /* [Windows] josh: when the atexit handler is called, there may still be more
6099    than one thread alive */
6100 void __kmp_internal_end_atexit(void) {
6101   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6102   /* [Windows]
6103      josh: ideally, we want to completely shutdown the library in this atexit
6104      handler, but stat code that depends on thread specific data for gtid fails
6105      because that data becomes unavailable at some point during the shutdown, so
6106      we call __kmp_internal_end_thread instead. We should eventually remove the
6107      dependency on __kmp_get_specific_gtid in the stat code and use
6108      __kmp_internal_end_library to cleanly shutdown the library.
6109 
6110      // TODO: Can some of this comment about GVS be removed?
6111      I suspect that the offending stat code is executed when the calling thread
6112      tries to clean up a dead root thread's data structures, resulting in GVS
6113      code trying to close the GVS structures for that thread, but since the stat
6114      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6115      the calling thread is cleaning up itself instead of another thread, it get
6116      confused. This happens because allowing a thread to unregister and cleanup
6117      another thread is a recent modification for addressing an issue.
6118      Based on the current design (20050722), a thread may end up
6119      trying to unregister another thread only if thread death does not trigger
6120      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
6121      thread specific data destructor function to detect thread death. For
6122      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6123      is nothing.  Thus, the workaround is applicable only for Windows static
6124      stat library. */
6125   __kmp_internal_end_library(-1);
6126 #if KMP_OS_WINDOWS
6127   __kmp_close_console();
6128 #endif
6129 }
6130 
6131 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6132   // It is assumed __kmp_forkjoin_lock is acquired.
6133 
6134   int gtid;
6135 
6136   KMP_DEBUG_ASSERT(thread != NULL);
6137 
6138   gtid = thread->th.th_info.ds.ds_gtid;
6139 
6140   if (!is_root) {
6141     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6142       /* Assume the threads are at the fork barrier here */
6143       KA_TRACE(
6144           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6145                gtid));
6146       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6147         while (
6148             !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6149           KMP_CPU_PAUSE();
6150         __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6151       } else {
6152         /* Need release fence here to prevent seg faults for tree forkjoin
6153            barrier (GEH) */
6154         kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6155                            thread);
6156         __kmp_release_64(&flag);
6157       }
6158     }
6159 
6160     // Terminate OS thread.
6161     __kmp_reap_worker(thread);
6162 
6163     // The thread was killed asynchronously.  If it was actively
6164     // spinning in the thread pool, decrement the global count.
6165     //
6166     // There is a small timing hole here - if the worker thread was just waking
6167     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6168     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6169     // the global counter might not get updated.
6170     //
6171     // Currently, this can only happen as the library is unloaded,
6172     // so there are no harmful side effects.
6173     if (thread->th.th_active_in_pool) {
6174       thread->th.th_active_in_pool = FALSE;
6175       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6176       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6177     }
6178   }
6179 
6180   __kmp_free_implicit_task(thread);
6181 
6182 // Free the fast memory for tasking
6183 #if USE_FAST_MEMORY
6184   __kmp_free_fast_memory(thread);
6185 #endif /* USE_FAST_MEMORY */
6186 
6187   __kmp_suspend_uninitialize_thread(thread);
6188 
6189   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6190   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6191 
6192   --__kmp_all_nth;
6193   // __kmp_nth was decremented when thread is added to the pool.
6194 
6195 #ifdef KMP_ADJUST_BLOCKTIME
6196   /* Adjust blocktime back to user setting or default if necessary */
6197   /* Middle initialization might never have occurred                */
6198   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6199     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6200     if (__kmp_nth <= __kmp_avail_proc) {
6201       __kmp_zero_bt = FALSE;
6202     }
6203   }
6204 #endif /* KMP_ADJUST_BLOCKTIME */
6205 
6206   /* free the memory being used */
6207   if (__kmp_env_consistency_check) {
6208     if (thread->th.th_cons) {
6209       __kmp_free_cons_stack(thread->th.th_cons);
6210       thread->th.th_cons = NULL;
6211     }
6212   }
6213 
6214   if (thread->th.th_pri_common != NULL) {
6215     __kmp_free(thread->th.th_pri_common);
6216     thread->th.th_pri_common = NULL;
6217   }
6218 
6219   if (thread->th.th_task_state_memo_stack != NULL) {
6220     __kmp_free(thread->th.th_task_state_memo_stack);
6221     thread->th.th_task_state_memo_stack = NULL;
6222   }
6223 
6224 #if KMP_USE_BGET
6225   if (thread->th.th_local.bget_data != NULL) {
6226     __kmp_finalize_bget(thread);
6227   }
6228 #endif
6229 
6230 #if KMP_AFFINITY_SUPPORTED
6231   if (thread->th.th_affin_mask != NULL) {
6232     KMP_CPU_FREE(thread->th.th_affin_mask);
6233     thread->th.th_affin_mask = NULL;
6234   }
6235 #endif /* KMP_AFFINITY_SUPPORTED */
6236 
6237 #if KMP_USE_HIER_SCHED
6238   if (thread->th.th_hier_bar_data != NULL) {
6239     __kmp_free(thread->th.th_hier_bar_data);
6240     thread->th.th_hier_bar_data = NULL;
6241   }
6242 #endif
6243 
6244   __kmp_reap_team(thread->th.th_serial_team);
6245   thread->th.th_serial_team = NULL;
6246   __kmp_free(thread);
6247 
6248   KMP_MB();
6249 
6250 } // __kmp_reap_thread
6251 
6252 static void __kmp_itthash_clean(kmp_info_t *th) {
6253 #if USE_ITT_NOTIFY
6254   if (__kmp_itt_region_domains.count > 0) {
6255     for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6256       kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6257       while (bucket) {
6258         kmp_itthash_entry_t *next = bucket->next_in_bucket;
6259         __kmp_thread_free(th, bucket);
6260         bucket = next;
6261       }
6262     }
6263   }
6264   if (__kmp_itt_barrier_domains.count > 0) {
6265     for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6266       kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6267       while (bucket) {
6268         kmp_itthash_entry_t *next = bucket->next_in_bucket;
6269         __kmp_thread_free(th, bucket);
6270         bucket = next;
6271       }
6272     }
6273   }
6274 #endif
6275 }
6276 
6277 static void __kmp_internal_end(void) {
6278   int i;
6279 
6280   /* First, unregister the library */
6281   __kmp_unregister_library();
6282 
6283 #if KMP_OS_WINDOWS
6284   /* In Win static library, we can't tell when a root actually dies, so we
6285      reclaim the data structures for any root threads that have died but not
6286      unregistered themselves, in order to shut down cleanly.
6287      In Win dynamic library we also can't tell when a thread dies.  */
6288   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6289 // dead roots
6290 #endif
6291 
6292   for (i = 0; i < __kmp_threads_capacity; i++)
6293     if (__kmp_root[i])
6294       if (__kmp_root[i]->r.r_active)
6295         break;
6296   KMP_MB(); /* Flush all pending memory write invalidates.  */
6297   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6298 
6299   if (i < __kmp_threads_capacity) {
6300 #if KMP_USE_MONITOR
6301     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6302     KMP_MB(); /* Flush all pending memory write invalidates.  */
6303 
6304     // Need to check that monitor was initialized before reaping it. If we are
6305     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6306     // __kmp_monitor will appear to contain valid data, but it is only valid in
6307     // the parent process, not the child.
6308     // New behavior (201008): instead of keying off of the flag
6309     // __kmp_init_parallel, the monitor thread creation is keyed off
6310     // of the new flag __kmp_init_monitor.
6311     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6312     if (TCR_4(__kmp_init_monitor)) {
6313       __kmp_reap_monitor(&__kmp_monitor);
6314       TCW_4(__kmp_init_monitor, 0);
6315     }
6316     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6317     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6318 #endif // KMP_USE_MONITOR
6319   } else {
6320 /* TODO move this to cleanup code */
6321 #ifdef KMP_DEBUG
6322     /* make sure that everything has properly ended */
6323     for (i = 0; i < __kmp_threads_capacity; i++) {
6324       if (__kmp_root[i]) {
6325         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6326         //                    there can be uber threads alive here
6327         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6328       }
6329     }
6330 #endif
6331 
6332     KMP_MB();
6333 
6334     // Reap the worker threads.
6335     // This is valid for now, but be careful if threads are reaped sooner.
6336     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6337       // Get the next thread from the pool.
6338       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6339       __kmp_thread_pool = thread->th.th_next_pool;
6340       // Reap it.
6341       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6342       thread->th.th_next_pool = NULL;
6343       thread->th.th_in_pool = FALSE;
6344       __kmp_reap_thread(thread, 0);
6345     }
6346     __kmp_thread_pool_insert_pt = NULL;
6347 
6348     // Reap teams.
6349     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6350       // Get the next team from the pool.
6351       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6352       __kmp_team_pool = team->t.t_next_pool;
6353       // Reap it.
6354       team->t.t_next_pool = NULL;
6355       __kmp_reap_team(team);
6356     }
6357 
6358     __kmp_reap_task_teams();
6359 
6360 #if KMP_OS_UNIX
6361     // Threads that are not reaped should not access any resources since they
6362     // are going to be deallocated soon, so the shutdown sequence should wait
6363     // until all threads either exit the final spin-waiting loop or begin
6364     // sleeping after the given blocktime.
6365     for (i = 0; i < __kmp_threads_capacity; i++) {
6366       kmp_info_t *thr = __kmp_threads[i];
6367       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6368         KMP_CPU_PAUSE();
6369     }
6370 #endif
6371 
6372     for (i = 0; i < __kmp_threads_capacity; ++i) {
6373       // TBD: Add some checking...
6374       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6375     }
6376 
6377     /* Make sure all threadprivate destructors get run by joining with all
6378        worker threads before resetting this flag */
6379     TCW_SYNC_4(__kmp_init_common, FALSE);
6380 
6381     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6382     KMP_MB();
6383 
6384 #if KMP_USE_MONITOR
6385     // See note above: One of the possible fixes for CQ138434 / CQ140126
6386     //
6387     // FIXME: push both code fragments down and CSE them?
6388     // push them into __kmp_cleanup() ?
6389     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6390     if (TCR_4(__kmp_init_monitor)) {
6391       __kmp_reap_monitor(&__kmp_monitor);
6392       TCW_4(__kmp_init_monitor, 0);
6393     }
6394     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6395     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6396 #endif
6397   } /* else !__kmp_global.t_active */
6398   TCW_4(__kmp_init_gtid, FALSE);
6399   KMP_MB(); /* Flush all pending memory write invalidates.  */
6400 
6401   __kmp_cleanup();
6402 #if OMPT_SUPPORT
6403   ompt_fini();
6404 #endif
6405 }
6406 
6407 void __kmp_internal_end_library(int gtid_req) {
6408   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6409   /* this shouldn't be a race condition because __kmp_internal_end() is the
6410      only place to clear __kmp_serial_init */
6411   /* we'll check this later too, after we get the lock */
6412   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6413   // redundant, because the next check will work in any case.
6414   if (__kmp_global.g.g_abort) {
6415     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6416     /* TODO abort? */
6417     return;
6418   }
6419   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6420     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6421     return;
6422   }
6423 
6424   // If hidden helper team has been initialized, we need to deinit it
6425   if (TCR_4(__kmp_init_hidden_helper) &&
6426       !TCR_4(__kmp_hidden_helper_team_done)) {
6427     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6428     // First release the main thread to let it continue its work
6429     __kmp_hidden_helper_main_thread_release();
6430     // Wait until the hidden helper team has been destroyed
6431     __kmp_hidden_helper_threads_deinitz_wait();
6432   }
6433 
6434   KMP_MB(); /* Flush all pending memory write invalidates.  */
6435   /* find out who we are and what we should do */
6436   {
6437     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6438     KA_TRACE(
6439         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6440     if (gtid == KMP_GTID_SHUTDOWN) {
6441       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6442                     "already shutdown\n"));
6443       return;
6444     } else if (gtid == KMP_GTID_MONITOR) {
6445       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6446                     "registered, or system shutdown\n"));
6447       return;
6448     } else if (gtid == KMP_GTID_DNE) {
6449       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6450                     "shutdown\n"));
6451       /* we don't know who we are, but we may still shutdown the library */
6452     } else if (KMP_UBER_GTID(gtid)) {
6453       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6454       if (__kmp_root[gtid]->r.r_active) {
6455         __kmp_global.g.g_abort = -1;
6456         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6457         __kmp_unregister_library();
6458         KA_TRACE(10,
6459                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6460                   gtid));
6461         return;
6462       } else {
6463         __kmp_itthash_clean(__kmp_threads[gtid]);
6464         KA_TRACE(
6465             10,
6466             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6467         __kmp_unregister_root_current_thread(gtid);
6468       }
6469     } else {
6470 /* worker threads may call this function through the atexit handler, if they
6471  * call exit() */
6472 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6473    TODO: do a thorough shutdown instead */
6474 #ifdef DUMP_DEBUG_ON_EXIT
6475       if (__kmp_debug_buf)
6476         __kmp_dump_debug_buffer();
6477 #endif
6478       // added unregister library call here when we switch to shm linux
6479       // if we don't, it will leave lots of files in /dev/shm
6480       // cleanup shared memory file before exiting.
6481       __kmp_unregister_library();
6482       return;
6483     }
6484   }
6485   /* synchronize the termination process */
6486   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6487 
6488   /* have we already finished */
6489   if (__kmp_global.g.g_abort) {
6490     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6491     /* TODO abort? */
6492     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6493     return;
6494   }
6495   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6496     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6497     return;
6498   }
6499 
6500   /* We need this lock to enforce mutex between this reading of
6501      __kmp_threads_capacity and the writing by __kmp_register_root.
6502      Alternatively, we can use a counter of roots that is atomically updated by
6503      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6504      __kmp_internal_end_*.  */
6505   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6506 
6507   /* now we can safely conduct the actual termination */
6508   __kmp_internal_end();
6509 
6510   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6511   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6512 
6513   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6514 
6515 #ifdef DUMP_DEBUG_ON_EXIT
6516   if (__kmp_debug_buf)
6517     __kmp_dump_debug_buffer();
6518 #endif
6519 
6520 #if KMP_OS_WINDOWS
6521   __kmp_close_console();
6522 #endif
6523 
6524   __kmp_fini_allocator();
6525 
6526 } // __kmp_internal_end_library
6527 
6528 void __kmp_internal_end_thread(int gtid_req) {
6529   int i;
6530 
6531   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6532   /* this shouldn't be a race condition because __kmp_internal_end() is the
6533    * only place to clear __kmp_serial_init */
6534   /* we'll check this later too, after we get the lock */
6535   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6536   // redundant, because the next check will work in any case.
6537   if (__kmp_global.g.g_abort) {
6538     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6539     /* TODO abort? */
6540     return;
6541   }
6542   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6543     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6544     return;
6545   }
6546 
6547   // If hidden helper team has been initialized, we need to deinit it
6548   if (TCR_4(__kmp_init_hidden_helper) &&
6549       !TCR_4(__kmp_hidden_helper_team_done)) {
6550     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6551     // First release the main thread to let it continue its work
6552     __kmp_hidden_helper_main_thread_release();
6553     // Wait until the hidden helper team has been destroyed
6554     __kmp_hidden_helper_threads_deinitz_wait();
6555   }
6556 
6557   KMP_MB(); /* Flush all pending memory write invalidates.  */
6558 
6559   /* find out who we are and what we should do */
6560   {
6561     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6562     KA_TRACE(10,
6563              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6564     if (gtid == KMP_GTID_SHUTDOWN) {
6565       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6566                     "already shutdown\n"));
6567       return;
6568     } else if (gtid == KMP_GTID_MONITOR) {
6569       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6570                     "registered, or system shutdown\n"));
6571       return;
6572     } else if (gtid == KMP_GTID_DNE) {
6573       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6574                     "shutdown\n"));
6575       return;
6576       /* we don't know who we are */
6577     } else if (KMP_UBER_GTID(gtid)) {
6578       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6579       if (__kmp_root[gtid]->r.r_active) {
6580         __kmp_global.g.g_abort = -1;
6581         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6582         KA_TRACE(10,
6583                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6584                   gtid));
6585         return;
6586       } else {
6587         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6588                       gtid));
6589         __kmp_unregister_root_current_thread(gtid);
6590       }
6591     } else {
6592       /* just a worker thread, let's leave */
6593       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6594 
6595       if (gtid >= 0) {
6596         __kmp_threads[gtid]->th.th_task_team = NULL;
6597       }
6598 
6599       KA_TRACE(10,
6600                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6601                 gtid));
6602       return;
6603     }
6604   }
6605 #if KMP_DYNAMIC_LIB
6606   if (__kmp_pause_status != kmp_hard_paused)
6607   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6608   // because we will better shutdown later in the library destructor.
6609   {
6610     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6611     return;
6612   }
6613 #endif
6614   /* synchronize the termination process */
6615   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6616 
6617   /* have we already finished */
6618   if (__kmp_global.g.g_abort) {
6619     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6620     /* TODO abort? */
6621     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6622     return;
6623   }
6624   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6625     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6626     return;
6627   }
6628 
6629   /* We need this lock to enforce mutex between this reading of
6630      __kmp_threads_capacity and the writing by __kmp_register_root.
6631      Alternatively, we can use a counter of roots that is atomically updated by
6632      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6633      __kmp_internal_end_*.  */
6634 
6635   /* should we finish the run-time?  are all siblings done? */
6636   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6637 
6638   for (i = 0; i < __kmp_threads_capacity; ++i) {
6639     if (KMP_UBER_GTID(i)) {
6640       KA_TRACE(
6641           10,
6642           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6643       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6644       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6645       return;
6646     }
6647   }
6648 
6649   /* now we can safely conduct the actual termination */
6650 
6651   __kmp_internal_end();
6652 
6653   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6654   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6655 
6656   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6657 
6658 #ifdef DUMP_DEBUG_ON_EXIT
6659   if (__kmp_debug_buf)
6660     __kmp_dump_debug_buffer();
6661 #endif
6662 } // __kmp_internal_end_thread
6663 
6664 // -----------------------------------------------------------------------------
6665 // Library registration stuff.
6666 
6667 static long __kmp_registration_flag = 0;
6668 // Random value used to indicate library initialization.
6669 static char *__kmp_registration_str = NULL;
6670 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6671 
6672 static inline char *__kmp_reg_status_name() {
6673 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6674    each thread. If registration and unregistration go in different threads
6675    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6676    env var can not be found, because the name will contain different pid. */
6677 // macOS* complains about name being too long with additional getuid()
6678 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6679   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6680                           (int)getuid());
6681 #else
6682   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6683 #endif
6684 } // __kmp_reg_status_get
6685 
6686 void __kmp_register_library_startup(void) {
6687 
6688   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6689   int done = 0;
6690   union {
6691     double dtime;
6692     long ltime;
6693   } time;
6694 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6695   __kmp_initialize_system_tick();
6696 #endif
6697   __kmp_read_system_time(&time.dtime);
6698   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6699   __kmp_registration_str =
6700       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6701                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6702 
6703   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6704                 __kmp_registration_str));
6705 
6706   while (!done) {
6707 
6708     char *value = NULL; // Actual value of the environment variable.
6709 
6710 #if defined(KMP_USE_SHM)
6711     char *shm_name = __kmp_str_format("/%s", name);
6712     int shm_preexist = 0;
6713     char *data1;
6714     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6715     if ((fd1 == -1) && (errno == EEXIST)) {
6716       // file didn't open because it already exists.
6717       // try opening existing file
6718       fd1 = shm_open(shm_name, O_RDWR, 0666);
6719       if (fd1 == -1) { // file didn't open
6720         // error out here
6721         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6722                     __kmp_msg_null);
6723       } else {
6724         // able to open existing file
6725         shm_preexist = 1;
6726       }
6727     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6728       // already exists.
6729       // error out here.
6730       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6731                   __kmp_msg_null);
6732     }
6733     if (shm_preexist == 0) {
6734       // we created SHM now set size
6735       if (ftruncate(fd1, SHM_SIZE) == -1) {
6736         // error occured setting size;
6737         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6738                     KMP_ERR(errno), __kmp_msg_null);
6739       }
6740     }
6741     data1 =
6742         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6743     if (data1 == MAP_FAILED) {
6744       // failed to map shared memory
6745       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6746                   __kmp_msg_null);
6747     }
6748     if (shm_preexist == 0) { // set data to SHM, set value
6749       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6750     }
6751     // Read value from either what we just wrote or existing file.
6752     value = __kmp_str_format("%s", data1); // read value from SHM
6753     munmap(data1, SHM_SIZE);
6754     close(fd1);
6755 #else // Windows and unix with static library
6756     // Set environment variable, but do not overwrite if it is exist.
6757     __kmp_env_set(name, __kmp_registration_str, 0);
6758     // read value to see if it got set
6759     value = __kmp_env_get(name);
6760 #endif
6761 
6762     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6763       done = 1; // Ok, environment variable set successfully, exit the loop.
6764     } else {
6765       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6766       // Check whether it alive or dead.
6767       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6768       char *tail = value;
6769       char *flag_addr_str = NULL;
6770       char *flag_val_str = NULL;
6771       char const *file_name = NULL;
6772       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6773       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6774       file_name = tail;
6775       if (tail != NULL) {
6776         unsigned long *flag_addr = 0;
6777         unsigned long flag_val = 0;
6778         KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6779         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6780         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6781           // First, check whether environment-encoded address is mapped into
6782           // addr space.
6783           // If so, dereference it to see if it still has the right value.
6784           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6785             neighbor = 1;
6786           } else {
6787             // If not, then we know the other copy of the library is no longer
6788             // running.
6789             neighbor = 2;
6790           }
6791         }
6792       }
6793       switch (neighbor) {
6794       case 0: // Cannot parse environment variable -- neighbor status unknown.
6795         // Assume it is the incompatible format of future version of the
6796         // library. Assume the other library is alive.
6797         // WARN( ... ); // TODO: Issue a warning.
6798         file_name = "unknown library";
6799         KMP_FALLTHROUGH();
6800       // Attention! Falling to the next case. That's intentional.
6801       case 1: { // Neighbor is alive.
6802         // Check it is allowed.
6803         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6804         if (!__kmp_str_match_true(duplicate_ok)) {
6805           // That's not allowed. Issue fatal error.
6806           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6807                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6808         }
6809         KMP_INTERNAL_FREE(duplicate_ok);
6810         __kmp_duplicate_library_ok = 1;
6811         done = 1; // Exit the loop.
6812       } break;
6813       case 2: { // Neighbor is dead.
6814 
6815 #if defined(KMP_USE_SHM)
6816         // close shared memory.
6817         shm_unlink(shm_name); // this removes file in /dev/shm
6818 #else
6819         // Clear the variable and try to register library again.
6820         __kmp_env_unset(name);
6821 #endif
6822       } break;
6823       default: {
6824         KMP_DEBUG_ASSERT(0);
6825       } break;
6826       }
6827     }
6828     KMP_INTERNAL_FREE((void *)value);
6829 #if defined(KMP_USE_SHM)
6830     KMP_INTERNAL_FREE((void *)shm_name);
6831 #endif
6832   } // while
6833   KMP_INTERNAL_FREE((void *)name);
6834 
6835 } // func __kmp_register_library_startup
6836 
6837 void __kmp_unregister_library(void) {
6838 
6839   char *name = __kmp_reg_status_name();
6840   char *value = NULL;
6841 
6842 #if defined(KMP_USE_SHM)
6843   char *shm_name = __kmp_str_format("/%s", name);
6844   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6845   if (fd1 == -1) {
6846     // file did not open. return.
6847     return;
6848   }
6849   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6850   if (data1 != MAP_FAILED) {
6851     value = __kmp_str_format("%s", data1); // read value from SHM
6852     munmap(data1, SHM_SIZE);
6853   }
6854   close(fd1);
6855 #else
6856   value = __kmp_env_get(name);
6857 #endif
6858 
6859   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6860   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6861   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6862 //  Ok, this is our variable. Delete it.
6863 #if defined(KMP_USE_SHM)
6864     shm_unlink(shm_name); // this removes file in /dev/shm
6865 #else
6866     __kmp_env_unset(name);
6867 #endif
6868   }
6869 
6870 #if defined(KMP_USE_SHM)
6871   KMP_INTERNAL_FREE(shm_name);
6872 #endif
6873 
6874   KMP_INTERNAL_FREE(__kmp_registration_str);
6875   KMP_INTERNAL_FREE(value);
6876   KMP_INTERNAL_FREE(name);
6877 
6878   __kmp_registration_flag = 0;
6879   __kmp_registration_str = NULL;
6880 
6881 } // __kmp_unregister_library
6882 
6883 // End of Library registration stuff.
6884 // -----------------------------------------------------------------------------
6885 
6886 #if KMP_MIC_SUPPORTED
6887 
6888 static void __kmp_check_mic_type() {
6889   kmp_cpuid_t cpuid_state = {0};
6890   kmp_cpuid_t *cs_p = &cpuid_state;
6891   __kmp_x86_cpuid(1, 0, cs_p);
6892   // We don't support mic1 at the moment
6893   if ((cs_p->eax & 0xff0) == 0xB10) {
6894     __kmp_mic_type = mic2;
6895   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6896     __kmp_mic_type = mic3;
6897   } else {
6898     __kmp_mic_type = non_mic;
6899   }
6900 }
6901 
6902 #endif /* KMP_MIC_SUPPORTED */
6903 
6904 #if KMP_HAVE_UMWAIT
6905 static void __kmp_user_level_mwait_init() {
6906   struct kmp_cpuid buf;
6907   __kmp_x86_cpuid(7, 0, &buf);
6908   __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
6909   __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
6910   __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
6911   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6912                 __kmp_umwait_enabled));
6913 }
6914 #elif KMP_HAVE_MWAIT
6915 #ifndef AT_INTELPHIUSERMWAIT
6916 // Spurious, non-existent value that should always fail to return anything.
6917 // Will be replaced with the correct value when we know that.
6918 #define AT_INTELPHIUSERMWAIT 10000
6919 #endif
6920 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6921 // earlier OS is used to build the RTL, we'll use the following internal
6922 // function when the entry is not found.
6923 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6924 unsigned long getauxval(unsigned long) { return 0; }
6925 
6926 static void __kmp_user_level_mwait_init() {
6927   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6928   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6929   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6930   // KMP_USER_LEVEL_MWAIT was set to TRUE.
6931   if (__kmp_mic_type == mic3) {
6932     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6933     if ((res & 0x1) || __kmp_user_level_mwait) {
6934       __kmp_mwait_enabled = TRUE;
6935       if (__kmp_user_level_mwait) {
6936         KMP_INFORM(EnvMwaitWarn);
6937       }
6938     } else {
6939       __kmp_mwait_enabled = FALSE;
6940     }
6941   }
6942   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6943                 "__kmp_mwait_enabled = %d\n",
6944                 __kmp_mic_type, __kmp_mwait_enabled));
6945 }
6946 #endif /* KMP_HAVE_UMWAIT */
6947 
6948 static void __kmp_do_serial_initialize(void) {
6949   int i, gtid;
6950   size_t size;
6951 
6952   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6953 
6954   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6955   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6956   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6957   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6958   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6959 
6960 #if OMPT_SUPPORT
6961   ompt_pre_init();
6962 #endif
6963 #if OMPD_SUPPORT
6964   __kmp_env_dump();
6965   ompd_init();
6966 #endif
6967 
6968   __kmp_validate_locks();
6969 
6970   /* Initialize internal memory allocator */
6971   __kmp_init_allocator();
6972 
6973   /* Register the library startup via an environment variable or via mapped
6974      shared memory file and check to see whether another copy of the library is
6975      already registered. Since forked child process is often terminated, we
6976      postpone the registration till middle initialization in the child */
6977   if (__kmp_need_register_serial)
6978     __kmp_register_library_startup();
6979 
6980   /* TODO reinitialization of library */
6981   if (TCR_4(__kmp_global.g.g_done)) {
6982     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6983   }
6984 
6985   __kmp_global.g.g_abort = 0;
6986   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6987 
6988 /* initialize the locks */
6989 #if KMP_USE_ADAPTIVE_LOCKS
6990 #if KMP_DEBUG_ADAPTIVE_LOCKS
6991   __kmp_init_speculative_stats();
6992 #endif
6993 #endif
6994 #if KMP_STATS_ENABLED
6995   __kmp_stats_init();
6996 #endif
6997   __kmp_init_lock(&__kmp_global_lock);
6998   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6999   __kmp_init_lock(&__kmp_debug_lock);
7000   __kmp_init_atomic_lock(&__kmp_atomic_lock);
7001   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7002   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7003   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7004   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7005   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7006   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7007   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7008   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7009   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7010   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7011   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7012   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7013   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7014   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7015 #if KMP_USE_MONITOR
7016   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7017 #endif
7018   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7019 
7020   /* conduct initialization and initial setup of configuration */
7021 
7022   __kmp_runtime_initialize();
7023 
7024 #if KMP_MIC_SUPPORTED
7025   __kmp_check_mic_type();
7026 #endif
7027 
7028 // Some global variable initialization moved here from kmp_env_initialize()
7029 #ifdef KMP_DEBUG
7030   kmp_diag = 0;
7031 #endif
7032   __kmp_abort_delay = 0;
7033 
7034   // From __kmp_init_dflt_team_nth()
7035   /* assume the entire machine will be used */
7036   __kmp_dflt_team_nth_ub = __kmp_xproc;
7037   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7038     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7039   }
7040   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7041     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7042   }
7043   __kmp_max_nth = __kmp_sys_max_nth;
7044   __kmp_cg_max_nth = __kmp_sys_max_nth;
7045   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7046   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7047     __kmp_teams_max_nth = __kmp_sys_max_nth;
7048   }
7049 
7050   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7051   // part
7052   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7053 #if KMP_USE_MONITOR
7054   __kmp_monitor_wakeups =
7055       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7056   __kmp_bt_intervals =
7057       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7058 #endif
7059   // From "KMP_LIBRARY" part of __kmp_env_initialize()
7060   __kmp_library = library_throughput;
7061   // From KMP_SCHEDULE initialization
7062   __kmp_static = kmp_sch_static_balanced;
7063 // AC: do not use analytical here, because it is non-monotonous
7064 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7065 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7066 // need to repeat assignment
7067 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7068 // bit control and barrier method control parts
7069 #if KMP_FAST_REDUCTION_BARRIER
7070 #define kmp_reduction_barrier_gather_bb ((int)1)
7071 #define kmp_reduction_barrier_release_bb ((int)1)
7072 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7073 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7074 #endif // KMP_FAST_REDUCTION_BARRIER
7075   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7076     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7077     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7078     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7079     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7080 #if KMP_FAST_REDUCTION_BARRIER
7081     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7082       // lin_64 ): hyper,1
7083       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7084       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7085       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7086       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7087     }
7088 #endif // KMP_FAST_REDUCTION_BARRIER
7089   }
7090 #if KMP_FAST_REDUCTION_BARRIER
7091 #undef kmp_reduction_barrier_release_pat
7092 #undef kmp_reduction_barrier_gather_pat
7093 #undef kmp_reduction_barrier_release_bb
7094 #undef kmp_reduction_barrier_gather_bb
7095 #endif // KMP_FAST_REDUCTION_BARRIER
7096 #if KMP_MIC_SUPPORTED
7097   if (__kmp_mic_type == mic2) { // KNC
7098     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7099     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7100     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7101         1; // forkjoin release
7102     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7103     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7104   }
7105 #if KMP_FAST_REDUCTION_BARRIER
7106   if (__kmp_mic_type == mic2) { // KNC
7107     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7108     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7109   }
7110 #endif // KMP_FAST_REDUCTION_BARRIER
7111 #endif // KMP_MIC_SUPPORTED
7112 
7113 // From KMP_CHECKS initialization
7114 #ifdef KMP_DEBUG
7115   __kmp_env_checks = TRUE; /* development versions have the extra checks */
7116 #else
7117   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7118 #endif
7119 
7120   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7121   __kmp_foreign_tp = TRUE;
7122 
7123   __kmp_global.g.g_dynamic = FALSE;
7124   __kmp_global.g.g_dynamic_mode = dynamic_default;
7125 
7126   __kmp_init_nesting_mode();
7127 
7128   __kmp_env_initialize(NULL);
7129 
7130 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7131   __kmp_user_level_mwait_init();
7132 #endif
7133 // Print all messages in message catalog for testing purposes.
7134 #ifdef KMP_DEBUG
7135   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7136   if (__kmp_str_match_true(val)) {
7137     kmp_str_buf_t buffer;
7138     __kmp_str_buf_init(&buffer);
7139     __kmp_i18n_dump_catalog(&buffer);
7140     __kmp_printf("%s", buffer.str);
7141     __kmp_str_buf_free(&buffer);
7142   }
7143   __kmp_env_free(&val);
7144 #endif
7145 
7146   __kmp_threads_capacity =
7147       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7148   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7149   __kmp_tp_capacity = __kmp_default_tp_capacity(
7150       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7151 
7152   // If the library is shut down properly, both pools must be NULL. Just in
7153   // case, set them to NULL -- some memory may leak, but subsequent code will
7154   // work even if pools are not freed.
7155   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7156   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7157   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7158   __kmp_thread_pool = NULL;
7159   __kmp_thread_pool_insert_pt = NULL;
7160   __kmp_team_pool = NULL;
7161 
7162   /* Allocate all of the variable sized records */
7163   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7164    * expandable */
7165   /* Since allocation is cache-aligned, just add extra padding at the end */
7166   size =
7167       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7168       CACHE_LINE;
7169   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7170   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7171                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
7172 
7173   /* init thread counts */
7174   KMP_DEBUG_ASSERT(__kmp_all_nth ==
7175                    0); // Asserts fail if the library is reinitializing and
7176   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7177   __kmp_all_nth = 0;
7178   __kmp_nth = 0;
7179 
7180   /* setup the uber master thread and hierarchy */
7181   gtid = __kmp_register_root(TRUE);
7182   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
7183   KMP_ASSERT(KMP_UBER_GTID(gtid));
7184   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7185 
7186   KMP_MB(); /* Flush all pending memory write invalidates.  */
7187 
7188   __kmp_common_initialize();
7189 
7190 #if KMP_OS_UNIX
7191   /* invoke the child fork handler */
7192   __kmp_register_atfork();
7193 #endif
7194 
7195 #if !KMP_DYNAMIC_LIB
7196   {
7197     /* Invoke the exit handler when the program finishes, only for static
7198        library. For dynamic library, we already have _fini and DllMain. */
7199     int rc = atexit(__kmp_internal_end_atexit);
7200     if (rc != 0) {
7201       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7202                   __kmp_msg_null);
7203     }
7204   }
7205 #endif
7206 
7207 #if KMP_HANDLE_SIGNALS
7208 #if KMP_OS_UNIX
7209   /* NOTE: make sure that this is called before the user installs their own
7210      signal handlers so that the user handlers are called first. this way they
7211      can return false, not call our handler, avoid terminating the library, and
7212      continue execution where they left off. */
7213   __kmp_install_signals(FALSE);
7214 #endif /* KMP_OS_UNIX */
7215 #if KMP_OS_WINDOWS
7216   __kmp_install_signals(TRUE);
7217 #endif /* KMP_OS_WINDOWS */
7218 #endif
7219 
7220   /* we have finished the serial initialization */
7221   __kmp_init_counter++;
7222 
7223   __kmp_init_serial = TRUE;
7224 
7225   if (__kmp_settings) {
7226     __kmp_env_print();
7227   }
7228 
7229   if (__kmp_display_env || __kmp_display_env_verbose) {
7230     __kmp_env_print_2();
7231   }
7232 
7233 #if OMPT_SUPPORT
7234   ompt_post_init();
7235 #endif
7236 
7237   KMP_MB();
7238 
7239   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7240 }
7241 
7242 void __kmp_serial_initialize(void) {
7243   if (__kmp_init_serial) {
7244     return;
7245   }
7246   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7247   if (__kmp_init_serial) {
7248     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7249     return;
7250   }
7251   __kmp_do_serial_initialize();
7252   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7253 }
7254 
7255 static void __kmp_do_middle_initialize(void) {
7256   int i, j;
7257   int prev_dflt_team_nth;
7258 
7259   if (!__kmp_init_serial) {
7260     __kmp_do_serial_initialize();
7261   }
7262 
7263   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7264 
7265   if (UNLIKELY(!__kmp_need_register_serial)) {
7266     // We are in a forked child process. The registration was skipped during
7267     // serial initialization in __kmp_atfork_child handler. Do it here.
7268     __kmp_register_library_startup();
7269   }
7270 
7271   // Save the previous value for the __kmp_dflt_team_nth so that
7272   // we can avoid some reinitialization if it hasn't changed.
7273   prev_dflt_team_nth = __kmp_dflt_team_nth;
7274 
7275 #if KMP_AFFINITY_SUPPORTED
7276   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7277   // number of cores on the machine.
7278   __kmp_affinity_initialize();
7279 
7280 #endif /* KMP_AFFINITY_SUPPORTED */
7281 
7282   KMP_ASSERT(__kmp_xproc > 0);
7283   if (__kmp_avail_proc == 0) {
7284     __kmp_avail_proc = __kmp_xproc;
7285   }
7286 
7287   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7288   // correct them now
7289   j = 0;
7290   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7291     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7292         __kmp_avail_proc;
7293     j++;
7294   }
7295 
7296   if (__kmp_dflt_team_nth == 0) {
7297 #ifdef KMP_DFLT_NTH_CORES
7298     // Default #threads = #cores
7299     __kmp_dflt_team_nth = __kmp_ncores;
7300     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7301                   "__kmp_ncores (%d)\n",
7302                   __kmp_dflt_team_nth));
7303 #else
7304     // Default #threads = #available OS procs
7305     __kmp_dflt_team_nth = __kmp_avail_proc;
7306     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7307                   "__kmp_avail_proc(%d)\n",
7308                   __kmp_dflt_team_nth));
7309 #endif /* KMP_DFLT_NTH_CORES */
7310   }
7311 
7312   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7313     __kmp_dflt_team_nth = KMP_MIN_NTH;
7314   }
7315   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7316     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7317   }
7318 
7319   if (__kmp_nesting_mode > 0)
7320     __kmp_set_nesting_mode_threads();
7321 
7322   // There's no harm in continuing if the following check fails,
7323   // but it indicates an error in the previous logic.
7324   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7325 
7326   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7327     // Run through the __kmp_threads array and set the num threads icv for each
7328     // root thread that is currently registered with the RTL (which has not
7329     // already explicitly set its nthreads-var with a call to
7330     // omp_set_num_threads()).
7331     for (i = 0; i < __kmp_threads_capacity; i++) {
7332       kmp_info_t *thread = __kmp_threads[i];
7333       if (thread == NULL)
7334         continue;
7335       if (thread->th.th_current_task->td_icvs.nproc != 0)
7336         continue;
7337 
7338       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7339     }
7340   }
7341   KA_TRACE(
7342       20,
7343       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7344        __kmp_dflt_team_nth));
7345 
7346 #ifdef KMP_ADJUST_BLOCKTIME
7347   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7348   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7349     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7350     if (__kmp_nth > __kmp_avail_proc) {
7351       __kmp_zero_bt = TRUE;
7352     }
7353   }
7354 #endif /* KMP_ADJUST_BLOCKTIME */
7355 
7356   /* we have finished middle initialization */
7357   TCW_SYNC_4(__kmp_init_middle, TRUE);
7358 
7359   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7360 }
7361 
7362 void __kmp_middle_initialize(void) {
7363   if (__kmp_init_middle) {
7364     return;
7365   }
7366   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7367   if (__kmp_init_middle) {
7368     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7369     return;
7370   }
7371   __kmp_do_middle_initialize();
7372   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7373 }
7374 
7375 void __kmp_parallel_initialize(void) {
7376   int gtid = __kmp_entry_gtid(); // this might be a new root
7377 
7378   /* synchronize parallel initialization (for sibling) */
7379   if (TCR_4(__kmp_init_parallel))
7380     return;
7381   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7382   if (TCR_4(__kmp_init_parallel)) {
7383     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7384     return;
7385   }
7386 
7387   /* TODO reinitialization after we have already shut down */
7388   if (TCR_4(__kmp_global.g.g_done)) {
7389     KA_TRACE(
7390         10,
7391         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7392     __kmp_infinite_loop();
7393   }
7394 
7395   /* jc: The lock __kmp_initz_lock is already held, so calling
7396      __kmp_serial_initialize would cause a deadlock.  So we call
7397      __kmp_do_serial_initialize directly. */
7398   if (!__kmp_init_middle) {
7399     __kmp_do_middle_initialize();
7400   }
7401   __kmp_assign_root_init_mask();
7402   __kmp_resume_if_hard_paused();
7403 
7404   /* begin initialization */
7405   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7406   KMP_ASSERT(KMP_UBER_GTID(gtid));
7407 
7408 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7409   // Save the FP control regs.
7410   // Worker threads will set theirs to these values at thread startup.
7411   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7412   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7413   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7414 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7415 
7416 #if KMP_OS_UNIX
7417 #if KMP_HANDLE_SIGNALS
7418   /*  must be after __kmp_serial_initialize  */
7419   __kmp_install_signals(TRUE);
7420 #endif
7421 #endif
7422 
7423   __kmp_suspend_initialize();
7424 
7425 #if defined(USE_LOAD_BALANCE)
7426   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7427     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7428   }
7429 #else
7430   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7431     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7432   }
7433 #endif
7434 
7435   if (__kmp_version) {
7436     __kmp_print_version_2();
7437   }
7438 
7439   /* we have finished parallel initialization */
7440   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7441 
7442   KMP_MB();
7443   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7444 
7445   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7446 }
7447 
7448 void __kmp_hidden_helper_initialize() {
7449   if (TCR_4(__kmp_init_hidden_helper))
7450     return;
7451 
7452   // __kmp_parallel_initialize is required before we initialize hidden helper
7453   if (!TCR_4(__kmp_init_parallel))
7454     __kmp_parallel_initialize();
7455 
7456   // Double check. Note that this double check should not be placed before
7457   // __kmp_parallel_initialize as it will cause dead lock.
7458   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7459   if (TCR_4(__kmp_init_hidden_helper)) {
7460     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7461     return;
7462   }
7463 
7464   // Set the count of hidden helper tasks to be executed to zero
7465   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7466 
7467   // Set the global variable indicating that we're initializing hidden helper
7468   // team/threads
7469   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7470 
7471   // Platform independent initialization
7472   __kmp_do_initialize_hidden_helper_threads();
7473 
7474   // Wait here for the finish of initialization of hidden helper teams
7475   __kmp_hidden_helper_threads_initz_wait();
7476 
7477   // We have finished hidden helper initialization
7478   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7479 
7480   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7481 }
7482 
7483 /* ------------------------------------------------------------------------ */
7484 
7485 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7486                                    kmp_team_t *team) {
7487   kmp_disp_t *dispatch;
7488 
7489   KMP_MB();
7490 
7491   /* none of the threads have encountered any constructs, yet. */
7492   this_thr->th.th_local.this_construct = 0;
7493 #if KMP_CACHE_MANAGE
7494   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7495 #endif /* KMP_CACHE_MANAGE */
7496   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7497   KMP_DEBUG_ASSERT(dispatch);
7498   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7499   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7500   // this_thr->th.th_info.ds.ds_tid ] );
7501 
7502   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7503   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7504   if (__kmp_env_consistency_check)
7505     __kmp_push_parallel(gtid, team->t.t_ident);
7506 
7507   KMP_MB(); /* Flush all pending memory write invalidates.  */
7508 }
7509 
7510 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7511                                   kmp_team_t *team) {
7512   if (__kmp_env_consistency_check)
7513     __kmp_pop_parallel(gtid, team->t.t_ident);
7514 
7515   __kmp_finish_implicit_task(this_thr);
7516 }
7517 
7518 int __kmp_invoke_task_func(int gtid) {
7519   int rc;
7520   int tid = __kmp_tid_from_gtid(gtid);
7521   kmp_info_t *this_thr = __kmp_threads[gtid];
7522   kmp_team_t *team = this_thr->th.th_team;
7523 
7524   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7525 #if USE_ITT_BUILD
7526   if (__itt_stack_caller_create_ptr) {
7527     // inform ittnotify about entering user's code
7528     if (team->t.t_stack_id != NULL) {
7529       __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7530     } else {
7531       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7532       __kmp_itt_stack_callee_enter(
7533           (__itt_caller)team->t.t_parent->t.t_stack_id);
7534     }
7535   }
7536 #endif /* USE_ITT_BUILD */
7537 #if INCLUDE_SSC_MARKS
7538   SSC_MARK_INVOKING();
7539 #endif
7540 
7541 #if OMPT_SUPPORT
7542   void *dummy;
7543   void **exit_frame_p;
7544   ompt_data_t *my_task_data;
7545   ompt_data_t *my_parallel_data;
7546   int ompt_team_size;
7547 
7548   if (ompt_enabled.enabled) {
7549     exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7550                          .ompt_task_info.frame.exit_frame.ptr);
7551   } else {
7552     exit_frame_p = &dummy;
7553   }
7554 
7555   my_task_data =
7556       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7557   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7558   if (ompt_enabled.ompt_callback_implicit_task) {
7559     ompt_team_size = team->t.t_nproc;
7560     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7561         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7562         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7563     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7564   }
7565 #endif
7566 
7567 #if KMP_STATS_ENABLED
7568   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7569   if (previous_state == stats_state_e::TEAMS_REGION) {
7570     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7571   } else {
7572     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7573   }
7574   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7575 #endif
7576 
7577   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7578                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7579 #if OMPT_SUPPORT
7580                               ,
7581                               exit_frame_p
7582 #endif
7583   );
7584 #if OMPT_SUPPORT
7585   *exit_frame_p = NULL;
7586   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7587 #endif
7588 
7589 #if KMP_STATS_ENABLED
7590   if (previous_state == stats_state_e::TEAMS_REGION) {
7591     KMP_SET_THREAD_STATE(previous_state);
7592   }
7593   KMP_POP_PARTITIONED_TIMER();
7594 #endif
7595 
7596 #if USE_ITT_BUILD
7597   if (__itt_stack_caller_create_ptr) {
7598     // inform ittnotify about leaving user's code
7599     if (team->t.t_stack_id != NULL) {
7600       __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7601     } else {
7602       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7603       __kmp_itt_stack_callee_leave(
7604           (__itt_caller)team->t.t_parent->t.t_stack_id);
7605     }
7606   }
7607 #endif /* USE_ITT_BUILD */
7608   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7609 
7610   return rc;
7611 }
7612 
7613 void __kmp_teams_master(int gtid) {
7614   // This routine is called by all primary threads in teams construct
7615   kmp_info_t *thr = __kmp_threads[gtid];
7616   kmp_team_t *team = thr->th.th_team;
7617   ident_t *loc = team->t.t_ident;
7618   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7619   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7620   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7621   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7622                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7623 
7624   // This thread is a new CG root.  Set up the proper variables.
7625   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7626   tmp->cg_root = thr; // Make thr the CG root
7627   // Init to thread limit stored when league primary threads were forked
7628   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7629   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7630   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7631                  " cg_nthreads to 1\n",
7632                  thr, tmp));
7633   tmp->up = thr->th.th_cg_roots;
7634   thr->th.th_cg_roots = tmp;
7635 
7636 // Launch league of teams now, but not let workers execute
7637 // (they hang on fork barrier until next parallel)
7638 #if INCLUDE_SSC_MARKS
7639   SSC_MARK_FORKING();
7640 #endif
7641   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7642                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7643                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7644 #if INCLUDE_SSC_MARKS
7645   SSC_MARK_JOINING();
7646 #endif
7647   // If the team size was reduced from the limit, set it to the new size
7648   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7649     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7650   // AC: last parameter "1" eliminates join barrier which won't work because
7651   // worker threads are in a fork barrier waiting for more parallel regions
7652   __kmp_join_call(loc, gtid
7653 #if OMPT_SUPPORT
7654                   ,
7655                   fork_context_intel
7656 #endif
7657                   ,
7658                   1);
7659 }
7660 
7661 int __kmp_invoke_teams_master(int gtid) {
7662   kmp_info_t *this_thr = __kmp_threads[gtid];
7663   kmp_team_t *team = this_thr->th.th_team;
7664 #if KMP_DEBUG
7665   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7666     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7667                      (void *)__kmp_teams_master);
7668 #endif
7669   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7670 #if OMPT_SUPPORT
7671   int tid = __kmp_tid_from_gtid(gtid);
7672   ompt_data_t *task_data =
7673       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7674   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7675   if (ompt_enabled.ompt_callback_implicit_task) {
7676     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7677         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7678         ompt_task_initial);
7679     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7680   }
7681 #endif
7682   __kmp_teams_master(gtid);
7683 #if OMPT_SUPPORT
7684   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7685 #endif
7686   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7687   return 1;
7688 }
7689 
7690 /* this sets the requested number of threads for the next parallel region
7691    encountered by this team. since this should be enclosed in the forkjoin
7692    critical section it should avoid race conditions with asymmetrical nested
7693    parallelism */
7694 
7695 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7696   kmp_info_t *thr = __kmp_threads[gtid];
7697 
7698   if (num_threads > 0)
7699     thr->th.th_set_nproc = num_threads;
7700 }
7701 
7702 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7703                                     int num_threads) {
7704   KMP_DEBUG_ASSERT(thr);
7705   // Remember the number of threads for inner parallel regions
7706   if (!TCR_4(__kmp_init_middle))
7707     __kmp_middle_initialize(); // get internal globals calculated
7708   __kmp_assign_root_init_mask();
7709   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7710   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7711 
7712   if (num_threads == 0) {
7713     if (__kmp_teams_thread_limit > 0) {
7714       num_threads = __kmp_teams_thread_limit;
7715     } else {
7716       num_threads = __kmp_avail_proc / num_teams;
7717     }
7718     // adjust num_threads w/o warning as it is not user setting
7719     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7720     // no thread_limit clause specified -  do not change thread-limit-var ICV
7721     if (num_threads > __kmp_dflt_team_nth) {
7722       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7723     }
7724     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7725       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7726     } // prevent team size to exceed thread-limit-var
7727     if (num_teams * num_threads > __kmp_teams_max_nth) {
7728       num_threads = __kmp_teams_max_nth / num_teams;
7729     }
7730     if (num_threads == 0) {
7731       num_threads = 1;
7732     }
7733   } else {
7734     if (num_threads < 0) {
7735       __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7736                 __kmp_msg_null);
7737       num_threads = 1;
7738     }
7739     // This thread will be the primary thread of the league primary threads
7740     // Store new thread limit; old limit is saved in th_cg_roots list
7741     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7742     // num_threads = min(num_threads, nthreads-var)
7743     if (num_threads > __kmp_dflt_team_nth) {
7744       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7745     }
7746     if (num_teams * num_threads > __kmp_teams_max_nth) {
7747       int new_threads = __kmp_teams_max_nth / num_teams;
7748       if (new_threads == 0) {
7749         new_threads = 1;
7750       }
7751       if (new_threads != num_threads) {
7752         if (!__kmp_reserve_warn) { // user asked for too many threads
7753           __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7754           __kmp_msg(kmp_ms_warning,
7755                     KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7756                     KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7757         }
7758       }
7759       num_threads = new_threads;
7760     }
7761   }
7762   thr->th.th_teams_size.nth = num_threads;
7763 }
7764 
7765 /* this sets the requested number of teams for the teams region and/or
7766    the number of threads for the next parallel region encountered  */
7767 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7768                           int num_threads) {
7769   kmp_info_t *thr = __kmp_threads[gtid];
7770   if (num_teams < 0) {
7771     // OpenMP specification requires requested values to be positive,
7772     // but people can send us any value, so we'd better check
7773     __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7774               __kmp_msg_null);
7775     num_teams = 1;
7776   }
7777   if (num_teams == 0) {
7778     if (__kmp_nteams > 0) {
7779       num_teams = __kmp_nteams;
7780     } else {
7781       num_teams = 1; // default number of teams is 1.
7782     }
7783   }
7784   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7785     if (!__kmp_reserve_warn) {
7786       __kmp_reserve_warn = 1;
7787       __kmp_msg(kmp_ms_warning,
7788                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7789                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7790     }
7791     num_teams = __kmp_teams_max_nth;
7792   }
7793   // Set number of teams (number of threads in the outer "parallel" of the
7794   // teams)
7795   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7796 
7797   __kmp_push_thread_limit(thr, num_teams, num_threads);
7798 }
7799 
7800 /* This sets the requested number of teams for the teams region and/or
7801    the number of threads for the next parallel region encountered  */
7802 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7803                              int num_teams_ub, int num_threads) {
7804   kmp_info_t *thr = __kmp_threads[gtid];
7805   KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7806   KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7807   KMP_DEBUG_ASSERT(num_threads >= 0);
7808 
7809   if (num_teams_lb > num_teams_ub) {
7810     __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7811                 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7812   }
7813 
7814   int num_teams = 1; // defalt number of teams is 1.
7815 
7816   if (num_teams_lb == 0 && num_teams_ub > 0)
7817     num_teams_lb = num_teams_ub;
7818 
7819   if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7820     num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7821     if (num_teams > __kmp_teams_max_nth) {
7822       if (!__kmp_reserve_warn) {
7823         __kmp_reserve_warn = 1;
7824         __kmp_msg(kmp_ms_warning,
7825                   KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7826                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7827       }
7828       num_teams = __kmp_teams_max_nth;
7829     }
7830   } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7831     num_teams = num_teams_ub;
7832   } else { // num_teams_lb <= num_teams <= num_teams_ub
7833     if (num_threads <= 0) {
7834       if (num_teams_ub > __kmp_teams_max_nth) {
7835         num_teams = num_teams_lb;
7836       } else {
7837         num_teams = num_teams_ub;
7838       }
7839     } else {
7840       num_teams = (num_threads > __kmp_teams_max_nth)
7841                       ? num_teams
7842                       : __kmp_teams_max_nth / num_threads;
7843       if (num_teams < num_teams_lb) {
7844         num_teams = num_teams_lb;
7845       } else if (num_teams > num_teams_ub) {
7846         num_teams = num_teams_ub;
7847       }
7848     }
7849   }
7850   // Set number of teams (number of threads in the outer "parallel" of the
7851   // teams)
7852   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7853 
7854   __kmp_push_thread_limit(thr, num_teams, num_threads);
7855 }
7856 
7857 // Set the proc_bind var to use in the following parallel region.
7858 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7859   kmp_info_t *thr = __kmp_threads[gtid];
7860   thr->th.th_set_proc_bind = proc_bind;
7861 }
7862 
7863 /* Launch the worker threads into the microtask. */
7864 
7865 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7866   kmp_info_t *this_thr = __kmp_threads[gtid];
7867 
7868 #ifdef KMP_DEBUG
7869   int f;
7870 #endif /* KMP_DEBUG */
7871 
7872   KMP_DEBUG_ASSERT(team);
7873   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7874   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7875   KMP_MB(); /* Flush all pending memory write invalidates.  */
7876 
7877   team->t.t_construct = 0; /* no single directives seen yet */
7878   team->t.t_ordered.dt.t_value =
7879       0; /* thread 0 enters the ordered section first */
7880 
7881   /* Reset the identifiers on the dispatch buffer */
7882   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7883   if (team->t.t_max_nproc > 1) {
7884     int i;
7885     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7886       team->t.t_disp_buffer[i].buffer_index = i;
7887       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7888     }
7889   } else {
7890     team->t.t_disp_buffer[0].buffer_index = 0;
7891     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7892   }
7893 
7894   KMP_MB(); /* Flush all pending memory write invalidates.  */
7895   KMP_ASSERT(this_thr->th.th_team == team);
7896 
7897 #ifdef KMP_DEBUG
7898   for (f = 0; f < team->t.t_nproc; f++) {
7899     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7900                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7901   }
7902 #endif /* KMP_DEBUG */
7903 
7904   /* release the worker threads so they may begin working */
7905   __kmp_fork_barrier(gtid, 0);
7906 }
7907 
7908 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7909   kmp_info_t *this_thr = __kmp_threads[gtid];
7910 
7911   KMP_DEBUG_ASSERT(team);
7912   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7913   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7914   KMP_MB(); /* Flush all pending memory write invalidates.  */
7915 
7916   /* Join barrier after fork */
7917 
7918 #ifdef KMP_DEBUG
7919   if (__kmp_threads[gtid] &&
7920       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7921     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7922                  __kmp_threads[gtid]);
7923     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7924                  "team->t.t_nproc=%d\n",
7925                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7926                  team->t.t_nproc);
7927     __kmp_print_structure();
7928   }
7929   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7930                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7931 #endif /* KMP_DEBUG */
7932 
7933   __kmp_join_barrier(gtid); /* wait for everyone */
7934 #if OMPT_SUPPORT
7935   if (ompt_enabled.enabled &&
7936       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7937     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7938     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7939     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7940 #if OMPT_OPTIONAL
7941     void *codeptr = NULL;
7942     if (KMP_MASTER_TID(ds_tid) &&
7943         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7944          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7945       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7946 
7947     if (ompt_enabled.ompt_callback_sync_region_wait) {
7948       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7949           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7950           codeptr);
7951     }
7952     if (ompt_enabled.ompt_callback_sync_region) {
7953       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7954           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7955           codeptr);
7956     }
7957 #endif
7958     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7959       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7960           ompt_scope_end, NULL, task_data, 0, ds_tid,
7961           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7962     }
7963   }
7964 #endif
7965 
7966   KMP_MB(); /* Flush all pending memory write invalidates.  */
7967   KMP_ASSERT(this_thr->th.th_team == team);
7968 }
7969 
7970 /* ------------------------------------------------------------------------ */
7971 
7972 #ifdef USE_LOAD_BALANCE
7973 
7974 // Return the worker threads actively spinning in the hot team, if we
7975 // are at the outermost level of parallelism.  Otherwise, return 0.
7976 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7977   int i;
7978   int retval;
7979   kmp_team_t *hot_team;
7980 
7981   if (root->r.r_active) {
7982     return 0;
7983   }
7984   hot_team = root->r.r_hot_team;
7985   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7986     return hot_team->t.t_nproc - 1; // Don't count primary thread
7987   }
7988 
7989   // Skip the primary thread - it is accounted for elsewhere.
7990   retval = 0;
7991   for (i = 1; i < hot_team->t.t_nproc; i++) {
7992     if (hot_team->t.t_threads[i]->th.th_active) {
7993       retval++;
7994     }
7995   }
7996   return retval;
7997 }
7998 
7999 // Perform an automatic adjustment to the number of
8000 // threads used by the next parallel region.
8001 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8002   int retval;
8003   int pool_active;
8004   int hot_team_active;
8005   int team_curr_active;
8006   int system_active;
8007 
8008   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8009                 set_nproc));
8010   KMP_DEBUG_ASSERT(root);
8011   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8012                        ->th.th_current_task->td_icvs.dynamic == TRUE);
8013   KMP_DEBUG_ASSERT(set_nproc > 1);
8014 
8015   if (set_nproc == 1) {
8016     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8017     return 1;
8018   }
8019 
8020   // Threads that are active in the thread pool, active in the hot team for this
8021   // particular root (if we are at the outer par level), and the currently
8022   // executing thread (to become the primary thread) are available to add to the
8023   // new team, but are currently contributing to the system load, and must be
8024   // accounted for.
8025   pool_active = __kmp_thread_pool_active_nth;
8026   hot_team_active = __kmp_active_hot_team_nproc(root);
8027   team_curr_active = pool_active + hot_team_active + 1;
8028 
8029   // Check the system load.
8030   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8031   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8032                 "hot team active = %d\n",
8033                 system_active, pool_active, hot_team_active));
8034 
8035   if (system_active < 0) {
8036     // There was an error reading the necessary info from /proc, so use the
8037     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8038     // = dynamic_thread_limit, we shouldn't wind up getting back here.
8039     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8040     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8041 
8042     // Make this call behave like the thread limit algorithm.
8043     retval = __kmp_avail_proc - __kmp_nth +
8044              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8045     if (retval > set_nproc) {
8046       retval = set_nproc;
8047     }
8048     if (retval < KMP_MIN_NTH) {
8049       retval = KMP_MIN_NTH;
8050     }
8051 
8052     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8053                   retval));
8054     return retval;
8055   }
8056 
8057   // There is a slight delay in the load balance algorithm in detecting new
8058   // running procs. The real system load at this instant should be at least as
8059   // large as the #active omp thread that are available to add to the team.
8060   if (system_active < team_curr_active) {
8061     system_active = team_curr_active;
8062   }
8063   retval = __kmp_avail_proc - system_active + team_curr_active;
8064   if (retval > set_nproc) {
8065     retval = set_nproc;
8066   }
8067   if (retval < KMP_MIN_NTH) {
8068     retval = KMP_MIN_NTH;
8069   }
8070 
8071   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8072   return retval;
8073 } // __kmp_load_balance_nproc()
8074 
8075 #endif /* USE_LOAD_BALANCE */
8076 
8077 /* ------------------------------------------------------------------------ */
8078 
8079 /* NOTE: this is called with the __kmp_init_lock held */
8080 void __kmp_cleanup(void) {
8081   int f;
8082 
8083   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8084 
8085   if (TCR_4(__kmp_init_parallel)) {
8086 #if KMP_HANDLE_SIGNALS
8087     __kmp_remove_signals();
8088 #endif
8089     TCW_4(__kmp_init_parallel, FALSE);
8090   }
8091 
8092   if (TCR_4(__kmp_init_middle)) {
8093 #if KMP_AFFINITY_SUPPORTED
8094     __kmp_affinity_uninitialize();
8095 #endif /* KMP_AFFINITY_SUPPORTED */
8096     __kmp_cleanup_hierarchy();
8097     TCW_4(__kmp_init_middle, FALSE);
8098   }
8099 
8100   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8101 
8102   if (__kmp_init_serial) {
8103     __kmp_runtime_destroy();
8104     __kmp_init_serial = FALSE;
8105   }
8106 
8107   __kmp_cleanup_threadprivate_caches();
8108 
8109   for (f = 0; f < __kmp_threads_capacity; f++) {
8110     if (__kmp_root[f] != NULL) {
8111       __kmp_free(__kmp_root[f]);
8112       __kmp_root[f] = NULL;
8113     }
8114   }
8115   __kmp_free(__kmp_threads);
8116   // __kmp_threads and __kmp_root were allocated at once, as single block, so
8117   // there is no need in freeing __kmp_root.
8118   __kmp_threads = NULL;
8119   __kmp_root = NULL;
8120   __kmp_threads_capacity = 0;
8121 
8122   // Free old __kmp_threads arrays if they exist.
8123   kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8124   while (ptr) {
8125     kmp_old_threads_list_t *next = ptr->next;
8126     __kmp_free(ptr->threads);
8127     __kmp_free(ptr);
8128     ptr = next;
8129   }
8130 
8131 #if KMP_USE_DYNAMIC_LOCK
8132   __kmp_cleanup_indirect_user_locks();
8133 #else
8134   __kmp_cleanup_user_locks();
8135 #endif
8136 #if OMPD_SUPPORT
8137   if (ompd_state) {
8138     __kmp_free(ompd_env_block);
8139     ompd_env_block = NULL;
8140     ompd_env_block_size = 0;
8141   }
8142 #endif
8143 
8144 #if KMP_AFFINITY_SUPPORTED
8145   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8146   __kmp_cpuinfo_file = NULL;
8147 #endif /* KMP_AFFINITY_SUPPORTED */
8148 
8149 #if KMP_USE_ADAPTIVE_LOCKS
8150 #if KMP_DEBUG_ADAPTIVE_LOCKS
8151   __kmp_print_speculative_stats();
8152 #endif
8153 #endif
8154   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8155   __kmp_nested_nth.nth = NULL;
8156   __kmp_nested_nth.size = 0;
8157   __kmp_nested_nth.used = 0;
8158   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8159   __kmp_nested_proc_bind.bind_types = NULL;
8160   __kmp_nested_proc_bind.size = 0;
8161   __kmp_nested_proc_bind.used = 0;
8162   if (__kmp_affinity_format) {
8163     KMP_INTERNAL_FREE(__kmp_affinity_format);
8164     __kmp_affinity_format = NULL;
8165   }
8166 
8167   __kmp_i18n_catclose();
8168 
8169 #if KMP_USE_HIER_SCHED
8170   __kmp_hier_scheds.deallocate();
8171 #endif
8172 
8173 #if KMP_STATS_ENABLED
8174   __kmp_stats_fini();
8175 #endif
8176 
8177   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8178 }
8179 
8180 /* ------------------------------------------------------------------------ */
8181 
8182 int __kmp_ignore_mppbeg(void) {
8183   char *env;
8184 
8185   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8186     if (__kmp_str_match_false(env))
8187       return FALSE;
8188   }
8189   // By default __kmpc_begin() is no-op.
8190   return TRUE;
8191 }
8192 
8193 int __kmp_ignore_mppend(void) {
8194   char *env;
8195 
8196   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8197     if (__kmp_str_match_false(env))
8198       return FALSE;
8199   }
8200   // By default __kmpc_end() is no-op.
8201   return TRUE;
8202 }
8203 
8204 void __kmp_internal_begin(void) {
8205   int gtid;
8206   kmp_root_t *root;
8207 
8208   /* this is a very important step as it will register new sibling threads
8209      and assign these new uber threads a new gtid */
8210   gtid = __kmp_entry_gtid();
8211   root = __kmp_threads[gtid]->th.th_root;
8212   KMP_ASSERT(KMP_UBER_GTID(gtid));
8213 
8214   if (root->r.r_begin)
8215     return;
8216   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8217   if (root->r.r_begin) {
8218     __kmp_release_lock(&root->r.r_begin_lock, gtid);
8219     return;
8220   }
8221 
8222   root->r.r_begin = TRUE;
8223 
8224   __kmp_release_lock(&root->r.r_begin_lock, gtid);
8225 }
8226 
8227 /* ------------------------------------------------------------------------ */
8228 
8229 void __kmp_user_set_library(enum library_type arg) {
8230   int gtid;
8231   kmp_root_t *root;
8232   kmp_info_t *thread;
8233 
8234   /* first, make sure we are initialized so we can get our gtid */
8235 
8236   gtid = __kmp_entry_gtid();
8237   thread = __kmp_threads[gtid];
8238 
8239   root = thread->th.th_root;
8240 
8241   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8242                 library_serial));
8243   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8244                                   thread */
8245     KMP_WARNING(SetLibraryIncorrectCall);
8246     return;
8247   }
8248 
8249   switch (arg) {
8250   case library_serial:
8251     thread->th.th_set_nproc = 0;
8252     set__nproc(thread, 1);
8253     break;
8254   case library_turnaround:
8255     thread->th.th_set_nproc = 0;
8256     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8257                                            : __kmp_dflt_team_nth_ub);
8258     break;
8259   case library_throughput:
8260     thread->th.th_set_nproc = 0;
8261     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8262                                            : __kmp_dflt_team_nth_ub);
8263     break;
8264   default:
8265     KMP_FATAL(UnknownLibraryType, arg);
8266   }
8267 
8268   __kmp_aux_set_library(arg);
8269 }
8270 
8271 void __kmp_aux_set_stacksize(size_t arg) {
8272   if (!__kmp_init_serial)
8273     __kmp_serial_initialize();
8274 
8275 #if KMP_OS_DARWIN
8276   if (arg & (0x1000 - 1)) {
8277     arg &= ~(0x1000 - 1);
8278     if (arg + 0x1000) /* check for overflow if we round up */
8279       arg += 0x1000;
8280   }
8281 #endif
8282   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8283 
8284   /* only change the default stacksize before the first parallel region */
8285   if (!TCR_4(__kmp_init_parallel)) {
8286     size_t value = arg; /* argument is in bytes */
8287 
8288     if (value < __kmp_sys_min_stksize)
8289       value = __kmp_sys_min_stksize;
8290     else if (value > KMP_MAX_STKSIZE)
8291       value = KMP_MAX_STKSIZE;
8292 
8293     __kmp_stksize = value;
8294 
8295     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8296   }
8297 
8298   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8299 }
8300 
8301 /* set the behaviour of the runtime library */
8302 /* TODO this can cause some odd behaviour with sibling parallelism... */
8303 void __kmp_aux_set_library(enum library_type arg) {
8304   __kmp_library = arg;
8305 
8306   switch (__kmp_library) {
8307   case library_serial: {
8308     KMP_INFORM(LibraryIsSerial);
8309   } break;
8310   case library_turnaround:
8311     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8312       __kmp_use_yield = 2; // only yield when oversubscribed
8313     break;
8314   case library_throughput:
8315     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8316       __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8317     break;
8318   default:
8319     KMP_FATAL(UnknownLibraryType, arg);
8320   }
8321 }
8322 
8323 /* Getting team information common for all team API */
8324 // Returns NULL if not in teams construct
8325 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8326   kmp_info_t *thr = __kmp_entry_thread();
8327   teams_serialized = 0;
8328   if (thr->th.th_teams_microtask) {
8329     kmp_team_t *team = thr->th.th_team;
8330     int tlevel = thr->th.th_teams_level; // the level of the teams construct
8331     int ii = team->t.t_level;
8332     teams_serialized = team->t.t_serialized;
8333     int level = tlevel + 1;
8334     KMP_DEBUG_ASSERT(ii >= tlevel);
8335     while (ii > level) {
8336       for (teams_serialized = team->t.t_serialized;
8337            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8338       }
8339       if (team->t.t_serialized && (!teams_serialized)) {
8340         team = team->t.t_parent;
8341         continue;
8342       }
8343       if (ii > level) {
8344         team = team->t.t_parent;
8345         ii--;
8346       }
8347     }
8348     return team;
8349   }
8350   return NULL;
8351 }
8352 
8353 int __kmp_aux_get_team_num() {
8354   int serialized;
8355   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8356   if (team) {
8357     if (serialized > 1) {
8358       return 0; // teams region is serialized ( 1 team of 1 thread ).
8359     } else {
8360       return team->t.t_master_tid;
8361     }
8362   }
8363   return 0;
8364 }
8365 
8366 int __kmp_aux_get_num_teams() {
8367   int serialized;
8368   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8369   if (team) {
8370     if (serialized > 1) {
8371       return 1;
8372     } else {
8373       return team->t.t_parent->t.t_nproc;
8374     }
8375   }
8376   return 1;
8377 }
8378 
8379 /* ------------------------------------------------------------------------ */
8380 
8381 /*
8382  * Affinity Format Parser
8383  *
8384  * Field is in form of: %[[[0].]size]type
8385  * % and type are required (%% means print a literal '%')
8386  * type is either single char or long name surrounded by {},
8387  * e.g., N or {num_threads}
8388  * 0 => leading zeros
8389  * . => right justified when size is specified
8390  * by default output is left justified
8391  * size is the *minimum* field length
8392  * All other characters are printed as is
8393  *
8394  * Available field types:
8395  * L {thread_level}      - omp_get_level()
8396  * n {thread_num}        - omp_get_thread_num()
8397  * h {host}              - name of host machine
8398  * P {process_id}        - process id (integer)
8399  * T {thread_identifier} - native thread identifier (integer)
8400  * N {num_threads}       - omp_get_num_threads()
8401  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8402  * a {thread_affinity}   - comma separated list of integers or integer ranges
8403  *                         (values of affinity mask)
8404  *
8405  * Implementation-specific field types can be added
8406  * If a type is unknown, print "undefined"
8407  */
8408 
8409 // Structure holding the short name, long name, and corresponding data type
8410 // for snprintf.  A table of these will represent the entire valid keyword
8411 // field types.
8412 typedef struct kmp_affinity_format_field_t {
8413   char short_name; // from spec e.g., L -> thread level
8414   const char *long_name; // from spec thread_level -> thread level
8415   char field_format; // data type for snprintf (typically 'd' or 's'
8416   // for integer or string)
8417 } kmp_affinity_format_field_t;
8418 
8419 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8420 #if KMP_AFFINITY_SUPPORTED
8421     {'A', "thread_affinity", 's'},
8422 #endif
8423     {'t', "team_num", 'd'},
8424     {'T', "num_teams", 'd'},
8425     {'L', "nesting_level", 'd'},
8426     {'n', "thread_num", 'd'},
8427     {'N', "num_threads", 'd'},
8428     {'a', "ancestor_tnum", 'd'},
8429     {'H', "host", 's'},
8430     {'P', "process_id", 'd'},
8431     {'i', "native_thread_id", 'd'}};
8432 
8433 // Return the number of characters it takes to hold field
8434 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8435                                             const char **ptr,
8436                                             kmp_str_buf_t *field_buffer) {
8437   int rc, format_index, field_value;
8438   const char *width_left, *width_right;
8439   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8440   static const int FORMAT_SIZE = 20;
8441   char format[FORMAT_SIZE] = {0};
8442   char absolute_short_name = 0;
8443 
8444   KMP_DEBUG_ASSERT(gtid >= 0);
8445   KMP_DEBUG_ASSERT(th);
8446   KMP_DEBUG_ASSERT(**ptr == '%');
8447   KMP_DEBUG_ASSERT(field_buffer);
8448 
8449   __kmp_str_buf_clear(field_buffer);
8450 
8451   // Skip the initial %
8452   (*ptr)++;
8453 
8454   // Check for %% first
8455   if (**ptr == '%') {
8456     __kmp_str_buf_cat(field_buffer, "%", 1);
8457     (*ptr)++; // skip over the second %
8458     return 1;
8459   }
8460 
8461   // Parse field modifiers if they are present
8462   pad_zeros = false;
8463   if (**ptr == '0') {
8464     pad_zeros = true;
8465     (*ptr)++; // skip over 0
8466   }
8467   right_justify = false;
8468   if (**ptr == '.') {
8469     right_justify = true;
8470     (*ptr)++; // skip over .
8471   }
8472   // Parse width of field: [width_left, width_right)
8473   width_left = width_right = NULL;
8474   if (**ptr >= '0' && **ptr <= '9') {
8475     width_left = *ptr;
8476     SKIP_DIGITS(*ptr);
8477     width_right = *ptr;
8478   }
8479 
8480   // Create the format for KMP_SNPRINTF based on flags parsed above
8481   format_index = 0;
8482   format[format_index++] = '%';
8483   if (!right_justify)
8484     format[format_index++] = '-';
8485   if (pad_zeros)
8486     format[format_index++] = '0';
8487   if (width_left && width_right) {
8488     int i = 0;
8489     // Only allow 8 digit number widths.
8490     // This also prevents overflowing format variable
8491     while (i < 8 && width_left < width_right) {
8492       format[format_index++] = *width_left;
8493       width_left++;
8494       i++;
8495     }
8496   }
8497 
8498   // Parse a name (long or short)
8499   // Canonicalize the name into absolute_short_name
8500   found_valid_name = false;
8501   parse_long_name = (**ptr == '{');
8502   if (parse_long_name)
8503     (*ptr)++; // skip initial left brace
8504   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8505                              sizeof(__kmp_affinity_format_table[0]);
8506        ++i) {
8507     char short_name = __kmp_affinity_format_table[i].short_name;
8508     const char *long_name = __kmp_affinity_format_table[i].long_name;
8509     char field_format = __kmp_affinity_format_table[i].field_format;
8510     if (parse_long_name) {
8511       size_t length = KMP_STRLEN(long_name);
8512       if (strncmp(*ptr, long_name, length) == 0) {
8513         found_valid_name = true;
8514         (*ptr) += length; // skip the long name
8515       }
8516     } else if (**ptr == short_name) {
8517       found_valid_name = true;
8518       (*ptr)++; // skip the short name
8519     }
8520     if (found_valid_name) {
8521       format[format_index++] = field_format;
8522       format[format_index++] = '\0';
8523       absolute_short_name = short_name;
8524       break;
8525     }
8526   }
8527   if (parse_long_name) {
8528     if (**ptr != '}') {
8529       absolute_short_name = 0;
8530     } else {
8531       (*ptr)++; // skip over the right brace
8532     }
8533   }
8534 
8535   // Attempt to fill the buffer with the requested
8536   // value using snprintf within __kmp_str_buf_print()
8537   switch (absolute_short_name) {
8538   case 't':
8539     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8540     break;
8541   case 'T':
8542     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8543     break;
8544   case 'L':
8545     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8546     break;
8547   case 'n':
8548     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8549     break;
8550   case 'H': {
8551     static const int BUFFER_SIZE = 256;
8552     char buf[BUFFER_SIZE];
8553     __kmp_expand_host_name(buf, BUFFER_SIZE);
8554     rc = __kmp_str_buf_print(field_buffer, format, buf);
8555   } break;
8556   case 'P':
8557     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8558     break;
8559   case 'i':
8560     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8561     break;
8562   case 'N':
8563     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8564     break;
8565   case 'a':
8566     field_value =
8567         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8568     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8569     break;
8570 #if KMP_AFFINITY_SUPPORTED
8571   case 'A': {
8572     kmp_str_buf_t buf;
8573     __kmp_str_buf_init(&buf);
8574     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8575     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8576     __kmp_str_buf_free(&buf);
8577   } break;
8578 #endif
8579   default:
8580     // According to spec, If an implementation does not have info for field
8581     // type, then "undefined" is printed
8582     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8583     // Skip the field
8584     if (parse_long_name) {
8585       SKIP_TOKEN(*ptr);
8586       if (**ptr == '}')
8587         (*ptr)++;
8588     } else {
8589       (*ptr)++;
8590     }
8591   }
8592 
8593   KMP_ASSERT(format_index <= FORMAT_SIZE);
8594   return rc;
8595 }
8596 
8597 /*
8598  * Return number of characters needed to hold the affinity string
8599  * (not including null byte character)
8600  * The resultant string is printed to buffer, which the caller can then
8601  * handle afterwards
8602  */
8603 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8604                                   kmp_str_buf_t *buffer) {
8605   const char *parse_ptr;
8606   size_t retval;
8607   const kmp_info_t *th;
8608   kmp_str_buf_t field;
8609 
8610   KMP_DEBUG_ASSERT(buffer);
8611   KMP_DEBUG_ASSERT(gtid >= 0);
8612 
8613   __kmp_str_buf_init(&field);
8614   __kmp_str_buf_clear(buffer);
8615 
8616   th = __kmp_threads[gtid];
8617   retval = 0;
8618 
8619   // If format is NULL or zero-length string, then we use
8620   // affinity-format-var ICV
8621   parse_ptr = format;
8622   if (parse_ptr == NULL || *parse_ptr == '\0') {
8623     parse_ptr = __kmp_affinity_format;
8624   }
8625   KMP_DEBUG_ASSERT(parse_ptr);
8626 
8627   while (*parse_ptr != '\0') {
8628     // Parse a field
8629     if (*parse_ptr == '%') {
8630       // Put field in the buffer
8631       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8632       __kmp_str_buf_catbuf(buffer, &field);
8633       retval += rc;
8634     } else {
8635       // Put literal character in buffer
8636       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8637       retval++;
8638       parse_ptr++;
8639     }
8640   }
8641   __kmp_str_buf_free(&field);
8642   return retval;
8643 }
8644 
8645 // Displays the affinity string to stdout
8646 void __kmp_aux_display_affinity(int gtid, const char *format) {
8647   kmp_str_buf_t buf;
8648   __kmp_str_buf_init(&buf);
8649   __kmp_aux_capture_affinity(gtid, format, &buf);
8650   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8651   __kmp_str_buf_free(&buf);
8652 }
8653 
8654 /* ------------------------------------------------------------------------ */
8655 
8656 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8657   int blocktime = arg; /* argument is in milliseconds */
8658 #if KMP_USE_MONITOR
8659   int bt_intervals;
8660 #endif
8661   kmp_int8 bt_set;
8662 
8663   __kmp_save_internal_controls(thread);
8664 
8665   /* Normalize and set blocktime for the teams */
8666   if (blocktime < KMP_MIN_BLOCKTIME)
8667     blocktime = KMP_MIN_BLOCKTIME;
8668   else if (blocktime > KMP_MAX_BLOCKTIME)
8669     blocktime = KMP_MAX_BLOCKTIME;
8670 
8671   set__blocktime_team(thread->th.th_team, tid, blocktime);
8672   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8673 
8674 #if KMP_USE_MONITOR
8675   /* Calculate and set blocktime intervals for the teams */
8676   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8677 
8678   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8679   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8680 #endif
8681 
8682   /* Set whether blocktime has been set to "TRUE" */
8683   bt_set = TRUE;
8684 
8685   set__bt_set_team(thread->th.th_team, tid, bt_set);
8686   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8687 #if KMP_USE_MONITOR
8688   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8689                 "bt_intervals=%d, monitor_updates=%d\n",
8690                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8691                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8692                 __kmp_monitor_wakeups));
8693 #else
8694   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8695                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8696                 thread->th.th_team->t.t_id, tid, blocktime));
8697 #endif
8698 }
8699 
8700 void __kmp_aux_set_defaults(char const *str, size_t len) {
8701   if (!__kmp_init_serial) {
8702     __kmp_serial_initialize();
8703   }
8704   __kmp_env_initialize(str);
8705 
8706   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8707     __kmp_env_print();
8708   }
8709 } // __kmp_aux_set_defaults
8710 
8711 /* ------------------------------------------------------------------------ */
8712 /* internal fast reduction routines */
8713 
8714 PACKED_REDUCTION_METHOD_T
8715 __kmp_determine_reduction_method(
8716     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8717     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8718     kmp_critical_name *lck) {
8719 
8720   // Default reduction method: critical construct ( lck != NULL, like in current
8721   // PAROPT )
8722   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8723   // can be selected by RTL
8724   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8725   // can be selected by RTL
8726   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8727   // among generated by PAROPT.
8728 
8729   PACKED_REDUCTION_METHOD_T retval;
8730 
8731   int team_size;
8732 
8733   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8734   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8735 
8736 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8737   (loc &&                                                                      \
8738    ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8739 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8740 
8741   retval = critical_reduce_block;
8742 
8743   // another choice of getting a team size (with 1 dynamic deference) is slower
8744   team_size = __kmp_get_team_num_threads(global_tid);
8745   if (team_size == 1) {
8746 
8747     retval = empty_reduce_block;
8748 
8749   } else {
8750 
8751     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8752 
8753 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8754     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8755 
8756 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8757     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8758 
8759     int teamsize_cutoff = 4;
8760 
8761 #if KMP_MIC_SUPPORTED
8762     if (__kmp_mic_type != non_mic) {
8763       teamsize_cutoff = 8;
8764     }
8765 #endif
8766     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8767     if (tree_available) {
8768       if (team_size <= teamsize_cutoff) {
8769         if (atomic_available) {
8770           retval = atomic_reduce_block;
8771         }
8772       } else {
8773         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8774       }
8775     } else if (atomic_available) {
8776       retval = atomic_reduce_block;
8777     }
8778 #else
8779 #error "Unknown or unsupported OS"
8780 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8781        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8782 
8783 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8784 
8785 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8786 
8787     // basic tuning
8788 
8789     if (atomic_available) {
8790       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8791         retval = atomic_reduce_block;
8792       }
8793     } // otherwise: use critical section
8794 
8795 #elif KMP_OS_DARWIN
8796 
8797     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8798     if (atomic_available && (num_vars <= 3)) {
8799       retval = atomic_reduce_block;
8800     } else if (tree_available) {
8801       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8802           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8803         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8804       }
8805     } // otherwise: use critical section
8806 
8807 #else
8808 #error "Unknown or unsupported OS"
8809 #endif
8810 
8811 #else
8812 #error "Unknown or unsupported architecture"
8813 #endif
8814   }
8815 
8816   // KMP_FORCE_REDUCTION
8817 
8818   // If the team is serialized (team_size == 1), ignore the forced reduction
8819   // method and stay with the unsynchronized method (empty_reduce_block)
8820   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8821       team_size != 1) {
8822 
8823     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8824 
8825     int atomic_available, tree_available;
8826 
8827     switch ((forced_retval = __kmp_force_reduction_method)) {
8828     case critical_reduce_block:
8829       KMP_ASSERT(lck); // lck should be != 0
8830       break;
8831 
8832     case atomic_reduce_block:
8833       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8834       if (!atomic_available) {
8835         KMP_WARNING(RedMethodNotSupported, "atomic");
8836         forced_retval = critical_reduce_block;
8837       }
8838       break;
8839 
8840     case tree_reduce_block:
8841       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8842       if (!tree_available) {
8843         KMP_WARNING(RedMethodNotSupported, "tree");
8844         forced_retval = critical_reduce_block;
8845       } else {
8846 #if KMP_FAST_REDUCTION_BARRIER
8847         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8848 #endif
8849       }
8850       break;
8851 
8852     default:
8853       KMP_ASSERT(0); // "unsupported method specified"
8854     }
8855 
8856     retval = forced_retval;
8857   }
8858 
8859   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8860 
8861 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8862 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8863 
8864   return (retval);
8865 }
8866 // this function is for testing set/get/determine reduce method
8867 kmp_int32 __kmp_get_reduce_method(void) {
8868   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8869 }
8870 
8871 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8872 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8873 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8874 
8875 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8876 // OpenMP is used subsequently.
8877 void __kmp_hard_pause() {
8878   __kmp_pause_status = kmp_hard_paused;
8879   __kmp_internal_end_thread(-1);
8880 }
8881 
8882 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8883 void __kmp_resume_if_soft_paused() {
8884   if (__kmp_pause_status == kmp_soft_paused) {
8885     __kmp_pause_status = kmp_not_paused;
8886 
8887     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8888       kmp_info_t *thread = __kmp_threads[gtid];
8889       if (thread) { // Wake it if sleeping
8890         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8891                          thread);
8892         if (fl.is_sleeping())
8893           fl.resume(gtid);
8894         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8895           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8896         } else { // thread holds the lock and may sleep soon
8897           do { // until either the thread sleeps, or we can get the lock
8898             if (fl.is_sleeping()) {
8899               fl.resume(gtid);
8900               break;
8901             } else if (__kmp_try_suspend_mx(thread)) {
8902               __kmp_unlock_suspend_mx(thread);
8903               break;
8904             }
8905           } while (1);
8906         }
8907       }
8908     }
8909   }
8910 }
8911 
8912 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8913 // TODO: add warning messages
8914 int __kmp_pause_resource(kmp_pause_status_t level) {
8915   if (level == kmp_not_paused) { // requesting resume
8916     if (__kmp_pause_status == kmp_not_paused) {
8917       // error message about runtime not being paused, so can't resume
8918       return 1;
8919     } else {
8920       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8921                        __kmp_pause_status == kmp_hard_paused);
8922       __kmp_pause_status = kmp_not_paused;
8923       return 0;
8924     }
8925   } else if (level == kmp_soft_paused) { // requesting soft pause
8926     if (__kmp_pause_status != kmp_not_paused) {
8927       // error message about already being paused
8928       return 1;
8929     } else {
8930       __kmp_soft_pause();
8931       return 0;
8932     }
8933   } else if (level == kmp_hard_paused) { // requesting hard pause
8934     if (__kmp_pause_status != kmp_not_paused) {
8935       // error message about already being paused
8936       return 1;
8937     } else {
8938       __kmp_hard_pause();
8939       return 0;
8940     }
8941   } else {
8942     // error message about invalid level
8943     return 1;
8944   }
8945 }
8946 
8947 void __kmp_omp_display_env(int verbose) {
8948   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8949   if (__kmp_init_serial == 0)
8950     __kmp_do_serial_initialize();
8951   __kmp_display_env_impl(!verbose, verbose);
8952   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8953 }
8954 
8955 // The team size is changing, so distributed barrier must be modified
8956 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
8957                                int new_nthreads) {
8958   KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
8959                    bp_dist_bar);
8960   kmp_info_t **other_threads = team->t.t_threads;
8961 
8962   // We want all the workers to stop waiting on the barrier while we adjust the
8963   // size of the team.
8964   for (int f = 1; f < old_nthreads; ++f) {
8965     KMP_DEBUG_ASSERT(other_threads[f] != NULL);
8966     // Ignore threads that are already inactive or not present in the team
8967     if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
8968       // teams construct causes thread_limit to get passed in, and some of
8969       // those could be inactive; just ignore them
8970       continue;
8971     }
8972     // If thread is transitioning still to in_use state, wait for it
8973     if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
8974       while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
8975         KMP_CPU_PAUSE();
8976     }
8977     // The thread should be in_use now
8978     KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
8979     // Transition to unused state
8980     team->t.t_threads[f]->th.th_used_in_team.store(2);
8981     KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
8982   }
8983   // Release all the workers
8984   team->t.b->go_release();
8985 
8986   KMP_MFENCE();
8987 
8988   // Workers should see transition status 2 and move to 0; but may need to be
8989   // woken up first
8990   int count = old_nthreads - 1;
8991   while (count > 0) {
8992     count = old_nthreads - 1;
8993     for (int f = 1; f < old_nthreads; ++f) {
8994       if (other_threads[f]->th.th_used_in_team.load() != 0) {
8995         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
8996           kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
8997               void *, other_threads[f]->th.th_sleep_loc);
8998           __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
8999         }
9000       } else {
9001         KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9002         count--;
9003       }
9004     }
9005   }
9006   // Now update the barrier size
9007   team->t.b->update_num_threads(new_nthreads);
9008   team->t.b->go_reset();
9009 }
9010 
9011 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9012   // Add the threads back to the team
9013   KMP_DEBUG_ASSERT(team);
9014   // Threads were paused and pointed at th_used_in_team temporarily during a
9015   // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9016   // the thread that it should transition itself back into the team. Then, if
9017   // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9018   // to wake it up.
9019   for (int f = 1; f < new_nthreads; ++f) {
9020     KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9021     KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9022                                 3);
9023     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9024       __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9025                       (kmp_flag_32<false, false> *)NULL);
9026     }
9027   }
9028   // The threads should be transitioning to the team; when they are done, they
9029   // should have set th_used_in_team to 1. This loop forces master to wait until
9030   // all threads have moved into the team and are waiting in the barrier.
9031   int count = new_nthreads - 1;
9032   while (count > 0) {
9033     count = new_nthreads - 1;
9034     for (int f = 1; f < new_nthreads; ++f) {
9035       if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9036         count--;
9037       }
9038     }
9039   }
9040 }
9041 
9042 // Globals and functions for hidden helper task
9043 kmp_info_t **__kmp_hidden_helper_threads;
9044 kmp_info_t *__kmp_hidden_helper_main_thread;
9045 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9046 #if KMP_OS_LINUX
9047 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9048 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9049 #else
9050 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9051 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9052 #endif
9053 
9054 namespace {
9055 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9056 
9057 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9058   // This is an explicit synchronization on all hidden helper threads in case
9059   // that when a regular thread pushes a hidden helper task to one hidden
9060   // helper thread, the thread has not been awaken once since they're released
9061   // by the main thread after creating the team.
9062   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9063   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9064          __kmp_hidden_helper_threads_num)
9065     ;
9066 
9067   // If main thread, then wait for signal
9068   if (__kmpc_master(nullptr, *gtid)) {
9069     // First, unset the initial state and release the initial thread
9070     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9071     __kmp_hidden_helper_initz_release();
9072     __kmp_hidden_helper_main_thread_wait();
9073     // Now wake up all worker threads
9074     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9075       __kmp_hidden_helper_worker_thread_signal();
9076     }
9077   }
9078 }
9079 } // namespace
9080 
9081 void __kmp_hidden_helper_threads_initz_routine() {
9082   // Create a new root for hidden helper team/threads
9083   const int gtid = __kmp_register_root(TRUE);
9084   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9085   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9086   __kmp_hidden_helper_main_thread->th.th_set_nproc =
9087       __kmp_hidden_helper_threads_num;
9088 
9089   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9090 
9091   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9092 
9093   // Set the initialization flag to FALSE
9094   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9095 
9096   __kmp_hidden_helper_threads_deinitz_release();
9097 }
9098 
9099 /* Nesting Mode:
9100    Set via KMP_NESTING_MODE, which takes an integer.
9101    Note: we skip duplicate topology levels, and skip levels with only
9102       one entity.
9103    KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9104    KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9105       in the topology, and initializes the number of threads at each of those
9106       levels to the number of entities at each level, respectively, below the
9107       entity at the parent level.
9108    KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9109       but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9110       the user to turn nesting on explicitly. This is an even more experimental
9111       option to this experimental feature, and may change or go away in the
9112       future.
9113 */
9114 
9115 // Allocate space to store nesting levels
9116 void __kmp_init_nesting_mode() {
9117   int levels = KMP_HW_LAST;
9118   __kmp_nesting_mode_nlevels = levels;
9119   __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9120   for (int i = 0; i < levels; ++i)
9121     __kmp_nesting_nth_level[i] = 0;
9122   if (__kmp_nested_nth.size < levels) {
9123     __kmp_nested_nth.nth =
9124         (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9125     __kmp_nested_nth.size = levels;
9126   }
9127 }
9128 
9129 // Set # threads for top levels of nesting; must be called after topology set
9130 void __kmp_set_nesting_mode_threads() {
9131   kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9132 
9133   if (__kmp_nesting_mode == 1)
9134     __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9135   else if (__kmp_nesting_mode > 1)
9136     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9137 
9138   if (__kmp_topology) { // use topology info
9139     int loc, hw_level;
9140     for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9141                                 loc < __kmp_nesting_mode_nlevels;
9142          loc++, hw_level++) {
9143       __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9144       if (__kmp_nesting_nth_level[loc] == 1)
9145         loc--;
9146     }
9147     // Make sure all cores are used
9148     if (__kmp_nesting_mode > 1 && loc > 1) {
9149       int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9150       int num_cores = __kmp_topology->get_count(core_level);
9151       int upper_levels = 1;
9152       for (int level = 0; level < loc - 1; ++level)
9153         upper_levels *= __kmp_nesting_nth_level[level];
9154       if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9155         __kmp_nesting_nth_level[loc - 1] =
9156             num_cores / __kmp_nesting_nth_level[loc - 2];
9157     }
9158     __kmp_nesting_mode_nlevels = loc;
9159     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9160   } else { // no topology info available; provide a reasonable guesstimation
9161     if (__kmp_avail_proc >= 4) {
9162       __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9163       __kmp_nesting_nth_level[1] = 2;
9164       __kmp_nesting_mode_nlevels = 2;
9165     } else {
9166       __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9167       __kmp_nesting_mode_nlevels = 1;
9168     }
9169     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9170   }
9171   for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9172     __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9173   }
9174   set__nproc(thread, __kmp_nesting_nth_level[0]);
9175   if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9176     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9177   if (get__max_active_levels(thread) > 1) {
9178     // if max levels was set, set nesting mode levels to same
9179     __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9180   }
9181   if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9182     set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9183 }
9184