xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_runtime.cpp (revision e9e8876a4d6afc1ad5315faaa191b25121a813d7)
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #if KMP_OS_WINDOWS
51 // windows does not need include files as it doesn't use shared memory
52 #else
53 #include <sys/mman.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #define SHM_SIZE 1024
57 #endif
58 
59 #if defined(KMP_GOMP_COMPAT)
60 char const __kmp_version_alt_comp[] =
61     KMP_VERSION_PREFIX "alternative compiler support: yes";
62 #endif /* defined(KMP_GOMP_COMPAT) */
63 
64 char const __kmp_version_omp_api[] =
65     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66 
67 #ifdef KMP_DEBUG
68 char const __kmp_version_lock[] =
69     KMP_VERSION_PREFIX "lock type: run time selectable";
70 #endif /* KMP_DEBUG */
71 
72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73 
74 /* ------------------------------------------------------------------------ */
75 
76 #if KMP_USE_MONITOR
77 kmp_info_t __kmp_monitor;
78 #endif
79 
80 /* Forward declarations */
81 
82 void __kmp_cleanup(void);
83 
84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85                                   int gtid);
86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87                                   kmp_internal_control_t *new_icvs,
88                                   ident_t *loc);
89 #if KMP_AFFINITY_SUPPORTED
90 static void __kmp_partition_places(kmp_team_t *team,
91                                    int update_master_only = 0);
92 #endif
93 static void __kmp_do_serial_initialize(void);
94 void __kmp_fork_barrier(int gtid, int tid);
95 void __kmp_join_barrier(int gtid);
96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97                           kmp_internal_control_t *new_icvs, ident_t *loc);
98 
99 #ifdef USE_LOAD_BALANCE
100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101 #endif
102 
103 static int __kmp_expand_threads(int nNeed);
104 #if KMP_OS_WINDOWS
105 static int __kmp_unregister_root_other_thread(int gtid);
106 #endif
107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109 
110 /* Calculate the identifier of the current thread */
111 /* fast (and somewhat portable) way to get unique identifier of executing
112    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
113 int __kmp_get_global_thread_id() {
114   int i;
115   kmp_info_t **other_threads;
116   size_t stack_data;
117   char *stack_addr;
118   size_t stack_size;
119   char *stack_base;
120 
121   KA_TRACE(
122       1000,
123       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
124        __kmp_nth, __kmp_all_nth));
125 
126   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
127      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
128      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
129      __kmp_init_gtid for this to work. */
130 
131   if (!TCR_4(__kmp_init_gtid))
132     return KMP_GTID_DNE;
133 
134 #ifdef KMP_TDATA_GTID
135   if (TCR_4(__kmp_gtid_mode) >= 3) {
136     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
137     return __kmp_gtid;
138   }
139 #endif
140   if (TCR_4(__kmp_gtid_mode) >= 2) {
141     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
142     return __kmp_gtid_get_specific();
143   }
144   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
145 
146   stack_addr = (char *)&stack_data;
147   other_threads = __kmp_threads;
148 
149   /* ATT: The code below is a source of potential bugs due to unsynchronized
150      access to __kmp_threads array. For example:
151      1. Current thread loads other_threads[i] to thr and checks it, it is
152         non-NULL.
153      2. Current thread is suspended by OS.
154      3. Another thread unregisters and finishes (debug versions of free()
155         may fill memory with something like 0xEF).
156      4. Current thread is resumed.
157      5. Current thread reads junk from *thr.
158      TODO: Fix it.  --ln  */
159 
160   for (i = 0; i < __kmp_threads_capacity; i++) {
161 
162     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
163     if (!thr)
164       continue;
165 
166     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
167     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
168 
169     /* stack grows down -- search through all of the active threads */
170 
171     if (stack_addr <= stack_base) {
172       size_t stack_diff = stack_base - stack_addr;
173 
174       if (stack_diff <= stack_size) {
175         /* The only way we can be closer than the allocated */
176         /* stack size is if we are running on this thread. */
177         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
178         return i;
179       }
180     }
181   }
182 
183   /* get specific to try and determine our gtid */
184   KA_TRACE(1000,
185            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
186             "thread, using TLS\n"));
187   i = __kmp_gtid_get_specific();
188 
189   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
190 
191   /* if we havn't been assigned a gtid, then return code */
192   if (i < 0)
193     return i;
194 
195   /* dynamically updated stack window for uber threads to avoid get_specific
196      call */
197   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
198     KMP_FATAL(StackOverflow, i);
199   }
200 
201   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
202   if (stack_addr > stack_base) {
203     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
204     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
205             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
206                 stack_base);
207   } else {
208     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209             stack_base - stack_addr);
210   }
211 
212   /* Reprint stack bounds for ubermaster since they have been refined */
213   if (__kmp_storage_map) {
214     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
215     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
216     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
217                                  other_threads[i]->th.th_info.ds.ds_stacksize,
218                                  "th_%d stack (refinement)", i);
219   }
220   return i;
221 }
222 
223 int __kmp_get_global_thread_id_reg() {
224   int gtid;
225 
226   if (!__kmp_init_serial) {
227     gtid = KMP_GTID_DNE;
228   } else
229 #ifdef KMP_TDATA_GTID
230       if (TCR_4(__kmp_gtid_mode) >= 3) {
231     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
232     gtid = __kmp_gtid;
233   } else
234 #endif
235       if (TCR_4(__kmp_gtid_mode) >= 2) {
236     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
237     gtid = __kmp_gtid_get_specific();
238   } else {
239     KA_TRACE(1000,
240              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
241     gtid = __kmp_get_global_thread_id();
242   }
243 
244   /* we must be a new uber master sibling thread */
245   if (gtid == KMP_GTID_DNE) {
246     KA_TRACE(10,
247              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
248               "Registering a new gtid.\n"));
249     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
250     if (!__kmp_init_serial) {
251       __kmp_do_serial_initialize();
252       gtid = __kmp_gtid_get_specific();
253     } else {
254       gtid = __kmp_register_root(FALSE);
255     }
256     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
257     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
258   }
259 
260   KMP_DEBUG_ASSERT(gtid >= 0);
261 
262   return gtid;
263 }
264 
265 /* caller must hold forkjoin_lock */
266 void __kmp_check_stack_overlap(kmp_info_t *th) {
267   int f;
268   char *stack_beg = NULL;
269   char *stack_end = NULL;
270   int gtid;
271 
272   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
273   if (__kmp_storage_map) {
274     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
275     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
276 
277     gtid = __kmp_gtid_from_thread(th);
278 
279     if (gtid == KMP_GTID_MONITOR) {
280       __kmp_print_storage_map_gtid(
281           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
282           "th_%s stack (%s)", "mon",
283           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
284     } else {
285       __kmp_print_storage_map_gtid(
286           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
287           "th_%d stack (%s)", gtid,
288           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
289     }
290   }
291 
292   /* No point in checking ubermaster threads since they use refinement and
293    * cannot overlap */
294   gtid = __kmp_gtid_from_thread(th);
295   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
296     KA_TRACE(10,
297              ("__kmp_check_stack_overlap: performing extensive checking\n"));
298     if (stack_beg == NULL) {
299       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
300       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
301     }
302 
303     for (f = 0; f < __kmp_threads_capacity; f++) {
304       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
305 
306       if (f_th && f_th != th) {
307         char *other_stack_end =
308             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
309         char *other_stack_beg =
310             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
311         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
312             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
313 
314           /* Print the other stack values before the abort */
315           if (__kmp_storage_map)
316             __kmp_print_storage_map_gtid(
317                 -1, other_stack_beg, other_stack_end,
318                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
319                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
320 
321           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
322                       __kmp_msg_null);
323         }
324       }
325     }
326   }
327   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
328 }
329 
330 /* ------------------------------------------------------------------------ */
331 
332 void __kmp_infinite_loop(void) {
333   static int done = FALSE;
334 
335   while (!done) {
336     KMP_YIELD(TRUE);
337   }
338 }
339 
340 #define MAX_MESSAGE 512
341 
342 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
343                                   char const *format, ...) {
344   char buffer[MAX_MESSAGE];
345   va_list ap;
346 
347   va_start(ap, format);
348   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
349                p2, (unsigned long)size, format);
350   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
351   __kmp_vprintf(kmp_err, buffer, ap);
352 #if KMP_PRINT_DATA_PLACEMENT
353   int node;
354   if (gtid >= 0) {
355     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
356       if (__kmp_storage_map_verbose) {
357         node = __kmp_get_host_node(p1);
358         if (node < 0) /* doesn't work, so don't try this next time */
359           __kmp_storage_map_verbose = FALSE;
360         else {
361           char *last;
362           int lastNode;
363           int localProc = __kmp_get_cpu_from_gtid(gtid);
364 
365           const int page_size = KMP_GET_PAGE_SIZE();
366 
367           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
368           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
369           if (localProc >= 0)
370             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
371                                  localProc >> 1);
372           else
373             __kmp_printf_no_lock("  GTID %d\n", gtid);
374 #if KMP_USE_PRCTL
375           /* The more elaborate format is disabled for now because of the prctl
376            * hanging bug. */
377           do {
378             last = p1;
379             lastNode = node;
380             /* This loop collates adjacent pages with the same host node. */
381             do {
382               (char *)p1 += page_size;
383             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
384             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
385                                  lastNode);
386           } while (p1 <= p2);
387 #else
388           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
389                                (char *)p1 + (page_size - 1),
390                                __kmp_get_host_node(p1));
391           if (p1 < p2) {
392             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
393                                  (char *)p2 + (page_size - 1),
394                                  __kmp_get_host_node(p2));
395           }
396 #endif
397         }
398       }
399     } else
400       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
401   }
402 #endif /* KMP_PRINT_DATA_PLACEMENT */
403   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
404 }
405 
406 void __kmp_warn(char const *format, ...) {
407   char buffer[MAX_MESSAGE];
408   va_list ap;
409 
410   if (__kmp_generate_warnings == kmp_warnings_off) {
411     return;
412   }
413 
414   va_start(ap, format);
415 
416   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
417   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
418   __kmp_vprintf(kmp_err, buffer, ap);
419   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
420 
421   va_end(ap);
422 }
423 
424 void __kmp_abort_process() {
425   // Later threads may stall here, but that's ok because abort() will kill them.
426   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
427 
428   if (__kmp_debug_buf) {
429     __kmp_dump_debug_buffer();
430   }
431 
432   if (KMP_OS_WINDOWS) {
433     // Let other threads know of abnormal termination and prevent deadlock
434     // if abort happened during library initialization or shutdown
435     __kmp_global.g.g_abort = SIGABRT;
436 
437     /* On Windows* OS by default abort() causes pop-up error box, which stalls
438        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
439        boxes. _set_abort_behavior() works well, but this function is not
440        available in VS7 (this is not problem for DLL, but it is a problem for
441        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
442        help, at least in some versions of MS C RTL.
443 
444        It seems following sequence is the only way to simulate abort() and
445        avoid pop-up error box. */
446     raise(SIGABRT);
447     _exit(3); // Just in case, if signal ignored, exit anyway.
448   } else {
449     __kmp_unregister_library();
450     abort();
451   }
452 
453   __kmp_infinite_loop();
454   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
455 
456 } // __kmp_abort_process
457 
458 void __kmp_abort_thread(void) {
459   // TODO: Eliminate g_abort global variable and this function.
460   // In case of abort just call abort(), it will kill all the threads.
461   __kmp_infinite_loop();
462 } // __kmp_abort_thread
463 
464 /* Print out the storage map for the major kmp_info_t thread data structures
465    that are allocated together. */
466 
467 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
468   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
469                                gtid);
470 
471   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
472                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
473 
474   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
475                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
476 
477   __kmp_print_storage_map_gtid(
478       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
479       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
480 
481   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
482                                &thr->th.th_bar[bs_plain_barrier + 1],
483                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
484                                gtid);
485 
486   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
487                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
488                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
489                                gtid);
490 
491 #if KMP_FAST_REDUCTION_BARRIER
492   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
493                                &thr->th.th_bar[bs_reduction_barrier + 1],
494                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
495                                gtid);
496 #endif // KMP_FAST_REDUCTION_BARRIER
497 }
498 
499 /* Print out the storage map for the major kmp_team_t team data structures
500    that are allocated together. */
501 
502 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
503                                          int team_id, int num_thr) {
504   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
505   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
506                                header, team_id);
507 
508   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
509                                &team->t.t_bar[bs_last_barrier],
510                                sizeof(kmp_balign_team_t) * bs_last_barrier,
511                                "%s_%d.t_bar", header, team_id);
512 
513   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
514                                &team->t.t_bar[bs_plain_barrier + 1],
515                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
516                                header, team_id);
517 
518   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
519                                &team->t.t_bar[bs_forkjoin_barrier + 1],
520                                sizeof(kmp_balign_team_t),
521                                "%s_%d.t_bar[forkjoin]", header, team_id);
522 
523 #if KMP_FAST_REDUCTION_BARRIER
524   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
525                                &team->t.t_bar[bs_reduction_barrier + 1],
526                                sizeof(kmp_balign_team_t),
527                                "%s_%d.t_bar[reduction]", header, team_id);
528 #endif // KMP_FAST_REDUCTION_BARRIER
529 
530   __kmp_print_storage_map_gtid(
531       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
532       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
533 
534   __kmp_print_storage_map_gtid(
535       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
536       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
537 
538   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
539                                &team->t.t_disp_buffer[num_disp_buff],
540                                sizeof(dispatch_shared_info_t) * num_disp_buff,
541                                "%s_%d.t_disp_buffer", header, team_id);
542 }
543 
544 static void __kmp_init_allocator() {
545   __kmp_init_memkind();
546   __kmp_init_target_mem();
547 }
548 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
549 
550 /* ------------------------------------------------------------------------ */
551 
552 #if KMP_DYNAMIC_LIB
553 #if KMP_OS_WINDOWS
554 
555 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
556   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
557 
558   switch (fdwReason) {
559 
560   case DLL_PROCESS_ATTACH:
561     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
562 
563     return TRUE;
564 
565   case DLL_PROCESS_DETACH:
566     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
567 
568     // According to Windows* documentation for DllMain entry point:
569     // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
570     //   lpReserved == NULL when FreeLibrary() is called,
571     //   lpReserved != NULL when the process is terminated.
572     // When FreeLibrary() is called, worker threads remain alive. So the
573     // runtime's state is consistent and executing proper shutdown is OK.
574     // When the process is terminated, worker threads have exited or been
575     // forcefully terminated by the OS and only the shutdown thread remains.
576     // This can leave the runtime in an inconsistent state.
577     // Hence, only attempt proper cleanup when FreeLibrary() is called.
578     // Otherwise, rely on OS to reclaim resources.
579     if (lpReserved == NULL)
580       __kmp_internal_end_library(__kmp_gtid_get_specific());
581 
582     return TRUE;
583 
584   case DLL_THREAD_ATTACH:
585     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
586 
587     /* if we want to register new siblings all the time here call
588      * __kmp_get_gtid(); */
589     return TRUE;
590 
591   case DLL_THREAD_DETACH:
592     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
593 
594     __kmp_internal_end_thread(__kmp_gtid_get_specific());
595     return TRUE;
596   }
597 
598   return TRUE;
599 }
600 
601 #endif /* KMP_OS_WINDOWS */
602 #endif /* KMP_DYNAMIC_LIB */
603 
604 /* __kmp_parallel_deo -- Wait until it's our turn. */
605 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
606   int gtid = *gtid_ref;
607 #ifdef BUILD_PARALLEL_ORDERED
608   kmp_team_t *team = __kmp_team_from_gtid(gtid);
609 #endif /* BUILD_PARALLEL_ORDERED */
610 
611   if (__kmp_env_consistency_check) {
612     if (__kmp_threads[gtid]->th.th_root->r.r_active)
613 #if KMP_USE_DYNAMIC_LOCK
614       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
615 #else
616       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
617 #endif
618   }
619 #ifdef BUILD_PARALLEL_ORDERED
620   if (!team->t.t_serialized) {
621     KMP_MB();
622     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
623              NULL);
624     KMP_MB();
625   }
626 #endif /* BUILD_PARALLEL_ORDERED */
627 }
628 
629 /* __kmp_parallel_dxo -- Signal the next task. */
630 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
631   int gtid = *gtid_ref;
632 #ifdef BUILD_PARALLEL_ORDERED
633   int tid = __kmp_tid_from_gtid(gtid);
634   kmp_team_t *team = __kmp_team_from_gtid(gtid);
635 #endif /* BUILD_PARALLEL_ORDERED */
636 
637   if (__kmp_env_consistency_check) {
638     if (__kmp_threads[gtid]->th.th_root->r.r_active)
639       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
640   }
641 #ifdef BUILD_PARALLEL_ORDERED
642   if (!team->t.t_serialized) {
643     KMP_MB(); /* Flush all pending memory write invalidates.  */
644 
645     /* use the tid of the next thread in this team */
646     /* TODO replace with general release procedure */
647     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
648 
649     KMP_MB(); /* Flush all pending memory write invalidates.  */
650   }
651 #endif /* BUILD_PARALLEL_ORDERED */
652 }
653 
654 /* ------------------------------------------------------------------------ */
655 /* The BARRIER for a SINGLE process section is always explicit   */
656 
657 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
658   int status;
659   kmp_info_t *th;
660   kmp_team_t *team;
661 
662   if (!TCR_4(__kmp_init_parallel))
663     __kmp_parallel_initialize();
664   __kmp_resume_if_soft_paused();
665 
666   th = __kmp_threads[gtid];
667   team = th->th.th_team;
668   status = 0;
669 
670   th->th.th_ident = id_ref;
671 
672   if (team->t.t_serialized) {
673     status = 1;
674   } else {
675     kmp_int32 old_this = th->th.th_local.this_construct;
676 
677     ++th->th.th_local.this_construct;
678     /* try to set team count to thread count--success means thread got the
679        single block */
680     /* TODO: Should this be acquire or release? */
681     if (team->t.t_construct == old_this) {
682       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
683                                               th->th.th_local.this_construct);
684     }
685 #if USE_ITT_BUILD
686     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
687         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
688         team->t.t_active_level == 1) {
689       // Only report metadata by primary thread of active team at level 1
690       __kmp_itt_metadata_single(id_ref);
691     }
692 #endif /* USE_ITT_BUILD */
693   }
694 
695   if (__kmp_env_consistency_check) {
696     if (status && push_ws) {
697       __kmp_push_workshare(gtid, ct_psingle, id_ref);
698     } else {
699       __kmp_check_workshare(gtid, ct_psingle, id_ref);
700     }
701   }
702 #if USE_ITT_BUILD
703   if (status) {
704     __kmp_itt_single_start(gtid);
705   }
706 #endif /* USE_ITT_BUILD */
707   return status;
708 }
709 
710 void __kmp_exit_single(int gtid) {
711 #if USE_ITT_BUILD
712   __kmp_itt_single_end(gtid);
713 #endif /* USE_ITT_BUILD */
714   if (__kmp_env_consistency_check)
715     __kmp_pop_workshare(gtid, ct_psingle, NULL);
716 }
717 
718 /* determine if we can go parallel or must use a serialized parallel region and
719  * how many threads we can use
720  * set_nproc is the number of threads requested for the team
721  * returns 0 if we should serialize or only use one thread,
722  * otherwise the number of threads to use
723  * The forkjoin lock is held by the caller. */
724 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
725                                  int master_tid, int set_nthreads,
726                                  int enter_teams) {
727   int capacity;
728   int new_nthreads;
729   KMP_DEBUG_ASSERT(__kmp_init_serial);
730   KMP_DEBUG_ASSERT(root && parent_team);
731   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
732 
733   // If dyn-var is set, dynamically adjust the number of desired threads,
734   // according to the method specified by dynamic_mode.
735   new_nthreads = set_nthreads;
736   if (!get__dynamic_2(parent_team, master_tid)) {
737     ;
738   }
739 #ifdef USE_LOAD_BALANCE
740   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
741     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
742     if (new_nthreads == 1) {
743       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
744                     "reservation to 1 thread\n",
745                     master_tid));
746       return 1;
747     }
748     if (new_nthreads < set_nthreads) {
749       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
750                     "reservation to %d threads\n",
751                     master_tid, new_nthreads));
752     }
753   }
754 #endif /* USE_LOAD_BALANCE */
755   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
756     new_nthreads = __kmp_avail_proc - __kmp_nth +
757                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
758     if (new_nthreads <= 1) {
759       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
760                     "reservation to 1 thread\n",
761                     master_tid));
762       return 1;
763     }
764     if (new_nthreads < set_nthreads) {
765       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
766                     "reservation to %d threads\n",
767                     master_tid, new_nthreads));
768     } else {
769       new_nthreads = set_nthreads;
770     }
771   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
772     if (set_nthreads > 2) {
773       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
774       new_nthreads = (new_nthreads % set_nthreads) + 1;
775       if (new_nthreads == 1) {
776         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
777                       "reservation to 1 thread\n",
778                       master_tid));
779         return 1;
780       }
781       if (new_nthreads < set_nthreads) {
782         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
783                       "reservation to %d threads\n",
784                       master_tid, new_nthreads));
785       }
786     }
787   } else {
788     KMP_ASSERT(0);
789   }
790 
791   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
792   if (__kmp_nth + new_nthreads -
793           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
794       __kmp_max_nth) {
795     int tl_nthreads = __kmp_max_nth - __kmp_nth +
796                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
797     if (tl_nthreads <= 0) {
798       tl_nthreads = 1;
799     }
800 
801     // If dyn-var is false, emit a 1-time warning.
802     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
803       __kmp_reserve_warn = 1;
804       __kmp_msg(kmp_ms_warning,
805                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
806                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
807     }
808     if (tl_nthreads == 1) {
809       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
810                     "reduced reservation to 1 thread\n",
811                     master_tid));
812       return 1;
813     }
814     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
815                   "reservation to %d threads\n",
816                   master_tid, tl_nthreads));
817     new_nthreads = tl_nthreads;
818   }
819 
820   // Respect OMP_THREAD_LIMIT
821   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
822   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
823   if (cg_nthreads + new_nthreads -
824           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
825       max_cg_threads) {
826     int tl_nthreads = max_cg_threads - cg_nthreads +
827                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
828     if (tl_nthreads <= 0) {
829       tl_nthreads = 1;
830     }
831 
832     // If dyn-var is false, emit a 1-time warning.
833     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
834       __kmp_reserve_warn = 1;
835       __kmp_msg(kmp_ms_warning,
836                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
837                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
838     }
839     if (tl_nthreads == 1) {
840       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
841                     "reduced reservation to 1 thread\n",
842                     master_tid));
843       return 1;
844     }
845     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
846                   "reservation to %d threads\n",
847                   master_tid, tl_nthreads));
848     new_nthreads = tl_nthreads;
849   }
850 
851   // Check if the threads array is large enough, or needs expanding.
852   // See comment in __kmp_register_root() about the adjustment if
853   // __kmp_threads[0] == NULL.
854   capacity = __kmp_threads_capacity;
855   if (TCR_PTR(__kmp_threads[0]) == NULL) {
856     --capacity;
857   }
858   // If it is not for initializing the hidden helper team, we need to take
859   // __kmp_hidden_helper_threads_num out of the capacity because it is included
860   // in __kmp_threads_capacity.
861   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
862     capacity -= __kmp_hidden_helper_threads_num;
863   }
864   if (__kmp_nth + new_nthreads -
865           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
866       capacity) {
867     // Expand the threads array.
868     int slotsRequired = __kmp_nth + new_nthreads -
869                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
870                         capacity;
871     int slotsAdded = __kmp_expand_threads(slotsRequired);
872     if (slotsAdded < slotsRequired) {
873       // The threads array was not expanded enough.
874       new_nthreads -= (slotsRequired - slotsAdded);
875       KMP_ASSERT(new_nthreads >= 1);
876 
877       // If dyn-var is false, emit a 1-time warning.
878       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
879         __kmp_reserve_warn = 1;
880         if (__kmp_tp_cached) {
881           __kmp_msg(kmp_ms_warning,
882                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
883                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
884                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
885         } else {
886           __kmp_msg(kmp_ms_warning,
887                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
888                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
889         }
890       }
891     }
892   }
893 
894 #ifdef KMP_DEBUG
895   if (new_nthreads == 1) {
896     KC_TRACE(10,
897              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
898               "dead roots and rechecking; requested %d threads\n",
899               __kmp_get_gtid(), set_nthreads));
900   } else {
901     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
902                   " %d threads\n",
903                   __kmp_get_gtid(), new_nthreads, set_nthreads));
904   }
905 #endif // KMP_DEBUG
906   return new_nthreads;
907 }
908 
909 /* Allocate threads from the thread pool and assign them to the new team. We are
910    assured that there are enough threads available, because we checked on that
911    earlier within critical section forkjoin */
912 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
913                                     kmp_info_t *master_th, int master_gtid) {
914   int i;
915   int use_hot_team;
916 
917   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
918   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
919   KMP_MB();
920 
921   /* first, let's setup the primary thread */
922   master_th->th.th_info.ds.ds_tid = 0;
923   master_th->th.th_team = team;
924   master_th->th.th_team_nproc = team->t.t_nproc;
925   master_th->th.th_team_master = master_th;
926   master_th->th.th_team_serialized = FALSE;
927   master_th->th.th_dispatch = &team->t.t_dispatch[0];
928 
929 /* make sure we are not the optimized hot team */
930 #if KMP_NESTED_HOT_TEAMS
931   use_hot_team = 0;
932   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
933   if (hot_teams) { // hot teams array is not allocated if
934     // KMP_HOT_TEAMS_MAX_LEVEL=0
935     int level = team->t.t_active_level - 1; // index in array of hot teams
936     if (master_th->th.th_teams_microtask) { // are we inside the teams?
937       if (master_th->th.th_teams_size.nteams > 1) {
938         ++level; // level was not increased in teams construct for
939         // team_of_masters
940       }
941       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
942           master_th->th.th_teams_level == team->t.t_level) {
943         ++level; // level was not increased in teams construct for
944         // team_of_workers before the parallel
945       } // team->t.t_level will be increased inside parallel
946     }
947     if (level < __kmp_hot_teams_max_level) {
948       if (hot_teams[level].hot_team) {
949         // hot team has already been allocated for given level
950         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
951         use_hot_team = 1; // the team is ready to use
952       } else {
953         use_hot_team = 0; // AC: threads are not allocated yet
954         hot_teams[level].hot_team = team; // remember new hot team
955         hot_teams[level].hot_team_nth = team->t.t_nproc;
956       }
957     } else {
958       use_hot_team = 0;
959     }
960   }
961 #else
962   use_hot_team = team == root->r.r_hot_team;
963 #endif
964   if (!use_hot_team) {
965 
966     /* install the primary thread */
967     team->t.t_threads[0] = master_th;
968     __kmp_initialize_info(master_th, team, 0, master_gtid);
969 
970     /* now, install the worker threads */
971     for (i = 1; i < team->t.t_nproc; i++) {
972 
973       /* fork or reallocate a new thread and install it in team */
974       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
975       team->t.t_threads[i] = thr;
976       KMP_DEBUG_ASSERT(thr);
977       KMP_DEBUG_ASSERT(thr->th.th_team == team);
978       /* align team and thread arrived states */
979       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
980                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
981                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
982                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
983                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
984                     team->t.t_bar[bs_plain_barrier].b_arrived));
985       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
986       thr->th.th_teams_level = master_th->th.th_teams_level;
987       thr->th.th_teams_size = master_th->th.th_teams_size;
988       { // Initialize threads' barrier data.
989         int b;
990         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
991         for (b = 0; b < bs_last_barrier; ++b) {
992           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
993           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
994 #if USE_DEBUGGER
995           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
996 #endif
997         }
998       }
999     }
1000 
1001 #if KMP_AFFINITY_SUPPORTED
1002     __kmp_partition_places(team);
1003 #endif
1004   }
1005 
1006   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1007     for (i = 0; i < team->t.t_nproc; i++) {
1008       kmp_info_t *thr = team->t.t_threads[i];
1009       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1010           thr->th.th_prev_level != team->t.t_level) {
1011         team->t.t_display_affinity = 1;
1012         break;
1013       }
1014     }
1015   }
1016 
1017   KMP_MB();
1018 }
1019 
1020 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1021 // Propagate any changes to the floating point control registers out to the team
1022 // We try to avoid unnecessary writes to the relevant cache line in the team
1023 // structure, so we don't make changes unless they are needed.
1024 inline static void propagateFPControl(kmp_team_t *team) {
1025   if (__kmp_inherit_fp_control) {
1026     kmp_int16 x87_fpu_control_word;
1027     kmp_uint32 mxcsr;
1028 
1029     // Get primary thread's values of FPU control flags (both X87 and vector)
1030     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1031     __kmp_store_mxcsr(&mxcsr);
1032     mxcsr &= KMP_X86_MXCSR_MASK;
1033 
1034     // There is no point looking at t_fp_control_saved here.
1035     // If it is TRUE, we still have to update the values if they are different
1036     // from those we now have. If it is FALSE we didn't save anything yet, but
1037     // our objective is the same. We have to ensure that the values in the team
1038     // are the same as those we have.
1039     // So, this code achieves what we need whether or not t_fp_control_saved is
1040     // true. By checking whether the value needs updating we avoid unnecessary
1041     // writes that would put the cache-line into a written state, causing all
1042     // threads in the team to have to read it again.
1043     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1044     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1045     // Although we don't use this value, other code in the runtime wants to know
1046     // whether it should restore them. So we must ensure it is correct.
1047     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1048   } else {
1049     // Similarly here. Don't write to this cache-line in the team structure
1050     // unless we have to.
1051     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1052   }
1053 }
1054 
1055 // Do the opposite, setting the hardware registers to the updated values from
1056 // the team.
1057 inline static void updateHWFPControl(kmp_team_t *team) {
1058   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1059     // Only reset the fp control regs if they have been changed in the team.
1060     // the parallel region that we are exiting.
1061     kmp_int16 x87_fpu_control_word;
1062     kmp_uint32 mxcsr;
1063     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1064     __kmp_store_mxcsr(&mxcsr);
1065     mxcsr &= KMP_X86_MXCSR_MASK;
1066 
1067     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1068       __kmp_clear_x87_fpu_status_word();
1069       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1070     }
1071 
1072     if (team->t.t_mxcsr != mxcsr) {
1073       __kmp_load_mxcsr(&team->t.t_mxcsr);
1074     }
1075   }
1076 }
1077 #else
1078 #define propagateFPControl(x) ((void)0)
1079 #define updateHWFPControl(x) ((void)0)
1080 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1081 
1082 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1083                                      int realloc); // forward declaration
1084 
1085 /* Run a parallel region that has been serialized, so runs only in a team of the
1086    single primary thread. */
1087 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1088   kmp_info_t *this_thr;
1089   kmp_team_t *serial_team;
1090 
1091   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1092 
1093   /* Skip all this code for autopar serialized loops since it results in
1094      unacceptable overhead */
1095   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1096     return;
1097 
1098   if (!TCR_4(__kmp_init_parallel))
1099     __kmp_parallel_initialize();
1100   __kmp_resume_if_soft_paused();
1101 
1102   this_thr = __kmp_threads[global_tid];
1103   serial_team = this_thr->th.th_serial_team;
1104 
1105   /* utilize the serialized team held by this thread */
1106   KMP_DEBUG_ASSERT(serial_team);
1107   KMP_MB();
1108 
1109   if (__kmp_tasking_mode != tskm_immediate_exec) {
1110     KMP_DEBUG_ASSERT(
1111         this_thr->th.th_task_team ==
1112         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1113     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1114                      NULL);
1115     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1116                   "team %p, new task_team = NULL\n",
1117                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1118     this_thr->th.th_task_team = NULL;
1119   }
1120 
1121   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1122   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1123     proc_bind = proc_bind_false;
1124   } else if (proc_bind == proc_bind_default) {
1125     // No proc_bind clause was specified, so use the current value
1126     // of proc-bind-var for this parallel region.
1127     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1128   }
1129   // Reset for next parallel region
1130   this_thr->th.th_set_proc_bind = proc_bind_default;
1131 
1132 #if OMPT_SUPPORT
1133   ompt_data_t ompt_parallel_data = ompt_data_none;
1134   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1135   if (ompt_enabled.enabled &&
1136       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1137 
1138     ompt_task_info_t *parent_task_info;
1139     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1140 
1141     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1142     if (ompt_enabled.ompt_callback_parallel_begin) {
1143       int team_size = 1;
1144 
1145       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1146           &(parent_task_info->task_data), &(parent_task_info->frame),
1147           &ompt_parallel_data, team_size,
1148           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1149     }
1150   }
1151 #endif // OMPT_SUPPORT
1152 
1153   if (this_thr->th.th_team != serial_team) {
1154     // Nested level will be an index in the nested nthreads array
1155     int level = this_thr->th.th_team->t.t_level;
1156 
1157     if (serial_team->t.t_serialized) {
1158       /* this serial team was already used
1159          TODO increase performance by making this locks more specific */
1160       kmp_team_t *new_team;
1161 
1162       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1163 
1164       new_team =
1165           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1166 #if OMPT_SUPPORT
1167                               ompt_parallel_data,
1168 #endif
1169                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1170                               0 USE_NESTED_HOT_ARG(NULL));
1171       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1172       KMP_ASSERT(new_team);
1173 
1174       /* setup new serialized team and install it */
1175       new_team->t.t_threads[0] = this_thr;
1176       new_team->t.t_parent = this_thr->th.th_team;
1177       serial_team = new_team;
1178       this_thr->th.th_serial_team = serial_team;
1179 
1180       KF_TRACE(
1181           10,
1182           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1183            global_tid, serial_team));
1184 
1185       /* TODO the above breaks the requirement that if we run out of resources,
1186          then we can still guarantee that serialized teams are ok, since we may
1187          need to allocate a new one */
1188     } else {
1189       KF_TRACE(
1190           10,
1191           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1192            global_tid, serial_team));
1193     }
1194 
1195     /* we have to initialize this serial team */
1196     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1197     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1198     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1199     serial_team->t.t_ident = loc;
1200     serial_team->t.t_serialized = 1;
1201     serial_team->t.t_nproc = 1;
1202     serial_team->t.t_parent = this_thr->th.th_team;
1203     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1204     this_thr->th.th_team = serial_team;
1205     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1206 
1207     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1208                   this_thr->th.th_current_task));
1209     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1210     this_thr->th.th_current_task->td_flags.executing = 0;
1211 
1212     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1213 
1214     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1215        implicit task for each serialized task represented by
1216        team->t.t_serialized? */
1217     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1218               &this_thr->th.th_current_task->td_parent->td_icvs);
1219 
1220     // Thread value exists in the nested nthreads array for the next nested
1221     // level
1222     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1223       this_thr->th.th_current_task->td_icvs.nproc =
1224           __kmp_nested_nth.nth[level + 1];
1225     }
1226 
1227     if (__kmp_nested_proc_bind.used &&
1228         (level + 1 < __kmp_nested_proc_bind.used)) {
1229       this_thr->th.th_current_task->td_icvs.proc_bind =
1230           __kmp_nested_proc_bind.bind_types[level + 1];
1231     }
1232 
1233 #if USE_DEBUGGER
1234     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1235 #endif
1236     this_thr->th.th_info.ds.ds_tid = 0;
1237 
1238     /* set thread cache values */
1239     this_thr->th.th_team_nproc = 1;
1240     this_thr->th.th_team_master = this_thr;
1241     this_thr->th.th_team_serialized = 1;
1242 
1243     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1244     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1245     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1246 
1247     propagateFPControl(serial_team);
1248 
1249     /* check if we need to allocate dispatch buffers stack */
1250     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1251     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1252       serial_team->t.t_dispatch->th_disp_buffer =
1253           (dispatch_private_info_t *)__kmp_allocate(
1254               sizeof(dispatch_private_info_t));
1255     }
1256     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1257 
1258     KMP_MB();
1259 
1260   } else {
1261     /* this serialized team is already being used,
1262      * that's fine, just add another nested level */
1263     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1264     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1265     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1266     ++serial_team->t.t_serialized;
1267     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1268 
1269     // Nested level will be an index in the nested nthreads array
1270     int level = this_thr->th.th_team->t.t_level;
1271     // Thread value exists in the nested nthreads array for the next nested
1272     // level
1273     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1274       this_thr->th.th_current_task->td_icvs.nproc =
1275           __kmp_nested_nth.nth[level + 1];
1276     }
1277     serial_team->t.t_level++;
1278     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1279                   "of serial team %p to %d\n",
1280                   global_tid, serial_team, serial_team->t.t_level));
1281 
1282     /* allocate/push dispatch buffers stack */
1283     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1284     {
1285       dispatch_private_info_t *disp_buffer =
1286           (dispatch_private_info_t *)__kmp_allocate(
1287               sizeof(dispatch_private_info_t));
1288       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1289       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1290     }
1291     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1292 
1293     KMP_MB();
1294   }
1295   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1296 
1297   // Perform the display affinity functionality for
1298   // serialized parallel regions
1299   if (__kmp_display_affinity) {
1300     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1301         this_thr->th.th_prev_num_threads != 1) {
1302       // NULL means use the affinity-format-var ICV
1303       __kmp_aux_display_affinity(global_tid, NULL);
1304       this_thr->th.th_prev_level = serial_team->t.t_level;
1305       this_thr->th.th_prev_num_threads = 1;
1306     }
1307   }
1308 
1309   if (__kmp_env_consistency_check)
1310     __kmp_push_parallel(global_tid, NULL);
1311 #if OMPT_SUPPORT
1312   serial_team->t.ompt_team_info.master_return_address = codeptr;
1313   if (ompt_enabled.enabled &&
1314       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1315     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1316         OMPT_GET_FRAME_ADDRESS(0);
1317 
1318     ompt_lw_taskteam_t lw_taskteam;
1319     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1320                             &ompt_parallel_data, codeptr);
1321 
1322     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1323     // don't use lw_taskteam after linking. content was swaped
1324 
1325     /* OMPT implicit task begin */
1326     if (ompt_enabled.ompt_callback_implicit_task) {
1327       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1328           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1329           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1330           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1331       OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1332           __kmp_tid_from_gtid(global_tid);
1333     }
1334 
1335     /* OMPT state */
1336     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1337     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1338         OMPT_GET_FRAME_ADDRESS(0);
1339   }
1340 #endif
1341 }
1342 
1343 /* most of the work for a fork */
1344 /* return true if we really went parallel, false if serialized */
1345 int __kmp_fork_call(ident_t *loc, int gtid,
1346                     enum fork_context_e call_context, // Intel, GNU, ...
1347                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1348                     kmp_va_list ap) {
1349   void **argv;
1350   int i;
1351   int master_tid;
1352   int master_this_cons;
1353   kmp_team_t *team;
1354   kmp_team_t *parent_team;
1355   kmp_info_t *master_th;
1356   kmp_root_t *root;
1357   int nthreads;
1358   int master_active;
1359   int master_set_numthreads;
1360   int level;
1361   int active_level;
1362   int teams_level;
1363 #if KMP_NESTED_HOT_TEAMS
1364   kmp_hot_team_ptr_t **p_hot_teams;
1365 #endif
1366   { // KMP_TIME_BLOCK
1367     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1368     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1369 
1370     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1371     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1372       /* Some systems prefer the stack for the root thread(s) to start with */
1373       /* some gap from the parent stack to prevent false sharing. */
1374       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1375       /* These 2 lines below are so this does not get optimized out */
1376       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1377         __kmp_stkpadding += (short)((kmp_int64)dummy);
1378     }
1379 
1380     /* initialize if needed */
1381     KMP_DEBUG_ASSERT(
1382         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1383     if (!TCR_4(__kmp_init_parallel))
1384       __kmp_parallel_initialize();
1385     __kmp_resume_if_soft_paused();
1386 
1387     /* setup current data */
1388     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1389     // shutdown
1390     parent_team = master_th->th.th_team;
1391     master_tid = master_th->th.th_info.ds.ds_tid;
1392     master_this_cons = master_th->th.th_local.this_construct;
1393     root = master_th->th.th_root;
1394     master_active = root->r.r_active;
1395     master_set_numthreads = master_th->th.th_set_nproc;
1396 
1397 #if OMPT_SUPPORT
1398     ompt_data_t ompt_parallel_data = ompt_data_none;
1399     ompt_data_t *parent_task_data;
1400     ompt_frame_t *ompt_frame;
1401     ompt_data_t *implicit_task_data;
1402     void *return_address = NULL;
1403 
1404     if (ompt_enabled.enabled) {
1405       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1406                                     NULL, NULL);
1407       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1408     }
1409 #endif
1410 
1411     // Assign affinity to root thread if it hasn't happened yet
1412     __kmp_assign_root_init_mask();
1413 
1414     // Nested level will be an index in the nested nthreads array
1415     level = parent_team->t.t_level;
1416     // used to launch non-serial teams even if nested is not allowed
1417     active_level = parent_team->t.t_active_level;
1418     // needed to check nesting inside the teams
1419     teams_level = master_th->th.th_teams_level;
1420 #if KMP_NESTED_HOT_TEAMS
1421     p_hot_teams = &master_th->th.th_hot_teams;
1422     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1423       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1424           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1425       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1426       // it is either actual or not needed (when active_level > 0)
1427       (*p_hot_teams)[0].hot_team_nth = 1;
1428     }
1429 #endif
1430 
1431 #if OMPT_SUPPORT
1432     if (ompt_enabled.enabled) {
1433       if (ompt_enabled.ompt_callback_parallel_begin) {
1434         int team_size = master_set_numthreads
1435                             ? master_set_numthreads
1436                             : get__nproc_2(parent_team, master_tid);
1437         int flags = OMPT_INVOKER(call_context) |
1438                     ((microtask == (microtask_t)__kmp_teams_master)
1439                          ? ompt_parallel_league
1440                          : ompt_parallel_team);
1441         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1442             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1443             return_address);
1444       }
1445       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1446     }
1447 #endif
1448 
1449     master_th->th.th_ident = loc;
1450 
1451     if (master_th->th.th_teams_microtask && ap &&
1452         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1453       // AC: This is start of parallel that is nested inside teams construct.
1454       // The team is actual (hot), all workers are ready at the fork barrier.
1455       // No lock needed to initialize the team a bit, then free workers.
1456       parent_team->t.t_ident = loc;
1457       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1458       parent_team->t.t_argc = argc;
1459       argv = (void **)parent_team->t.t_argv;
1460       for (i = argc - 1; i >= 0; --i)
1461         *argv++ = va_arg(kmp_va_deref(ap), void *);
1462       // Increment our nested depth levels, but not increase the serialization
1463       if (parent_team == master_th->th.th_serial_team) {
1464         // AC: we are in serialized parallel
1465         __kmpc_serialized_parallel(loc, gtid);
1466         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1467 
1468         if (call_context == fork_context_gnu) {
1469           // AC: need to decrement t_serialized for enquiry functions to work
1470           // correctly, will restore at join time
1471           parent_team->t.t_serialized--;
1472           return TRUE;
1473         }
1474 
1475 #if OMPD_SUPPORT
1476         parent_team->t.t_pkfn = microtask;
1477 #endif
1478 
1479 #if OMPT_SUPPORT
1480         void *dummy;
1481         void **exit_frame_p;
1482 
1483         ompt_lw_taskteam_t lw_taskteam;
1484 
1485         if (ompt_enabled.enabled) {
1486           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1487                                   &ompt_parallel_data, return_address);
1488           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1489 
1490           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1491           // don't use lw_taskteam after linking. content was swaped
1492 
1493           /* OMPT implicit task begin */
1494           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1495           if (ompt_enabled.ompt_callback_implicit_task) {
1496             OMPT_CUR_TASK_INFO(master_th)->thread_num =
1497                 __kmp_tid_from_gtid(gtid);
1498             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1499                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1500                 implicit_task_data, 1,
1501                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1502           }
1503 
1504           /* OMPT state */
1505           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1506         } else {
1507           exit_frame_p = &dummy;
1508         }
1509 #endif
1510         // AC: need to decrement t_serialized for enquiry functions to work
1511         // correctly, will restore at join time
1512         parent_team->t.t_serialized--;
1513 
1514         {
1515           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1516           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1517           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1518 #if OMPT_SUPPORT
1519                                  ,
1520                                  exit_frame_p
1521 #endif
1522           );
1523         }
1524 
1525 #if OMPT_SUPPORT
1526         if (ompt_enabled.enabled) {
1527           *exit_frame_p = NULL;
1528           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1529           if (ompt_enabled.ompt_callback_implicit_task) {
1530             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1531                 ompt_scope_end, NULL, implicit_task_data, 1,
1532                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1533           }
1534           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1535           __ompt_lw_taskteam_unlink(master_th);
1536           if (ompt_enabled.ompt_callback_parallel_end) {
1537             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1538                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1539                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1540                 return_address);
1541           }
1542           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1543         }
1544 #endif
1545         return TRUE;
1546       }
1547 
1548       parent_team->t.t_pkfn = microtask;
1549       parent_team->t.t_invoke = invoker;
1550       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1551       parent_team->t.t_active_level++;
1552       parent_team->t.t_level++;
1553       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1554 
1555 #if OMPT_SUPPORT
1556       if (ompt_enabled.enabled) {
1557         ompt_lw_taskteam_t lw_taskteam;
1558         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1559                                 &ompt_parallel_data, return_address);
1560         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1561       }
1562 #endif
1563 
1564       /* Change number of threads in the team if requested */
1565       if (master_set_numthreads) { // The parallel has num_threads clause
1566         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1567           // AC: only can reduce number of threads dynamically, can't increase
1568           kmp_info_t **other_threads = parent_team->t.t_threads;
1569           parent_team->t.t_nproc = master_set_numthreads;
1570           for (i = 0; i < master_set_numthreads; ++i) {
1571             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1572           }
1573           // Keep extra threads hot in the team for possible next parallels
1574         }
1575         master_th->th.th_set_nproc = 0;
1576       }
1577 
1578 #if USE_DEBUGGER
1579       if (__kmp_debugging) { // Let debugger override number of threads.
1580         int nth = __kmp_omp_num_threads(loc);
1581         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1582           master_set_numthreads = nth;
1583         }
1584       }
1585 #endif
1586 
1587 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1588       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1589            KMP_ITT_DEBUG) &&
1590           __kmp_forkjoin_frames_mode == 3 &&
1591           parent_team->t.t_active_level == 1 // only report frames at level 1
1592           && master_th->th.th_teams_size.nteams == 1) {
1593         kmp_uint64 tmp_time = __itt_get_timestamp();
1594         master_th->th.th_frame_time = tmp_time;
1595         parent_team->t.t_region_time = tmp_time;
1596       }
1597       if (__itt_stack_caller_create_ptr) {
1598         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1599         // create new stack stitching id before entering fork barrier
1600         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1601       }
1602 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1603 
1604       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1605                     "master_th=%p, gtid=%d\n",
1606                     root, parent_team, master_th, gtid));
1607       __kmp_internal_fork(loc, gtid, parent_team);
1608       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1609                     "master_th=%p, gtid=%d\n",
1610                     root, parent_team, master_th, gtid));
1611 
1612       if (call_context == fork_context_gnu)
1613         return TRUE;
1614 
1615       /* Invoke microtask for PRIMARY thread */
1616       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1617                     parent_team->t.t_id, parent_team->t.t_pkfn));
1618 
1619       if (!parent_team->t.t_invoke(gtid)) {
1620         KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1621       }
1622       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1623                     parent_team->t.t_id, parent_team->t.t_pkfn));
1624       KMP_MB(); /* Flush all pending memory write invalidates.  */
1625 
1626       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1627 
1628       return TRUE;
1629     } // Parallel closely nested in teams construct
1630 
1631 #if KMP_DEBUG
1632     if (__kmp_tasking_mode != tskm_immediate_exec) {
1633       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1634                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1635     }
1636 #endif
1637 
1638     int enter_teams = 0;
1639     if (parent_team->t.t_active_level >=
1640         master_th->th.th_current_task->td_icvs.max_active_levels) {
1641       nthreads = 1;
1642     } else {
1643       enter_teams = ((ap == NULL && active_level == 0) ||
1644                      (ap && teams_level > 0 && teams_level == level));
1645       nthreads =
1646           master_set_numthreads
1647               ? master_set_numthreads
1648               : get__nproc_2(
1649                     parent_team,
1650                     master_tid); // TODO: get nproc directly from current task
1651 
1652       // Check if we need to take forkjoin lock? (no need for serialized
1653       // parallel out of teams construct). This code moved here from
1654       // __kmp_reserve_threads() to speedup nested serialized parallels.
1655       if (nthreads > 1) {
1656         if ((get__max_active_levels(master_th) == 1 &&
1657              (root->r.r_in_parallel && !enter_teams)) ||
1658             (__kmp_library == library_serial)) {
1659           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1660                         " threads\n",
1661                         gtid, nthreads));
1662           nthreads = 1;
1663         }
1664       }
1665       if (nthreads > 1) {
1666         /* determine how many new threads we can use */
1667         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1668         /* AC: If we execute teams from parallel region (on host), then teams
1669            should be created but each can only have 1 thread if nesting is
1670            disabled. If teams called from serial region, then teams and their
1671            threads should be created regardless of the nesting setting. */
1672         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1673                                          nthreads, enter_teams);
1674         if (nthreads == 1) {
1675           // Free lock for single thread execution here; for multi-thread
1676           // execution it will be freed later after team of threads created
1677           // and initialized
1678           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1679         }
1680       }
1681     }
1682     KMP_DEBUG_ASSERT(nthreads > 0);
1683 
1684     // If we temporarily changed the set number of threads then restore it now
1685     master_th->th.th_set_nproc = 0;
1686 
1687     /* create a serialized parallel region? */
1688     if (nthreads == 1) {
1689 /* josh todo: hypothetical question: what do we do for OS X*? */
1690 #if KMP_OS_LINUX &&                                                            \
1691     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1692       void *args[argc];
1693 #else
1694       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1695 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1696           KMP_ARCH_AARCH64) */
1697 
1698       KA_TRACE(20,
1699                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1700 
1701       __kmpc_serialized_parallel(loc, gtid);
1702 
1703 #if OMPD_SUPPORT
1704       master_th->th.th_serial_team->t.t_pkfn = microtask;
1705 #endif
1706 
1707       if (call_context == fork_context_intel) {
1708         /* TODO this sucks, use the compiler itself to pass args! :) */
1709         master_th->th.th_serial_team->t.t_ident = loc;
1710         if (!ap) {
1711           // revert change made in __kmpc_serialized_parallel()
1712           master_th->th.th_serial_team->t.t_level--;
1713           // Get args from parent team for teams construct
1714 
1715 #if OMPT_SUPPORT
1716           void *dummy;
1717           void **exit_frame_p;
1718           ompt_task_info_t *task_info;
1719 
1720           ompt_lw_taskteam_t lw_taskteam;
1721 
1722           if (ompt_enabled.enabled) {
1723             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1724                                     &ompt_parallel_data, return_address);
1725 
1726             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1727             // don't use lw_taskteam after linking. content was swaped
1728 
1729             task_info = OMPT_CUR_TASK_INFO(master_th);
1730             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1731             if (ompt_enabled.ompt_callback_implicit_task) {
1732               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1733                   __kmp_tid_from_gtid(gtid);
1734               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1735                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1736                   &(task_info->task_data), 1,
1737                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1738                   ompt_task_implicit);
1739             }
1740 
1741             /* OMPT state */
1742             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1743           } else {
1744             exit_frame_p = &dummy;
1745           }
1746 #endif
1747 
1748           {
1749             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1750             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1751             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1752                                    parent_team->t.t_argv
1753 #if OMPT_SUPPORT
1754                                    ,
1755                                    exit_frame_p
1756 #endif
1757             );
1758           }
1759 
1760 #if OMPT_SUPPORT
1761           if (ompt_enabled.enabled) {
1762             *exit_frame_p = NULL;
1763             if (ompt_enabled.ompt_callback_implicit_task) {
1764               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1765                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1766                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1767                   ompt_task_implicit);
1768             }
1769             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1770             __ompt_lw_taskteam_unlink(master_th);
1771             if (ompt_enabled.ompt_callback_parallel_end) {
1772               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1773                   &ompt_parallel_data, parent_task_data,
1774                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1775                   return_address);
1776             }
1777             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1778           }
1779 #endif
1780         } else if (microtask == (microtask_t)__kmp_teams_master) {
1781           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1782                            master_th->th.th_serial_team);
1783           team = master_th->th.th_team;
1784           // team->t.t_pkfn = microtask;
1785           team->t.t_invoke = invoker;
1786           __kmp_alloc_argv_entries(argc, team, TRUE);
1787           team->t.t_argc = argc;
1788           argv = (void **)team->t.t_argv;
1789           if (ap) {
1790             for (i = argc - 1; i >= 0; --i)
1791               *argv++ = va_arg(kmp_va_deref(ap), void *);
1792           } else {
1793             for (i = 0; i < argc; ++i)
1794               // Get args from parent team for teams construct
1795               argv[i] = parent_team->t.t_argv[i];
1796           }
1797           // AC: revert change made in __kmpc_serialized_parallel()
1798           //     because initial code in teams should have level=0
1799           team->t.t_level--;
1800           // AC: call special invoker for outer "parallel" of teams construct
1801           invoker(gtid);
1802 #if OMPT_SUPPORT
1803           if (ompt_enabled.enabled) {
1804             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1805             if (ompt_enabled.ompt_callback_implicit_task) {
1806               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1807                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1808                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1809             }
1810             if (ompt_enabled.ompt_callback_parallel_end) {
1811               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1812                   &ompt_parallel_data, parent_task_data,
1813                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1814                   return_address);
1815             }
1816             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1817           }
1818 #endif
1819         } else {
1820           argv = args;
1821           for (i = argc - 1; i >= 0; --i)
1822             *argv++ = va_arg(kmp_va_deref(ap), void *);
1823           KMP_MB();
1824 
1825 #if OMPT_SUPPORT
1826           void *dummy;
1827           void **exit_frame_p;
1828           ompt_task_info_t *task_info;
1829 
1830           ompt_lw_taskteam_t lw_taskteam;
1831 
1832           if (ompt_enabled.enabled) {
1833             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1834                                     &ompt_parallel_data, return_address);
1835             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1836             // don't use lw_taskteam after linking. content was swaped
1837             task_info = OMPT_CUR_TASK_INFO(master_th);
1838             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1839 
1840             /* OMPT implicit task begin */
1841             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1842             if (ompt_enabled.ompt_callback_implicit_task) {
1843               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1844                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1845                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1846                   ompt_task_implicit);
1847               OMPT_CUR_TASK_INFO(master_th)->thread_num =
1848                   __kmp_tid_from_gtid(gtid);
1849             }
1850 
1851             /* OMPT state */
1852             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1853           } else {
1854             exit_frame_p = &dummy;
1855           }
1856 #endif
1857 
1858           {
1859             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1860             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1861             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1862 #if OMPT_SUPPORT
1863                                    ,
1864                                    exit_frame_p
1865 #endif
1866             );
1867           }
1868 
1869 #if OMPT_SUPPORT
1870           if (ompt_enabled.enabled) {
1871             *exit_frame_p = NULL;
1872             if (ompt_enabled.ompt_callback_implicit_task) {
1873               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1874                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1875                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1876                   ompt_task_implicit);
1877             }
1878 
1879             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1880             __ompt_lw_taskteam_unlink(master_th);
1881             if (ompt_enabled.ompt_callback_parallel_end) {
1882               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1883                   &ompt_parallel_data, parent_task_data,
1884                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1885                   return_address);
1886             }
1887             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1888           }
1889 #endif
1890         }
1891       } else if (call_context == fork_context_gnu) {
1892 #if OMPT_SUPPORT
1893         ompt_lw_taskteam_t lwt;
1894         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1895                                 return_address);
1896 
1897         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1898         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1899 // don't use lw_taskteam after linking. content was swaped
1900 #endif
1901 
1902         // we were called from GNU native code
1903         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1904         return FALSE;
1905       } else {
1906         KMP_ASSERT2(call_context < fork_context_last,
1907                     "__kmp_fork_call: unknown fork_context parameter");
1908       }
1909 
1910       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1911       KMP_MB();
1912       return FALSE;
1913     } // if (nthreads == 1)
1914 
1915     // GEH: only modify the executing flag in the case when not serialized
1916     //      serialized case is handled in kmpc_serialized_parallel
1917     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1918                   "curtask=%p, curtask_max_aclevel=%d\n",
1919                   parent_team->t.t_active_level, master_th,
1920                   master_th->th.th_current_task,
1921                   master_th->th.th_current_task->td_icvs.max_active_levels));
1922     // TODO: GEH - cannot do this assertion because root thread not set up as
1923     // executing
1924     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1925     master_th->th.th_current_task->td_flags.executing = 0;
1926 
1927     if (!master_th->th.th_teams_microtask || level > teams_level) {
1928       /* Increment our nested depth level */
1929       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1930     }
1931 
1932     // See if we need to make a copy of the ICVs.
1933     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1934     if ((level + 1 < __kmp_nested_nth.used) &&
1935         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1936       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1937     } else {
1938       nthreads_icv = 0; // don't update
1939     }
1940 
1941     // Figure out the proc_bind_policy for the new team.
1942     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1943     kmp_proc_bind_t proc_bind_icv =
1944         proc_bind_default; // proc_bind_default means don't update
1945     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1946       proc_bind = proc_bind_false;
1947     } else {
1948       if (proc_bind == proc_bind_default) {
1949         // No proc_bind clause specified; use current proc-bind-var for this
1950         // parallel region
1951         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1952       }
1953       /* else: The proc_bind policy was specified explicitly on parallel clause.
1954          This overrides proc-bind-var for this parallel region, but does not
1955          change proc-bind-var. */
1956       // Figure the value of proc-bind-var for the child threads.
1957       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1958           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1959            master_th->th.th_current_task->td_icvs.proc_bind)) {
1960         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1961       }
1962     }
1963 
1964     // Reset for next parallel region
1965     master_th->th.th_set_proc_bind = proc_bind_default;
1966 
1967     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1968       kmp_internal_control_t new_icvs;
1969       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1970       new_icvs.next = NULL;
1971       if (nthreads_icv > 0) {
1972         new_icvs.nproc = nthreads_icv;
1973       }
1974       if (proc_bind_icv != proc_bind_default) {
1975         new_icvs.proc_bind = proc_bind_icv;
1976       }
1977 
1978       /* allocate a new parallel team */
1979       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1980       team = __kmp_allocate_team(root, nthreads, nthreads,
1981 #if OMPT_SUPPORT
1982                                  ompt_parallel_data,
1983 #endif
1984                                  proc_bind, &new_icvs,
1985                                  argc USE_NESTED_HOT_ARG(master_th));
1986     } else {
1987       /* allocate a new parallel team */
1988       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1989       team = __kmp_allocate_team(root, nthreads, nthreads,
1990 #if OMPT_SUPPORT
1991                                  ompt_parallel_data,
1992 #endif
1993                                  proc_bind,
1994                                  &master_th->th.th_current_task->td_icvs,
1995                                  argc USE_NESTED_HOT_ARG(master_th));
1996     }
1997     KF_TRACE(
1998         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
1999 
2000     /* setup the new team */
2001     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2002     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2003     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2004     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2005     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2006 #if OMPT_SUPPORT
2007     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2008                           return_address);
2009 #endif
2010     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2011     // TODO: parent_team->t.t_level == INT_MAX ???
2012     if (!master_th->th.th_teams_microtask || level > teams_level) {
2013       int new_level = parent_team->t.t_level + 1;
2014       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2015       new_level = parent_team->t.t_active_level + 1;
2016       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2017     } else {
2018       // AC: Do not increase parallel level at start of the teams construct
2019       int new_level = parent_team->t.t_level;
2020       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2021       new_level = parent_team->t.t_active_level;
2022       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2023     }
2024     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2025     // set primary thread's schedule as new run-time schedule
2026     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2027 
2028     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2029     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2030 
2031     // Update the floating point rounding in the team if required.
2032     propagateFPControl(team);
2033 #if OMPD_SUPPORT
2034     if (ompd_state & OMPD_ENABLE_BP)
2035       ompd_bp_parallel_begin();
2036 #endif
2037 
2038     if (__kmp_tasking_mode != tskm_immediate_exec) {
2039       // Set primary thread's task team to team's task team. Unless this is hot
2040       // team, it should be NULL.
2041       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2042                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2043       KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2044                     "%p, new task_team %p / team %p\n",
2045                     __kmp_gtid_from_thread(master_th),
2046                     master_th->th.th_task_team, parent_team,
2047                     team->t.t_task_team[master_th->th.th_task_state], team));
2048 
2049       if (active_level || master_th->th.th_task_team) {
2050         // Take a memo of primary thread's task_state
2051         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2052         if (master_th->th.th_task_state_top >=
2053             master_th->th.th_task_state_stack_sz) { // increase size
2054           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2055           kmp_uint8 *old_stack, *new_stack;
2056           kmp_uint32 i;
2057           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2058           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2059             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2060           }
2061           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2062                ++i) { // zero-init rest of stack
2063             new_stack[i] = 0;
2064           }
2065           old_stack = master_th->th.th_task_state_memo_stack;
2066           master_th->th.th_task_state_memo_stack = new_stack;
2067           master_th->th.th_task_state_stack_sz = new_size;
2068           __kmp_free(old_stack);
2069         }
2070         // Store primary thread's task_state on stack
2071         master_th->th
2072             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2073             master_th->th.th_task_state;
2074         master_th->th.th_task_state_top++;
2075 #if KMP_NESTED_HOT_TEAMS
2076         if (master_th->th.th_hot_teams &&
2077             active_level < __kmp_hot_teams_max_level &&
2078             team == master_th->th.th_hot_teams[active_level].hot_team) {
2079           // Restore primary thread's nested state if nested hot team
2080           master_th->th.th_task_state =
2081               master_th->th
2082                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2083         } else {
2084 #endif
2085           master_th->th.th_task_state = 0;
2086 #if KMP_NESTED_HOT_TEAMS
2087         }
2088 #endif
2089       }
2090 #if !KMP_NESTED_HOT_TEAMS
2091       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2092                        (team == root->r.r_hot_team));
2093 #endif
2094     }
2095 
2096     KA_TRACE(
2097         20,
2098         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2099          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2100          team->t.t_nproc));
2101     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2102                      (team->t.t_master_tid == 0 &&
2103                       (team->t.t_parent == root->r.r_root_team ||
2104                        team->t.t_parent->t.t_serialized)));
2105     KMP_MB();
2106 
2107     /* now, setup the arguments */
2108     argv = (void **)team->t.t_argv;
2109     if (ap) {
2110       for (i = argc - 1; i >= 0; --i) {
2111         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2112         KMP_CHECK_UPDATE(*argv, new_argv);
2113         argv++;
2114       }
2115     } else {
2116       for (i = 0; i < argc; ++i) {
2117         // Get args from parent team for teams construct
2118         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2119       }
2120     }
2121 
2122     /* now actually fork the threads */
2123     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2124     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2125       root->r.r_active = TRUE;
2126 
2127     __kmp_fork_team_threads(root, team, master_th, gtid);
2128     __kmp_setup_icv_copy(team, nthreads,
2129                          &master_th->th.th_current_task->td_icvs, loc);
2130 
2131 #if OMPT_SUPPORT
2132     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2133 #endif
2134 
2135     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2136 
2137 #if USE_ITT_BUILD
2138     if (team->t.t_active_level == 1 // only report frames at level 1
2139         && !master_th->th.th_teams_microtask) { // not in teams construct
2140 #if USE_ITT_NOTIFY
2141       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2142           (__kmp_forkjoin_frames_mode == 3 ||
2143            __kmp_forkjoin_frames_mode == 1)) {
2144         kmp_uint64 tmp_time = 0;
2145         if (__itt_get_timestamp_ptr)
2146           tmp_time = __itt_get_timestamp();
2147         // Internal fork - report frame begin
2148         master_th->th.th_frame_time = tmp_time;
2149         if (__kmp_forkjoin_frames_mode == 3)
2150           team->t.t_region_time = tmp_time;
2151       } else
2152 // only one notification scheme (either "submit" or "forking/joined", not both)
2153 #endif /* USE_ITT_NOTIFY */
2154           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2155               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2156         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2157         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2158       }
2159     }
2160 #endif /* USE_ITT_BUILD */
2161 
2162     /* now go on and do the work */
2163     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2164     KMP_MB();
2165     KF_TRACE(10,
2166              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2167               root, team, master_th, gtid));
2168 
2169 #if USE_ITT_BUILD
2170     if (__itt_stack_caller_create_ptr) {
2171       // create new stack stitching id before entering fork barrier
2172       if (!enter_teams) {
2173         KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2174         team->t.t_stack_id = __kmp_itt_stack_caller_create();
2175       } else if (parent_team->t.t_serialized) {
2176         // keep stack stitching id in the serialized parent_team;
2177         // current team will be used for parallel inside the teams;
2178         // if parent_team is active, then it already keeps stack stitching id
2179         // for the league of teams
2180         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2181         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2182       }
2183     }
2184 #endif /* USE_ITT_BUILD */
2185 
2186     // AC: skip __kmp_internal_fork at teams construct, let only primary
2187     // threads execute
2188     if (ap) {
2189       __kmp_internal_fork(loc, gtid, team);
2190       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2191                     "master_th=%p, gtid=%d\n",
2192                     root, team, master_th, gtid));
2193     }
2194 
2195     if (call_context == fork_context_gnu) {
2196       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2197       return TRUE;
2198     }
2199 
2200     /* Invoke microtask for PRIMARY thread */
2201     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2202                   team->t.t_id, team->t.t_pkfn));
2203   } // END of timer KMP_fork_call block
2204 
2205 #if KMP_STATS_ENABLED
2206   // If beginning a teams construct, then change thread state
2207   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2208   if (!ap) {
2209     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2210   }
2211 #endif
2212 
2213   if (!team->t.t_invoke(gtid)) {
2214     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2215   }
2216 
2217 #if KMP_STATS_ENABLED
2218   // If was beginning of a teams construct, then reset thread state
2219   if (!ap) {
2220     KMP_SET_THREAD_STATE(previous_state);
2221   }
2222 #endif
2223 
2224   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2225                 team->t.t_id, team->t.t_pkfn));
2226   KMP_MB(); /* Flush all pending memory write invalidates.  */
2227 
2228   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2229 #if OMPT_SUPPORT
2230   if (ompt_enabled.enabled) {
2231     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2232   }
2233 #endif
2234 
2235   return TRUE;
2236 }
2237 
2238 #if OMPT_SUPPORT
2239 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2240                                             kmp_team_t *team) {
2241   // restore state outside the region
2242   thread->th.ompt_thread_info.state =
2243       ((team->t.t_serialized) ? ompt_state_work_serial
2244                               : ompt_state_work_parallel);
2245 }
2246 
2247 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2248                                    kmp_team_t *team, ompt_data_t *parallel_data,
2249                                    int flags, void *codeptr) {
2250   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2251   if (ompt_enabled.ompt_callback_parallel_end) {
2252     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2253         parallel_data, &(task_info->task_data), flags, codeptr);
2254   }
2255 
2256   task_info->frame.enter_frame = ompt_data_none;
2257   __kmp_join_restore_state(thread, team);
2258 }
2259 #endif
2260 
2261 void __kmp_join_call(ident_t *loc, int gtid
2262 #if OMPT_SUPPORT
2263                      ,
2264                      enum fork_context_e fork_context
2265 #endif
2266                      ,
2267                      int exit_teams) {
2268   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2269   kmp_team_t *team;
2270   kmp_team_t *parent_team;
2271   kmp_info_t *master_th;
2272   kmp_root_t *root;
2273   int master_active;
2274 
2275   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2276 
2277   /* setup current data */
2278   master_th = __kmp_threads[gtid];
2279   root = master_th->th.th_root;
2280   team = master_th->th.th_team;
2281   parent_team = team->t.t_parent;
2282 
2283   master_th->th.th_ident = loc;
2284 
2285 #if OMPT_SUPPORT
2286   void *team_microtask = (void *)team->t.t_pkfn;
2287   // For GOMP interface with serialized parallel, need the
2288   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2289   // and end-parallel events.
2290   if (ompt_enabled.enabled &&
2291       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2292     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2293   }
2294 #endif
2295 
2296 #if KMP_DEBUG
2297   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2298     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2299                   "th_task_team = %p\n",
2300                   __kmp_gtid_from_thread(master_th), team,
2301                   team->t.t_task_team[master_th->th.th_task_state],
2302                   master_th->th.th_task_team));
2303     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2304                      team->t.t_task_team[master_th->th.th_task_state]);
2305   }
2306 #endif
2307 
2308   if (team->t.t_serialized) {
2309     if (master_th->th.th_teams_microtask) {
2310       // We are in teams construct
2311       int level = team->t.t_level;
2312       int tlevel = master_th->th.th_teams_level;
2313       if (level == tlevel) {
2314         // AC: we haven't incremented it earlier at start of teams construct,
2315         //     so do it here - at the end of teams construct
2316         team->t.t_level++;
2317       } else if (level == tlevel + 1) {
2318         // AC: we are exiting parallel inside teams, need to increment
2319         // serialization in order to restore it in the next call to
2320         // __kmpc_end_serialized_parallel
2321         team->t.t_serialized++;
2322       }
2323     }
2324     __kmpc_end_serialized_parallel(loc, gtid);
2325 
2326 #if OMPT_SUPPORT
2327     if (ompt_enabled.enabled) {
2328       __kmp_join_restore_state(master_th, parent_team);
2329     }
2330 #endif
2331 
2332     return;
2333   }
2334 
2335   master_active = team->t.t_master_active;
2336 
2337   if (!exit_teams) {
2338     // AC: No barrier for internal teams at exit from teams construct.
2339     //     But there is barrier for external team (league).
2340     __kmp_internal_join(loc, gtid, team);
2341 #if USE_ITT_BUILD
2342     if (__itt_stack_caller_create_ptr) {
2343       KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2344       // destroy the stack stitching id after join barrier
2345       __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2346       team->t.t_stack_id = NULL;
2347     }
2348 #endif
2349   } else {
2350     master_th->th.th_task_state =
2351         0; // AC: no tasking in teams (out of any parallel)
2352 #if USE_ITT_BUILD
2353     if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2354       KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2355       // destroy the stack stitching id on exit from the teams construct
2356       // if parent_team is active, then the id will be destroyed later on
2357       // by master of the league of teams
2358       __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2359       parent_team->t.t_stack_id = NULL;
2360     }
2361 #endif
2362   }
2363 
2364   KMP_MB();
2365 
2366 #if OMPT_SUPPORT
2367   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2368   void *codeptr = team->t.ompt_team_info.master_return_address;
2369 #endif
2370 
2371 #if USE_ITT_BUILD
2372   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2373   if (team->t.t_active_level == 1 &&
2374       (!master_th->th.th_teams_microtask || /* not in teams construct */
2375        master_th->th.th_teams_size.nteams == 1)) {
2376     master_th->th.th_ident = loc;
2377     // only one notification scheme (either "submit" or "forking/joined", not
2378     // both)
2379     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2380         __kmp_forkjoin_frames_mode == 3)
2381       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2382                              master_th->th.th_frame_time, 0, loc,
2383                              master_th->th.th_team_nproc, 1);
2384     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2385              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2386       __kmp_itt_region_joined(gtid);
2387   } // active_level == 1
2388 #endif /* USE_ITT_BUILD */
2389 
2390   if (master_th->th.th_teams_microtask && !exit_teams &&
2391       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2392       team->t.t_level == master_th->th.th_teams_level + 1) {
2393 // AC: We need to leave the team structure intact at the end of parallel
2394 // inside the teams construct, so that at the next parallel same (hot) team
2395 // works, only adjust nesting levels
2396 #if OMPT_SUPPORT
2397     ompt_data_t ompt_parallel_data = ompt_data_none;
2398     if (ompt_enabled.enabled) {
2399       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2400       if (ompt_enabled.ompt_callback_implicit_task) {
2401         int ompt_team_size = team->t.t_nproc;
2402         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2403             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2404             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2405       }
2406       task_info->frame.exit_frame = ompt_data_none;
2407       task_info->task_data = ompt_data_none;
2408       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2409       __ompt_lw_taskteam_unlink(master_th);
2410     }
2411 #endif
2412     /* Decrement our nested depth level */
2413     team->t.t_level--;
2414     team->t.t_active_level--;
2415     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2416 
2417     // Restore number of threads in the team if needed. This code relies on
2418     // the proper adjustment of th_teams_size.nth after the fork in
2419     // __kmp_teams_master on each teams primary thread in the case that
2420     // __kmp_reserve_threads reduced it.
2421     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2422       int old_num = master_th->th.th_team_nproc;
2423       int new_num = master_th->th.th_teams_size.nth;
2424       kmp_info_t **other_threads = team->t.t_threads;
2425       team->t.t_nproc = new_num;
2426       for (int i = 0; i < old_num; ++i) {
2427         other_threads[i]->th.th_team_nproc = new_num;
2428       }
2429       // Adjust states of non-used threads of the team
2430       for (int i = old_num; i < new_num; ++i) {
2431         // Re-initialize thread's barrier data.
2432         KMP_DEBUG_ASSERT(other_threads[i]);
2433         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2434         for (int b = 0; b < bs_last_barrier; ++b) {
2435           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2436           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2437 #if USE_DEBUGGER
2438           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2439 #endif
2440         }
2441         if (__kmp_tasking_mode != tskm_immediate_exec) {
2442           // Synchronize thread's task state
2443           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2444         }
2445       }
2446     }
2447 
2448 #if OMPT_SUPPORT
2449     if (ompt_enabled.enabled) {
2450       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2451                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2452     }
2453 #endif
2454 
2455     return;
2456   }
2457 
2458   /* do cleanup and restore the parent team */
2459   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2460   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2461 
2462   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2463 
2464   /* jc: The following lock has instructions with REL and ACQ semantics,
2465      separating the parallel user code called in this parallel region
2466      from the serial user code called after this function returns. */
2467   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2468 
2469   if (!master_th->th.th_teams_microtask ||
2470       team->t.t_level > master_th->th.th_teams_level) {
2471     /* Decrement our nested depth level */
2472     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2473   }
2474   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2475 
2476 #if OMPT_SUPPORT
2477   if (ompt_enabled.enabled) {
2478     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2479     if (ompt_enabled.ompt_callback_implicit_task) {
2480       int flags = (team_microtask == (void *)__kmp_teams_master)
2481                       ? ompt_task_initial
2482                       : ompt_task_implicit;
2483       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2484       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2485           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2486           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2487     }
2488     task_info->frame.exit_frame = ompt_data_none;
2489     task_info->task_data = ompt_data_none;
2490   }
2491 #endif
2492 
2493   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2494                 master_th, team));
2495   __kmp_pop_current_task_from_thread(master_th);
2496 
2497 #if KMP_AFFINITY_SUPPORTED
2498   // Restore master thread's partition.
2499   master_th->th.th_first_place = team->t.t_first_place;
2500   master_th->th.th_last_place = team->t.t_last_place;
2501 #endif // KMP_AFFINITY_SUPPORTED
2502   master_th->th.th_def_allocator = team->t.t_def_allocator;
2503 
2504 #if OMPD_SUPPORT
2505   if (ompd_state & OMPD_ENABLE_BP)
2506     ompd_bp_parallel_end();
2507 #endif
2508   updateHWFPControl(team);
2509 
2510   if (root->r.r_active != master_active)
2511     root->r.r_active = master_active;
2512 
2513   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2514                             master_th)); // this will free worker threads
2515 
2516   /* this race was fun to find. make sure the following is in the critical
2517      region otherwise assertions may fail occasionally since the old team may be
2518      reallocated and the hierarchy appears inconsistent. it is actually safe to
2519      run and won't cause any bugs, but will cause those assertion failures. it's
2520      only one deref&assign so might as well put this in the critical region */
2521   master_th->th.th_team = parent_team;
2522   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2523   master_th->th.th_team_master = parent_team->t.t_threads[0];
2524   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2525 
2526   /* restore serialized team, if need be */
2527   if (parent_team->t.t_serialized &&
2528       parent_team != master_th->th.th_serial_team &&
2529       parent_team != root->r.r_root_team) {
2530     __kmp_free_team(root,
2531                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2532     master_th->th.th_serial_team = parent_team;
2533   }
2534 
2535   if (__kmp_tasking_mode != tskm_immediate_exec) {
2536     if (master_th->th.th_task_state_top >
2537         0) { // Restore task state from memo stack
2538       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2539       // Remember primary thread's state if we re-use this nested hot team
2540       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2541           master_th->th.th_task_state;
2542       --master_th->th.th_task_state_top; // pop
2543       // Now restore state at this level
2544       master_th->th.th_task_state =
2545           master_th->th
2546               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2547     }
2548     // Copy the task team from the parent team to the primary thread
2549     master_th->th.th_task_team =
2550         parent_team->t.t_task_team[master_th->th.th_task_state];
2551     KA_TRACE(20,
2552              ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2553               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2554               parent_team));
2555   }
2556 
2557   // TODO: GEH - cannot do this assertion because root thread not set up as
2558   // executing
2559   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2560   master_th->th.th_current_task->td_flags.executing = 1;
2561 
2562   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2563 
2564 #if OMPT_SUPPORT
2565   int flags =
2566       OMPT_INVOKER(fork_context) |
2567       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2568                                                       : ompt_parallel_team);
2569   if (ompt_enabled.enabled) {
2570     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2571                     codeptr);
2572   }
2573 #endif
2574 
2575   KMP_MB();
2576   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2577 }
2578 
2579 /* Check whether we should push an internal control record onto the
2580    serial team stack.  If so, do it.  */
2581 void __kmp_save_internal_controls(kmp_info_t *thread) {
2582 
2583   if (thread->th.th_team != thread->th.th_serial_team) {
2584     return;
2585   }
2586   if (thread->th.th_team->t.t_serialized > 1) {
2587     int push = 0;
2588 
2589     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2590       push = 1;
2591     } else {
2592       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2593           thread->th.th_team->t.t_serialized) {
2594         push = 1;
2595       }
2596     }
2597     if (push) { /* push a record on the serial team's stack */
2598       kmp_internal_control_t *control =
2599           (kmp_internal_control_t *)__kmp_allocate(
2600               sizeof(kmp_internal_control_t));
2601 
2602       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2603 
2604       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2605 
2606       control->next = thread->th.th_team->t.t_control_stack_top;
2607       thread->th.th_team->t.t_control_stack_top = control;
2608     }
2609   }
2610 }
2611 
2612 /* Changes set_nproc */
2613 void __kmp_set_num_threads(int new_nth, int gtid) {
2614   kmp_info_t *thread;
2615   kmp_root_t *root;
2616 
2617   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2618   KMP_DEBUG_ASSERT(__kmp_init_serial);
2619 
2620   if (new_nth < 1)
2621     new_nth = 1;
2622   else if (new_nth > __kmp_max_nth)
2623     new_nth = __kmp_max_nth;
2624 
2625   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2626   thread = __kmp_threads[gtid];
2627   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2628     return; // nothing to do
2629 
2630   __kmp_save_internal_controls(thread);
2631 
2632   set__nproc(thread, new_nth);
2633 
2634   // If this omp_set_num_threads() call will cause the hot team size to be
2635   // reduced (in the absence of a num_threads clause), then reduce it now,
2636   // rather than waiting for the next parallel region.
2637   root = thread->th.th_root;
2638   if (__kmp_init_parallel && (!root->r.r_active) &&
2639       (root->r.r_hot_team->t.t_nproc > new_nth)
2640 #if KMP_NESTED_HOT_TEAMS
2641       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2642 #endif
2643   ) {
2644     kmp_team_t *hot_team = root->r.r_hot_team;
2645     int f;
2646 
2647     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2648 
2649     // Release the extra threads we don't need any more.
2650     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2651       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2652       if (__kmp_tasking_mode != tskm_immediate_exec) {
2653         // When decreasing team size, threads no longer in the team should unref
2654         // task team.
2655         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2656       }
2657       __kmp_free_thread(hot_team->t.t_threads[f]);
2658       hot_team->t.t_threads[f] = NULL;
2659     }
2660     hot_team->t.t_nproc = new_nth;
2661 #if KMP_NESTED_HOT_TEAMS
2662     if (thread->th.th_hot_teams) {
2663       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2664       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2665     }
2666 #endif
2667 
2668     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2669 
2670     // Update the t_nproc field in the threads that are still active.
2671     for (f = 0; f < new_nth; f++) {
2672       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2673       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2674     }
2675     // Special flag in case omp_set_num_threads() call
2676     hot_team->t.t_size_changed = -1;
2677   }
2678 }
2679 
2680 /* Changes max_active_levels */
2681 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2682   kmp_info_t *thread;
2683 
2684   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2685                 "%d = (%d)\n",
2686                 gtid, max_active_levels));
2687   KMP_DEBUG_ASSERT(__kmp_init_serial);
2688 
2689   // validate max_active_levels
2690   if (max_active_levels < 0) {
2691     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2692     // We ignore this call if the user has specified a negative value.
2693     // The current setting won't be changed. The last valid setting will be
2694     // used. A warning will be issued (if warnings are allowed as controlled by
2695     // the KMP_WARNINGS env var).
2696     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2697                   "max_active_levels for thread %d = (%d)\n",
2698                   gtid, max_active_levels));
2699     return;
2700   }
2701   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2702     // it's OK, the max_active_levels is within the valid range: [ 0;
2703     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2704     // We allow a zero value. (implementation defined behavior)
2705   } else {
2706     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2707                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2708     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2709     // Current upper limit is MAX_INT. (implementation defined behavior)
2710     // If the input exceeds the upper limit, we correct the input to be the
2711     // upper limit. (implementation defined behavior)
2712     // Actually, the flow should never get here until we use MAX_INT limit.
2713   }
2714   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2715                 "max_active_levels for thread %d = (%d)\n",
2716                 gtid, max_active_levels));
2717 
2718   thread = __kmp_threads[gtid];
2719 
2720   __kmp_save_internal_controls(thread);
2721 
2722   set__max_active_levels(thread, max_active_levels);
2723 }
2724 
2725 /* Gets max_active_levels */
2726 int __kmp_get_max_active_levels(int gtid) {
2727   kmp_info_t *thread;
2728 
2729   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2730   KMP_DEBUG_ASSERT(__kmp_init_serial);
2731 
2732   thread = __kmp_threads[gtid];
2733   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2734   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2735                 "curtask_maxaclevel=%d\n",
2736                 gtid, thread->th.th_current_task,
2737                 thread->th.th_current_task->td_icvs.max_active_levels));
2738   return thread->th.th_current_task->td_icvs.max_active_levels;
2739 }
2740 
2741 // nteams-var per-device ICV
2742 void __kmp_set_num_teams(int num_teams) {
2743   if (num_teams > 0)
2744     __kmp_nteams = num_teams;
2745 }
2746 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2747 // teams-thread-limit-var per-device ICV
2748 void __kmp_set_teams_thread_limit(int limit) {
2749   if (limit > 0)
2750     __kmp_teams_thread_limit = limit;
2751 }
2752 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2753 
2754 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2755 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2756 
2757 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2758 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2759   kmp_info_t *thread;
2760   kmp_sched_t orig_kind;
2761   //    kmp_team_t *team;
2762 
2763   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2764                 gtid, (int)kind, chunk));
2765   KMP_DEBUG_ASSERT(__kmp_init_serial);
2766 
2767   // Check if the kind parameter is valid, correct if needed.
2768   // Valid parameters should fit in one of two intervals - standard or extended:
2769   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2770   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2771   orig_kind = kind;
2772   kind = __kmp_sched_without_mods(kind);
2773 
2774   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2775       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2776     // TODO: Hint needs attention in case we change the default schedule.
2777     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2778               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2779               __kmp_msg_null);
2780     kind = kmp_sched_default;
2781     chunk = 0; // ignore chunk value in case of bad kind
2782   }
2783 
2784   thread = __kmp_threads[gtid];
2785 
2786   __kmp_save_internal_controls(thread);
2787 
2788   if (kind < kmp_sched_upper_std) {
2789     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2790       // differ static chunked vs. unchunked:  chunk should be invalid to
2791       // indicate unchunked schedule (which is the default)
2792       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2793     } else {
2794       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2795           __kmp_sch_map[kind - kmp_sched_lower - 1];
2796     }
2797   } else {
2798     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2799     //    kmp_sched_lower - 2 ];
2800     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2801         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2802                       kmp_sched_lower - 2];
2803   }
2804   __kmp_sched_apply_mods_intkind(
2805       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2806   if (kind == kmp_sched_auto || chunk < 1) {
2807     // ignore parameter chunk for schedule auto
2808     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2809   } else {
2810     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2811   }
2812 }
2813 
2814 /* Gets def_sched_var ICV values */
2815 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2816   kmp_info_t *thread;
2817   enum sched_type th_type;
2818 
2819   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2820   KMP_DEBUG_ASSERT(__kmp_init_serial);
2821 
2822   thread = __kmp_threads[gtid];
2823 
2824   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2825   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2826   case kmp_sch_static:
2827   case kmp_sch_static_greedy:
2828   case kmp_sch_static_balanced:
2829     *kind = kmp_sched_static;
2830     __kmp_sched_apply_mods_stdkind(kind, th_type);
2831     *chunk = 0; // chunk was not set, try to show this fact via zero value
2832     return;
2833   case kmp_sch_static_chunked:
2834     *kind = kmp_sched_static;
2835     break;
2836   case kmp_sch_dynamic_chunked:
2837     *kind = kmp_sched_dynamic;
2838     break;
2839   case kmp_sch_guided_chunked:
2840   case kmp_sch_guided_iterative_chunked:
2841   case kmp_sch_guided_analytical_chunked:
2842     *kind = kmp_sched_guided;
2843     break;
2844   case kmp_sch_auto:
2845     *kind = kmp_sched_auto;
2846     break;
2847   case kmp_sch_trapezoidal:
2848     *kind = kmp_sched_trapezoidal;
2849     break;
2850 #if KMP_STATIC_STEAL_ENABLED
2851   case kmp_sch_static_steal:
2852     *kind = kmp_sched_static_steal;
2853     break;
2854 #endif
2855   default:
2856     KMP_FATAL(UnknownSchedulingType, th_type);
2857   }
2858 
2859   __kmp_sched_apply_mods_stdkind(kind, th_type);
2860   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2861 }
2862 
2863 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2864 
2865   int ii, dd;
2866   kmp_team_t *team;
2867   kmp_info_t *thr;
2868 
2869   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2870   KMP_DEBUG_ASSERT(__kmp_init_serial);
2871 
2872   // validate level
2873   if (level == 0)
2874     return 0;
2875   if (level < 0)
2876     return -1;
2877   thr = __kmp_threads[gtid];
2878   team = thr->th.th_team;
2879   ii = team->t.t_level;
2880   if (level > ii)
2881     return -1;
2882 
2883   if (thr->th.th_teams_microtask) {
2884     // AC: we are in teams region where multiple nested teams have same level
2885     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2886     if (level <=
2887         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2888       KMP_DEBUG_ASSERT(ii >= tlevel);
2889       // AC: As we need to pass by the teams league, we need to artificially
2890       // increase ii
2891       if (ii == tlevel) {
2892         ii += 2; // three teams have same level
2893       } else {
2894         ii++; // two teams have same level
2895       }
2896     }
2897   }
2898 
2899   if (ii == level)
2900     return __kmp_tid_from_gtid(gtid);
2901 
2902   dd = team->t.t_serialized;
2903   level++;
2904   while (ii > level) {
2905     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2906     }
2907     if ((team->t.t_serialized) && (!dd)) {
2908       team = team->t.t_parent;
2909       continue;
2910     }
2911     if (ii > level) {
2912       team = team->t.t_parent;
2913       dd = team->t.t_serialized;
2914       ii--;
2915     }
2916   }
2917 
2918   return (dd > 1) ? (0) : (team->t.t_master_tid);
2919 }
2920 
2921 int __kmp_get_team_size(int gtid, int level) {
2922 
2923   int ii, dd;
2924   kmp_team_t *team;
2925   kmp_info_t *thr;
2926 
2927   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2928   KMP_DEBUG_ASSERT(__kmp_init_serial);
2929 
2930   // validate level
2931   if (level == 0)
2932     return 1;
2933   if (level < 0)
2934     return -1;
2935   thr = __kmp_threads[gtid];
2936   team = thr->th.th_team;
2937   ii = team->t.t_level;
2938   if (level > ii)
2939     return -1;
2940 
2941   if (thr->th.th_teams_microtask) {
2942     // AC: we are in teams region where multiple nested teams have same level
2943     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2944     if (level <=
2945         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2946       KMP_DEBUG_ASSERT(ii >= tlevel);
2947       // AC: As we need to pass by the teams league, we need to artificially
2948       // increase ii
2949       if (ii == tlevel) {
2950         ii += 2; // three teams have same level
2951       } else {
2952         ii++; // two teams have same level
2953       }
2954     }
2955   }
2956 
2957   while (ii > level) {
2958     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2959     }
2960     if (team->t.t_serialized && (!dd)) {
2961       team = team->t.t_parent;
2962       continue;
2963     }
2964     if (ii > level) {
2965       team = team->t.t_parent;
2966       ii--;
2967     }
2968   }
2969 
2970   return team->t.t_nproc;
2971 }
2972 
2973 kmp_r_sched_t __kmp_get_schedule_global() {
2974   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2975   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2976   // independently. So one can get the updated schedule here.
2977 
2978   kmp_r_sched_t r_sched;
2979 
2980   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2981   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2982   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2983   // different roots (even in OMP 2.5)
2984   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2985   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2986   if (s == kmp_sch_static) {
2987     // replace STATIC with more detailed schedule (balanced or greedy)
2988     r_sched.r_sched_type = __kmp_static;
2989   } else if (s == kmp_sch_guided_chunked) {
2990     // replace GUIDED with more detailed schedule (iterative or analytical)
2991     r_sched.r_sched_type = __kmp_guided;
2992   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2993     r_sched.r_sched_type = __kmp_sched;
2994   }
2995   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2996 
2997   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2998     // __kmp_chunk may be wrong here (if it was not ever set)
2999     r_sched.chunk = KMP_DEFAULT_CHUNK;
3000   } else {
3001     r_sched.chunk = __kmp_chunk;
3002   }
3003 
3004   return r_sched;
3005 }
3006 
3007 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3008    at least argc number of *t_argv entries for the requested team. */
3009 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3010 
3011   KMP_DEBUG_ASSERT(team);
3012   if (!realloc || argc > team->t.t_max_argc) {
3013 
3014     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3015                    "current entries=%d\n",
3016                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3017     /* if previously allocated heap space for args, free them */
3018     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3019       __kmp_free((void *)team->t.t_argv);
3020 
3021     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3022       /* use unused space in the cache line for arguments */
3023       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3024       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3025                      "argv entries\n",
3026                      team->t.t_id, team->t.t_max_argc));
3027       team->t.t_argv = &team->t.t_inline_argv[0];
3028       if (__kmp_storage_map) {
3029         __kmp_print_storage_map_gtid(
3030             -1, &team->t.t_inline_argv[0],
3031             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3032             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3033             team->t.t_id);
3034       }
3035     } else {
3036       /* allocate space for arguments in the heap */
3037       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3038                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3039                                : 2 * argc;
3040       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3041                      "argv entries\n",
3042                      team->t.t_id, team->t.t_max_argc));
3043       team->t.t_argv =
3044           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3045       if (__kmp_storage_map) {
3046         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3047                                      &team->t.t_argv[team->t.t_max_argc],
3048                                      sizeof(void *) * team->t.t_max_argc,
3049                                      "team_%d.t_argv", team->t.t_id);
3050       }
3051     }
3052   }
3053 }
3054 
3055 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3056   int i;
3057   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3058   team->t.t_threads =
3059       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3060   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3061       sizeof(dispatch_shared_info_t) * num_disp_buff);
3062   team->t.t_dispatch =
3063       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3064   team->t.t_implicit_task_taskdata =
3065       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3066   team->t.t_max_nproc = max_nth;
3067 
3068   /* setup dispatch buffers */
3069   for (i = 0; i < num_disp_buff; ++i) {
3070     team->t.t_disp_buffer[i].buffer_index = i;
3071     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3072   }
3073 }
3074 
3075 static void __kmp_free_team_arrays(kmp_team_t *team) {
3076   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3077   int i;
3078   for (i = 0; i < team->t.t_max_nproc; ++i) {
3079     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3080       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3081       team->t.t_dispatch[i].th_disp_buffer = NULL;
3082     }
3083   }
3084 #if KMP_USE_HIER_SCHED
3085   __kmp_dispatch_free_hierarchies(team);
3086 #endif
3087   __kmp_free(team->t.t_threads);
3088   __kmp_free(team->t.t_disp_buffer);
3089   __kmp_free(team->t.t_dispatch);
3090   __kmp_free(team->t.t_implicit_task_taskdata);
3091   team->t.t_threads = NULL;
3092   team->t.t_disp_buffer = NULL;
3093   team->t.t_dispatch = NULL;
3094   team->t.t_implicit_task_taskdata = 0;
3095 }
3096 
3097 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3098   kmp_info_t **oldThreads = team->t.t_threads;
3099 
3100   __kmp_free(team->t.t_disp_buffer);
3101   __kmp_free(team->t.t_dispatch);
3102   __kmp_free(team->t.t_implicit_task_taskdata);
3103   __kmp_allocate_team_arrays(team, max_nth);
3104 
3105   KMP_MEMCPY(team->t.t_threads, oldThreads,
3106              team->t.t_nproc * sizeof(kmp_info_t *));
3107 
3108   __kmp_free(oldThreads);
3109 }
3110 
3111 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3112 
3113   kmp_r_sched_t r_sched =
3114       __kmp_get_schedule_global(); // get current state of scheduling globals
3115 
3116   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3117 
3118   kmp_internal_control_t g_icvs = {
3119     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3120     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3121     // adjustment of threads (per thread)
3122     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3123     // whether blocktime is explicitly set
3124     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3125 #if KMP_USE_MONITOR
3126     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3127 // intervals
3128 #endif
3129     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3130     // next parallel region (per thread)
3131     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3132     __kmp_cg_max_nth, // int thread_limit;
3133     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3134     // for max_active_levels
3135     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3136     // {sched,chunk} pair
3137     __kmp_nested_proc_bind.bind_types[0],
3138     __kmp_default_device,
3139     NULL // struct kmp_internal_control *next;
3140   };
3141 
3142   return g_icvs;
3143 }
3144 
3145 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3146 
3147   kmp_internal_control_t gx_icvs;
3148   gx_icvs.serial_nesting_level =
3149       0; // probably =team->t.t_serial like in save_inter_controls
3150   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3151   gx_icvs.next = NULL;
3152 
3153   return gx_icvs;
3154 }
3155 
3156 static void __kmp_initialize_root(kmp_root_t *root) {
3157   int f;
3158   kmp_team_t *root_team;
3159   kmp_team_t *hot_team;
3160   int hot_team_max_nth;
3161   kmp_r_sched_t r_sched =
3162       __kmp_get_schedule_global(); // get current state of scheduling globals
3163   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3164   KMP_DEBUG_ASSERT(root);
3165   KMP_ASSERT(!root->r.r_begin);
3166 
3167   /* setup the root state structure */
3168   __kmp_init_lock(&root->r.r_begin_lock);
3169   root->r.r_begin = FALSE;
3170   root->r.r_active = FALSE;
3171   root->r.r_in_parallel = 0;
3172   root->r.r_blocktime = __kmp_dflt_blocktime;
3173 #if KMP_AFFINITY_SUPPORTED
3174   root->r.r_affinity_assigned = FALSE;
3175 #endif
3176 
3177   /* setup the root team for this task */
3178   /* allocate the root team structure */
3179   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3180 
3181   root_team =
3182       __kmp_allocate_team(root,
3183                           1, // new_nproc
3184                           1, // max_nproc
3185 #if OMPT_SUPPORT
3186                           ompt_data_none, // root parallel id
3187 #endif
3188                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3189                           0 // argc
3190                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3191                           );
3192 #if USE_DEBUGGER
3193   // Non-NULL value should be assigned to make the debugger display the root
3194   // team.
3195   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3196 #endif
3197 
3198   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3199 
3200   root->r.r_root_team = root_team;
3201   root_team->t.t_control_stack_top = NULL;
3202 
3203   /* initialize root team */
3204   root_team->t.t_threads[0] = NULL;
3205   root_team->t.t_nproc = 1;
3206   root_team->t.t_serialized = 1;
3207   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3208   root_team->t.t_sched.sched = r_sched.sched;
3209   KA_TRACE(
3210       20,
3211       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3212        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3213 
3214   /* setup the  hot team for this task */
3215   /* allocate the hot team structure */
3216   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3217 
3218   hot_team =
3219       __kmp_allocate_team(root,
3220                           1, // new_nproc
3221                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3222 #if OMPT_SUPPORT
3223                           ompt_data_none, // root parallel id
3224 #endif
3225                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3226                           0 // argc
3227                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3228                           );
3229   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3230 
3231   root->r.r_hot_team = hot_team;
3232   root_team->t.t_control_stack_top = NULL;
3233 
3234   /* first-time initialization */
3235   hot_team->t.t_parent = root_team;
3236 
3237   /* initialize hot team */
3238   hot_team_max_nth = hot_team->t.t_max_nproc;
3239   for (f = 0; f < hot_team_max_nth; ++f) {
3240     hot_team->t.t_threads[f] = NULL;
3241   }
3242   hot_team->t.t_nproc = 1;
3243   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3244   hot_team->t.t_sched.sched = r_sched.sched;
3245   hot_team->t.t_size_changed = 0;
3246 }
3247 
3248 #ifdef KMP_DEBUG
3249 
3250 typedef struct kmp_team_list_item {
3251   kmp_team_p const *entry;
3252   struct kmp_team_list_item *next;
3253 } kmp_team_list_item_t;
3254 typedef kmp_team_list_item_t *kmp_team_list_t;
3255 
3256 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3257     kmp_team_list_t list, // List of teams.
3258     kmp_team_p const *team // Team to add.
3259 ) {
3260 
3261   // List must terminate with item where both entry and next are NULL.
3262   // Team is added to the list only once.
3263   // List is sorted in ascending order by team id.
3264   // Team id is *not* a key.
3265 
3266   kmp_team_list_t l;
3267 
3268   KMP_DEBUG_ASSERT(list != NULL);
3269   if (team == NULL) {
3270     return;
3271   }
3272 
3273   __kmp_print_structure_team_accum(list, team->t.t_parent);
3274   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3275 
3276   // Search list for the team.
3277   l = list;
3278   while (l->next != NULL && l->entry != team) {
3279     l = l->next;
3280   }
3281   if (l->next != NULL) {
3282     return; // Team has been added before, exit.
3283   }
3284 
3285   // Team is not found. Search list again for insertion point.
3286   l = list;
3287   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3288     l = l->next;
3289   }
3290 
3291   // Insert team.
3292   {
3293     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3294         sizeof(kmp_team_list_item_t));
3295     *item = *l;
3296     l->entry = team;
3297     l->next = item;
3298   }
3299 }
3300 
3301 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3302 
3303 ) {
3304   __kmp_printf("%s", title);
3305   if (team != NULL) {
3306     __kmp_printf("%2x %p\n", team->t.t_id, team);
3307   } else {
3308     __kmp_printf(" - (nil)\n");
3309   }
3310 }
3311 
3312 static void __kmp_print_structure_thread(char const *title,
3313                                          kmp_info_p const *thread) {
3314   __kmp_printf("%s", title);
3315   if (thread != NULL) {
3316     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3317   } else {
3318     __kmp_printf(" - (nil)\n");
3319   }
3320 }
3321 
3322 void __kmp_print_structure(void) {
3323 
3324   kmp_team_list_t list;
3325 
3326   // Initialize list of teams.
3327   list =
3328       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3329   list->entry = NULL;
3330   list->next = NULL;
3331 
3332   __kmp_printf("\n------------------------------\nGlobal Thread "
3333                "Table\n------------------------------\n");
3334   {
3335     int gtid;
3336     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3337       __kmp_printf("%2d", gtid);
3338       if (__kmp_threads != NULL) {
3339         __kmp_printf(" %p", __kmp_threads[gtid]);
3340       }
3341       if (__kmp_root != NULL) {
3342         __kmp_printf(" %p", __kmp_root[gtid]);
3343       }
3344       __kmp_printf("\n");
3345     }
3346   }
3347 
3348   // Print out __kmp_threads array.
3349   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3350                "----------\n");
3351   if (__kmp_threads != NULL) {
3352     int gtid;
3353     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3354       kmp_info_t const *thread = __kmp_threads[gtid];
3355       if (thread != NULL) {
3356         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3357         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3358         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3359         __kmp_print_structure_team("    Serial Team:  ",
3360                                    thread->th.th_serial_team);
3361         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3362         __kmp_print_structure_thread("    Primary:      ",
3363                                      thread->th.th_team_master);
3364         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3365         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3366         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3367         __kmp_print_structure_thread("    Next in pool: ",
3368                                      thread->th.th_next_pool);
3369         __kmp_printf("\n");
3370         __kmp_print_structure_team_accum(list, thread->th.th_team);
3371         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3372       }
3373     }
3374   } else {
3375     __kmp_printf("Threads array is not allocated.\n");
3376   }
3377 
3378   // Print out __kmp_root array.
3379   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3380                "--------\n");
3381   if (__kmp_root != NULL) {
3382     int gtid;
3383     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3384       kmp_root_t const *root = __kmp_root[gtid];
3385       if (root != NULL) {
3386         __kmp_printf("GTID %2d %p:\n", gtid, root);
3387         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3388         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3389         __kmp_print_structure_thread("    Uber Thread:  ",
3390                                      root->r.r_uber_thread);
3391         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3392         __kmp_printf("    In Parallel:  %2d\n",
3393                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3394         __kmp_printf("\n");
3395         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3396         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3397       }
3398     }
3399   } else {
3400     __kmp_printf("Ubers array is not allocated.\n");
3401   }
3402 
3403   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3404                "--------\n");
3405   while (list->next != NULL) {
3406     kmp_team_p const *team = list->entry;
3407     int i;
3408     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3409     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3410     __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
3411     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3412     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3413     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3414     for (i = 0; i < team->t.t_nproc; ++i) {
3415       __kmp_printf("    Thread %2d:      ", i);
3416       __kmp_print_structure_thread("", team->t.t_threads[i]);
3417     }
3418     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3419     __kmp_printf("\n");
3420     list = list->next;
3421   }
3422 
3423   // Print out __kmp_thread_pool and __kmp_team_pool.
3424   __kmp_printf("\n------------------------------\nPools\n----------------------"
3425                "--------\n");
3426   __kmp_print_structure_thread("Thread pool:          ",
3427                                CCAST(kmp_info_t *, __kmp_thread_pool));
3428   __kmp_print_structure_team("Team pool:            ",
3429                              CCAST(kmp_team_t *, __kmp_team_pool));
3430   __kmp_printf("\n");
3431 
3432   // Free team list.
3433   while (list != NULL) {
3434     kmp_team_list_item_t *item = list;
3435     list = list->next;
3436     KMP_INTERNAL_FREE(item);
3437   }
3438 }
3439 
3440 #endif
3441 
3442 //---------------------------------------------------------------------------
3443 //  Stuff for per-thread fast random number generator
3444 //  Table of primes
3445 static const unsigned __kmp_primes[] = {
3446     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3447     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3448     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3449     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3450     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3451     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3452     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3453     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3454     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3455     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3456     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3457 
3458 //---------------------------------------------------------------------------
3459 //  __kmp_get_random: Get a random number using a linear congruential method.
3460 unsigned short __kmp_get_random(kmp_info_t *thread) {
3461   unsigned x = thread->th.th_x;
3462   unsigned short r = (unsigned short)(x >> 16);
3463 
3464   thread->th.th_x = x * thread->th.th_a + 1;
3465 
3466   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3467                 thread->th.th_info.ds.ds_tid, r));
3468 
3469   return r;
3470 }
3471 //--------------------------------------------------------
3472 // __kmp_init_random: Initialize a random number generator
3473 void __kmp_init_random(kmp_info_t *thread) {
3474   unsigned seed = thread->th.th_info.ds.ds_tid;
3475 
3476   thread->th.th_a =
3477       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3478   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3479   KA_TRACE(30,
3480            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3481 }
3482 
3483 #if KMP_OS_WINDOWS
3484 /* reclaim array entries for root threads that are already dead, returns number
3485  * reclaimed */
3486 static int __kmp_reclaim_dead_roots(void) {
3487   int i, r = 0;
3488 
3489   for (i = 0; i < __kmp_threads_capacity; ++i) {
3490     if (KMP_UBER_GTID(i) &&
3491         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3492         !__kmp_root[i]
3493              ->r.r_active) { // AC: reclaim only roots died in non-active state
3494       r += __kmp_unregister_root_other_thread(i);
3495     }
3496   }
3497   return r;
3498 }
3499 #endif
3500 
3501 /* This function attempts to create free entries in __kmp_threads and
3502    __kmp_root, and returns the number of free entries generated.
3503 
3504    For Windows* OS static library, the first mechanism used is to reclaim array
3505    entries for root threads that are already dead.
3506 
3507    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3508    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3509    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3510    threadprivate cache array has been created. Synchronization with
3511    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3512 
3513    After any dead root reclamation, if the clipping value allows array expansion
3514    to result in the generation of a total of nNeed free slots, the function does
3515    that expansion. If not, nothing is done beyond the possible initial root
3516    thread reclamation.
3517 
3518    If any argument is negative, the behavior is undefined. */
3519 static int __kmp_expand_threads(int nNeed) {
3520   int added = 0;
3521   int minimumRequiredCapacity;
3522   int newCapacity;
3523   kmp_info_t **newThreads;
3524   kmp_root_t **newRoot;
3525 
3526   // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3527   // resizing __kmp_threads does not need additional protection if foreign
3528   // threads are present
3529 
3530 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3531   /* only for Windows static library */
3532   /* reclaim array entries for root threads that are already dead */
3533   added = __kmp_reclaim_dead_roots();
3534 
3535   if (nNeed) {
3536     nNeed -= added;
3537     if (nNeed < 0)
3538       nNeed = 0;
3539   }
3540 #endif
3541   if (nNeed <= 0)
3542     return added;
3543 
3544   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3545   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3546   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3547   // > __kmp_max_nth in one of two ways:
3548   //
3549   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3550   //    may not be reused by another thread, so we may need to increase
3551   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3552   //
3553   // 2) New foreign root(s) are encountered.  We always register new foreign
3554   //    roots. This may cause a smaller # of threads to be allocated at
3555   //    subsequent parallel regions, but the worker threads hang around (and
3556   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3557   //
3558   // Anyway, that is the reason for moving the check to see if
3559   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3560   // instead of having it performed here. -BB
3561 
3562   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3563 
3564   /* compute expansion headroom to check if we can expand */
3565   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3566     /* possible expansion too small -- give up */
3567     return added;
3568   }
3569   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3570 
3571   newCapacity = __kmp_threads_capacity;
3572   do {
3573     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3574                                                           : __kmp_sys_max_nth;
3575   } while (newCapacity < minimumRequiredCapacity);
3576   newThreads = (kmp_info_t **)__kmp_allocate(
3577       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3578   newRoot =
3579       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3580   KMP_MEMCPY(newThreads, __kmp_threads,
3581              __kmp_threads_capacity * sizeof(kmp_info_t *));
3582   KMP_MEMCPY(newRoot, __kmp_root,
3583              __kmp_threads_capacity * sizeof(kmp_root_t *));
3584 
3585   kmp_info_t **temp_threads = __kmp_threads;
3586   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3587   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3588   __kmp_free(temp_threads);
3589   added += newCapacity - __kmp_threads_capacity;
3590   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3591 
3592   if (newCapacity > __kmp_tp_capacity) {
3593     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3594     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3595       __kmp_threadprivate_resize_cache(newCapacity);
3596     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3597       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3598     }
3599     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3600   }
3601 
3602   return added;
3603 }
3604 
3605 /* Register the current thread as a root thread and obtain our gtid. We must
3606    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3607    thread that calls from __kmp_do_serial_initialize() */
3608 int __kmp_register_root(int initial_thread) {
3609   kmp_info_t *root_thread;
3610   kmp_root_t *root;
3611   int gtid;
3612   int capacity;
3613   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3614   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3615   KMP_MB();
3616 
3617   /* 2007-03-02:
3618      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3619      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3620      work as expected -- it may return false (that means there is at least one
3621      empty slot in __kmp_threads array), but it is possible the only free slot
3622      is #0, which is reserved for initial thread and so cannot be used for this
3623      one. Following code workarounds this bug.
3624 
3625      However, right solution seems to be not reserving slot #0 for initial
3626      thread because:
3627      (1) there is no magic in slot #0,
3628      (2) we cannot detect initial thread reliably (the first thread which does
3629         serial initialization may be not a real initial thread).
3630   */
3631   capacity = __kmp_threads_capacity;
3632   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3633     --capacity;
3634   }
3635 
3636   // If it is not for initializing the hidden helper team, we need to take
3637   // __kmp_hidden_helper_threads_num out of the capacity because it is included
3638   // in __kmp_threads_capacity.
3639   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3640     capacity -= __kmp_hidden_helper_threads_num;
3641   }
3642 
3643   /* see if there are too many threads */
3644   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3645     if (__kmp_tp_cached) {
3646       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3647                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3648                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3649     } else {
3650       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3651                   __kmp_msg_null);
3652     }
3653   }
3654 
3655   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3656   // 0: initial thread, also a regular OpenMP thread.
3657   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3658   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3659   // regular OpenMP threads.
3660   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3661     // Find an available thread slot for hidden helper thread. Slots for hidden
3662     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3663     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3664                    gtid <= __kmp_hidden_helper_threads_num;
3665          gtid++)
3666       ;
3667     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3668     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3669                  "hidden helper thread: T#%d\n",
3670                  gtid));
3671   } else {
3672     /* find an available thread slot */
3673     // Don't reassign the zero slot since we need that to only be used by
3674     // initial thread. Slots for hidden helper threads should also be skipped.
3675     if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3676       gtid = 0;
3677     } else {
3678       for (gtid = __kmp_hidden_helper_threads_num + 1;
3679            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3680         ;
3681     }
3682     KA_TRACE(
3683         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3684     KMP_ASSERT(gtid < __kmp_threads_capacity);
3685   }
3686 
3687   /* update global accounting */
3688   __kmp_all_nth++;
3689   TCW_4(__kmp_nth, __kmp_nth + 1);
3690 
3691   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3692   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3693   if (__kmp_adjust_gtid_mode) {
3694     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3695       if (TCR_4(__kmp_gtid_mode) != 2) {
3696         TCW_4(__kmp_gtid_mode, 2);
3697       }
3698     } else {
3699       if (TCR_4(__kmp_gtid_mode) != 1) {
3700         TCW_4(__kmp_gtid_mode, 1);
3701       }
3702     }
3703   }
3704 
3705 #ifdef KMP_ADJUST_BLOCKTIME
3706   /* Adjust blocktime to zero if necessary            */
3707   /* Middle initialization might not have occurred yet */
3708   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3709     if (__kmp_nth > __kmp_avail_proc) {
3710       __kmp_zero_bt = TRUE;
3711     }
3712   }
3713 #endif /* KMP_ADJUST_BLOCKTIME */
3714 
3715   /* setup this new hierarchy */
3716   if (!(root = __kmp_root[gtid])) {
3717     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3718     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3719   }
3720 
3721 #if KMP_STATS_ENABLED
3722   // Initialize stats as soon as possible (right after gtid assignment).
3723   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3724   __kmp_stats_thread_ptr->startLife();
3725   KMP_SET_THREAD_STATE(SERIAL_REGION);
3726   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3727 #endif
3728   __kmp_initialize_root(root);
3729 
3730   /* setup new root thread structure */
3731   if (root->r.r_uber_thread) {
3732     root_thread = root->r.r_uber_thread;
3733   } else {
3734     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3735     if (__kmp_storage_map) {
3736       __kmp_print_thread_storage_map(root_thread, gtid);
3737     }
3738     root_thread->th.th_info.ds.ds_gtid = gtid;
3739 #if OMPT_SUPPORT
3740     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3741 #endif
3742     root_thread->th.th_root = root;
3743     if (__kmp_env_consistency_check) {
3744       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3745     }
3746 #if USE_FAST_MEMORY
3747     __kmp_initialize_fast_memory(root_thread);
3748 #endif /* USE_FAST_MEMORY */
3749 
3750 #if KMP_USE_BGET
3751     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3752     __kmp_initialize_bget(root_thread);
3753 #endif
3754     __kmp_init_random(root_thread); // Initialize random number generator
3755   }
3756 
3757   /* setup the serial team held in reserve by the root thread */
3758   if (!root_thread->th.th_serial_team) {
3759     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3760     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3761     root_thread->th.th_serial_team = __kmp_allocate_team(
3762         root, 1, 1,
3763 #if OMPT_SUPPORT
3764         ompt_data_none, // root parallel id
3765 #endif
3766         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3767   }
3768   KMP_ASSERT(root_thread->th.th_serial_team);
3769   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3770                 root_thread->th.th_serial_team));
3771 
3772   /* drop root_thread into place */
3773   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3774 
3775   root->r.r_root_team->t.t_threads[0] = root_thread;
3776   root->r.r_hot_team->t.t_threads[0] = root_thread;
3777   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3778   // AC: the team created in reserve, not for execution (it is unused for now).
3779   root_thread->th.th_serial_team->t.t_serialized = 0;
3780   root->r.r_uber_thread = root_thread;
3781 
3782   /* initialize the thread, get it ready to go */
3783   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3784   TCW_4(__kmp_init_gtid, TRUE);
3785 
3786   /* prepare the primary thread for get_gtid() */
3787   __kmp_gtid_set_specific(gtid);
3788 
3789 #if USE_ITT_BUILD
3790   __kmp_itt_thread_name(gtid);
3791 #endif /* USE_ITT_BUILD */
3792 
3793 #ifdef KMP_TDATA_GTID
3794   __kmp_gtid = gtid;
3795 #endif
3796   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3797   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3798 
3799   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3800                 "plain=%u\n",
3801                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3802                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3803                 KMP_INIT_BARRIER_STATE));
3804   { // Initialize barrier data.
3805     int b;
3806     for (b = 0; b < bs_last_barrier; ++b) {
3807       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3808 #if USE_DEBUGGER
3809       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3810 #endif
3811     }
3812   }
3813   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3814                    KMP_INIT_BARRIER_STATE);
3815 
3816 #if KMP_AFFINITY_SUPPORTED
3817   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3818   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3819   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3820   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3821 #endif /* KMP_AFFINITY_SUPPORTED */
3822   root_thread->th.th_def_allocator = __kmp_def_allocator;
3823   root_thread->th.th_prev_level = 0;
3824   root_thread->th.th_prev_num_threads = 1;
3825 
3826   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3827   tmp->cg_root = root_thread;
3828   tmp->cg_thread_limit = __kmp_cg_max_nth;
3829   tmp->cg_nthreads = 1;
3830   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3831                  " cg_nthreads init to 1\n",
3832                  root_thread, tmp));
3833   tmp->up = NULL;
3834   root_thread->th.th_cg_roots = tmp;
3835 
3836   __kmp_root_counter++;
3837 
3838 #if OMPT_SUPPORT
3839   if (!initial_thread && ompt_enabled.enabled) {
3840 
3841     kmp_info_t *root_thread = ompt_get_thread();
3842 
3843     ompt_set_thread_state(root_thread, ompt_state_overhead);
3844 
3845     if (ompt_enabled.ompt_callback_thread_begin) {
3846       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3847           ompt_thread_initial, __ompt_get_thread_data_internal());
3848     }
3849     ompt_data_t *task_data;
3850     ompt_data_t *parallel_data;
3851     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3852                                   NULL);
3853     if (ompt_enabled.ompt_callback_implicit_task) {
3854       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3855           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3856     }
3857 
3858     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3859   }
3860 #endif
3861 #if OMPD_SUPPORT
3862   if (ompd_state & OMPD_ENABLE_BP)
3863     ompd_bp_thread_begin();
3864 #endif
3865 
3866   KMP_MB();
3867   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3868 
3869   return gtid;
3870 }
3871 
3872 #if KMP_NESTED_HOT_TEAMS
3873 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3874                                 const int max_level) {
3875   int i, n, nth;
3876   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3877   if (!hot_teams || !hot_teams[level].hot_team) {
3878     return 0;
3879   }
3880   KMP_DEBUG_ASSERT(level < max_level);
3881   kmp_team_t *team = hot_teams[level].hot_team;
3882   nth = hot_teams[level].hot_team_nth;
3883   n = nth - 1; // primary thread is not freed
3884   if (level < max_level - 1) {
3885     for (i = 0; i < nth; ++i) {
3886       kmp_info_t *th = team->t.t_threads[i];
3887       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3888       if (i > 0 && th->th.th_hot_teams) {
3889         __kmp_free(th->th.th_hot_teams);
3890         th->th.th_hot_teams = NULL;
3891       }
3892     }
3893   }
3894   __kmp_free_team(root, team, NULL);
3895   return n;
3896 }
3897 #endif
3898 
3899 // Resets a root thread and clear its root and hot teams.
3900 // Returns the number of __kmp_threads entries directly and indirectly freed.
3901 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3902   kmp_team_t *root_team = root->r.r_root_team;
3903   kmp_team_t *hot_team = root->r.r_hot_team;
3904   int n = hot_team->t.t_nproc;
3905   int i;
3906 
3907   KMP_DEBUG_ASSERT(!root->r.r_active);
3908 
3909   root->r.r_root_team = NULL;
3910   root->r.r_hot_team = NULL;
3911   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3912   // before call to __kmp_free_team().
3913   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3914 #if KMP_NESTED_HOT_TEAMS
3915   if (__kmp_hot_teams_max_level >
3916       0) { // need to free nested hot teams and their threads if any
3917     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3918       kmp_info_t *th = hot_team->t.t_threads[i];
3919       if (__kmp_hot_teams_max_level > 1) {
3920         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3921       }
3922       if (th->th.th_hot_teams) {
3923         __kmp_free(th->th.th_hot_teams);
3924         th->th.th_hot_teams = NULL;
3925       }
3926     }
3927   }
3928 #endif
3929   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3930 
3931   // Before we can reap the thread, we need to make certain that all other
3932   // threads in the teams that had this root as ancestor have stopped trying to
3933   // steal tasks.
3934   if (__kmp_tasking_mode != tskm_immediate_exec) {
3935     __kmp_wait_to_unref_task_teams();
3936   }
3937 
3938 #if KMP_OS_WINDOWS
3939   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3940   KA_TRACE(
3941       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3942            "\n",
3943            (LPVOID) & (root->r.r_uber_thread->th),
3944            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3945   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3946 #endif /* KMP_OS_WINDOWS */
3947 
3948 #if OMPD_SUPPORT
3949   if (ompd_state & OMPD_ENABLE_BP)
3950     ompd_bp_thread_end();
3951 #endif
3952 
3953 #if OMPT_SUPPORT
3954   ompt_data_t *task_data;
3955   ompt_data_t *parallel_data;
3956   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3957                                 NULL);
3958   if (ompt_enabled.ompt_callback_implicit_task) {
3959     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3960         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3961   }
3962   if (ompt_enabled.ompt_callback_thread_end) {
3963     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3964         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3965   }
3966 #endif
3967 
3968   TCW_4(__kmp_nth,
3969         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3970   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3971   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3972                  " to %d\n",
3973                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3974                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3975   if (i == 1) {
3976     // need to free contention group structure
3977     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3978                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3979     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3980     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3981     root->r.r_uber_thread->th.th_cg_roots = NULL;
3982   }
3983   __kmp_reap_thread(root->r.r_uber_thread, 1);
3984 
3985   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3986   // instead of freeing.
3987   root->r.r_uber_thread = NULL;
3988   /* mark root as no longer in use */
3989   root->r.r_begin = FALSE;
3990 
3991   return n;
3992 }
3993 
3994 void __kmp_unregister_root_current_thread(int gtid) {
3995   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3996   /* this lock should be ok, since unregister_root_current_thread is never
3997      called during an abort, only during a normal close. furthermore, if you
3998      have the forkjoin lock, you should never try to get the initz lock */
3999   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4000   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4001     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4002                   "exiting T#%d\n",
4003                   gtid));
4004     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4005     return;
4006   }
4007   kmp_root_t *root = __kmp_root[gtid];
4008 
4009   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4010   KMP_ASSERT(KMP_UBER_GTID(gtid));
4011   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4012   KMP_ASSERT(root->r.r_active == FALSE);
4013 
4014   KMP_MB();
4015 
4016   kmp_info_t *thread = __kmp_threads[gtid];
4017   kmp_team_t *team = thread->th.th_team;
4018   kmp_task_team_t *task_team = thread->th.th_task_team;
4019 
4020   // we need to wait for the proxy tasks before finishing the thread
4021   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4022 #if OMPT_SUPPORT
4023     // the runtime is shutting down so we won't report any events
4024     thread->th.ompt_thread_info.state = ompt_state_undefined;
4025 #endif
4026     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4027   }
4028 
4029   __kmp_reset_root(gtid, root);
4030 
4031   KMP_MB();
4032   KC_TRACE(10,
4033            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4034 
4035   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4036 }
4037 
4038 #if KMP_OS_WINDOWS
4039 /* __kmp_forkjoin_lock must be already held
4040    Unregisters a root thread that is not the current thread.  Returns the number
4041    of __kmp_threads entries freed as a result. */
4042 static int __kmp_unregister_root_other_thread(int gtid) {
4043   kmp_root_t *root = __kmp_root[gtid];
4044   int r;
4045 
4046   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4047   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4048   KMP_ASSERT(KMP_UBER_GTID(gtid));
4049   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4050   KMP_ASSERT(root->r.r_active == FALSE);
4051 
4052   r = __kmp_reset_root(gtid, root);
4053   KC_TRACE(10,
4054            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4055   return r;
4056 }
4057 #endif
4058 
4059 #if KMP_DEBUG
4060 void __kmp_task_info() {
4061 
4062   kmp_int32 gtid = __kmp_entry_gtid();
4063   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4064   kmp_info_t *this_thr = __kmp_threads[gtid];
4065   kmp_team_t *steam = this_thr->th.th_serial_team;
4066   kmp_team_t *team = this_thr->th.th_team;
4067 
4068   __kmp_printf(
4069       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4070       "ptask=%p\n",
4071       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4072       team->t.t_implicit_task_taskdata[tid].td_parent);
4073 }
4074 #endif // KMP_DEBUG
4075 
4076 /* TODO optimize with one big memclr, take out what isn't needed, split
4077    responsibility to workers as much as possible, and delay initialization of
4078    features as much as possible  */
4079 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4080                                   int tid, int gtid) {
4081   /* this_thr->th.th_info.ds.ds_gtid is setup in
4082      kmp_allocate_thread/create_worker.
4083      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4084   KMP_DEBUG_ASSERT(this_thr != NULL);
4085   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4086   KMP_DEBUG_ASSERT(team);
4087   KMP_DEBUG_ASSERT(team->t.t_threads);
4088   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4089   kmp_info_t *master = team->t.t_threads[0];
4090   KMP_DEBUG_ASSERT(master);
4091   KMP_DEBUG_ASSERT(master->th.th_root);
4092 
4093   KMP_MB();
4094 
4095   TCW_SYNC_PTR(this_thr->th.th_team, team);
4096 
4097   this_thr->th.th_info.ds.ds_tid = tid;
4098   this_thr->th.th_set_nproc = 0;
4099   if (__kmp_tasking_mode != tskm_immediate_exec)
4100     // When tasking is possible, threads are not safe to reap until they are
4101     // done tasking; this will be set when tasking code is exited in wait
4102     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4103   else // no tasking --> always safe to reap
4104     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4105   this_thr->th.th_set_proc_bind = proc_bind_default;
4106 #if KMP_AFFINITY_SUPPORTED
4107   this_thr->th.th_new_place = this_thr->th.th_current_place;
4108 #endif
4109   this_thr->th.th_root = master->th.th_root;
4110 
4111   /* setup the thread's cache of the team structure */
4112   this_thr->th.th_team_nproc = team->t.t_nproc;
4113   this_thr->th.th_team_master = master;
4114   this_thr->th.th_team_serialized = team->t.t_serialized;
4115   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4116 
4117   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4118 
4119   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4120                 tid, gtid, this_thr, this_thr->th.th_current_task));
4121 
4122   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4123                            team, tid, TRUE);
4124 
4125   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4126                 tid, gtid, this_thr, this_thr->th.th_current_task));
4127   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4128   // __kmp_initialize_team()?
4129 
4130   /* TODO no worksharing in speculative threads */
4131   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4132 
4133   this_thr->th.th_local.this_construct = 0;
4134 
4135   if (!this_thr->th.th_pri_common) {
4136     this_thr->th.th_pri_common =
4137         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4138     if (__kmp_storage_map) {
4139       __kmp_print_storage_map_gtid(
4140           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4141           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4142     }
4143     this_thr->th.th_pri_head = NULL;
4144   }
4145 
4146   if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4147       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4148     // Make new thread's CG root same as primary thread's
4149     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4150     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4151     if (tmp) {
4152       // worker changes CG, need to check if old CG should be freed
4153       int i = tmp->cg_nthreads--;
4154       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4155                      " on node %p of thread %p to %d\n",
4156                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4157       if (i == 1) {
4158         __kmp_free(tmp); // last thread left CG --> free it
4159       }
4160     }
4161     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4162     // Increment new thread's CG root's counter to add the new thread
4163     this_thr->th.th_cg_roots->cg_nthreads++;
4164     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4165                    " node %p of thread %p to %d\n",
4166                    this_thr, this_thr->th.th_cg_roots,
4167                    this_thr->th.th_cg_roots->cg_root,
4168                    this_thr->th.th_cg_roots->cg_nthreads));
4169     this_thr->th.th_current_task->td_icvs.thread_limit =
4170         this_thr->th.th_cg_roots->cg_thread_limit;
4171   }
4172 
4173   /* Initialize dynamic dispatch */
4174   {
4175     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4176     // Use team max_nproc since this will never change for the team.
4177     size_t disp_size =
4178         sizeof(dispatch_private_info_t) *
4179         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4180     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4181                   team->t.t_max_nproc));
4182     KMP_ASSERT(dispatch);
4183     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4184     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4185 
4186     dispatch->th_disp_index = 0;
4187     dispatch->th_doacross_buf_idx = 0;
4188     if (!dispatch->th_disp_buffer) {
4189       dispatch->th_disp_buffer =
4190           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4191 
4192       if (__kmp_storage_map) {
4193         __kmp_print_storage_map_gtid(
4194             gtid, &dispatch->th_disp_buffer[0],
4195             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4196                                           ? 1
4197                                           : __kmp_dispatch_num_buffers],
4198             disp_size,
4199             "th_%d.th_dispatch.th_disp_buffer "
4200             "(team_%d.t_dispatch[%d].th_disp_buffer)",
4201             gtid, team->t.t_id, gtid);
4202       }
4203     } else {
4204       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4205     }
4206 
4207     dispatch->th_dispatch_pr_current = 0;
4208     dispatch->th_dispatch_sh_current = 0;
4209 
4210     dispatch->th_deo_fcn = 0; /* ORDERED     */
4211     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4212   }
4213 
4214   this_thr->th.th_next_pool = NULL;
4215 
4216   if (!this_thr->th.th_task_state_memo_stack) {
4217     size_t i;
4218     this_thr->th.th_task_state_memo_stack =
4219         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4220     this_thr->th.th_task_state_top = 0;
4221     this_thr->th.th_task_state_stack_sz = 4;
4222     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4223          ++i) // zero init the stack
4224       this_thr->th.th_task_state_memo_stack[i] = 0;
4225   }
4226 
4227   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4228   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4229 
4230   KMP_MB();
4231 }
4232 
4233 /* allocate a new thread for the requesting team. this is only called from
4234    within a forkjoin critical section. we will first try to get an available
4235    thread from the thread pool. if none is available, we will fork a new one
4236    assuming we are able to create a new one. this should be assured, as the
4237    caller should check on this first. */
4238 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4239                                   int new_tid) {
4240   kmp_team_t *serial_team;
4241   kmp_info_t *new_thr;
4242   int new_gtid;
4243 
4244   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4245   KMP_DEBUG_ASSERT(root && team);
4246 #if !KMP_NESTED_HOT_TEAMS
4247   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4248 #endif
4249   KMP_MB();
4250 
4251   /* first, try to get one from the thread pool */
4252   if (__kmp_thread_pool) {
4253     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4254     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4255     if (new_thr == __kmp_thread_pool_insert_pt) {
4256       __kmp_thread_pool_insert_pt = NULL;
4257     }
4258     TCW_4(new_thr->th.th_in_pool, FALSE);
4259     __kmp_suspend_initialize_thread(new_thr);
4260     __kmp_lock_suspend_mx(new_thr);
4261     if (new_thr->th.th_active_in_pool == TRUE) {
4262       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4263       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4264       new_thr->th.th_active_in_pool = FALSE;
4265     }
4266     __kmp_unlock_suspend_mx(new_thr);
4267 
4268     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4269                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4270     KMP_ASSERT(!new_thr->th.th_team);
4271     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4272 
4273     /* setup the thread structure */
4274     __kmp_initialize_info(new_thr, team, new_tid,
4275                           new_thr->th.th_info.ds.ds_gtid);
4276     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4277 
4278     TCW_4(__kmp_nth, __kmp_nth + 1);
4279 
4280     new_thr->th.th_task_state = 0;
4281     new_thr->th.th_task_state_top = 0;
4282     new_thr->th.th_task_state_stack_sz = 4;
4283 
4284 #ifdef KMP_ADJUST_BLOCKTIME
4285     /* Adjust blocktime back to zero if necessary */
4286     /* Middle initialization might not have occurred yet */
4287     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4288       if (__kmp_nth > __kmp_avail_proc) {
4289         __kmp_zero_bt = TRUE;
4290       }
4291     }
4292 #endif /* KMP_ADJUST_BLOCKTIME */
4293 
4294 #if KMP_DEBUG
4295     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4296     // KMP_BARRIER_PARENT_FLAG.
4297     int b;
4298     kmp_balign_t *balign = new_thr->th.th_bar;
4299     for (b = 0; b < bs_last_barrier; ++b)
4300       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4301 #endif
4302 
4303     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4304                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4305 
4306     KMP_MB();
4307     return new_thr;
4308   }
4309 
4310   /* no, well fork a new one */
4311   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4312   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4313 
4314 #if KMP_USE_MONITOR
4315   // If this is the first worker thread the RTL is creating, then also
4316   // launch the monitor thread.  We try to do this as early as possible.
4317   if (!TCR_4(__kmp_init_monitor)) {
4318     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4319     if (!TCR_4(__kmp_init_monitor)) {
4320       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4321       TCW_4(__kmp_init_monitor, 1);
4322       __kmp_create_monitor(&__kmp_monitor);
4323       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4324 #if KMP_OS_WINDOWS
4325       // AC: wait until monitor has started. This is a fix for CQ232808.
4326       // The reason is that if the library is loaded/unloaded in a loop with
4327       // small (parallel) work in between, then there is high probability that
4328       // monitor thread started after the library shutdown. At shutdown it is
4329       // too late to cope with the problem, because when the primary thread is
4330       // in DllMain (process detach) the monitor has no chances to start (it is
4331       // blocked), and primary thread has no means to inform the monitor that
4332       // the library has gone, because all the memory which the monitor can
4333       // access is going to be released/reset.
4334       while (TCR_4(__kmp_init_monitor) < 2) {
4335         KMP_YIELD(TRUE);
4336       }
4337       KF_TRACE(10, ("after monitor thread has started\n"));
4338 #endif
4339     }
4340     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4341   }
4342 #endif
4343 
4344   KMP_MB();
4345 
4346   {
4347     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4348                              ? 1
4349                              : __kmp_hidden_helper_threads_num + 1;
4350 
4351     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4352          ++new_gtid) {
4353       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4354     }
4355 
4356     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4357       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4358     }
4359   }
4360 
4361   /* allocate space for it. */
4362   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4363 
4364   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4365 
4366 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4367   // suppress race conditions detection on synchronization flags in debug mode
4368   // this helps to analyze library internals eliminating false positives
4369   __itt_suppress_mark_range(
4370       __itt_suppress_range, __itt_suppress_threading_errors,
4371       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4372   __itt_suppress_mark_range(
4373       __itt_suppress_range, __itt_suppress_threading_errors,
4374       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4375 #if KMP_OS_WINDOWS
4376   __itt_suppress_mark_range(
4377       __itt_suppress_range, __itt_suppress_threading_errors,
4378       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4379 #else
4380   __itt_suppress_mark_range(__itt_suppress_range,
4381                             __itt_suppress_threading_errors,
4382                             &new_thr->th.th_suspend_init_count,
4383                             sizeof(new_thr->th.th_suspend_init_count));
4384 #endif
4385   // TODO: check if we need to also suppress b_arrived flags
4386   __itt_suppress_mark_range(__itt_suppress_range,
4387                             __itt_suppress_threading_errors,
4388                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4389                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4390   __itt_suppress_mark_range(__itt_suppress_range,
4391                             __itt_suppress_threading_errors,
4392                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4393                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4394   __itt_suppress_mark_range(__itt_suppress_range,
4395                             __itt_suppress_threading_errors,
4396                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4397                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4398 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4399   if (__kmp_storage_map) {
4400     __kmp_print_thread_storage_map(new_thr, new_gtid);
4401   }
4402 
4403   // add the reserve serialized team, initialized from the team's primary thread
4404   {
4405     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4406     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4407     new_thr->th.th_serial_team = serial_team =
4408         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4409 #if OMPT_SUPPORT
4410                                           ompt_data_none, // root parallel id
4411 #endif
4412                                           proc_bind_default, &r_icvs,
4413                                           0 USE_NESTED_HOT_ARG(NULL));
4414   }
4415   KMP_ASSERT(serial_team);
4416   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4417   // execution (it is unused for now).
4418   serial_team->t.t_threads[0] = new_thr;
4419   KF_TRACE(10,
4420            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4421             new_thr));
4422 
4423   /* setup the thread structures */
4424   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4425 
4426 #if USE_FAST_MEMORY
4427   __kmp_initialize_fast_memory(new_thr);
4428 #endif /* USE_FAST_MEMORY */
4429 
4430 #if KMP_USE_BGET
4431   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4432   __kmp_initialize_bget(new_thr);
4433 #endif
4434 
4435   __kmp_init_random(new_thr); // Initialize random number generator
4436 
4437   /* Initialize these only once when thread is grabbed for a team allocation */
4438   KA_TRACE(20,
4439            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4440             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4441 
4442   int b;
4443   kmp_balign_t *balign = new_thr->th.th_bar;
4444   for (b = 0; b < bs_last_barrier; ++b) {
4445     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4446     balign[b].bb.team = NULL;
4447     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4448     balign[b].bb.use_oncore_barrier = 0;
4449   }
4450 
4451   new_thr->th.th_spin_here = FALSE;
4452   new_thr->th.th_next_waiting = 0;
4453 #if KMP_OS_UNIX
4454   new_thr->th.th_blocking = false;
4455 #endif
4456 
4457 #if KMP_AFFINITY_SUPPORTED
4458   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4459   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4460   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4461   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4462 #endif
4463   new_thr->th.th_def_allocator = __kmp_def_allocator;
4464   new_thr->th.th_prev_level = 0;
4465   new_thr->th.th_prev_num_threads = 1;
4466 
4467   TCW_4(new_thr->th.th_in_pool, FALSE);
4468   new_thr->th.th_active_in_pool = FALSE;
4469   TCW_4(new_thr->th.th_active, TRUE);
4470 
4471   /* adjust the global counters */
4472   __kmp_all_nth++;
4473   __kmp_nth++;
4474 
4475   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4476   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4477   if (__kmp_adjust_gtid_mode) {
4478     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4479       if (TCR_4(__kmp_gtid_mode) != 2) {
4480         TCW_4(__kmp_gtid_mode, 2);
4481       }
4482     } else {
4483       if (TCR_4(__kmp_gtid_mode) != 1) {
4484         TCW_4(__kmp_gtid_mode, 1);
4485       }
4486     }
4487   }
4488 
4489 #ifdef KMP_ADJUST_BLOCKTIME
4490   /* Adjust blocktime back to zero if necessary       */
4491   /* Middle initialization might not have occurred yet */
4492   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4493     if (__kmp_nth > __kmp_avail_proc) {
4494       __kmp_zero_bt = TRUE;
4495     }
4496   }
4497 #endif /* KMP_ADJUST_BLOCKTIME */
4498 
4499   /* actually fork it and create the new worker thread */
4500   KF_TRACE(
4501       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4502   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4503   KF_TRACE(10,
4504            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4505 
4506   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4507                 new_gtid));
4508   KMP_MB();
4509   return new_thr;
4510 }
4511 
4512 /* Reinitialize team for reuse.
4513    The hot team code calls this case at every fork barrier, so EPCC barrier
4514    test are extremely sensitive to changes in it, esp. writes to the team
4515    struct, which cause a cache invalidation in all threads.
4516    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4517 static void __kmp_reinitialize_team(kmp_team_t *team,
4518                                     kmp_internal_control_t *new_icvs,
4519                                     ident_t *loc) {
4520   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4521                 team->t.t_threads[0], team));
4522   KMP_DEBUG_ASSERT(team && new_icvs);
4523   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4524   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4525 
4526   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4527   // Copy ICVs to the primary thread's implicit taskdata
4528   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4529   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4530 
4531   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4532                 team->t.t_threads[0], team));
4533 }
4534 
4535 /* Initialize the team data structure.
4536    This assumes the t_threads and t_max_nproc are already set.
4537    Also, we don't touch the arguments */
4538 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4539                                   kmp_internal_control_t *new_icvs,
4540                                   ident_t *loc) {
4541   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4542 
4543   /* verify */
4544   KMP_DEBUG_ASSERT(team);
4545   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4546   KMP_DEBUG_ASSERT(team->t.t_threads);
4547   KMP_MB();
4548 
4549   team->t.t_master_tid = 0; /* not needed */
4550   /* team->t.t_master_bar;        not needed */
4551   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4552   team->t.t_nproc = new_nproc;
4553 
4554   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4555   team->t.t_next_pool = NULL;
4556   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4557    * up hot team */
4558 
4559   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4560   team->t.t_invoke = NULL; /* not needed */
4561 
4562   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4563   team->t.t_sched.sched = new_icvs->sched.sched;
4564 
4565 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4566   team->t.t_fp_control_saved = FALSE; /* not needed */
4567   team->t.t_x87_fpu_control_word = 0; /* not needed */
4568   team->t.t_mxcsr = 0; /* not needed */
4569 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4570 
4571   team->t.t_construct = 0;
4572 
4573   team->t.t_ordered.dt.t_value = 0;
4574   team->t.t_master_active = FALSE;
4575 
4576 #ifdef KMP_DEBUG
4577   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4578 #endif
4579 #if KMP_OS_WINDOWS
4580   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4581 #endif
4582 
4583   team->t.t_control_stack_top = NULL;
4584 
4585   __kmp_reinitialize_team(team, new_icvs, loc);
4586 
4587   KMP_MB();
4588   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4589 }
4590 
4591 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4592 /* Sets full mask for thread and returns old mask, no changes to structures. */
4593 static void
4594 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4595   if (KMP_AFFINITY_CAPABLE()) {
4596     int status;
4597     if (old_mask != NULL) {
4598       status = __kmp_get_system_affinity(old_mask, TRUE);
4599       int error = errno;
4600       if (status != 0) {
4601         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4602                     __kmp_msg_null);
4603       }
4604     }
4605     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4606   }
4607 }
4608 #endif
4609 
4610 #if KMP_AFFINITY_SUPPORTED
4611 
4612 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4613 // It calculates the worker + primary thread's partition based upon the parent
4614 // thread's partition, and binds each worker to a thread in their partition.
4615 // The primary thread's partition should already include its current binding.
4616 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4617   // Do not partition places for the hidden helper team
4618   if (KMP_HIDDEN_HELPER_TEAM(team))
4619     return;
4620   // Copy the primary thread's place partition to the team struct
4621   kmp_info_t *master_th = team->t.t_threads[0];
4622   KMP_DEBUG_ASSERT(master_th != NULL);
4623   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4624   int first_place = master_th->th.th_first_place;
4625   int last_place = master_th->th.th_last_place;
4626   int masters_place = master_th->th.th_current_place;
4627   team->t.t_first_place = first_place;
4628   team->t.t_last_place = last_place;
4629 
4630   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4631                 "bound to place %d partition = [%d,%d]\n",
4632                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4633                 team->t.t_id, masters_place, first_place, last_place));
4634 
4635   switch (proc_bind) {
4636 
4637   case proc_bind_default:
4638     // Serial teams might have the proc_bind policy set to proc_bind_default.
4639     // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4640     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4641     break;
4642 
4643   case proc_bind_primary: {
4644     int f;
4645     int n_th = team->t.t_nproc;
4646     for (f = 1; f < n_th; f++) {
4647       kmp_info_t *th = team->t.t_threads[f];
4648       KMP_DEBUG_ASSERT(th != NULL);
4649       th->th.th_first_place = first_place;
4650       th->th.th_last_place = last_place;
4651       th->th.th_new_place = masters_place;
4652       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4653           team->t.t_display_affinity != 1) {
4654         team->t.t_display_affinity = 1;
4655       }
4656 
4657       KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4658                      "partition = [%d,%d]\n",
4659                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4660                      f, masters_place, first_place, last_place));
4661     }
4662   } break;
4663 
4664   case proc_bind_close: {
4665     int f;
4666     int n_th = team->t.t_nproc;
4667     int n_places;
4668     if (first_place <= last_place) {
4669       n_places = last_place - first_place + 1;
4670     } else {
4671       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4672     }
4673     if (n_th <= n_places) {
4674       int place = masters_place;
4675       for (f = 1; f < n_th; f++) {
4676         kmp_info_t *th = team->t.t_threads[f];
4677         KMP_DEBUG_ASSERT(th != NULL);
4678 
4679         if (place == last_place) {
4680           place = first_place;
4681         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4682           place = 0;
4683         } else {
4684           place++;
4685         }
4686         th->th.th_first_place = first_place;
4687         th->th.th_last_place = last_place;
4688         th->th.th_new_place = place;
4689         if (__kmp_display_affinity && place != th->th.th_current_place &&
4690             team->t.t_display_affinity != 1) {
4691           team->t.t_display_affinity = 1;
4692         }
4693 
4694         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4695                        "partition = [%d,%d]\n",
4696                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4697                        team->t.t_id, f, place, first_place, last_place));
4698       }
4699     } else {
4700       int S, rem, gap, s_count;
4701       S = n_th / n_places;
4702       s_count = 0;
4703       rem = n_th - (S * n_places);
4704       gap = rem > 0 ? n_places / rem : n_places;
4705       int place = masters_place;
4706       int gap_ct = gap;
4707       for (f = 0; f < n_th; f++) {
4708         kmp_info_t *th = team->t.t_threads[f];
4709         KMP_DEBUG_ASSERT(th != NULL);
4710 
4711         th->th.th_first_place = first_place;
4712         th->th.th_last_place = last_place;
4713         th->th.th_new_place = place;
4714         if (__kmp_display_affinity && place != th->th.th_current_place &&
4715             team->t.t_display_affinity != 1) {
4716           team->t.t_display_affinity = 1;
4717         }
4718         s_count++;
4719 
4720         if ((s_count == S) && rem && (gap_ct == gap)) {
4721           // do nothing, add an extra thread to place on next iteration
4722         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4723           // we added an extra thread to this place; move to next place
4724           if (place == last_place) {
4725             place = first_place;
4726           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4727             place = 0;
4728           } else {
4729             place++;
4730           }
4731           s_count = 0;
4732           gap_ct = 1;
4733           rem--;
4734         } else if (s_count == S) { // place full; don't add extra
4735           if (place == last_place) {
4736             place = first_place;
4737           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4738             place = 0;
4739           } else {
4740             place++;
4741           }
4742           gap_ct++;
4743           s_count = 0;
4744         }
4745 
4746         KA_TRACE(100,
4747                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4748                   "partition = [%d,%d]\n",
4749                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4750                   th->th.th_new_place, first_place, last_place));
4751       }
4752       KMP_DEBUG_ASSERT(place == masters_place);
4753     }
4754   } break;
4755 
4756   case proc_bind_spread: {
4757     int f;
4758     int n_th = team->t.t_nproc;
4759     int n_places;
4760     int thidx;
4761     if (first_place <= last_place) {
4762       n_places = last_place - first_place + 1;
4763     } else {
4764       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4765     }
4766     if (n_th <= n_places) {
4767       int place = -1;
4768 
4769       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4770         int S = n_places / n_th;
4771         int s_count, rem, gap, gap_ct;
4772 
4773         place = masters_place;
4774         rem = n_places - n_th * S;
4775         gap = rem ? n_th / rem : 1;
4776         gap_ct = gap;
4777         thidx = n_th;
4778         if (update_master_only == 1)
4779           thidx = 1;
4780         for (f = 0; f < thidx; f++) {
4781           kmp_info_t *th = team->t.t_threads[f];
4782           KMP_DEBUG_ASSERT(th != NULL);
4783 
4784           th->th.th_first_place = place;
4785           th->th.th_new_place = place;
4786           if (__kmp_display_affinity && place != th->th.th_current_place &&
4787               team->t.t_display_affinity != 1) {
4788             team->t.t_display_affinity = 1;
4789           }
4790           s_count = 1;
4791           while (s_count < S) {
4792             if (place == last_place) {
4793               place = first_place;
4794             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4795               place = 0;
4796             } else {
4797               place++;
4798             }
4799             s_count++;
4800           }
4801           if (rem && (gap_ct == gap)) {
4802             if (place == last_place) {
4803               place = first_place;
4804             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4805               place = 0;
4806             } else {
4807               place++;
4808             }
4809             rem--;
4810             gap_ct = 0;
4811           }
4812           th->th.th_last_place = place;
4813           gap_ct++;
4814 
4815           if (place == last_place) {
4816             place = first_place;
4817           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4818             place = 0;
4819           } else {
4820             place++;
4821           }
4822 
4823           KA_TRACE(100,
4824                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4825                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4826                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4827                     f, th->th.th_new_place, th->th.th_first_place,
4828                     th->th.th_last_place, __kmp_affinity_num_masks));
4829         }
4830       } else {
4831         /* Having uniform space of available computation places I can create
4832            T partitions of round(P/T) size and put threads into the first
4833            place of each partition. */
4834         double current = static_cast<double>(masters_place);
4835         double spacing =
4836             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4837         int first, last;
4838         kmp_info_t *th;
4839 
4840         thidx = n_th + 1;
4841         if (update_master_only == 1)
4842           thidx = 1;
4843         for (f = 0; f < thidx; f++) {
4844           first = static_cast<int>(current);
4845           last = static_cast<int>(current + spacing) - 1;
4846           KMP_DEBUG_ASSERT(last >= first);
4847           if (first >= n_places) {
4848             if (masters_place) {
4849               first -= n_places;
4850               last -= n_places;
4851               if (first == (masters_place + 1)) {
4852                 KMP_DEBUG_ASSERT(f == n_th);
4853                 first--;
4854               }
4855               if (last == masters_place) {
4856                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4857                 last--;
4858               }
4859             } else {
4860               KMP_DEBUG_ASSERT(f == n_th);
4861               first = 0;
4862               last = 0;
4863             }
4864           }
4865           if (last >= n_places) {
4866             last = (n_places - 1);
4867           }
4868           place = first;
4869           current += spacing;
4870           if (f < n_th) {
4871             KMP_DEBUG_ASSERT(0 <= first);
4872             KMP_DEBUG_ASSERT(n_places > first);
4873             KMP_DEBUG_ASSERT(0 <= last);
4874             KMP_DEBUG_ASSERT(n_places > last);
4875             KMP_DEBUG_ASSERT(last_place >= first_place);
4876             th = team->t.t_threads[f];
4877             KMP_DEBUG_ASSERT(th);
4878             th->th.th_first_place = first;
4879             th->th.th_new_place = place;
4880             th->th.th_last_place = last;
4881             if (__kmp_display_affinity && place != th->th.th_current_place &&
4882                 team->t.t_display_affinity != 1) {
4883               team->t.t_display_affinity = 1;
4884             }
4885             KA_TRACE(100,
4886                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4887                       "partition = [%d,%d], spacing = %.4f\n",
4888                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4889                       team->t.t_id, f, th->th.th_new_place,
4890                       th->th.th_first_place, th->th.th_last_place, spacing));
4891           }
4892         }
4893       }
4894       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4895     } else {
4896       int S, rem, gap, s_count;
4897       S = n_th / n_places;
4898       s_count = 0;
4899       rem = n_th - (S * n_places);
4900       gap = rem > 0 ? n_places / rem : n_places;
4901       int place = masters_place;
4902       int gap_ct = gap;
4903       thidx = n_th;
4904       if (update_master_only == 1)
4905         thidx = 1;
4906       for (f = 0; f < thidx; f++) {
4907         kmp_info_t *th = team->t.t_threads[f];
4908         KMP_DEBUG_ASSERT(th != NULL);
4909 
4910         th->th.th_first_place = place;
4911         th->th.th_last_place = place;
4912         th->th.th_new_place = place;
4913         if (__kmp_display_affinity && place != th->th.th_current_place &&
4914             team->t.t_display_affinity != 1) {
4915           team->t.t_display_affinity = 1;
4916         }
4917         s_count++;
4918 
4919         if ((s_count == S) && rem && (gap_ct == gap)) {
4920           // do nothing, add an extra thread to place on next iteration
4921         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4922           // we added an extra thread to this place; move on to next place
4923           if (place == last_place) {
4924             place = first_place;
4925           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4926             place = 0;
4927           } else {
4928             place++;
4929           }
4930           s_count = 0;
4931           gap_ct = 1;
4932           rem--;
4933         } else if (s_count == S) { // place is full; don't add extra thread
4934           if (place == last_place) {
4935             place = first_place;
4936           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4937             place = 0;
4938           } else {
4939             place++;
4940           }
4941           gap_ct++;
4942           s_count = 0;
4943         }
4944 
4945         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4946                        "partition = [%d,%d]\n",
4947                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4948                        team->t.t_id, f, th->th.th_new_place,
4949                        th->th.th_first_place, th->th.th_last_place));
4950       }
4951       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4952     }
4953   } break;
4954 
4955   default:
4956     break;
4957   }
4958 
4959   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4960 }
4961 
4962 #endif // KMP_AFFINITY_SUPPORTED
4963 
4964 /* allocate a new team data structure to use.  take one off of the free pool if
4965    available */
4966 kmp_team_t *
4967 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4968 #if OMPT_SUPPORT
4969                     ompt_data_t ompt_parallel_data,
4970 #endif
4971                     kmp_proc_bind_t new_proc_bind,
4972                     kmp_internal_control_t *new_icvs,
4973                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4974   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4975   int f;
4976   kmp_team_t *team;
4977   int use_hot_team = !root->r.r_active;
4978   int level = 0;
4979 
4980   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4981   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4982   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4983   KMP_MB();
4984 
4985 #if KMP_NESTED_HOT_TEAMS
4986   kmp_hot_team_ptr_t *hot_teams;
4987   if (master) {
4988     team = master->th.th_team;
4989     level = team->t.t_active_level;
4990     if (master->th.th_teams_microtask) { // in teams construct?
4991       if (master->th.th_teams_size.nteams > 1 &&
4992           ( // #teams > 1
4993               team->t.t_pkfn ==
4994                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4995               master->th.th_teams_level <
4996                   team->t.t_level)) { // or nested parallel inside the teams
4997         ++level; // not increment if #teams==1, or for outer fork of the teams;
4998         // increment otherwise
4999       }
5000     }
5001     hot_teams = master->th.th_hot_teams;
5002     if (level < __kmp_hot_teams_max_level && hot_teams &&
5003         hot_teams[level].hot_team) {
5004       // hot team has already been allocated for given level
5005       use_hot_team = 1;
5006     } else {
5007       use_hot_team = 0;
5008     }
5009   } else {
5010     // check we won't access uninitialized hot_teams, just in case
5011     KMP_DEBUG_ASSERT(new_nproc == 1);
5012   }
5013 #endif
5014   // Optimization to use a "hot" team
5015   if (use_hot_team && new_nproc > 1) {
5016     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5017 #if KMP_NESTED_HOT_TEAMS
5018     team = hot_teams[level].hot_team;
5019 #else
5020     team = root->r.r_hot_team;
5021 #endif
5022 #if KMP_DEBUG
5023     if (__kmp_tasking_mode != tskm_immediate_exec) {
5024       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5025                     "task_team[1] = %p before reinit\n",
5026                     team->t.t_task_team[0], team->t.t_task_team[1]));
5027     }
5028 #endif
5029 
5030     // Has the number of threads changed?
5031     /* Let's assume the most common case is that the number of threads is
5032        unchanged, and put that case first. */
5033     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5034       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5035       // This case can mean that omp_set_num_threads() was called and the hot
5036       // team size was already reduced, so we check the special flag
5037       if (team->t.t_size_changed == -1) {
5038         team->t.t_size_changed = 1;
5039       } else {
5040         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5041       }
5042 
5043       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5044       kmp_r_sched_t new_sched = new_icvs->sched;
5045       // set primary thread's schedule as new run-time schedule
5046       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5047 
5048       __kmp_reinitialize_team(team, new_icvs,
5049                               root->r.r_uber_thread->th.th_ident);
5050 
5051       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5052                     team->t.t_threads[0], team));
5053       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5054 
5055 #if KMP_AFFINITY_SUPPORTED
5056       if ((team->t.t_size_changed == 0) &&
5057           (team->t.t_proc_bind == new_proc_bind)) {
5058         if (new_proc_bind == proc_bind_spread) {
5059           __kmp_partition_places(
5060               team, 1); // add flag to update only master for spread
5061         }
5062         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5063                        "proc_bind = %d, partition = [%d,%d]\n",
5064                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5065                        team->t.t_last_place));
5066       } else {
5067         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5068         __kmp_partition_places(team);
5069       }
5070 #else
5071       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5072 #endif /* KMP_AFFINITY_SUPPORTED */
5073     } else if (team->t.t_nproc > new_nproc) {
5074       KA_TRACE(20,
5075                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5076                 new_nproc));
5077 
5078       team->t.t_size_changed = 1;
5079 #if KMP_NESTED_HOT_TEAMS
5080       if (__kmp_hot_teams_mode == 0) {
5081         // AC: saved number of threads should correspond to team's value in this
5082         // mode, can be bigger in mode 1, when hot team has threads in reserve
5083         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5084         hot_teams[level].hot_team_nth = new_nproc;
5085 #endif // KMP_NESTED_HOT_TEAMS
5086         /* release the extra threads we don't need any more */
5087         for (f = new_nproc; f < team->t.t_nproc; f++) {
5088           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5089           if (__kmp_tasking_mode != tskm_immediate_exec) {
5090             // When decreasing team size, threads no longer in the team should
5091             // unref task team.
5092             team->t.t_threads[f]->th.th_task_team = NULL;
5093           }
5094           __kmp_free_thread(team->t.t_threads[f]);
5095           team->t.t_threads[f] = NULL;
5096         }
5097 #if KMP_NESTED_HOT_TEAMS
5098       } // (__kmp_hot_teams_mode == 0)
5099       else {
5100         // When keeping extra threads in team, switch threads to wait on own
5101         // b_go flag
5102         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5103           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5104           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5105           for (int b = 0; b < bs_last_barrier; ++b) {
5106             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5107               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5108             }
5109             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5110           }
5111         }
5112       }
5113 #endif // KMP_NESTED_HOT_TEAMS
5114       team->t.t_nproc = new_nproc;
5115       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5116       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5117       __kmp_reinitialize_team(team, new_icvs,
5118                               root->r.r_uber_thread->th.th_ident);
5119 
5120       // Update remaining threads
5121       for (f = 0; f < new_nproc; ++f) {
5122         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5123       }
5124 
5125       // restore the current task state of the primary thread: should be the
5126       // implicit task
5127       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5128                     team->t.t_threads[0], team));
5129 
5130       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5131 
5132 #ifdef KMP_DEBUG
5133       for (f = 0; f < team->t.t_nproc; f++) {
5134         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5135                          team->t.t_threads[f]->th.th_team_nproc ==
5136                              team->t.t_nproc);
5137       }
5138 #endif
5139 
5140       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5141 #if KMP_AFFINITY_SUPPORTED
5142       __kmp_partition_places(team);
5143 #endif
5144     } else { // team->t.t_nproc < new_nproc
5145 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5146       kmp_affin_mask_t *old_mask;
5147       if (KMP_AFFINITY_CAPABLE()) {
5148         KMP_CPU_ALLOC(old_mask);
5149       }
5150 #endif
5151 
5152       KA_TRACE(20,
5153                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5154                 new_nproc));
5155 
5156       team->t.t_size_changed = 1;
5157 
5158 #if KMP_NESTED_HOT_TEAMS
5159       int avail_threads = hot_teams[level].hot_team_nth;
5160       if (new_nproc < avail_threads)
5161         avail_threads = new_nproc;
5162       kmp_info_t **other_threads = team->t.t_threads;
5163       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5164         // Adjust barrier data of reserved threads (if any) of the team
5165         // Other data will be set in __kmp_initialize_info() below.
5166         int b;
5167         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5168         for (b = 0; b < bs_last_barrier; ++b) {
5169           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5170           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5171 #if USE_DEBUGGER
5172           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5173 #endif
5174         }
5175       }
5176       if (hot_teams[level].hot_team_nth >= new_nproc) {
5177         // we have all needed threads in reserve, no need to allocate any
5178         // this only possible in mode 1, cannot have reserved threads in mode 0
5179         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5180         team->t.t_nproc = new_nproc; // just get reserved threads involved
5181       } else {
5182         // we may have some threads in reserve, but not enough
5183         team->t.t_nproc =
5184             hot_teams[level]
5185                 .hot_team_nth; // get reserved threads involved if any
5186         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5187 #endif // KMP_NESTED_HOT_TEAMS
5188         if (team->t.t_max_nproc < new_nproc) {
5189           /* reallocate larger arrays */
5190           __kmp_reallocate_team_arrays(team, new_nproc);
5191           __kmp_reinitialize_team(team, new_icvs, NULL);
5192         }
5193 
5194 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5195         /* Temporarily set full mask for primary thread before creation of
5196            workers. The reason is that workers inherit the affinity from the
5197            primary thread, so if a lot of workers are created on the single
5198            core quickly, they don't get a chance to set their own affinity for
5199            a long time. */
5200         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5201 #endif
5202 
5203         /* allocate new threads for the hot team */
5204         for (f = team->t.t_nproc; f < new_nproc; f++) {
5205           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5206           KMP_DEBUG_ASSERT(new_worker);
5207           team->t.t_threads[f] = new_worker;
5208 
5209           KA_TRACE(20,
5210                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5211                     "join=%llu, plain=%llu\n",
5212                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5213                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5214                     team->t.t_bar[bs_plain_barrier].b_arrived));
5215 
5216           { // Initialize barrier data for new threads.
5217             int b;
5218             kmp_balign_t *balign = new_worker->th.th_bar;
5219             for (b = 0; b < bs_last_barrier; ++b) {
5220               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5221               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5222                                KMP_BARRIER_PARENT_FLAG);
5223 #if USE_DEBUGGER
5224               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5225 #endif
5226             }
5227           }
5228         }
5229 
5230 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5231         if (KMP_AFFINITY_CAPABLE()) {
5232           /* Restore initial primary thread's affinity mask */
5233           __kmp_set_system_affinity(old_mask, TRUE);
5234           KMP_CPU_FREE(old_mask);
5235         }
5236 #endif
5237 #if KMP_NESTED_HOT_TEAMS
5238       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5239 #endif // KMP_NESTED_HOT_TEAMS
5240       /* make sure everyone is syncronized */
5241       int old_nproc = team->t.t_nproc; // save old value and use to update only
5242       // new threads below
5243       __kmp_initialize_team(team, new_nproc, new_icvs,
5244                             root->r.r_uber_thread->th.th_ident);
5245 
5246       /* reinitialize the threads */
5247       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5248       for (f = 0; f < team->t.t_nproc; ++f)
5249         __kmp_initialize_info(team->t.t_threads[f], team, f,
5250                               __kmp_gtid_from_tid(f, team));
5251 
5252       if (level) { // set th_task_state for new threads in nested hot team
5253         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5254         // only need to set the th_task_state for the new threads. th_task_state
5255         // for primary thread will not be accurate until after this in
5256         // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5257         // get the correct value.
5258         for (f = old_nproc; f < team->t.t_nproc; ++f)
5259           team->t.t_threads[f]->th.th_task_state =
5260               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5261       } else { // set th_task_state for new threads in non-nested hot team
5262         // copy primary thread's state
5263         kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5264         for (f = old_nproc; f < team->t.t_nproc; ++f)
5265           team->t.t_threads[f]->th.th_task_state = old_state;
5266       }
5267 
5268 #ifdef KMP_DEBUG
5269       for (f = 0; f < team->t.t_nproc; ++f) {
5270         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5271                          team->t.t_threads[f]->th.th_team_nproc ==
5272                              team->t.t_nproc);
5273       }
5274 #endif
5275 
5276       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5277 #if KMP_AFFINITY_SUPPORTED
5278       __kmp_partition_places(team);
5279 #endif
5280     } // Check changes in number of threads
5281 
5282     kmp_info_t *master = team->t.t_threads[0];
5283     if (master->th.th_teams_microtask) {
5284       for (f = 1; f < new_nproc; ++f) {
5285         // propagate teams construct specific info to workers
5286         kmp_info_t *thr = team->t.t_threads[f];
5287         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5288         thr->th.th_teams_level = master->th.th_teams_level;
5289         thr->th.th_teams_size = master->th.th_teams_size;
5290       }
5291     }
5292 #if KMP_NESTED_HOT_TEAMS
5293     if (level) {
5294       // Sync barrier state for nested hot teams, not needed for outermost hot
5295       // team.
5296       for (f = 1; f < new_nproc; ++f) {
5297         kmp_info_t *thr = team->t.t_threads[f];
5298         int b;
5299         kmp_balign_t *balign = thr->th.th_bar;
5300         for (b = 0; b < bs_last_barrier; ++b) {
5301           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5302           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5303 #if USE_DEBUGGER
5304           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5305 #endif
5306         }
5307       }
5308     }
5309 #endif // KMP_NESTED_HOT_TEAMS
5310 
5311     /* reallocate space for arguments if necessary */
5312     __kmp_alloc_argv_entries(argc, team, TRUE);
5313     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5314     // The hot team re-uses the previous task team,
5315     // if untouched during the previous release->gather phase.
5316 
5317     KF_TRACE(10, (" hot_team = %p\n", team));
5318 
5319 #if KMP_DEBUG
5320     if (__kmp_tasking_mode != tskm_immediate_exec) {
5321       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5322                     "task_team[1] = %p after reinit\n",
5323                     team->t.t_task_team[0], team->t.t_task_team[1]));
5324     }
5325 #endif
5326 
5327 #if OMPT_SUPPORT
5328     __ompt_team_assign_id(team, ompt_parallel_data);
5329 #endif
5330 
5331     KMP_MB();
5332 
5333     return team;
5334   }
5335 
5336   /* next, let's try to take one from the team pool */
5337   KMP_MB();
5338   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5339     /* TODO: consider resizing undersized teams instead of reaping them, now
5340        that we have a resizing mechanism */
5341     if (team->t.t_max_nproc >= max_nproc) {
5342       /* take this team from the team pool */
5343       __kmp_team_pool = team->t.t_next_pool;
5344 
5345       /* setup the team for fresh use */
5346       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5347 
5348       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5349                     "task_team[1] %p to NULL\n",
5350                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5351       team->t.t_task_team[0] = NULL;
5352       team->t.t_task_team[1] = NULL;
5353 
5354       /* reallocate space for arguments if necessary */
5355       __kmp_alloc_argv_entries(argc, team, TRUE);
5356       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5357 
5358       KA_TRACE(
5359           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5360                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5361       { // Initialize barrier data.
5362         int b;
5363         for (b = 0; b < bs_last_barrier; ++b) {
5364           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5365 #if USE_DEBUGGER
5366           team->t.t_bar[b].b_master_arrived = 0;
5367           team->t.t_bar[b].b_team_arrived = 0;
5368 #endif
5369         }
5370       }
5371 
5372       team->t.t_proc_bind = new_proc_bind;
5373 
5374       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5375                     team->t.t_id));
5376 
5377 #if OMPT_SUPPORT
5378       __ompt_team_assign_id(team, ompt_parallel_data);
5379 #endif
5380 
5381       KMP_MB();
5382 
5383       return team;
5384     }
5385 
5386     /* reap team if it is too small, then loop back and check the next one */
5387     // not sure if this is wise, but, will be redone during the hot-teams
5388     // rewrite.
5389     /* TODO: Use technique to find the right size hot-team, don't reap them */
5390     team = __kmp_reap_team(team);
5391     __kmp_team_pool = team;
5392   }
5393 
5394   /* nothing available in the pool, no matter, make a new team! */
5395   KMP_MB();
5396   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5397 
5398   /* and set it up */
5399   team->t.t_max_nproc = max_nproc;
5400   /* NOTE well, for some reason allocating one big buffer and dividing it up
5401      seems to really hurt performance a lot on the P4, so, let's not use this */
5402   __kmp_allocate_team_arrays(team, max_nproc);
5403 
5404   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5405   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5406 
5407   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5408                 "%p to NULL\n",
5409                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5410   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5411   // memory, no need to duplicate
5412   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5413   // memory, no need to duplicate
5414 
5415   if (__kmp_storage_map) {
5416     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5417   }
5418 
5419   /* allocate space for arguments */
5420   __kmp_alloc_argv_entries(argc, team, FALSE);
5421   team->t.t_argc = argc;
5422 
5423   KA_TRACE(20,
5424            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5425             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5426   { // Initialize barrier data.
5427     int b;
5428     for (b = 0; b < bs_last_barrier; ++b) {
5429       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5430 #if USE_DEBUGGER
5431       team->t.t_bar[b].b_master_arrived = 0;
5432       team->t.t_bar[b].b_team_arrived = 0;
5433 #endif
5434     }
5435   }
5436 
5437   team->t.t_proc_bind = new_proc_bind;
5438 
5439 #if OMPT_SUPPORT
5440   __ompt_team_assign_id(team, ompt_parallel_data);
5441   team->t.ompt_serialized_team_info = NULL;
5442 #endif
5443 
5444   KMP_MB();
5445 
5446   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5447                 team->t.t_id));
5448 
5449   return team;
5450 }
5451 
5452 /* TODO implement hot-teams at all levels */
5453 /* TODO implement lazy thread release on demand (disband request) */
5454 
5455 /* free the team.  return it to the team pool.  release all the threads
5456  * associated with it */
5457 void __kmp_free_team(kmp_root_t *root,
5458                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5459   int f;
5460   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5461                 team->t.t_id));
5462 
5463   /* verify state */
5464   KMP_DEBUG_ASSERT(root);
5465   KMP_DEBUG_ASSERT(team);
5466   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5467   KMP_DEBUG_ASSERT(team->t.t_threads);
5468 
5469   int use_hot_team = team == root->r.r_hot_team;
5470 #if KMP_NESTED_HOT_TEAMS
5471   int level;
5472   kmp_hot_team_ptr_t *hot_teams;
5473   if (master) {
5474     level = team->t.t_active_level - 1;
5475     if (master->th.th_teams_microtask) { // in teams construct?
5476       if (master->th.th_teams_size.nteams > 1) {
5477         ++level; // level was not increased in teams construct for
5478         // team_of_masters
5479       }
5480       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5481           master->th.th_teams_level == team->t.t_level) {
5482         ++level; // level was not increased in teams construct for
5483         // team_of_workers before the parallel
5484       } // team->t.t_level will be increased inside parallel
5485     }
5486     hot_teams = master->th.th_hot_teams;
5487     if (level < __kmp_hot_teams_max_level) {
5488       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5489       use_hot_team = 1;
5490     }
5491   }
5492 #endif // KMP_NESTED_HOT_TEAMS
5493 
5494   /* team is done working */
5495   TCW_SYNC_PTR(team->t.t_pkfn,
5496                NULL); // Important for Debugging Support Library.
5497 #if KMP_OS_WINDOWS
5498   team->t.t_copyin_counter = 0; // init counter for possible reuse
5499 #endif
5500   // Do not reset pointer to parent team to NULL for hot teams.
5501 
5502   /* if we are non-hot team, release our threads */
5503   if (!use_hot_team) {
5504     if (__kmp_tasking_mode != tskm_immediate_exec) {
5505       // Wait for threads to reach reapable state
5506       for (f = 1; f < team->t.t_nproc; ++f) {
5507         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5508         kmp_info_t *th = team->t.t_threads[f];
5509         volatile kmp_uint32 *state = &th->th.th_reap_state;
5510         while (*state != KMP_SAFE_TO_REAP) {
5511 #if KMP_OS_WINDOWS
5512           // On Windows a thread can be killed at any time, check this
5513           DWORD ecode;
5514           if (!__kmp_is_thread_alive(th, &ecode)) {
5515             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5516             break;
5517           }
5518 #endif
5519           // first check if thread is sleeping
5520           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5521           if (fl.is_sleeping())
5522             fl.resume(__kmp_gtid_from_thread(th));
5523           KMP_CPU_PAUSE();
5524         }
5525       }
5526 
5527       // Delete task teams
5528       int tt_idx;
5529       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5530         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5531         if (task_team != NULL) {
5532           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5533             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5534             team->t.t_threads[f]->th.th_task_team = NULL;
5535           }
5536           KA_TRACE(
5537               20,
5538               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5539                __kmp_get_gtid(), task_team, team->t.t_id));
5540 #if KMP_NESTED_HOT_TEAMS
5541           __kmp_free_task_team(master, task_team);
5542 #endif
5543           team->t.t_task_team[tt_idx] = NULL;
5544         }
5545       }
5546     }
5547 
5548     // Reset pointer to parent team only for non-hot teams.
5549     team->t.t_parent = NULL;
5550     team->t.t_level = 0;
5551     team->t.t_active_level = 0;
5552 
5553     /* free the worker threads */
5554     for (f = 1; f < team->t.t_nproc; ++f) {
5555       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5556       __kmp_free_thread(team->t.t_threads[f]);
5557       team->t.t_threads[f] = NULL;
5558     }
5559 
5560     /* put the team back in the team pool */
5561     /* TODO limit size of team pool, call reap_team if pool too large */
5562     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5563     __kmp_team_pool = (volatile kmp_team_t *)team;
5564   } else { // Check if team was created for primary threads in teams construct
5565     // See if first worker is a CG root
5566     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5567                      team->t.t_threads[1]->th.th_cg_roots);
5568     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5569       // Clean up the CG root nodes on workers so that this team can be re-used
5570       for (f = 1; f < team->t.t_nproc; ++f) {
5571         kmp_info_t *thr = team->t.t_threads[f];
5572         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5573                          thr->th.th_cg_roots->cg_root == thr);
5574         // Pop current CG root off list
5575         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5576         thr->th.th_cg_roots = tmp->up;
5577         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5578                        " up to node %p. cg_nthreads was %d\n",
5579                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5580         int i = tmp->cg_nthreads--;
5581         if (i == 1) {
5582           __kmp_free(tmp); // free CG if we are the last thread in it
5583         }
5584         // Restore current task's thread_limit from CG root
5585         if (thr->th.th_cg_roots)
5586           thr->th.th_current_task->td_icvs.thread_limit =
5587               thr->th.th_cg_roots->cg_thread_limit;
5588       }
5589     }
5590   }
5591 
5592   KMP_MB();
5593 }
5594 
5595 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5596 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5597   kmp_team_t *next_pool = team->t.t_next_pool;
5598 
5599   KMP_DEBUG_ASSERT(team);
5600   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5601   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5602   KMP_DEBUG_ASSERT(team->t.t_threads);
5603   KMP_DEBUG_ASSERT(team->t.t_argv);
5604 
5605   /* TODO clean the threads that are a part of this? */
5606 
5607   /* free stuff */
5608   __kmp_free_team_arrays(team);
5609   if (team->t.t_argv != &team->t.t_inline_argv[0])
5610     __kmp_free((void *)team->t.t_argv);
5611   __kmp_free(team);
5612 
5613   KMP_MB();
5614   return next_pool;
5615 }
5616 
5617 // Free the thread.  Don't reap it, just place it on the pool of available
5618 // threads.
5619 //
5620 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5621 // binding for the affinity mechanism to be useful.
5622 //
5623 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5624 // However, we want to avoid a potential performance problem by always
5625 // scanning through the list to find the correct point at which to insert
5626 // the thread (potential N**2 behavior).  To do this we keep track of the
5627 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5628 // With single-level parallelism, threads will always be added to the tail
5629 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5630 // parallelism, all bets are off and we may need to scan through the entire
5631 // free list.
5632 //
5633 // This change also has a potentially large performance benefit, for some
5634 // applications.  Previously, as threads were freed from the hot team, they
5635 // would be placed back on the free list in inverse order.  If the hot team
5636 // grew back to it's original size, then the freed thread would be placed
5637 // back on the hot team in reverse order.  This could cause bad cache
5638 // locality problems on programs where the size of the hot team regularly
5639 // grew and shrunk.
5640 //
5641 // Now, for single-level parallelism, the OMP tid is always == gtid.
5642 void __kmp_free_thread(kmp_info_t *this_th) {
5643   int gtid;
5644   kmp_info_t **scan;
5645 
5646   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5647                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5648 
5649   KMP_DEBUG_ASSERT(this_th);
5650 
5651   // When moving thread to pool, switch thread to wait on own b_go flag, and
5652   // uninitialized (NULL team).
5653   int b;
5654   kmp_balign_t *balign = this_th->th.th_bar;
5655   for (b = 0; b < bs_last_barrier; ++b) {
5656     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5657       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5658     balign[b].bb.team = NULL;
5659     balign[b].bb.leaf_kids = 0;
5660   }
5661   this_th->th.th_task_state = 0;
5662   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5663 
5664   /* put thread back on the free pool */
5665   TCW_PTR(this_th->th.th_team, NULL);
5666   TCW_PTR(this_th->th.th_root, NULL);
5667   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5668 
5669   while (this_th->th.th_cg_roots) {
5670     this_th->th.th_cg_roots->cg_nthreads--;
5671     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5672                    " %p of thread  %p to %d\n",
5673                    this_th, this_th->th.th_cg_roots,
5674                    this_th->th.th_cg_roots->cg_root,
5675                    this_th->th.th_cg_roots->cg_nthreads));
5676     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5677     if (tmp->cg_root == this_th) { // Thread is a cg_root
5678       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5679       KA_TRACE(
5680           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5681       this_th->th.th_cg_roots = tmp->up;
5682       __kmp_free(tmp);
5683     } else { // Worker thread
5684       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5685         __kmp_free(tmp);
5686       }
5687       this_th->th.th_cg_roots = NULL;
5688       break;
5689     }
5690   }
5691 
5692   /* If the implicit task assigned to this thread can be used by other threads
5693    * -> multiple threads can share the data and try to free the task at
5694    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5695    * with higher probability when hot team is disabled but can occurs even when
5696    * the hot team is enabled */
5697   __kmp_free_implicit_task(this_th);
5698   this_th->th.th_current_task = NULL;
5699 
5700   // If the __kmp_thread_pool_insert_pt is already past the new insert
5701   // point, then we need to re-scan the entire list.
5702   gtid = this_th->th.th_info.ds.ds_gtid;
5703   if (__kmp_thread_pool_insert_pt != NULL) {
5704     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5705     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5706       __kmp_thread_pool_insert_pt = NULL;
5707     }
5708   }
5709 
5710   // Scan down the list to find the place to insert the thread.
5711   // scan is the address of a link in the list, possibly the address of
5712   // __kmp_thread_pool itself.
5713   //
5714   // In the absence of nested parallelism, the for loop will have 0 iterations.
5715   if (__kmp_thread_pool_insert_pt != NULL) {
5716     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5717   } else {
5718     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5719   }
5720   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5721        scan = &((*scan)->th.th_next_pool))
5722     ;
5723 
5724   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5725   // to its address.
5726   TCW_PTR(this_th->th.th_next_pool, *scan);
5727   __kmp_thread_pool_insert_pt = *scan = this_th;
5728   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5729                    (this_th->th.th_info.ds.ds_gtid <
5730                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5731   TCW_4(this_th->th.th_in_pool, TRUE);
5732   __kmp_suspend_initialize_thread(this_th);
5733   __kmp_lock_suspend_mx(this_th);
5734   if (this_th->th.th_active == TRUE) {
5735     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5736     this_th->th.th_active_in_pool = TRUE;
5737   }
5738 #if KMP_DEBUG
5739   else {
5740     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5741   }
5742 #endif
5743   __kmp_unlock_suspend_mx(this_th);
5744 
5745   TCW_4(__kmp_nth, __kmp_nth - 1);
5746 
5747 #ifdef KMP_ADJUST_BLOCKTIME
5748   /* Adjust blocktime back to user setting or default if necessary */
5749   /* Middle initialization might never have occurred                */
5750   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5751     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5752     if (__kmp_nth <= __kmp_avail_proc) {
5753       __kmp_zero_bt = FALSE;
5754     }
5755   }
5756 #endif /* KMP_ADJUST_BLOCKTIME */
5757 
5758   KMP_MB();
5759 }
5760 
5761 /* ------------------------------------------------------------------------ */
5762 
5763 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5764 #if OMP_PROFILING_SUPPORT
5765   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5766   // TODO: add a configuration option for time granularity
5767   if (ProfileTraceFile)
5768     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5769 #endif
5770 
5771   int gtid = this_thr->th.th_info.ds.ds_gtid;
5772   /*    void                 *stack_data;*/
5773   kmp_team_t **volatile pteam;
5774 
5775   KMP_MB();
5776   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5777 
5778   if (__kmp_env_consistency_check) {
5779     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5780   }
5781 
5782 #if OMPD_SUPPORT
5783   if (ompd_state & OMPD_ENABLE_BP)
5784     ompd_bp_thread_begin();
5785 #endif
5786 
5787 #if OMPT_SUPPORT
5788   ompt_data_t *thread_data = nullptr;
5789   if (ompt_enabled.enabled) {
5790     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5791     *thread_data = ompt_data_none;
5792 
5793     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5794     this_thr->th.ompt_thread_info.wait_id = 0;
5795     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5796     this_thr->th.ompt_thread_info.parallel_flags = 0;
5797     if (ompt_enabled.ompt_callback_thread_begin) {
5798       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5799           ompt_thread_worker, thread_data);
5800     }
5801     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5802   }
5803 #endif
5804 
5805   /* This is the place where threads wait for work */
5806   while (!TCR_4(__kmp_global.g.g_done)) {
5807     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5808     KMP_MB();
5809 
5810     /* wait for work to do */
5811     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5812 
5813     /* No tid yet since not part of a team */
5814     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5815 
5816 #if OMPT_SUPPORT
5817     if (ompt_enabled.enabled) {
5818       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5819     }
5820 #endif
5821 
5822     pteam = &this_thr->th.th_team;
5823 
5824     /* have we been allocated? */
5825     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5826       /* we were just woken up, so run our new task */
5827       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5828         int rc;
5829         KA_TRACE(20,
5830                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5831                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5832                   (*pteam)->t.t_pkfn));
5833 
5834         updateHWFPControl(*pteam);
5835 
5836 #if OMPT_SUPPORT
5837         if (ompt_enabled.enabled) {
5838           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5839         }
5840 #endif
5841 
5842         rc = (*pteam)->t.t_invoke(gtid);
5843         KMP_ASSERT(rc);
5844 
5845         KMP_MB();
5846         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5847                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5848                       (*pteam)->t.t_pkfn));
5849       }
5850 #if OMPT_SUPPORT
5851       if (ompt_enabled.enabled) {
5852         /* no frame set while outside task */
5853         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5854 
5855         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5856       }
5857 #endif
5858       /* join barrier after parallel region */
5859       __kmp_join_barrier(gtid);
5860     }
5861   }
5862   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5863 
5864 #if OMPD_SUPPORT
5865   if (ompd_state & OMPD_ENABLE_BP)
5866     ompd_bp_thread_end();
5867 #endif
5868 
5869 #if OMPT_SUPPORT
5870   if (ompt_enabled.ompt_callback_thread_end) {
5871     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5872   }
5873 #endif
5874 
5875   this_thr->th.th_task_team = NULL;
5876   /* run the destructors for the threadprivate data for this thread */
5877   __kmp_common_destroy_gtid(gtid);
5878 
5879   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5880   KMP_MB();
5881 
5882 #if OMP_PROFILING_SUPPORT
5883   llvm::timeTraceProfilerFinishThread();
5884 #endif
5885   return this_thr;
5886 }
5887 
5888 /* ------------------------------------------------------------------------ */
5889 
5890 void __kmp_internal_end_dest(void *specific_gtid) {
5891   // Make sure no significant bits are lost
5892   int gtid;
5893   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5894 
5895   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5896   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5897    * this is because 0 is reserved for the nothing-stored case */
5898 
5899   __kmp_internal_end_thread(gtid);
5900 }
5901 
5902 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5903 
5904 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5905   __kmp_internal_end_atexit();
5906 }
5907 
5908 #endif
5909 
5910 /* [Windows] josh: when the atexit handler is called, there may still be more
5911    than one thread alive */
5912 void __kmp_internal_end_atexit(void) {
5913   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5914   /* [Windows]
5915      josh: ideally, we want to completely shutdown the library in this atexit
5916      handler, but stat code that depends on thread specific data for gtid fails
5917      because that data becomes unavailable at some point during the shutdown, so
5918      we call __kmp_internal_end_thread instead. We should eventually remove the
5919      dependency on __kmp_get_specific_gtid in the stat code and use
5920      __kmp_internal_end_library to cleanly shutdown the library.
5921 
5922      // TODO: Can some of this comment about GVS be removed?
5923      I suspect that the offending stat code is executed when the calling thread
5924      tries to clean up a dead root thread's data structures, resulting in GVS
5925      code trying to close the GVS structures for that thread, but since the stat
5926      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5927      the calling thread is cleaning up itself instead of another thread, it get
5928      confused. This happens because allowing a thread to unregister and cleanup
5929      another thread is a recent modification for addressing an issue.
5930      Based on the current design (20050722), a thread may end up
5931      trying to unregister another thread only if thread death does not trigger
5932      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5933      thread specific data destructor function to detect thread death. For
5934      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5935      is nothing.  Thus, the workaround is applicable only for Windows static
5936      stat library. */
5937   __kmp_internal_end_library(-1);
5938 #if KMP_OS_WINDOWS
5939   __kmp_close_console();
5940 #endif
5941 }
5942 
5943 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5944   // It is assumed __kmp_forkjoin_lock is acquired.
5945 
5946   int gtid;
5947 
5948   KMP_DEBUG_ASSERT(thread != NULL);
5949 
5950   gtid = thread->th.th_info.ds.ds_gtid;
5951 
5952   if (!is_root) {
5953     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5954       /* Assume the threads are at the fork barrier here */
5955       KA_TRACE(
5956           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5957                gtid));
5958       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5959        * (GEH) */
5960       kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5961                          thread);
5962       __kmp_release_64(&flag);
5963     }
5964 
5965     // Terminate OS thread.
5966     __kmp_reap_worker(thread);
5967 
5968     // The thread was killed asynchronously.  If it was actively
5969     // spinning in the thread pool, decrement the global count.
5970     //
5971     // There is a small timing hole here - if the worker thread was just waking
5972     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5973     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5974     // the global counter might not get updated.
5975     //
5976     // Currently, this can only happen as the library is unloaded,
5977     // so there are no harmful side effects.
5978     if (thread->th.th_active_in_pool) {
5979       thread->th.th_active_in_pool = FALSE;
5980       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5981       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5982     }
5983   }
5984 
5985   __kmp_free_implicit_task(thread);
5986 
5987 // Free the fast memory for tasking
5988 #if USE_FAST_MEMORY
5989   __kmp_free_fast_memory(thread);
5990 #endif /* USE_FAST_MEMORY */
5991 
5992   __kmp_suspend_uninitialize_thread(thread);
5993 
5994   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5995   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5996 
5997   --__kmp_all_nth;
5998   // __kmp_nth was decremented when thread is added to the pool.
5999 
6000 #ifdef KMP_ADJUST_BLOCKTIME
6001   /* Adjust blocktime back to user setting or default if necessary */
6002   /* Middle initialization might never have occurred                */
6003   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6004     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6005     if (__kmp_nth <= __kmp_avail_proc) {
6006       __kmp_zero_bt = FALSE;
6007     }
6008   }
6009 #endif /* KMP_ADJUST_BLOCKTIME */
6010 
6011   /* free the memory being used */
6012   if (__kmp_env_consistency_check) {
6013     if (thread->th.th_cons) {
6014       __kmp_free_cons_stack(thread->th.th_cons);
6015       thread->th.th_cons = NULL;
6016     }
6017   }
6018 
6019   if (thread->th.th_pri_common != NULL) {
6020     __kmp_free(thread->th.th_pri_common);
6021     thread->th.th_pri_common = NULL;
6022   }
6023 
6024   if (thread->th.th_task_state_memo_stack != NULL) {
6025     __kmp_free(thread->th.th_task_state_memo_stack);
6026     thread->th.th_task_state_memo_stack = NULL;
6027   }
6028 
6029 #if KMP_USE_BGET
6030   if (thread->th.th_local.bget_data != NULL) {
6031     __kmp_finalize_bget(thread);
6032   }
6033 #endif
6034 
6035 #if KMP_AFFINITY_SUPPORTED
6036   if (thread->th.th_affin_mask != NULL) {
6037     KMP_CPU_FREE(thread->th.th_affin_mask);
6038     thread->th.th_affin_mask = NULL;
6039   }
6040 #endif /* KMP_AFFINITY_SUPPORTED */
6041 
6042 #if KMP_USE_HIER_SCHED
6043   if (thread->th.th_hier_bar_data != NULL) {
6044     __kmp_free(thread->th.th_hier_bar_data);
6045     thread->th.th_hier_bar_data = NULL;
6046   }
6047 #endif
6048 
6049   __kmp_reap_team(thread->th.th_serial_team);
6050   thread->th.th_serial_team = NULL;
6051   __kmp_free(thread);
6052 
6053   KMP_MB();
6054 
6055 } // __kmp_reap_thread
6056 
6057 static void __kmp_internal_end(void) {
6058   int i;
6059 
6060   /* First, unregister the library */
6061   __kmp_unregister_library();
6062 
6063 #if KMP_OS_WINDOWS
6064   /* In Win static library, we can't tell when a root actually dies, so we
6065      reclaim the data structures for any root threads that have died but not
6066      unregistered themselves, in order to shut down cleanly.
6067      In Win dynamic library we also can't tell when a thread dies.  */
6068   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6069 // dead roots
6070 #endif
6071 
6072   for (i = 0; i < __kmp_threads_capacity; i++)
6073     if (__kmp_root[i])
6074       if (__kmp_root[i]->r.r_active)
6075         break;
6076   KMP_MB(); /* Flush all pending memory write invalidates.  */
6077   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6078 
6079   if (i < __kmp_threads_capacity) {
6080 #if KMP_USE_MONITOR
6081     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6082     KMP_MB(); /* Flush all pending memory write invalidates.  */
6083 
6084     // Need to check that monitor was initialized before reaping it. If we are
6085     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6086     // __kmp_monitor will appear to contain valid data, but it is only valid in
6087     // the parent process, not the child.
6088     // New behavior (201008): instead of keying off of the flag
6089     // __kmp_init_parallel, the monitor thread creation is keyed off
6090     // of the new flag __kmp_init_monitor.
6091     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6092     if (TCR_4(__kmp_init_monitor)) {
6093       __kmp_reap_monitor(&__kmp_monitor);
6094       TCW_4(__kmp_init_monitor, 0);
6095     }
6096     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6097     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6098 #endif // KMP_USE_MONITOR
6099   } else {
6100 /* TODO move this to cleanup code */
6101 #ifdef KMP_DEBUG
6102     /* make sure that everything has properly ended */
6103     for (i = 0; i < __kmp_threads_capacity; i++) {
6104       if (__kmp_root[i]) {
6105         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6106         //                    there can be uber threads alive here
6107         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6108       }
6109     }
6110 #endif
6111 
6112     KMP_MB();
6113 
6114     // Reap the worker threads.
6115     // This is valid for now, but be careful if threads are reaped sooner.
6116     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6117       // Get the next thread from the pool.
6118       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6119       __kmp_thread_pool = thread->th.th_next_pool;
6120       // Reap it.
6121       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6122       thread->th.th_next_pool = NULL;
6123       thread->th.th_in_pool = FALSE;
6124       __kmp_reap_thread(thread, 0);
6125     }
6126     __kmp_thread_pool_insert_pt = NULL;
6127 
6128     // Reap teams.
6129     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6130       // Get the next team from the pool.
6131       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6132       __kmp_team_pool = team->t.t_next_pool;
6133       // Reap it.
6134       team->t.t_next_pool = NULL;
6135       __kmp_reap_team(team);
6136     }
6137 
6138     __kmp_reap_task_teams();
6139 
6140 #if KMP_OS_UNIX
6141     // Threads that are not reaped should not access any resources since they
6142     // are going to be deallocated soon, so the shutdown sequence should wait
6143     // until all threads either exit the final spin-waiting loop or begin
6144     // sleeping after the given blocktime.
6145     for (i = 0; i < __kmp_threads_capacity; i++) {
6146       kmp_info_t *thr = __kmp_threads[i];
6147       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6148         KMP_CPU_PAUSE();
6149     }
6150 #endif
6151 
6152     for (i = 0; i < __kmp_threads_capacity; ++i) {
6153       // TBD: Add some checking...
6154       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6155     }
6156 
6157     /* Make sure all threadprivate destructors get run by joining with all
6158        worker threads before resetting this flag */
6159     TCW_SYNC_4(__kmp_init_common, FALSE);
6160 
6161     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6162     KMP_MB();
6163 
6164 #if KMP_USE_MONITOR
6165     // See note above: One of the possible fixes for CQ138434 / CQ140126
6166     //
6167     // FIXME: push both code fragments down and CSE them?
6168     // push them into __kmp_cleanup() ?
6169     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6170     if (TCR_4(__kmp_init_monitor)) {
6171       __kmp_reap_monitor(&__kmp_monitor);
6172       TCW_4(__kmp_init_monitor, 0);
6173     }
6174     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6175     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6176 #endif
6177   } /* else !__kmp_global.t_active */
6178   TCW_4(__kmp_init_gtid, FALSE);
6179   KMP_MB(); /* Flush all pending memory write invalidates.  */
6180 
6181   __kmp_cleanup();
6182 #if OMPT_SUPPORT
6183   ompt_fini();
6184 #endif
6185 }
6186 
6187 void __kmp_internal_end_library(int gtid_req) {
6188   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6189   /* this shouldn't be a race condition because __kmp_internal_end() is the
6190      only place to clear __kmp_serial_init */
6191   /* we'll check this later too, after we get the lock */
6192   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6193   // redundant, because the next check will work in any case.
6194   if (__kmp_global.g.g_abort) {
6195     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6196     /* TODO abort? */
6197     return;
6198   }
6199   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6200     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6201     return;
6202   }
6203 
6204   // If hidden helper team has been initialized, we need to deinit it
6205   if (TCR_4(__kmp_init_hidden_helper) &&
6206       !TCR_4(__kmp_hidden_helper_team_done)) {
6207     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6208     // First release the main thread to let it continue its work
6209     __kmp_hidden_helper_main_thread_release();
6210     // Wait until the hidden helper team has been destroyed
6211     __kmp_hidden_helper_threads_deinitz_wait();
6212   }
6213 
6214   KMP_MB(); /* Flush all pending memory write invalidates.  */
6215   /* find out who we are and what we should do */
6216   {
6217     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6218     KA_TRACE(
6219         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6220     if (gtid == KMP_GTID_SHUTDOWN) {
6221       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6222                     "already shutdown\n"));
6223       return;
6224     } else if (gtid == KMP_GTID_MONITOR) {
6225       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6226                     "registered, or system shutdown\n"));
6227       return;
6228     } else if (gtid == KMP_GTID_DNE) {
6229       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6230                     "shutdown\n"));
6231       /* we don't know who we are, but we may still shutdown the library */
6232     } else if (KMP_UBER_GTID(gtid)) {
6233       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6234       if (__kmp_root[gtid]->r.r_active) {
6235         __kmp_global.g.g_abort = -1;
6236         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6237         __kmp_unregister_library();
6238         KA_TRACE(10,
6239                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6240                   gtid));
6241         return;
6242       } else {
6243         KA_TRACE(
6244             10,
6245             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6246         __kmp_unregister_root_current_thread(gtid);
6247       }
6248     } else {
6249 /* worker threads may call this function through the atexit handler, if they
6250  * call exit() */
6251 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6252    TODO: do a thorough shutdown instead */
6253 #ifdef DUMP_DEBUG_ON_EXIT
6254       if (__kmp_debug_buf)
6255         __kmp_dump_debug_buffer();
6256 #endif
6257       // added unregister library call here when we switch to shm linux
6258       // if we don't, it will leave lots of files in /dev/shm
6259       // cleanup shared memory file before exiting.
6260       __kmp_unregister_library();
6261       return;
6262     }
6263   }
6264   /* synchronize the termination process */
6265   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6266 
6267   /* have we already finished */
6268   if (__kmp_global.g.g_abort) {
6269     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6270     /* TODO abort? */
6271     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6272     return;
6273   }
6274   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6275     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6276     return;
6277   }
6278 
6279   /* We need this lock to enforce mutex between this reading of
6280      __kmp_threads_capacity and the writing by __kmp_register_root.
6281      Alternatively, we can use a counter of roots that is atomically updated by
6282      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6283      __kmp_internal_end_*.  */
6284   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6285 
6286   /* now we can safely conduct the actual termination */
6287   __kmp_internal_end();
6288 
6289   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6290   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6291 
6292   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6293 
6294 #ifdef DUMP_DEBUG_ON_EXIT
6295   if (__kmp_debug_buf)
6296     __kmp_dump_debug_buffer();
6297 #endif
6298 
6299 #if KMP_OS_WINDOWS
6300   __kmp_close_console();
6301 #endif
6302 
6303   __kmp_fini_allocator();
6304 
6305 } // __kmp_internal_end_library
6306 
6307 void __kmp_internal_end_thread(int gtid_req) {
6308   int i;
6309 
6310   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6311   /* this shouldn't be a race condition because __kmp_internal_end() is the
6312    * only place to clear __kmp_serial_init */
6313   /* we'll check this later too, after we get the lock */
6314   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6315   // redundant, because the next check will work in any case.
6316   if (__kmp_global.g.g_abort) {
6317     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6318     /* TODO abort? */
6319     return;
6320   }
6321   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6322     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6323     return;
6324   }
6325 
6326   // If hidden helper team has been initialized, we need to deinit it
6327   if (TCR_4(__kmp_init_hidden_helper) &&
6328       !TCR_4(__kmp_hidden_helper_team_done)) {
6329     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6330     // First release the main thread to let it continue its work
6331     __kmp_hidden_helper_main_thread_release();
6332     // Wait until the hidden helper team has been destroyed
6333     __kmp_hidden_helper_threads_deinitz_wait();
6334   }
6335 
6336   KMP_MB(); /* Flush all pending memory write invalidates.  */
6337 
6338   /* find out who we are and what we should do */
6339   {
6340     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6341     KA_TRACE(10,
6342              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6343     if (gtid == KMP_GTID_SHUTDOWN) {
6344       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6345                     "already shutdown\n"));
6346       return;
6347     } else if (gtid == KMP_GTID_MONITOR) {
6348       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6349                     "registered, or system shutdown\n"));
6350       return;
6351     } else if (gtid == KMP_GTID_DNE) {
6352       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6353                     "shutdown\n"));
6354       return;
6355       /* we don't know who we are */
6356     } else if (KMP_UBER_GTID(gtid)) {
6357       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6358       if (__kmp_root[gtid]->r.r_active) {
6359         __kmp_global.g.g_abort = -1;
6360         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6361         KA_TRACE(10,
6362                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6363                   gtid));
6364         return;
6365       } else {
6366         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6367                       gtid));
6368         __kmp_unregister_root_current_thread(gtid);
6369       }
6370     } else {
6371       /* just a worker thread, let's leave */
6372       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6373 
6374       if (gtid >= 0) {
6375         __kmp_threads[gtid]->th.th_task_team = NULL;
6376       }
6377 
6378       KA_TRACE(10,
6379                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6380                 gtid));
6381       return;
6382     }
6383   }
6384 #if KMP_DYNAMIC_LIB
6385   if (__kmp_pause_status != kmp_hard_paused)
6386   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6387   // because we will better shutdown later in the library destructor.
6388   {
6389     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6390     return;
6391   }
6392 #endif
6393   /* synchronize the termination process */
6394   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6395 
6396   /* have we already finished */
6397   if (__kmp_global.g.g_abort) {
6398     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6399     /* TODO abort? */
6400     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6401     return;
6402   }
6403   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6404     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6405     return;
6406   }
6407 
6408   /* We need this lock to enforce mutex between this reading of
6409      __kmp_threads_capacity and the writing by __kmp_register_root.
6410      Alternatively, we can use a counter of roots that is atomically updated by
6411      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6412      __kmp_internal_end_*.  */
6413 
6414   /* should we finish the run-time?  are all siblings done? */
6415   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6416 
6417   for (i = 0; i < __kmp_threads_capacity; ++i) {
6418     if (KMP_UBER_GTID(i)) {
6419       KA_TRACE(
6420           10,
6421           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6422       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6423       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6424       return;
6425     }
6426   }
6427 
6428   /* now we can safely conduct the actual termination */
6429 
6430   __kmp_internal_end();
6431 
6432   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6433   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6434 
6435   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6436 
6437 #ifdef DUMP_DEBUG_ON_EXIT
6438   if (__kmp_debug_buf)
6439     __kmp_dump_debug_buffer();
6440 #endif
6441 } // __kmp_internal_end_thread
6442 
6443 // -----------------------------------------------------------------------------
6444 // Library registration stuff.
6445 
6446 static long __kmp_registration_flag = 0;
6447 // Random value used to indicate library initialization.
6448 static char *__kmp_registration_str = NULL;
6449 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6450 
6451 static inline char *__kmp_reg_status_name() {
6452 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6453    each thread. If registration and unregistration go in different threads
6454    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6455    env var can not be found, because the name will contain different pid. */
6456 // macOS* complains about name being too long with additional getuid()
6457 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6458   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6459                           (int)getuid());
6460 #else
6461   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6462 #endif
6463 } // __kmp_reg_status_get
6464 
6465 void __kmp_register_library_startup(void) {
6466 
6467   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6468   int done = 0;
6469   union {
6470     double dtime;
6471     long ltime;
6472   } time;
6473 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6474   __kmp_initialize_system_tick();
6475 #endif
6476   __kmp_read_system_time(&time.dtime);
6477   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6478   __kmp_registration_str =
6479       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6480                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6481 
6482   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6483                 __kmp_registration_str));
6484 
6485   while (!done) {
6486 
6487     char *value = NULL; // Actual value of the environment variable.
6488 
6489 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6490     char *shm_name = __kmp_str_format("/%s", name);
6491     int shm_preexist = 0;
6492     char *data1;
6493     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6494     if ((fd1 == -1) && (errno == EEXIST)) {
6495       // file didn't open because it already exists.
6496       // try opening existing file
6497       fd1 = shm_open(shm_name, O_RDWR, 0666);
6498       if (fd1 == -1) { // file didn't open
6499         // error out here
6500         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6501                     __kmp_msg_null);
6502       } else {
6503         // able to open existing file
6504         shm_preexist = 1;
6505       }
6506     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6507       // already exists.
6508       // error out here.
6509       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6510                   __kmp_msg_null);
6511     }
6512     if (shm_preexist == 0) {
6513       // we created SHM now set size
6514       if (ftruncate(fd1, SHM_SIZE) == -1) {
6515         // error occured setting size;
6516         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6517                     KMP_ERR(errno), __kmp_msg_null);
6518       }
6519     }
6520     data1 =
6521         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6522     if (data1 == MAP_FAILED) {
6523       // failed to map shared memory
6524       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6525                   __kmp_msg_null);
6526     }
6527     if (shm_preexist == 0) { // set data to SHM, set value
6528       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6529     }
6530     // Read value from either what we just wrote or existing file.
6531     value = __kmp_str_format("%s", data1); // read value from SHM
6532     munmap(data1, SHM_SIZE);
6533     close(fd1);
6534 #else // Windows and unix with static library
6535     // Set environment variable, but do not overwrite if it is exist.
6536     __kmp_env_set(name, __kmp_registration_str, 0);
6537     // read value to see if it got set
6538     value = __kmp_env_get(name);
6539 #endif
6540 
6541     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6542       done = 1; // Ok, environment variable set successfully, exit the loop.
6543     } else {
6544       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6545       // Check whether it alive or dead.
6546       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6547       char *tail = value;
6548       char *flag_addr_str = NULL;
6549       char *flag_val_str = NULL;
6550       char const *file_name = NULL;
6551       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6552       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6553       file_name = tail;
6554       if (tail != NULL) {
6555         unsigned long *flag_addr = 0;
6556         unsigned long flag_val = 0;
6557         KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6558         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6559         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6560           // First, check whether environment-encoded address is mapped into
6561           // addr space.
6562           // If so, dereference it to see if it still has the right value.
6563           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6564             neighbor = 1;
6565           } else {
6566             // If not, then we know the other copy of the library is no longer
6567             // running.
6568             neighbor = 2;
6569           }
6570         }
6571       }
6572       switch (neighbor) {
6573       case 0: // Cannot parse environment variable -- neighbor status unknown.
6574         // Assume it is the incompatible format of future version of the
6575         // library. Assume the other library is alive.
6576         // WARN( ... ); // TODO: Issue a warning.
6577         file_name = "unknown library";
6578         KMP_FALLTHROUGH();
6579       // Attention! Falling to the next case. That's intentional.
6580       case 1: { // Neighbor is alive.
6581         // Check it is allowed.
6582         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6583         if (!__kmp_str_match_true(duplicate_ok)) {
6584           // That's not allowed. Issue fatal error.
6585           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6586                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6587         }
6588         KMP_INTERNAL_FREE(duplicate_ok);
6589         __kmp_duplicate_library_ok = 1;
6590         done = 1; // Exit the loop.
6591       } break;
6592       case 2: { // Neighbor is dead.
6593 
6594 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6595         // close shared memory.
6596         shm_unlink(shm_name); // this removes file in /dev/shm
6597 #else
6598         // Clear the variable and try to register library again.
6599         __kmp_env_unset(name);
6600 #endif
6601       } break;
6602       default: {
6603         KMP_DEBUG_ASSERT(0);
6604       } break;
6605       }
6606     }
6607     KMP_INTERNAL_FREE((void *)value);
6608 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6609     KMP_INTERNAL_FREE((void *)shm_name);
6610 #endif
6611   } // while
6612   KMP_INTERNAL_FREE((void *)name);
6613 
6614 } // func __kmp_register_library_startup
6615 
6616 void __kmp_unregister_library(void) {
6617 
6618   char *name = __kmp_reg_status_name();
6619   char *value = NULL;
6620 
6621 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6622   char *shm_name = __kmp_str_format("/%s", name);
6623   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6624   if (fd1 == -1) {
6625     // file did not open. return.
6626     return;
6627   }
6628   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6629   if (data1 != MAP_FAILED) {
6630     value = __kmp_str_format("%s", data1); // read value from SHM
6631     munmap(data1, SHM_SIZE);
6632   }
6633   close(fd1);
6634 #else
6635   value = __kmp_env_get(name);
6636 #endif
6637 
6638   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6639   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6640   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6641 //  Ok, this is our variable. Delete it.
6642 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6643     shm_unlink(shm_name); // this removes file in /dev/shm
6644 #else
6645     __kmp_env_unset(name);
6646 #endif
6647   }
6648 
6649 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6650   KMP_INTERNAL_FREE(shm_name);
6651 #endif
6652 
6653   KMP_INTERNAL_FREE(__kmp_registration_str);
6654   KMP_INTERNAL_FREE(value);
6655   KMP_INTERNAL_FREE(name);
6656 
6657   __kmp_registration_flag = 0;
6658   __kmp_registration_str = NULL;
6659 
6660 } // __kmp_unregister_library
6661 
6662 // End of Library registration stuff.
6663 // -----------------------------------------------------------------------------
6664 
6665 #if KMP_MIC_SUPPORTED
6666 
6667 static void __kmp_check_mic_type() {
6668   kmp_cpuid_t cpuid_state = {0};
6669   kmp_cpuid_t *cs_p = &cpuid_state;
6670   __kmp_x86_cpuid(1, 0, cs_p);
6671   // We don't support mic1 at the moment
6672   if ((cs_p->eax & 0xff0) == 0xB10) {
6673     __kmp_mic_type = mic2;
6674   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6675     __kmp_mic_type = mic3;
6676   } else {
6677     __kmp_mic_type = non_mic;
6678   }
6679 }
6680 
6681 #endif /* KMP_MIC_SUPPORTED */
6682 
6683 #if KMP_HAVE_UMWAIT
6684 static void __kmp_user_level_mwait_init() {
6685   struct kmp_cpuid buf;
6686   __kmp_x86_cpuid(7, 0, &buf);
6687   __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6688   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6689                 __kmp_umwait_enabled));
6690 }
6691 #elif KMP_HAVE_MWAIT
6692 #ifndef AT_INTELPHIUSERMWAIT
6693 // Spurious, non-existent value that should always fail to return anything.
6694 // Will be replaced with the correct value when we know that.
6695 #define AT_INTELPHIUSERMWAIT 10000
6696 #endif
6697 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6698 // earlier OS is used to build the RTL, we'll use the following internal
6699 // function when the entry is not found.
6700 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6701 unsigned long getauxval(unsigned long) { return 0; }
6702 
6703 static void __kmp_user_level_mwait_init() {
6704   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6705   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6706   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6707   // KMP_USER_LEVEL_MWAIT was set to TRUE.
6708   if (__kmp_mic_type == mic3) {
6709     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6710     if ((res & 0x1) || __kmp_user_level_mwait) {
6711       __kmp_mwait_enabled = TRUE;
6712       if (__kmp_user_level_mwait) {
6713         KMP_INFORM(EnvMwaitWarn);
6714       }
6715     } else {
6716       __kmp_mwait_enabled = FALSE;
6717     }
6718   }
6719   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6720                 "__kmp_mwait_enabled = %d\n",
6721                 __kmp_mic_type, __kmp_mwait_enabled));
6722 }
6723 #endif /* KMP_HAVE_UMWAIT */
6724 
6725 static void __kmp_do_serial_initialize(void) {
6726   int i, gtid;
6727   size_t size;
6728 
6729   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6730 
6731   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6732   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6733   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6734   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6735   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6736 
6737 #if OMPT_SUPPORT
6738   ompt_pre_init();
6739 #endif
6740 #if OMPD_SUPPORT
6741   __kmp_env_dump();
6742   ompd_init();
6743 #endif
6744 
6745   __kmp_validate_locks();
6746 
6747   /* Initialize internal memory allocator */
6748   __kmp_init_allocator();
6749 
6750   /* Register the library startup via an environment variable and check to see
6751      whether another copy of the library is already registered. */
6752 
6753   __kmp_register_library_startup();
6754 
6755   /* TODO reinitialization of library */
6756   if (TCR_4(__kmp_global.g.g_done)) {
6757     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6758   }
6759 
6760   __kmp_global.g.g_abort = 0;
6761   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6762 
6763 /* initialize the locks */
6764 #if KMP_USE_ADAPTIVE_LOCKS
6765 #if KMP_DEBUG_ADAPTIVE_LOCKS
6766   __kmp_init_speculative_stats();
6767 #endif
6768 #endif
6769 #if KMP_STATS_ENABLED
6770   __kmp_stats_init();
6771 #endif
6772   __kmp_init_lock(&__kmp_global_lock);
6773   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6774   __kmp_init_lock(&__kmp_debug_lock);
6775   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6776   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6777   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6778   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6779   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6780   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6781   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6782   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6783   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6784   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6785   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6786   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6787   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6788   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6789   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6790 #if KMP_USE_MONITOR
6791   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6792 #endif
6793   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6794 
6795   /* conduct initialization and initial setup of configuration */
6796 
6797   __kmp_runtime_initialize();
6798 
6799 #if KMP_MIC_SUPPORTED
6800   __kmp_check_mic_type();
6801 #endif
6802 
6803 // Some global variable initialization moved here from kmp_env_initialize()
6804 #ifdef KMP_DEBUG
6805   kmp_diag = 0;
6806 #endif
6807   __kmp_abort_delay = 0;
6808 
6809   // From __kmp_init_dflt_team_nth()
6810   /* assume the entire machine will be used */
6811   __kmp_dflt_team_nth_ub = __kmp_xproc;
6812   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6813     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6814   }
6815   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6816     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6817   }
6818   __kmp_max_nth = __kmp_sys_max_nth;
6819   __kmp_cg_max_nth = __kmp_sys_max_nth;
6820   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6821   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6822     __kmp_teams_max_nth = __kmp_sys_max_nth;
6823   }
6824 
6825   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6826   // part
6827   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6828 #if KMP_USE_MONITOR
6829   __kmp_monitor_wakeups =
6830       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6831   __kmp_bt_intervals =
6832       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6833 #endif
6834   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6835   __kmp_library = library_throughput;
6836   // From KMP_SCHEDULE initialization
6837   __kmp_static = kmp_sch_static_balanced;
6838 // AC: do not use analytical here, because it is non-monotonous
6839 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6840 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6841 // need to repeat assignment
6842 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6843 // bit control and barrier method control parts
6844 #if KMP_FAST_REDUCTION_BARRIER
6845 #define kmp_reduction_barrier_gather_bb ((int)1)
6846 #define kmp_reduction_barrier_release_bb ((int)1)
6847 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6848 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6849 #endif // KMP_FAST_REDUCTION_BARRIER
6850   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6851     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6852     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6853     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6854     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6855 #if KMP_FAST_REDUCTION_BARRIER
6856     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6857       // lin_64 ): hyper,1
6858       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6859       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6860       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6861       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6862     }
6863 #endif // KMP_FAST_REDUCTION_BARRIER
6864   }
6865 #if KMP_FAST_REDUCTION_BARRIER
6866 #undef kmp_reduction_barrier_release_pat
6867 #undef kmp_reduction_barrier_gather_pat
6868 #undef kmp_reduction_barrier_release_bb
6869 #undef kmp_reduction_barrier_gather_bb
6870 #endif // KMP_FAST_REDUCTION_BARRIER
6871 #if KMP_MIC_SUPPORTED
6872   if (__kmp_mic_type == mic2) { // KNC
6873     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6874     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6875     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6876         1; // forkjoin release
6877     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6878     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6879   }
6880 #if KMP_FAST_REDUCTION_BARRIER
6881   if (__kmp_mic_type == mic2) { // KNC
6882     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6883     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6884   }
6885 #endif // KMP_FAST_REDUCTION_BARRIER
6886 #endif // KMP_MIC_SUPPORTED
6887 
6888 // From KMP_CHECKS initialization
6889 #ifdef KMP_DEBUG
6890   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6891 #else
6892   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6893 #endif
6894 
6895   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6896   __kmp_foreign_tp = TRUE;
6897 
6898   __kmp_global.g.g_dynamic = FALSE;
6899   __kmp_global.g.g_dynamic_mode = dynamic_default;
6900 
6901   __kmp_init_nesting_mode();
6902 
6903   __kmp_env_initialize(NULL);
6904 
6905 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6906   __kmp_user_level_mwait_init();
6907 #endif
6908 // Print all messages in message catalog for testing purposes.
6909 #ifdef KMP_DEBUG
6910   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6911   if (__kmp_str_match_true(val)) {
6912     kmp_str_buf_t buffer;
6913     __kmp_str_buf_init(&buffer);
6914     __kmp_i18n_dump_catalog(&buffer);
6915     __kmp_printf("%s", buffer.str);
6916     __kmp_str_buf_free(&buffer);
6917   }
6918   __kmp_env_free(&val);
6919 #endif
6920 
6921   __kmp_threads_capacity =
6922       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6923   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6924   __kmp_tp_capacity = __kmp_default_tp_capacity(
6925       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6926 
6927   // If the library is shut down properly, both pools must be NULL. Just in
6928   // case, set them to NULL -- some memory may leak, but subsequent code will
6929   // work even if pools are not freed.
6930   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6931   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6932   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6933   __kmp_thread_pool = NULL;
6934   __kmp_thread_pool_insert_pt = NULL;
6935   __kmp_team_pool = NULL;
6936 
6937   /* Allocate all of the variable sized records */
6938   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6939    * expandable */
6940   /* Since allocation is cache-aligned, just add extra padding at the end */
6941   size =
6942       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6943       CACHE_LINE;
6944   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6945   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6946                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6947 
6948   /* init thread counts */
6949   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6950                    0); // Asserts fail if the library is reinitializing and
6951   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6952   __kmp_all_nth = 0;
6953   __kmp_nth = 0;
6954 
6955   /* setup the uber master thread and hierarchy */
6956   gtid = __kmp_register_root(TRUE);
6957   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6958   KMP_ASSERT(KMP_UBER_GTID(gtid));
6959   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6960 
6961   KMP_MB(); /* Flush all pending memory write invalidates.  */
6962 
6963   __kmp_common_initialize();
6964 
6965 #if KMP_OS_UNIX
6966   /* invoke the child fork handler */
6967   __kmp_register_atfork();
6968 #endif
6969 
6970 #if !KMP_DYNAMIC_LIB
6971   {
6972     /* Invoke the exit handler when the program finishes, only for static
6973        library. For dynamic library, we already have _fini and DllMain. */
6974     int rc = atexit(__kmp_internal_end_atexit);
6975     if (rc != 0) {
6976       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6977                   __kmp_msg_null);
6978     }
6979   }
6980 #endif
6981 
6982 #if KMP_HANDLE_SIGNALS
6983 #if KMP_OS_UNIX
6984   /* NOTE: make sure that this is called before the user installs their own
6985      signal handlers so that the user handlers are called first. this way they
6986      can return false, not call our handler, avoid terminating the library, and
6987      continue execution where they left off. */
6988   __kmp_install_signals(FALSE);
6989 #endif /* KMP_OS_UNIX */
6990 #if KMP_OS_WINDOWS
6991   __kmp_install_signals(TRUE);
6992 #endif /* KMP_OS_WINDOWS */
6993 #endif
6994 
6995   /* we have finished the serial initialization */
6996   __kmp_init_counter++;
6997 
6998   __kmp_init_serial = TRUE;
6999 
7000   if (__kmp_settings) {
7001     __kmp_env_print();
7002   }
7003 
7004   if (__kmp_display_env || __kmp_display_env_verbose) {
7005     __kmp_env_print_2();
7006   }
7007 
7008 #if OMPT_SUPPORT
7009   ompt_post_init();
7010 #endif
7011 
7012   KMP_MB();
7013 
7014   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7015 }
7016 
7017 void __kmp_serial_initialize(void) {
7018   if (__kmp_init_serial) {
7019     return;
7020   }
7021   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7022   if (__kmp_init_serial) {
7023     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7024     return;
7025   }
7026   __kmp_do_serial_initialize();
7027   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7028 }
7029 
7030 static void __kmp_do_middle_initialize(void) {
7031   int i, j;
7032   int prev_dflt_team_nth;
7033 
7034   if (!__kmp_init_serial) {
7035     __kmp_do_serial_initialize();
7036   }
7037 
7038   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7039 
7040   // Save the previous value for the __kmp_dflt_team_nth so that
7041   // we can avoid some reinitialization if it hasn't changed.
7042   prev_dflt_team_nth = __kmp_dflt_team_nth;
7043 
7044 #if KMP_AFFINITY_SUPPORTED
7045   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7046   // number of cores on the machine.
7047   __kmp_affinity_initialize();
7048 
7049 #endif /* KMP_AFFINITY_SUPPORTED */
7050 
7051   KMP_ASSERT(__kmp_xproc > 0);
7052   if (__kmp_avail_proc == 0) {
7053     __kmp_avail_proc = __kmp_xproc;
7054   }
7055 
7056   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7057   // correct them now
7058   j = 0;
7059   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7060     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7061         __kmp_avail_proc;
7062     j++;
7063   }
7064 
7065   if (__kmp_dflt_team_nth == 0) {
7066 #ifdef KMP_DFLT_NTH_CORES
7067     // Default #threads = #cores
7068     __kmp_dflt_team_nth = __kmp_ncores;
7069     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7070                   "__kmp_ncores (%d)\n",
7071                   __kmp_dflt_team_nth));
7072 #else
7073     // Default #threads = #available OS procs
7074     __kmp_dflt_team_nth = __kmp_avail_proc;
7075     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7076                   "__kmp_avail_proc(%d)\n",
7077                   __kmp_dflt_team_nth));
7078 #endif /* KMP_DFLT_NTH_CORES */
7079   }
7080 
7081   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7082     __kmp_dflt_team_nth = KMP_MIN_NTH;
7083   }
7084   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7085     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7086   }
7087 
7088   if (__kmp_nesting_mode > 0)
7089     __kmp_set_nesting_mode_threads();
7090 
7091   // There's no harm in continuing if the following check fails,
7092   // but it indicates an error in the previous logic.
7093   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7094 
7095   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7096     // Run through the __kmp_threads array and set the num threads icv for each
7097     // root thread that is currently registered with the RTL (which has not
7098     // already explicitly set its nthreads-var with a call to
7099     // omp_set_num_threads()).
7100     for (i = 0; i < __kmp_threads_capacity; i++) {
7101       kmp_info_t *thread = __kmp_threads[i];
7102       if (thread == NULL)
7103         continue;
7104       if (thread->th.th_current_task->td_icvs.nproc != 0)
7105         continue;
7106 
7107       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7108     }
7109   }
7110   KA_TRACE(
7111       20,
7112       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7113        __kmp_dflt_team_nth));
7114 
7115 #ifdef KMP_ADJUST_BLOCKTIME
7116   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7117   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7118     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7119     if (__kmp_nth > __kmp_avail_proc) {
7120       __kmp_zero_bt = TRUE;
7121     }
7122   }
7123 #endif /* KMP_ADJUST_BLOCKTIME */
7124 
7125   /* we have finished middle initialization */
7126   TCW_SYNC_4(__kmp_init_middle, TRUE);
7127 
7128   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7129 }
7130 
7131 void __kmp_middle_initialize(void) {
7132   if (__kmp_init_middle) {
7133     return;
7134   }
7135   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7136   if (__kmp_init_middle) {
7137     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7138     return;
7139   }
7140   __kmp_do_middle_initialize();
7141   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7142 }
7143 
7144 void __kmp_parallel_initialize(void) {
7145   int gtid = __kmp_entry_gtid(); // this might be a new root
7146 
7147   /* synchronize parallel initialization (for sibling) */
7148   if (TCR_4(__kmp_init_parallel))
7149     return;
7150   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7151   if (TCR_4(__kmp_init_parallel)) {
7152     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7153     return;
7154   }
7155 
7156   /* TODO reinitialization after we have already shut down */
7157   if (TCR_4(__kmp_global.g.g_done)) {
7158     KA_TRACE(
7159         10,
7160         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7161     __kmp_infinite_loop();
7162   }
7163 
7164   /* jc: The lock __kmp_initz_lock is already held, so calling
7165      __kmp_serial_initialize would cause a deadlock.  So we call
7166      __kmp_do_serial_initialize directly. */
7167   if (!__kmp_init_middle) {
7168     __kmp_do_middle_initialize();
7169   }
7170   __kmp_assign_root_init_mask();
7171   __kmp_resume_if_hard_paused();
7172 
7173   /* begin initialization */
7174   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7175   KMP_ASSERT(KMP_UBER_GTID(gtid));
7176 
7177 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7178   // Save the FP control regs.
7179   // Worker threads will set theirs to these values at thread startup.
7180   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7181   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7182   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7183 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7184 
7185 #if KMP_OS_UNIX
7186 #if KMP_HANDLE_SIGNALS
7187   /*  must be after __kmp_serial_initialize  */
7188   __kmp_install_signals(TRUE);
7189 #endif
7190 #endif
7191 
7192   __kmp_suspend_initialize();
7193 
7194 #if defined(USE_LOAD_BALANCE)
7195   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7196     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7197   }
7198 #else
7199   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7200     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7201   }
7202 #endif
7203 
7204   if (__kmp_version) {
7205     __kmp_print_version_2();
7206   }
7207 
7208   /* we have finished parallel initialization */
7209   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7210 
7211   KMP_MB();
7212   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7213 
7214   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7215 }
7216 
7217 void __kmp_hidden_helper_initialize() {
7218   if (TCR_4(__kmp_init_hidden_helper))
7219     return;
7220 
7221   // __kmp_parallel_initialize is required before we initialize hidden helper
7222   if (!TCR_4(__kmp_init_parallel))
7223     __kmp_parallel_initialize();
7224 
7225   // Double check. Note that this double check should not be placed before
7226   // __kmp_parallel_initialize as it will cause dead lock.
7227   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7228   if (TCR_4(__kmp_init_hidden_helper)) {
7229     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7230     return;
7231   }
7232 
7233   // Set the count of hidden helper tasks to be executed to zero
7234   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7235 
7236   // Set the global variable indicating that we're initializing hidden helper
7237   // team/threads
7238   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7239 
7240   // Platform independent initialization
7241   __kmp_do_initialize_hidden_helper_threads();
7242 
7243   // Wait here for the finish of initialization of hidden helper teams
7244   __kmp_hidden_helper_threads_initz_wait();
7245 
7246   // We have finished hidden helper initialization
7247   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7248 
7249   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7250 }
7251 
7252 /* ------------------------------------------------------------------------ */
7253 
7254 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7255                                    kmp_team_t *team) {
7256   kmp_disp_t *dispatch;
7257 
7258   KMP_MB();
7259 
7260   /* none of the threads have encountered any constructs, yet. */
7261   this_thr->th.th_local.this_construct = 0;
7262 #if KMP_CACHE_MANAGE
7263   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7264 #endif /* KMP_CACHE_MANAGE */
7265   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7266   KMP_DEBUG_ASSERT(dispatch);
7267   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7268   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7269   // this_thr->th.th_info.ds.ds_tid ] );
7270 
7271   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7272   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7273   if (__kmp_env_consistency_check)
7274     __kmp_push_parallel(gtid, team->t.t_ident);
7275 
7276   KMP_MB(); /* Flush all pending memory write invalidates.  */
7277 }
7278 
7279 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7280                                   kmp_team_t *team) {
7281   if (__kmp_env_consistency_check)
7282     __kmp_pop_parallel(gtid, team->t.t_ident);
7283 
7284   __kmp_finish_implicit_task(this_thr);
7285 }
7286 
7287 int __kmp_invoke_task_func(int gtid) {
7288   int rc;
7289   int tid = __kmp_tid_from_gtid(gtid);
7290   kmp_info_t *this_thr = __kmp_threads[gtid];
7291   kmp_team_t *team = this_thr->th.th_team;
7292 
7293   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7294 #if USE_ITT_BUILD
7295   if (__itt_stack_caller_create_ptr) {
7296     // inform ittnotify about entering user's code
7297     if (team->t.t_stack_id != NULL) {
7298       __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7299     } else {
7300       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7301       __kmp_itt_stack_callee_enter(
7302           (__itt_caller)team->t.t_parent->t.t_stack_id);
7303     }
7304   }
7305 #endif /* USE_ITT_BUILD */
7306 #if INCLUDE_SSC_MARKS
7307   SSC_MARK_INVOKING();
7308 #endif
7309 
7310 #if OMPT_SUPPORT
7311   void *dummy;
7312   void **exit_frame_p;
7313   ompt_data_t *my_task_data;
7314   ompt_data_t *my_parallel_data;
7315   int ompt_team_size;
7316 
7317   if (ompt_enabled.enabled) {
7318     exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7319                          .ompt_task_info.frame.exit_frame.ptr);
7320   } else {
7321     exit_frame_p = &dummy;
7322   }
7323 
7324   my_task_data =
7325       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7326   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7327   if (ompt_enabled.ompt_callback_implicit_task) {
7328     ompt_team_size = team->t.t_nproc;
7329     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7330         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7331         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7332     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7333   }
7334 #endif
7335 
7336 #if KMP_STATS_ENABLED
7337   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7338   if (previous_state == stats_state_e::TEAMS_REGION) {
7339     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7340   } else {
7341     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7342   }
7343   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7344 #endif
7345 
7346   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7347                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7348 #if OMPT_SUPPORT
7349                               ,
7350                               exit_frame_p
7351 #endif
7352   );
7353 #if OMPT_SUPPORT
7354   *exit_frame_p = NULL;
7355   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7356 #endif
7357 
7358 #if KMP_STATS_ENABLED
7359   if (previous_state == stats_state_e::TEAMS_REGION) {
7360     KMP_SET_THREAD_STATE(previous_state);
7361   }
7362   KMP_POP_PARTITIONED_TIMER();
7363 #endif
7364 
7365 #if USE_ITT_BUILD
7366   if (__itt_stack_caller_create_ptr) {
7367     // inform ittnotify about leaving user's code
7368     if (team->t.t_stack_id != NULL) {
7369       __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7370     } else {
7371       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7372       __kmp_itt_stack_callee_leave(
7373           (__itt_caller)team->t.t_parent->t.t_stack_id);
7374     }
7375   }
7376 #endif /* USE_ITT_BUILD */
7377   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7378 
7379   return rc;
7380 }
7381 
7382 void __kmp_teams_master(int gtid) {
7383   // This routine is called by all primary threads in teams construct
7384   kmp_info_t *thr = __kmp_threads[gtid];
7385   kmp_team_t *team = thr->th.th_team;
7386   ident_t *loc = team->t.t_ident;
7387   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7388   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7389   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7390   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7391                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7392 
7393   // This thread is a new CG root.  Set up the proper variables.
7394   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7395   tmp->cg_root = thr; // Make thr the CG root
7396   // Init to thread limit stored when league primary threads were forked
7397   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7398   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7399   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7400                  " cg_nthreads to 1\n",
7401                  thr, tmp));
7402   tmp->up = thr->th.th_cg_roots;
7403   thr->th.th_cg_roots = tmp;
7404 
7405 // Launch league of teams now, but not let workers execute
7406 // (they hang on fork barrier until next parallel)
7407 #if INCLUDE_SSC_MARKS
7408   SSC_MARK_FORKING();
7409 #endif
7410   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7411                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7412                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7413 #if INCLUDE_SSC_MARKS
7414   SSC_MARK_JOINING();
7415 #endif
7416   // If the team size was reduced from the limit, set it to the new size
7417   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7418     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7419   // AC: last parameter "1" eliminates join barrier which won't work because
7420   // worker threads are in a fork barrier waiting for more parallel regions
7421   __kmp_join_call(loc, gtid
7422 #if OMPT_SUPPORT
7423                   ,
7424                   fork_context_intel
7425 #endif
7426                   ,
7427                   1);
7428 }
7429 
7430 int __kmp_invoke_teams_master(int gtid) {
7431   kmp_info_t *this_thr = __kmp_threads[gtid];
7432   kmp_team_t *team = this_thr->th.th_team;
7433 #if KMP_DEBUG
7434   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7435     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7436                      (void *)__kmp_teams_master);
7437 #endif
7438   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7439 #if OMPT_SUPPORT
7440   int tid = __kmp_tid_from_gtid(gtid);
7441   ompt_data_t *task_data =
7442       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7443   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7444   if (ompt_enabled.ompt_callback_implicit_task) {
7445     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7446         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7447         ompt_task_initial);
7448     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7449   }
7450 #endif
7451   __kmp_teams_master(gtid);
7452 #if OMPT_SUPPORT
7453   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7454 #endif
7455   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7456   return 1;
7457 }
7458 
7459 /* this sets the requested number of threads for the next parallel region
7460    encountered by this team. since this should be enclosed in the forkjoin
7461    critical section it should avoid race conditions with asymmetrical nested
7462    parallelism */
7463 
7464 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7465   kmp_info_t *thr = __kmp_threads[gtid];
7466 
7467   if (num_threads > 0)
7468     thr->th.th_set_nproc = num_threads;
7469 }
7470 
7471 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7472                                     int num_threads) {
7473   KMP_DEBUG_ASSERT(thr);
7474   // Remember the number of threads for inner parallel regions
7475   if (!TCR_4(__kmp_init_middle))
7476     __kmp_middle_initialize(); // get internal globals calculated
7477   __kmp_assign_root_init_mask();
7478   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7479   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7480 
7481   if (num_threads == 0) {
7482     if (__kmp_teams_thread_limit > 0) {
7483       num_threads = __kmp_teams_thread_limit;
7484     } else {
7485       num_threads = __kmp_avail_proc / num_teams;
7486     }
7487     // adjust num_threads w/o warning as it is not user setting
7488     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7489     // no thread_limit clause specified -  do not change thread-limit-var ICV
7490     if (num_threads > __kmp_dflt_team_nth) {
7491       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7492     }
7493     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7494       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7495     } // prevent team size to exceed thread-limit-var
7496     if (num_teams * num_threads > __kmp_teams_max_nth) {
7497       num_threads = __kmp_teams_max_nth / num_teams;
7498     }
7499     if (num_threads == 0) {
7500       num_threads = 1;
7501     }
7502   } else {
7503     // This thread will be the primary thread of the league primary threads
7504     // Store new thread limit; old limit is saved in th_cg_roots list
7505     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7506     // num_threads = min(num_threads, nthreads-var)
7507     if (num_threads > __kmp_dflt_team_nth) {
7508       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7509     }
7510     if (num_teams * num_threads > __kmp_teams_max_nth) {
7511       int new_threads = __kmp_teams_max_nth / num_teams;
7512       if (new_threads == 0) {
7513         new_threads = 1;
7514       }
7515       if (new_threads != num_threads) {
7516         if (!__kmp_reserve_warn) { // user asked for too many threads
7517           __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7518           __kmp_msg(kmp_ms_warning,
7519                     KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7520                     KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7521         }
7522       }
7523       num_threads = new_threads;
7524     }
7525   }
7526   thr->th.th_teams_size.nth = num_threads;
7527 }
7528 
7529 /* this sets the requested number of teams for the teams region and/or
7530    the number of threads for the next parallel region encountered  */
7531 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7532                           int num_threads) {
7533   kmp_info_t *thr = __kmp_threads[gtid];
7534   KMP_DEBUG_ASSERT(num_teams >= 0);
7535   KMP_DEBUG_ASSERT(num_threads >= 0);
7536 
7537   if (num_teams == 0) {
7538     if (__kmp_nteams > 0) {
7539       num_teams = __kmp_nteams;
7540     } else {
7541       num_teams = 1; // default number of teams is 1.
7542     }
7543   }
7544   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7545     if (!__kmp_reserve_warn) {
7546       __kmp_reserve_warn = 1;
7547       __kmp_msg(kmp_ms_warning,
7548                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7549                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7550     }
7551     num_teams = __kmp_teams_max_nth;
7552   }
7553   // Set number of teams (number of threads in the outer "parallel" of the
7554   // teams)
7555   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7556 
7557   __kmp_push_thread_limit(thr, num_teams, num_threads);
7558 }
7559 
7560 /* This sets the requested number of teams for the teams region and/or
7561    the number of threads for the next parallel region encountered  */
7562 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7563                              int num_teams_ub, int num_threads) {
7564   kmp_info_t *thr = __kmp_threads[gtid];
7565   KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7566   KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7567   KMP_DEBUG_ASSERT(num_threads >= 0);
7568 
7569   if (num_teams_lb > num_teams_ub) {
7570     __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7571                 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7572   }
7573 
7574   int num_teams = 1; // defalt number of teams is 1.
7575 
7576   if (num_teams_lb == 0 && num_teams_ub > 0)
7577     num_teams_lb = num_teams_ub;
7578 
7579   if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7580     num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7581     if (num_teams > __kmp_teams_max_nth) {
7582       if (!__kmp_reserve_warn) {
7583         __kmp_reserve_warn = 1;
7584         __kmp_msg(kmp_ms_warning,
7585                   KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7586                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7587       }
7588       num_teams = __kmp_teams_max_nth;
7589     }
7590   } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7591     num_teams = num_teams_ub;
7592   } else { // num_teams_lb <= num_teams <= num_teams_ub
7593     if (num_threads == 0) {
7594       if (num_teams_ub > __kmp_teams_max_nth) {
7595         num_teams = num_teams_lb;
7596       } else {
7597         num_teams = num_teams_ub;
7598       }
7599     } else {
7600       num_teams = (num_threads > __kmp_teams_max_nth)
7601                       ? num_teams
7602                       : __kmp_teams_max_nth / num_threads;
7603       if (num_teams < num_teams_lb) {
7604         num_teams = num_teams_lb;
7605       } else if (num_teams > num_teams_ub) {
7606         num_teams = num_teams_ub;
7607       }
7608     }
7609   }
7610   // Set number of teams (number of threads in the outer "parallel" of the
7611   // teams)
7612   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7613 
7614   __kmp_push_thread_limit(thr, num_teams, num_threads);
7615 }
7616 
7617 // Set the proc_bind var to use in the following parallel region.
7618 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7619   kmp_info_t *thr = __kmp_threads[gtid];
7620   thr->th.th_set_proc_bind = proc_bind;
7621 }
7622 
7623 /* Launch the worker threads into the microtask. */
7624 
7625 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7626   kmp_info_t *this_thr = __kmp_threads[gtid];
7627 
7628 #ifdef KMP_DEBUG
7629   int f;
7630 #endif /* KMP_DEBUG */
7631 
7632   KMP_DEBUG_ASSERT(team);
7633   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7634   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7635   KMP_MB(); /* Flush all pending memory write invalidates.  */
7636 
7637   team->t.t_construct = 0; /* no single directives seen yet */
7638   team->t.t_ordered.dt.t_value =
7639       0; /* thread 0 enters the ordered section first */
7640 
7641   /* Reset the identifiers on the dispatch buffer */
7642   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7643   if (team->t.t_max_nproc > 1) {
7644     int i;
7645     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7646       team->t.t_disp_buffer[i].buffer_index = i;
7647       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7648     }
7649   } else {
7650     team->t.t_disp_buffer[0].buffer_index = 0;
7651     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7652   }
7653 
7654   KMP_MB(); /* Flush all pending memory write invalidates.  */
7655   KMP_ASSERT(this_thr->th.th_team == team);
7656 
7657 #ifdef KMP_DEBUG
7658   for (f = 0; f < team->t.t_nproc; f++) {
7659     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7660                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7661   }
7662 #endif /* KMP_DEBUG */
7663 
7664   /* release the worker threads so they may begin working */
7665   __kmp_fork_barrier(gtid, 0);
7666 }
7667 
7668 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7669   kmp_info_t *this_thr = __kmp_threads[gtid];
7670 
7671   KMP_DEBUG_ASSERT(team);
7672   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7673   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7674   KMP_MB(); /* Flush all pending memory write invalidates.  */
7675 
7676   /* Join barrier after fork */
7677 
7678 #ifdef KMP_DEBUG
7679   if (__kmp_threads[gtid] &&
7680       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7681     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7682                  __kmp_threads[gtid]);
7683     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7684                  "team->t.t_nproc=%d\n",
7685                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7686                  team->t.t_nproc);
7687     __kmp_print_structure();
7688   }
7689   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7690                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7691 #endif /* KMP_DEBUG */
7692 
7693   __kmp_join_barrier(gtid); /* wait for everyone */
7694 #if OMPT_SUPPORT
7695   if (ompt_enabled.enabled &&
7696       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7697     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7698     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7699     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7700 #if OMPT_OPTIONAL
7701     void *codeptr = NULL;
7702     if (KMP_MASTER_TID(ds_tid) &&
7703         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7704          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7705       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7706 
7707     if (ompt_enabled.ompt_callback_sync_region_wait) {
7708       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7709           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7710           codeptr);
7711     }
7712     if (ompt_enabled.ompt_callback_sync_region) {
7713       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7714           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7715           codeptr);
7716     }
7717 #endif
7718     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7719       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7720           ompt_scope_end, NULL, task_data, 0, ds_tid,
7721           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7722     }
7723   }
7724 #endif
7725 
7726   KMP_MB(); /* Flush all pending memory write invalidates.  */
7727   KMP_ASSERT(this_thr->th.th_team == team);
7728 }
7729 
7730 /* ------------------------------------------------------------------------ */
7731 
7732 #ifdef USE_LOAD_BALANCE
7733 
7734 // Return the worker threads actively spinning in the hot team, if we
7735 // are at the outermost level of parallelism.  Otherwise, return 0.
7736 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7737   int i;
7738   int retval;
7739   kmp_team_t *hot_team;
7740 
7741   if (root->r.r_active) {
7742     return 0;
7743   }
7744   hot_team = root->r.r_hot_team;
7745   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7746     return hot_team->t.t_nproc - 1; // Don't count primary thread
7747   }
7748 
7749   // Skip the primary thread - it is accounted for elsewhere.
7750   retval = 0;
7751   for (i = 1; i < hot_team->t.t_nproc; i++) {
7752     if (hot_team->t.t_threads[i]->th.th_active) {
7753       retval++;
7754     }
7755   }
7756   return retval;
7757 }
7758 
7759 // Perform an automatic adjustment to the number of
7760 // threads used by the next parallel region.
7761 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7762   int retval;
7763   int pool_active;
7764   int hot_team_active;
7765   int team_curr_active;
7766   int system_active;
7767 
7768   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7769                 set_nproc));
7770   KMP_DEBUG_ASSERT(root);
7771   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7772                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7773   KMP_DEBUG_ASSERT(set_nproc > 1);
7774 
7775   if (set_nproc == 1) {
7776     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7777     return 1;
7778   }
7779 
7780   // Threads that are active in the thread pool, active in the hot team for this
7781   // particular root (if we are at the outer par level), and the currently
7782   // executing thread (to become the primary thread) are available to add to the
7783   // new team, but are currently contributing to the system load, and must be
7784   // accounted for.
7785   pool_active = __kmp_thread_pool_active_nth;
7786   hot_team_active = __kmp_active_hot_team_nproc(root);
7787   team_curr_active = pool_active + hot_team_active + 1;
7788 
7789   // Check the system load.
7790   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7791   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7792                 "hot team active = %d\n",
7793                 system_active, pool_active, hot_team_active));
7794 
7795   if (system_active < 0) {
7796     // There was an error reading the necessary info from /proc, so use the
7797     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7798     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7799     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7800     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7801 
7802     // Make this call behave like the thread limit algorithm.
7803     retval = __kmp_avail_proc - __kmp_nth +
7804              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7805     if (retval > set_nproc) {
7806       retval = set_nproc;
7807     }
7808     if (retval < KMP_MIN_NTH) {
7809       retval = KMP_MIN_NTH;
7810     }
7811 
7812     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7813                   retval));
7814     return retval;
7815   }
7816 
7817   // There is a slight delay in the load balance algorithm in detecting new
7818   // running procs. The real system load at this instant should be at least as
7819   // large as the #active omp thread that are available to add to the team.
7820   if (system_active < team_curr_active) {
7821     system_active = team_curr_active;
7822   }
7823   retval = __kmp_avail_proc - system_active + team_curr_active;
7824   if (retval > set_nproc) {
7825     retval = set_nproc;
7826   }
7827   if (retval < KMP_MIN_NTH) {
7828     retval = KMP_MIN_NTH;
7829   }
7830 
7831   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7832   return retval;
7833 } // __kmp_load_balance_nproc()
7834 
7835 #endif /* USE_LOAD_BALANCE */
7836 
7837 /* ------------------------------------------------------------------------ */
7838 
7839 /* NOTE: this is called with the __kmp_init_lock held */
7840 void __kmp_cleanup(void) {
7841   int f;
7842 
7843   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7844 
7845   if (TCR_4(__kmp_init_parallel)) {
7846 #if KMP_HANDLE_SIGNALS
7847     __kmp_remove_signals();
7848 #endif
7849     TCW_4(__kmp_init_parallel, FALSE);
7850   }
7851 
7852   if (TCR_4(__kmp_init_middle)) {
7853 #if KMP_AFFINITY_SUPPORTED
7854     __kmp_affinity_uninitialize();
7855 #endif /* KMP_AFFINITY_SUPPORTED */
7856     __kmp_cleanup_hierarchy();
7857     TCW_4(__kmp_init_middle, FALSE);
7858   }
7859 
7860   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7861 
7862   if (__kmp_init_serial) {
7863     __kmp_runtime_destroy();
7864     __kmp_init_serial = FALSE;
7865   }
7866 
7867   __kmp_cleanup_threadprivate_caches();
7868 
7869   for (f = 0; f < __kmp_threads_capacity; f++) {
7870     if (__kmp_root[f] != NULL) {
7871       __kmp_free(__kmp_root[f]);
7872       __kmp_root[f] = NULL;
7873     }
7874   }
7875   __kmp_free(__kmp_threads);
7876   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7877   // there is no need in freeing __kmp_root.
7878   __kmp_threads = NULL;
7879   __kmp_root = NULL;
7880   __kmp_threads_capacity = 0;
7881 
7882 #if KMP_USE_DYNAMIC_LOCK
7883   __kmp_cleanup_indirect_user_locks();
7884 #else
7885   __kmp_cleanup_user_locks();
7886 #endif
7887 #if OMPD_SUPPORT
7888   if (ompd_state) {
7889     __kmp_free(ompd_env_block);
7890     ompd_env_block = NULL;
7891     ompd_env_block_size = 0;
7892   }
7893 #endif
7894 
7895 #if KMP_AFFINITY_SUPPORTED
7896   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7897   __kmp_cpuinfo_file = NULL;
7898 #endif /* KMP_AFFINITY_SUPPORTED */
7899 
7900 #if KMP_USE_ADAPTIVE_LOCKS
7901 #if KMP_DEBUG_ADAPTIVE_LOCKS
7902   __kmp_print_speculative_stats();
7903 #endif
7904 #endif
7905   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7906   __kmp_nested_nth.nth = NULL;
7907   __kmp_nested_nth.size = 0;
7908   __kmp_nested_nth.used = 0;
7909   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7910   __kmp_nested_proc_bind.bind_types = NULL;
7911   __kmp_nested_proc_bind.size = 0;
7912   __kmp_nested_proc_bind.used = 0;
7913   if (__kmp_affinity_format) {
7914     KMP_INTERNAL_FREE(__kmp_affinity_format);
7915     __kmp_affinity_format = NULL;
7916   }
7917 
7918   __kmp_i18n_catclose();
7919 
7920 #if KMP_USE_HIER_SCHED
7921   __kmp_hier_scheds.deallocate();
7922 #endif
7923 
7924 #if KMP_STATS_ENABLED
7925   __kmp_stats_fini();
7926 #endif
7927 
7928   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7929 }
7930 
7931 /* ------------------------------------------------------------------------ */
7932 
7933 int __kmp_ignore_mppbeg(void) {
7934   char *env;
7935 
7936   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7937     if (__kmp_str_match_false(env))
7938       return FALSE;
7939   }
7940   // By default __kmpc_begin() is no-op.
7941   return TRUE;
7942 }
7943 
7944 int __kmp_ignore_mppend(void) {
7945   char *env;
7946 
7947   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7948     if (__kmp_str_match_false(env))
7949       return FALSE;
7950   }
7951   // By default __kmpc_end() is no-op.
7952   return TRUE;
7953 }
7954 
7955 void __kmp_internal_begin(void) {
7956   int gtid;
7957   kmp_root_t *root;
7958 
7959   /* this is a very important step as it will register new sibling threads
7960      and assign these new uber threads a new gtid */
7961   gtid = __kmp_entry_gtid();
7962   root = __kmp_threads[gtid]->th.th_root;
7963   KMP_ASSERT(KMP_UBER_GTID(gtid));
7964 
7965   if (root->r.r_begin)
7966     return;
7967   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7968   if (root->r.r_begin) {
7969     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7970     return;
7971   }
7972 
7973   root->r.r_begin = TRUE;
7974 
7975   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7976 }
7977 
7978 /* ------------------------------------------------------------------------ */
7979 
7980 void __kmp_user_set_library(enum library_type arg) {
7981   int gtid;
7982   kmp_root_t *root;
7983   kmp_info_t *thread;
7984 
7985   /* first, make sure we are initialized so we can get our gtid */
7986 
7987   gtid = __kmp_entry_gtid();
7988   thread = __kmp_threads[gtid];
7989 
7990   root = thread->th.th_root;
7991 
7992   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7993                 library_serial));
7994   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7995                                   thread */
7996     KMP_WARNING(SetLibraryIncorrectCall);
7997     return;
7998   }
7999 
8000   switch (arg) {
8001   case library_serial:
8002     thread->th.th_set_nproc = 0;
8003     set__nproc(thread, 1);
8004     break;
8005   case library_turnaround:
8006     thread->th.th_set_nproc = 0;
8007     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8008                                            : __kmp_dflt_team_nth_ub);
8009     break;
8010   case library_throughput:
8011     thread->th.th_set_nproc = 0;
8012     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8013                                            : __kmp_dflt_team_nth_ub);
8014     break;
8015   default:
8016     KMP_FATAL(UnknownLibraryType, arg);
8017   }
8018 
8019   __kmp_aux_set_library(arg);
8020 }
8021 
8022 void __kmp_aux_set_stacksize(size_t arg) {
8023   if (!__kmp_init_serial)
8024     __kmp_serial_initialize();
8025 
8026 #if KMP_OS_DARWIN
8027   if (arg & (0x1000 - 1)) {
8028     arg &= ~(0x1000 - 1);
8029     if (arg + 0x1000) /* check for overflow if we round up */
8030       arg += 0x1000;
8031   }
8032 #endif
8033   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8034 
8035   /* only change the default stacksize before the first parallel region */
8036   if (!TCR_4(__kmp_init_parallel)) {
8037     size_t value = arg; /* argument is in bytes */
8038 
8039     if (value < __kmp_sys_min_stksize)
8040       value = __kmp_sys_min_stksize;
8041     else if (value > KMP_MAX_STKSIZE)
8042       value = KMP_MAX_STKSIZE;
8043 
8044     __kmp_stksize = value;
8045 
8046     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8047   }
8048 
8049   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8050 }
8051 
8052 /* set the behaviour of the runtime library */
8053 /* TODO this can cause some odd behaviour with sibling parallelism... */
8054 void __kmp_aux_set_library(enum library_type arg) {
8055   __kmp_library = arg;
8056 
8057   switch (__kmp_library) {
8058   case library_serial: {
8059     KMP_INFORM(LibraryIsSerial);
8060   } break;
8061   case library_turnaround:
8062     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8063       __kmp_use_yield = 2; // only yield when oversubscribed
8064     break;
8065   case library_throughput:
8066     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8067       __kmp_dflt_blocktime = 200;
8068     break;
8069   default:
8070     KMP_FATAL(UnknownLibraryType, arg);
8071   }
8072 }
8073 
8074 /* Getting team information common for all team API */
8075 // Returns NULL if not in teams construct
8076 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8077   kmp_info_t *thr = __kmp_entry_thread();
8078   teams_serialized = 0;
8079   if (thr->th.th_teams_microtask) {
8080     kmp_team_t *team = thr->th.th_team;
8081     int tlevel = thr->th.th_teams_level; // the level of the teams construct
8082     int ii = team->t.t_level;
8083     teams_serialized = team->t.t_serialized;
8084     int level = tlevel + 1;
8085     KMP_DEBUG_ASSERT(ii >= tlevel);
8086     while (ii > level) {
8087       for (teams_serialized = team->t.t_serialized;
8088            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8089       }
8090       if (team->t.t_serialized && (!teams_serialized)) {
8091         team = team->t.t_parent;
8092         continue;
8093       }
8094       if (ii > level) {
8095         team = team->t.t_parent;
8096         ii--;
8097       }
8098     }
8099     return team;
8100   }
8101   return NULL;
8102 }
8103 
8104 int __kmp_aux_get_team_num() {
8105   int serialized;
8106   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8107   if (team) {
8108     if (serialized > 1) {
8109       return 0; // teams region is serialized ( 1 team of 1 thread ).
8110     } else {
8111       return team->t.t_master_tid;
8112     }
8113   }
8114   return 0;
8115 }
8116 
8117 int __kmp_aux_get_num_teams() {
8118   int serialized;
8119   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8120   if (team) {
8121     if (serialized > 1) {
8122       return 1;
8123     } else {
8124       return team->t.t_parent->t.t_nproc;
8125     }
8126   }
8127   return 1;
8128 }
8129 
8130 /* ------------------------------------------------------------------------ */
8131 
8132 /*
8133  * Affinity Format Parser
8134  *
8135  * Field is in form of: %[[[0].]size]type
8136  * % and type are required (%% means print a literal '%')
8137  * type is either single char or long name surrounded by {},
8138  * e.g., N or {num_threads}
8139  * 0 => leading zeros
8140  * . => right justified when size is specified
8141  * by default output is left justified
8142  * size is the *minimum* field length
8143  * All other characters are printed as is
8144  *
8145  * Available field types:
8146  * L {thread_level}      - omp_get_level()
8147  * n {thread_num}        - omp_get_thread_num()
8148  * h {host}              - name of host machine
8149  * P {process_id}        - process id (integer)
8150  * T {thread_identifier} - native thread identifier (integer)
8151  * N {num_threads}       - omp_get_num_threads()
8152  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8153  * a {thread_affinity}   - comma separated list of integers or integer ranges
8154  *                         (values of affinity mask)
8155  *
8156  * Implementation-specific field types can be added
8157  * If a type is unknown, print "undefined"
8158  */
8159 
8160 // Structure holding the short name, long name, and corresponding data type
8161 // for snprintf.  A table of these will represent the entire valid keyword
8162 // field types.
8163 typedef struct kmp_affinity_format_field_t {
8164   char short_name; // from spec e.g., L -> thread level
8165   const char *long_name; // from spec thread_level -> thread level
8166   char field_format; // data type for snprintf (typically 'd' or 's'
8167   // for integer or string)
8168 } kmp_affinity_format_field_t;
8169 
8170 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8171 #if KMP_AFFINITY_SUPPORTED
8172     {'A', "thread_affinity", 's'},
8173 #endif
8174     {'t', "team_num", 'd'},
8175     {'T', "num_teams", 'd'},
8176     {'L', "nesting_level", 'd'},
8177     {'n', "thread_num", 'd'},
8178     {'N', "num_threads", 'd'},
8179     {'a', "ancestor_tnum", 'd'},
8180     {'H', "host", 's'},
8181     {'P', "process_id", 'd'},
8182     {'i', "native_thread_id", 'd'}};
8183 
8184 // Return the number of characters it takes to hold field
8185 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8186                                             const char **ptr,
8187                                             kmp_str_buf_t *field_buffer) {
8188   int rc, format_index, field_value;
8189   const char *width_left, *width_right;
8190   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8191   static const int FORMAT_SIZE = 20;
8192   char format[FORMAT_SIZE] = {0};
8193   char absolute_short_name = 0;
8194 
8195   KMP_DEBUG_ASSERT(gtid >= 0);
8196   KMP_DEBUG_ASSERT(th);
8197   KMP_DEBUG_ASSERT(**ptr == '%');
8198   KMP_DEBUG_ASSERT(field_buffer);
8199 
8200   __kmp_str_buf_clear(field_buffer);
8201 
8202   // Skip the initial %
8203   (*ptr)++;
8204 
8205   // Check for %% first
8206   if (**ptr == '%') {
8207     __kmp_str_buf_cat(field_buffer, "%", 1);
8208     (*ptr)++; // skip over the second %
8209     return 1;
8210   }
8211 
8212   // Parse field modifiers if they are present
8213   pad_zeros = false;
8214   if (**ptr == '0') {
8215     pad_zeros = true;
8216     (*ptr)++; // skip over 0
8217   }
8218   right_justify = false;
8219   if (**ptr == '.') {
8220     right_justify = true;
8221     (*ptr)++; // skip over .
8222   }
8223   // Parse width of field: [width_left, width_right)
8224   width_left = width_right = NULL;
8225   if (**ptr >= '0' && **ptr <= '9') {
8226     width_left = *ptr;
8227     SKIP_DIGITS(*ptr);
8228     width_right = *ptr;
8229   }
8230 
8231   // Create the format for KMP_SNPRINTF based on flags parsed above
8232   format_index = 0;
8233   format[format_index++] = '%';
8234   if (!right_justify)
8235     format[format_index++] = '-';
8236   if (pad_zeros)
8237     format[format_index++] = '0';
8238   if (width_left && width_right) {
8239     int i = 0;
8240     // Only allow 8 digit number widths.
8241     // This also prevents overflowing format variable
8242     while (i < 8 && width_left < width_right) {
8243       format[format_index++] = *width_left;
8244       width_left++;
8245       i++;
8246     }
8247   }
8248 
8249   // Parse a name (long or short)
8250   // Canonicalize the name into absolute_short_name
8251   found_valid_name = false;
8252   parse_long_name = (**ptr == '{');
8253   if (parse_long_name)
8254     (*ptr)++; // skip initial left brace
8255   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8256                              sizeof(__kmp_affinity_format_table[0]);
8257        ++i) {
8258     char short_name = __kmp_affinity_format_table[i].short_name;
8259     const char *long_name = __kmp_affinity_format_table[i].long_name;
8260     char field_format = __kmp_affinity_format_table[i].field_format;
8261     if (parse_long_name) {
8262       size_t length = KMP_STRLEN(long_name);
8263       if (strncmp(*ptr, long_name, length) == 0) {
8264         found_valid_name = true;
8265         (*ptr) += length; // skip the long name
8266       }
8267     } else if (**ptr == short_name) {
8268       found_valid_name = true;
8269       (*ptr)++; // skip the short name
8270     }
8271     if (found_valid_name) {
8272       format[format_index++] = field_format;
8273       format[format_index++] = '\0';
8274       absolute_short_name = short_name;
8275       break;
8276     }
8277   }
8278   if (parse_long_name) {
8279     if (**ptr != '}') {
8280       absolute_short_name = 0;
8281     } else {
8282       (*ptr)++; // skip over the right brace
8283     }
8284   }
8285 
8286   // Attempt to fill the buffer with the requested
8287   // value using snprintf within __kmp_str_buf_print()
8288   switch (absolute_short_name) {
8289   case 't':
8290     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8291     break;
8292   case 'T':
8293     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8294     break;
8295   case 'L':
8296     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8297     break;
8298   case 'n':
8299     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8300     break;
8301   case 'H': {
8302     static const int BUFFER_SIZE = 256;
8303     char buf[BUFFER_SIZE];
8304     __kmp_expand_host_name(buf, BUFFER_SIZE);
8305     rc = __kmp_str_buf_print(field_buffer, format, buf);
8306   } break;
8307   case 'P':
8308     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8309     break;
8310   case 'i':
8311     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8312     break;
8313   case 'N':
8314     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8315     break;
8316   case 'a':
8317     field_value =
8318         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8319     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8320     break;
8321 #if KMP_AFFINITY_SUPPORTED
8322   case 'A': {
8323     kmp_str_buf_t buf;
8324     __kmp_str_buf_init(&buf);
8325     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8326     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8327     __kmp_str_buf_free(&buf);
8328   } break;
8329 #endif
8330   default:
8331     // According to spec, If an implementation does not have info for field
8332     // type, then "undefined" is printed
8333     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8334     // Skip the field
8335     if (parse_long_name) {
8336       SKIP_TOKEN(*ptr);
8337       if (**ptr == '}')
8338         (*ptr)++;
8339     } else {
8340       (*ptr)++;
8341     }
8342   }
8343 
8344   KMP_ASSERT(format_index <= FORMAT_SIZE);
8345   return rc;
8346 }
8347 
8348 /*
8349  * Return number of characters needed to hold the affinity string
8350  * (not including null byte character)
8351  * The resultant string is printed to buffer, which the caller can then
8352  * handle afterwards
8353  */
8354 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8355                                   kmp_str_buf_t *buffer) {
8356   const char *parse_ptr;
8357   size_t retval;
8358   const kmp_info_t *th;
8359   kmp_str_buf_t field;
8360 
8361   KMP_DEBUG_ASSERT(buffer);
8362   KMP_DEBUG_ASSERT(gtid >= 0);
8363 
8364   __kmp_str_buf_init(&field);
8365   __kmp_str_buf_clear(buffer);
8366 
8367   th = __kmp_threads[gtid];
8368   retval = 0;
8369 
8370   // If format is NULL or zero-length string, then we use
8371   // affinity-format-var ICV
8372   parse_ptr = format;
8373   if (parse_ptr == NULL || *parse_ptr == '\0') {
8374     parse_ptr = __kmp_affinity_format;
8375   }
8376   KMP_DEBUG_ASSERT(parse_ptr);
8377 
8378   while (*parse_ptr != '\0') {
8379     // Parse a field
8380     if (*parse_ptr == '%') {
8381       // Put field in the buffer
8382       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8383       __kmp_str_buf_catbuf(buffer, &field);
8384       retval += rc;
8385     } else {
8386       // Put literal character in buffer
8387       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8388       retval++;
8389       parse_ptr++;
8390     }
8391   }
8392   __kmp_str_buf_free(&field);
8393   return retval;
8394 }
8395 
8396 // Displays the affinity string to stdout
8397 void __kmp_aux_display_affinity(int gtid, const char *format) {
8398   kmp_str_buf_t buf;
8399   __kmp_str_buf_init(&buf);
8400   __kmp_aux_capture_affinity(gtid, format, &buf);
8401   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8402   __kmp_str_buf_free(&buf);
8403 }
8404 
8405 /* ------------------------------------------------------------------------ */
8406 
8407 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8408   int blocktime = arg; /* argument is in milliseconds */
8409 #if KMP_USE_MONITOR
8410   int bt_intervals;
8411 #endif
8412   kmp_int8 bt_set;
8413 
8414   __kmp_save_internal_controls(thread);
8415 
8416   /* Normalize and set blocktime for the teams */
8417   if (blocktime < KMP_MIN_BLOCKTIME)
8418     blocktime = KMP_MIN_BLOCKTIME;
8419   else if (blocktime > KMP_MAX_BLOCKTIME)
8420     blocktime = KMP_MAX_BLOCKTIME;
8421 
8422   set__blocktime_team(thread->th.th_team, tid, blocktime);
8423   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8424 
8425 #if KMP_USE_MONITOR
8426   /* Calculate and set blocktime intervals for the teams */
8427   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8428 
8429   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8430   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8431 #endif
8432 
8433   /* Set whether blocktime has been set to "TRUE" */
8434   bt_set = TRUE;
8435 
8436   set__bt_set_team(thread->th.th_team, tid, bt_set);
8437   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8438 #if KMP_USE_MONITOR
8439   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8440                 "bt_intervals=%d, monitor_updates=%d\n",
8441                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8442                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8443                 __kmp_monitor_wakeups));
8444 #else
8445   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8446                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8447                 thread->th.th_team->t.t_id, tid, blocktime));
8448 #endif
8449 }
8450 
8451 void __kmp_aux_set_defaults(char const *str, size_t len) {
8452   if (!__kmp_init_serial) {
8453     __kmp_serial_initialize();
8454   }
8455   __kmp_env_initialize(str);
8456 
8457   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8458     __kmp_env_print();
8459   }
8460 } // __kmp_aux_set_defaults
8461 
8462 /* ------------------------------------------------------------------------ */
8463 /* internal fast reduction routines */
8464 
8465 PACKED_REDUCTION_METHOD_T
8466 __kmp_determine_reduction_method(
8467     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8468     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8469     kmp_critical_name *lck) {
8470 
8471   // Default reduction method: critical construct ( lck != NULL, like in current
8472   // PAROPT )
8473   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8474   // can be selected by RTL
8475   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8476   // can be selected by RTL
8477   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8478   // among generated by PAROPT.
8479 
8480   PACKED_REDUCTION_METHOD_T retval;
8481 
8482   int team_size;
8483 
8484   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8485   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8486 
8487 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8488   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8489 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8490 
8491   retval = critical_reduce_block;
8492 
8493   // another choice of getting a team size (with 1 dynamic deference) is slower
8494   team_size = __kmp_get_team_num_threads(global_tid);
8495   if (team_size == 1) {
8496 
8497     retval = empty_reduce_block;
8498 
8499   } else {
8500 
8501     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8502 
8503 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8504     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8505 
8506 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8507     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8508 
8509     int teamsize_cutoff = 4;
8510 
8511 #if KMP_MIC_SUPPORTED
8512     if (__kmp_mic_type != non_mic) {
8513       teamsize_cutoff = 8;
8514     }
8515 #endif
8516     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8517     if (tree_available) {
8518       if (team_size <= teamsize_cutoff) {
8519         if (atomic_available) {
8520           retval = atomic_reduce_block;
8521         }
8522       } else {
8523         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8524       }
8525     } else if (atomic_available) {
8526       retval = atomic_reduce_block;
8527     }
8528 #else
8529 #error "Unknown or unsupported OS"
8530 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8531        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8532 
8533 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8534 
8535 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8536 
8537     // basic tuning
8538 
8539     if (atomic_available) {
8540       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8541         retval = atomic_reduce_block;
8542       }
8543     } // otherwise: use critical section
8544 
8545 #elif KMP_OS_DARWIN
8546 
8547     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8548     if (atomic_available && (num_vars <= 3)) {
8549       retval = atomic_reduce_block;
8550     } else if (tree_available) {
8551       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8552           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8553         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8554       }
8555     } // otherwise: use critical section
8556 
8557 #else
8558 #error "Unknown or unsupported OS"
8559 #endif
8560 
8561 #else
8562 #error "Unknown or unsupported architecture"
8563 #endif
8564   }
8565 
8566   // KMP_FORCE_REDUCTION
8567 
8568   // If the team is serialized (team_size == 1), ignore the forced reduction
8569   // method and stay with the unsynchronized method (empty_reduce_block)
8570   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8571       team_size != 1) {
8572 
8573     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8574 
8575     int atomic_available, tree_available;
8576 
8577     switch ((forced_retval = __kmp_force_reduction_method)) {
8578     case critical_reduce_block:
8579       KMP_ASSERT(lck); // lck should be != 0
8580       break;
8581 
8582     case atomic_reduce_block:
8583       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8584       if (!atomic_available) {
8585         KMP_WARNING(RedMethodNotSupported, "atomic");
8586         forced_retval = critical_reduce_block;
8587       }
8588       break;
8589 
8590     case tree_reduce_block:
8591       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8592       if (!tree_available) {
8593         KMP_WARNING(RedMethodNotSupported, "tree");
8594         forced_retval = critical_reduce_block;
8595       } else {
8596 #if KMP_FAST_REDUCTION_BARRIER
8597         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8598 #endif
8599       }
8600       break;
8601 
8602     default:
8603       KMP_ASSERT(0); // "unsupported method specified"
8604     }
8605 
8606     retval = forced_retval;
8607   }
8608 
8609   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8610 
8611 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8612 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8613 
8614   return (retval);
8615 }
8616 // this function is for testing set/get/determine reduce method
8617 kmp_int32 __kmp_get_reduce_method(void) {
8618   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8619 }
8620 
8621 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8622 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8623 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8624 
8625 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8626 // OpenMP is used subsequently.
8627 void __kmp_hard_pause() {
8628   __kmp_pause_status = kmp_hard_paused;
8629   __kmp_internal_end_thread(-1);
8630 }
8631 
8632 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8633 void __kmp_resume_if_soft_paused() {
8634   if (__kmp_pause_status == kmp_soft_paused) {
8635     __kmp_pause_status = kmp_not_paused;
8636 
8637     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8638       kmp_info_t *thread = __kmp_threads[gtid];
8639       if (thread) { // Wake it if sleeping
8640         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8641                          thread);
8642         if (fl.is_sleeping())
8643           fl.resume(gtid);
8644         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8645           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8646         } else { // thread holds the lock and may sleep soon
8647           do { // until either the thread sleeps, or we can get the lock
8648             if (fl.is_sleeping()) {
8649               fl.resume(gtid);
8650               break;
8651             } else if (__kmp_try_suspend_mx(thread)) {
8652               __kmp_unlock_suspend_mx(thread);
8653               break;
8654             }
8655           } while (1);
8656         }
8657       }
8658     }
8659   }
8660 }
8661 
8662 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8663 // TODO: add warning messages
8664 int __kmp_pause_resource(kmp_pause_status_t level) {
8665   if (level == kmp_not_paused) { // requesting resume
8666     if (__kmp_pause_status == kmp_not_paused) {
8667       // error message about runtime not being paused, so can't resume
8668       return 1;
8669     } else {
8670       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8671                        __kmp_pause_status == kmp_hard_paused);
8672       __kmp_pause_status = kmp_not_paused;
8673       return 0;
8674     }
8675   } else if (level == kmp_soft_paused) { // requesting soft pause
8676     if (__kmp_pause_status != kmp_not_paused) {
8677       // error message about already being paused
8678       return 1;
8679     } else {
8680       __kmp_soft_pause();
8681       return 0;
8682     }
8683   } else if (level == kmp_hard_paused) { // requesting hard pause
8684     if (__kmp_pause_status != kmp_not_paused) {
8685       // error message about already being paused
8686       return 1;
8687     } else {
8688       __kmp_hard_pause();
8689       return 0;
8690     }
8691   } else {
8692     // error message about invalid level
8693     return 1;
8694   }
8695 }
8696 
8697 void __kmp_omp_display_env(int verbose) {
8698   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8699   if (__kmp_init_serial == 0)
8700     __kmp_do_serial_initialize();
8701   __kmp_display_env_impl(!verbose, verbose);
8702   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8703 }
8704 
8705 // Globals and functions for hidden helper task
8706 kmp_info_t **__kmp_hidden_helper_threads;
8707 kmp_info_t *__kmp_hidden_helper_main_thread;
8708 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8709 #if KMP_OS_LINUX
8710 kmp_int32 __kmp_hidden_helper_threads_num = 8;
8711 kmp_int32 __kmp_enable_hidden_helper = TRUE;
8712 #else
8713 kmp_int32 __kmp_hidden_helper_threads_num = 0;
8714 kmp_int32 __kmp_enable_hidden_helper = FALSE;
8715 #endif
8716 
8717 namespace {
8718 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8719 
8720 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8721   // This is an explicit synchronization on all hidden helper threads in case
8722   // that when a regular thread pushes a hidden helper task to one hidden
8723   // helper thread, the thread has not been awaken once since they're released
8724   // by the main thread after creating the team.
8725   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8726   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8727          __kmp_hidden_helper_threads_num)
8728     ;
8729 
8730   // If main thread, then wait for signal
8731   if (__kmpc_master(nullptr, *gtid)) {
8732     // First, unset the initial state and release the initial thread
8733     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8734     __kmp_hidden_helper_initz_release();
8735     __kmp_hidden_helper_main_thread_wait();
8736     // Now wake up all worker threads
8737     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8738       __kmp_hidden_helper_worker_thread_signal();
8739     }
8740   }
8741 }
8742 } // namespace
8743 
8744 void __kmp_hidden_helper_threads_initz_routine() {
8745   // Create a new root for hidden helper team/threads
8746   const int gtid = __kmp_register_root(TRUE);
8747   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8748   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8749   __kmp_hidden_helper_main_thread->th.th_set_nproc =
8750       __kmp_hidden_helper_threads_num;
8751 
8752   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8753 
8754   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8755 
8756   // Set the initialization flag to FALSE
8757   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8758 
8759   __kmp_hidden_helper_threads_deinitz_release();
8760 }
8761 
8762 /* Nesting Mode:
8763    Set via KMP_NESTING_MODE, which takes an integer.
8764    Note: we skip duplicate topology levels, and skip levels with only
8765       one entity.
8766    KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
8767    KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
8768       in the topology, and initializes the number of threads at each of those
8769       levels to the number of entities at each level, respectively, below the
8770       entity at the parent level.
8771    KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
8772       but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
8773       the user to turn nesting on explicitly. This is an even more experimental
8774       option to this experimental feature, and may change or go away in the
8775       future.
8776 */
8777 
8778 // Allocate space to store nesting levels
8779 void __kmp_init_nesting_mode() {
8780   int levels = KMP_HW_LAST;
8781   __kmp_nesting_mode_nlevels = levels;
8782   __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
8783   for (int i = 0; i < levels; ++i)
8784     __kmp_nesting_nth_level[i] = 0;
8785   if (__kmp_nested_nth.size < levels) {
8786     __kmp_nested_nth.nth =
8787         (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
8788     __kmp_nested_nth.size = levels;
8789   }
8790 }
8791 
8792 // Set # threads for top levels of nesting; must be called after topology set
8793 void __kmp_set_nesting_mode_threads() {
8794   kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
8795 
8796   if (__kmp_nesting_mode == 1)
8797     __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
8798   else if (__kmp_nesting_mode > 1)
8799     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
8800 
8801   if (__kmp_topology) { // use topology info
8802     int loc, hw_level;
8803     for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
8804                                 loc < __kmp_nesting_mode_nlevels;
8805          loc++, hw_level++) {
8806       __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
8807       if (__kmp_nesting_nth_level[loc] == 1)
8808         loc--;
8809     }
8810     // Make sure all cores are used
8811     if (__kmp_nesting_mode > 1 && loc > 1) {
8812       int core_level = __kmp_topology->get_level(KMP_HW_CORE);
8813       int num_cores = __kmp_topology->get_count(core_level);
8814       int upper_levels = 1;
8815       for (int level = 0; level < loc - 1; ++level)
8816         upper_levels *= __kmp_nesting_nth_level[level];
8817       if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
8818         __kmp_nesting_nth_level[loc - 1] =
8819             num_cores / __kmp_nesting_nth_level[loc - 2];
8820     }
8821     __kmp_nesting_mode_nlevels = loc;
8822     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
8823   } else { // no topology info available; provide a reasonable guesstimation
8824     if (__kmp_avail_proc >= 4) {
8825       __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
8826       __kmp_nesting_nth_level[1] = 2;
8827       __kmp_nesting_mode_nlevels = 2;
8828     } else {
8829       __kmp_nesting_nth_level[0] = __kmp_avail_proc;
8830       __kmp_nesting_mode_nlevels = 1;
8831     }
8832     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
8833   }
8834   for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
8835     __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
8836   }
8837   set__nproc(thread, __kmp_nesting_nth_level[0]);
8838   if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
8839     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
8840   if (get__max_active_levels(thread) > 1) {
8841     // if max levels was set, set nesting mode levels to same
8842     __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
8843   }
8844   if (__kmp_nesting_mode == 1) // turn on nesting for this case only
8845     set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
8846 }
8847