xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_runtime.cpp (revision c8e7f78a3d28ff6e6223ed136ada8e1e2f34965e)
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #if KMP_OS_WINDOWS
51 // windows does not need include files as it doesn't use shared memory
52 #else
53 #include <sys/mman.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #define SHM_SIZE 1024
57 #endif
58 
59 #if defined(KMP_GOMP_COMPAT)
60 char const __kmp_version_alt_comp[] =
61     KMP_VERSION_PREFIX "alternative compiler support: yes";
62 #endif /* defined(KMP_GOMP_COMPAT) */
63 
64 char const __kmp_version_omp_api[] =
65     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66 
67 #ifdef KMP_DEBUG
68 char const __kmp_version_lock[] =
69     KMP_VERSION_PREFIX "lock type: run time selectable";
70 #endif /* KMP_DEBUG */
71 
72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73 
74 /* ------------------------------------------------------------------------ */
75 
76 #if KMP_USE_MONITOR
77 kmp_info_t __kmp_monitor;
78 #endif
79 
80 /* Forward declarations */
81 
82 void __kmp_cleanup(void);
83 
84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85                                   int gtid);
86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87                                   kmp_internal_control_t *new_icvs,
88                                   ident_t *loc);
89 #if KMP_AFFINITY_SUPPORTED
90 static void __kmp_partition_places(kmp_team_t *team,
91                                    int update_master_only = 0);
92 #endif
93 static void __kmp_do_serial_initialize(void);
94 void __kmp_fork_barrier(int gtid, int tid);
95 void __kmp_join_barrier(int gtid);
96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97                           kmp_internal_control_t *new_icvs, ident_t *loc);
98 
99 #ifdef USE_LOAD_BALANCE
100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101 #endif
102 
103 static int __kmp_expand_threads(int nNeed);
104 #if KMP_OS_WINDOWS
105 static int __kmp_unregister_root_other_thread(int gtid);
106 #endif
107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109 
110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111                                int new_nthreads);
112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113 
114 /* Calculate the identifier of the current thread */
115 /* fast (and somewhat portable) way to get unique identifier of executing
116    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117 int __kmp_get_global_thread_id() {
118   int i;
119   kmp_info_t **other_threads;
120   size_t stack_data;
121   char *stack_addr;
122   size_t stack_size;
123   char *stack_base;
124 
125   KA_TRACE(
126       1000,
127       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
128        __kmp_nth, __kmp_all_nth));
129 
130   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133      __kmp_init_gtid for this to work. */
134 
135   if (!TCR_4(__kmp_init_gtid))
136     return KMP_GTID_DNE;
137 
138 #ifdef KMP_TDATA_GTID
139   if (TCR_4(__kmp_gtid_mode) >= 3) {
140     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141     return __kmp_gtid;
142   }
143 #endif
144   if (TCR_4(__kmp_gtid_mode) >= 2) {
145     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146     return __kmp_gtid_get_specific();
147   }
148   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149 
150   stack_addr = (char *)&stack_data;
151   other_threads = __kmp_threads;
152 
153   /* ATT: The code below is a source of potential bugs due to unsynchronized
154      access to __kmp_threads array. For example:
155      1. Current thread loads other_threads[i] to thr and checks it, it is
156         non-NULL.
157      2. Current thread is suspended by OS.
158      3. Another thread unregisters and finishes (debug versions of free()
159         may fill memory with something like 0xEF).
160      4. Current thread is resumed.
161      5. Current thread reads junk from *thr.
162      TODO: Fix it.  --ln  */
163 
164   for (i = 0; i < __kmp_threads_capacity; i++) {
165 
166     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167     if (!thr)
168       continue;
169 
170     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172 
173     /* stack grows down -- search through all of the active threads */
174 
175     if (stack_addr <= stack_base) {
176       size_t stack_diff = stack_base - stack_addr;
177 
178       if (stack_diff <= stack_size) {
179         /* The only way we can be closer than the allocated */
180         /* stack size is if we are running on this thread. */
181         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
182         return i;
183       }
184     }
185   }
186 
187   /* get specific to try and determine our gtid */
188   KA_TRACE(1000,
189            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
190             "thread, using TLS\n"));
191   i = __kmp_gtid_get_specific();
192 
193   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
194 
195   /* if we havn't been assigned a gtid, then return code */
196   if (i < 0)
197     return i;
198 
199   /* dynamically updated stack window for uber threads to avoid get_specific
200      call */
201   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
202     KMP_FATAL(StackOverflow, i);
203   }
204 
205   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206   if (stack_addr > stack_base) {
207     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
208     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
210                 stack_base);
211   } else {
212     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
213             stack_base - stack_addr);
214   }
215 
216   /* Reprint stack bounds for ubermaster since they have been refined */
217   if (__kmp_storage_map) {
218     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
220     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
221                                  other_threads[i]->th.th_info.ds.ds_stacksize,
222                                  "th_%d stack (refinement)", i);
223   }
224   return i;
225 }
226 
227 int __kmp_get_global_thread_id_reg() {
228   int gtid;
229 
230   if (!__kmp_init_serial) {
231     gtid = KMP_GTID_DNE;
232   } else
233 #ifdef KMP_TDATA_GTID
234       if (TCR_4(__kmp_gtid_mode) >= 3) {
235     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
236     gtid = __kmp_gtid;
237   } else
238 #endif
239       if (TCR_4(__kmp_gtid_mode) >= 2) {
240     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
241     gtid = __kmp_gtid_get_specific();
242   } else {
243     KA_TRACE(1000,
244              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
245     gtid = __kmp_get_global_thread_id();
246   }
247 
248   /* we must be a new uber master sibling thread */
249   if (gtid == KMP_GTID_DNE) {
250     KA_TRACE(10,
251              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
252               "Registering a new gtid.\n"));
253     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
254     if (!__kmp_init_serial) {
255       __kmp_do_serial_initialize();
256       gtid = __kmp_gtid_get_specific();
257     } else {
258       gtid = __kmp_register_root(FALSE);
259     }
260     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
261     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
262   }
263 
264   KMP_DEBUG_ASSERT(gtid >= 0);
265 
266   return gtid;
267 }
268 
269 /* caller must hold forkjoin_lock */
270 void __kmp_check_stack_overlap(kmp_info_t *th) {
271   int f;
272   char *stack_beg = NULL;
273   char *stack_end = NULL;
274   int gtid;
275 
276   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
277   if (__kmp_storage_map) {
278     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
279     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
280 
281     gtid = __kmp_gtid_from_thread(th);
282 
283     if (gtid == KMP_GTID_MONITOR) {
284       __kmp_print_storage_map_gtid(
285           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286           "th_%s stack (%s)", "mon",
287           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288     } else {
289       __kmp_print_storage_map_gtid(
290           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
291           "th_%d stack (%s)", gtid,
292           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
293     }
294   }
295 
296   /* No point in checking ubermaster threads since they use refinement and
297    * cannot overlap */
298   gtid = __kmp_gtid_from_thread(th);
299   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
300     KA_TRACE(10,
301              ("__kmp_check_stack_overlap: performing extensive checking\n"));
302     if (stack_beg == NULL) {
303       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
304       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
305     }
306 
307     for (f = 0; f < __kmp_threads_capacity; f++) {
308       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
309 
310       if (f_th && f_th != th) {
311         char *other_stack_end =
312             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
313         char *other_stack_beg =
314             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
315         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
316             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
317 
318           /* Print the other stack values before the abort */
319           if (__kmp_storage_map)
320             __kmp_print_storage_map_gtid(
321                 -1, other_stack_beg, other_stack_end,
322                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
323                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
324 
325           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
326                       __kmp_msg_null);
327         }
328       }
329     }
330   }
331   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
332 }
333 
334 /* ------------------------------------------------------------------------ */
335 
336 void __kmp_infinite_loop(void) {
337   static int done = FALSE;
338 
339   while (!done) {
340     KMP_YIELD(TRUE);
341   }
342 }
343 
344 #define MAX_MESSAGE 512
345 
346 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
347                                   char const *format, ...) {
348   char buffer[MAX_MESSAGE];
349   va_list ap;
350 
351   va_start(ap, format);
352   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
353                p2, (unsigned long)size, format);
354   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
355   __kmp_vprintf(kmp_err, buffer, ap);
356 #if KMP_PRINT_DATA_PLACEMENT
357   int node;
358   if (gtid >= 0) {
359     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
360       if (__kmp_storage_map_verbose) {
361         node = __kmp_get_host_node(p1);
362         if (node < 0) /* doesn't work, so don't try this next time */
363           __kmp_storage_map_verbose = FALSE;
364         else {
365           char *last;
366           int lastNode;
367           int localProc = __kmp_get_cpu_from_gtid(gtid);
368 
369           const int page_size = KMP_GET_PAGE_SIZE();
370 
371           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
372           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
373           if (localProc >= 0)
374             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
375                                  localProc >> 1);
376           else
377             __kmp_printf_no_lock("  GTID %d\n", gtid);
378 #if KMP_USE_PRCTL
379           /* The more elaborate format is disabled for now because of the prctl
380            * hanging bug. */
381           do {
382             last = p1;
383             lastNode = node;
384             /* This loop collates adjacent pages with the same host node. */
385             do {
386               (char *)p1 += page_size;
387             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
388             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
389                                  lastNode);
390           } while (p1 <= p2);
391 #else
392           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
393                                (char *)p1 + (page_size - 1),
394                                __kmp_get_host_node(p1));
395           if (p1 < p2) {
396             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
397                                  (char *)p2 + (page_size - 1),
398                                  __kmp_get_host_node(p2));
399           }
400 #endif
401         }
402       }
403     } else
404       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
405   }
406 #endif /* KMP_PRINT_DATA_PLACEMENT */
407   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
408 
409   va_end(ap);
410 }
411 
412 void __kmp_warn(char const *format, ...) {
413   char buffer[MAX_MESSAGE];
414   va_list ap;
415 
416   if (__kmp_generate_warnings == kmp_warnings_off) {
417     return;
418   }
419 
420   va_start(ap, format);
421 
422   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
423   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
424   __kmp_vprintf(kmp_err, buffer, ap);
425   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
426 
427   va_end(ap);
428 }
429 
430 void __kmp_abort_process() {
431   // Later threads may stall here, but that's ok because abort() will kill them.
432   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
433 
434   if (__kmp_debug_buf) {
435     __kmp_dump_debug_buffer();
436   }
437 
438   if (KMP_OS_WINDOWS) {
439     // Let other threads know of abnormal termination and prevent deadlock
440     // if abort happened during library initialization or shutdown
441     __kmp_global.g.g_abort = SIGABRT;
442 
443     /* On Windows* OS by default abort() causes pop-up error box, which stalls
444        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
445        boxes. _set_abort_behavior() works well, but this function is not
446        available in VS7 (this is not problem for DLL, but it is a problem for
447        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
448        help, at least in some versions of MS C RTL.
449 
450        It seems following sequence is the only way to simulate abort() and
451        avoid pop-up error box. */
452     raise(SIGABRT);
453     _exit(3); // Just in case, if signal ignored, exit anyway.
454   } else {
455     __kmp_unregister_library();
456     abort();
457   }
458 
459   __kmp_infinite_loop();
460   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
461 
462 } // __kmp_abort_process
463 
464 void __kmp_abort_thread(void) {
465   // TODO: Eliminate g_abort global variable and this function.
466   // In case of abort just call abort(), it will kill all the threads.
467   __kmp_infinite_loop();
468 } // __kmp_abort_thread
469 
470 /* Print out the storage map for the major kmp_info_t thread data structures
471    that are allocated together. */
472 
473 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
474   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
475                                gtid);
476 
477   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
478                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
479 
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
481                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
482 
483   __kmp_print_storage_map_gtid(
484       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
485       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
486 
487   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
488                                &thr->th.th_bar[bs_plain_barrier + 1],
489                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
490                                gtid);
491 
492   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
493                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
494                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
495                                gtid);
496 
497 #if KMP_FAST_REDUCTION_BARRIER
498   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
499                                &thr->th.th_bar[bs_reduction_barrier + 1],
500                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
501                                gtid);
502 #endif // KMP_FAST_REDUCTION_BARRIER
503 }
504 
505 /* Print out the storage map for the major kmp_team_t team data structures
506    that are allocated together. */
507 
508 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
509                                          int team_id, int num_thr) {
510   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
511   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
512                                header, team_id);
513 
514   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
515                                &team->t.t_bar[bs_last_barrier],
516                                sizeof(kmp_balign_team_t) * bs_last_barrier,
517                                "%s_%d.t_bar", header, team_id);
518 
519   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
520                                &team->t.t_bar[bs_plain_barrier + 1],
521                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
522                                header, team_id);
523 
524   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
525                                &team->t.t_bar[bs_forkjoin_barrier + 1],
526                                sizeof(kmp_balign_team_t),
527                                "%s_%d.t_bar[forkjoin]", header, team_id);
528 
529 #if KMP_FAST_REDUCTION_BARRIER
530   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
531                                &team->t.t_bar[bs_reduction_barrier + 1],
532                                sizeof(kmp_balign_team_t),
533                                "%s_%d.t_bar[reduction]", header, team_id);
534 #endif // KMP_FAST_REDUCTION_BARRIER
535 
536   __kmp_print_storage_map_gtid(
537       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
538       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
539 
540   __kmp_print_storage_map_gtid(
541       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
542       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
543 
544   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
545                                &team->t.t_disp_buffer[num_disp_buff],
546                                sizeof(dispatch_shared_info_t) * num_disp_buff,
547                                "%s_%d.t_disp_buffer", header, team_id);
548 }
549 
550 static void __kmp_init_allocator() {
551   __kmp_init_memkind();
552   __kmp_init_target_mem();
553 }
554 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
555 
556 /* ------------------------------------------------------------------------ */
557 
558 #if ENABLE_LIBOMPTARGET
559 static void __kmp_init_omptarget() {
560   __kmp_init_target_task();
561 }
562 #endif
563 
564 /* ------------------------------------------------------------------------ */
565 
566 #if KMP_DYNAMIC_LIB
567 #if KMP_OS_WINDOWS
568 
569 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
570   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
571 
572   switch (fdwReason) {
573 
574   case DLL_PROCESS_ATTACH:
575     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
576 
577     return TRUE;
578 
579   case DLL_PROCESS_DETACH:
580     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
581 
582     // According to Windows* documentation for DllMain entry point:
583     // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
584     //   lpReserved == NULL when FreeLibrary() is called,
585     //   lpReserved != NULL when the process is terminated.
586     // When FreeLibrary() is called, worker threads remain alive. So the
587     // runtime's state is consistent and executing proper shutdown is OK.
588     // When the process is terminated, worker threads have exited or been
589     // forcefully terminated by the OS and only the shutdown thread remains.
590     // This can leave the runtime in an inconsistent state.
591     // Hence, only attempt proper cleanup when FreeLibrary() is called.
592     // Otherwise, rely on OS to reclaim resources.
593     if (lpReserved == NULL)
594       __kmp_internal_end_library(__kmp_gtid_get_specific());
595 
596     return TRUE;
597 
598   case DLL_THREAD_ATTACH:
599     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
600 
601     /* if we want to register new siblings all the time here call
602      * __kmp_get_gtid(); */
603     return TRUE;
604 
605   case DLL_THREAD_DETACH:
606     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
607 
608     __kmp_internal_end_thread(__kmp_gtid_get_specific());
609     return TRUE;
610   }
611 
612   return TRUE;
613 }
614 
615 #endif /* KMP_OS_WINDOWS */
616 #endif /* KMP_DYNAMIC_LIB */
617 
618 /* __kmp_parallel_deo -- Wait until it's our turn. */
619 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
620   int gtid = *gtid_ref;
621 #ifdef BUILD_PARALLEL_ORDERED
622   kmp_team_t *team = __kmp_team_from_gtid(gtid);
623 #endif /* BUILD_PARALLEL_ORDERED */
624 
625   if (__kmp_env_consistency_check) {
626     if (__kmp_threads[gtid]->th.th_root->r.r_active)
627 #if KMP_USE_DYNAMIC_LOCK
628       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
629 #else
630       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
631 #endif
632   }
633 #ifdef BUILD_PARALLEL_ORDERED
634   if (!team->t.t_serialized) {
635     KMP_MB();
636     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
637              NULL);
638     KMP_MB();
639   }
640 #endif /* BUILD_PARALLEL_ORDERED */
641 }
642 
643 /* __kmp_parallel_dxo -- Signal the next task. */
644 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
645   int gtid = *gtid_ref;
646 #ifdef BUILD_PARALLEL_ORDERED
647   int tid = __kmp_tid_from_gtid(gtid);
648   kmp_team_t *team = __kmp_team_from_gtid(gtid);
649 #endif /* BUILD_PARALLEL_ORDERED */
650 
651   if (__kmp_env_consistency_check) {
652     if (__kmp_threads[gtid]->th.th_root->r.r_active)
653       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
654   }
655 #ifdef BUILD_PARALLEL_ORDERED
656   if (!team->t.t_serialized) {
657     KMP_MB(); /* Flush all pending memory write invalidates.  */
658 
659     /* use the tid of the next thread in this team */
660     /* TODO replace with general release procedure */
661     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
662 
663     KMP_MB(); /* Flush all pending memory write invalidates.  */
664   }
665 #endif /* BUILD_PARALLEL_ORDERED */
666 }
667 
668 /* ------------------------------------------------------------------------ */
669 /* The BARRIER for a SINGLE process section is always explicit   */
670 
671 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
672   int status;
673   kmp_info_t *th;
674   kmp_team_t *team;
675 
676   if (!TCR_4(__kmp_init_parallel))
677     __kmp_parallel_initialize();
678   __kmp_resume_if_soft_paused();
679 
680   th = __kmp_threads[gtid];
681   team = th->th.th_team;
682   status = 0;
683 
684   th->th.th_ident = id_ref;
685 
686   if (team->t.t_serialized) {
687     status = 1;
688   } else {
689     kmp_int32 old_this = th->th.th_local.this_construct;
690 
691     ++th->th.th_local.this_construct;
692     /* try to set team count to thread count--success means thread got the
693        single block */
694     /* TODO: Should this be acquire or release? */
695     if (team->t.t_construct == old_this) {
696       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
697                                               th->th.th_local.this_construct);
698     }
699 #if USE_ITT_BUILD
700     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
701         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
702         team->t.t_active_level == 1) {
703       // Only report metadata by primary thread of active team at level 1
704       __kmp_itt_metadata_single(id_ref);
705     }
706 #endif /* USE_ITT_BUILD */
707   }
708 
709   if (__kmp_env_consistency_check) {
710     if (status && push_ws) {
711       __kmp_push_workshare(gtid, ct_psingle, id_ref);
712     } else {
713       __kmp_check_workshare(gtid, ct_psingle, id_ref);
714     }
715   }
716 #if USE_ITT_BUILD
717   if (status) {
718     __kmp_itt_single_start(gtid);
719   }
720 #endif /* USE_ITT_BUILD */
721   return status;
722 }
723 
724 void __kmp_exit_single(int gtid) {
725 #if USE_ITT_BUILD
726   __kmp_itt_single_end(gtid);
727 #endif /* USE_ITT_BUILD */
728   if (__kmp_env_consistency_check)
729     __kmp_pop_workshare(gtid, ct_psingle, NULL);
730 }
731 
732 /* determine if we can go parallel or must use a serialized parallel region and
733  * how many threads we can use
734  * set_nproc is the number of threads requested for the team
735  * returns 0 if we should serialize or only use one thread,
736  * otherwise the number of threads to use
737  * The forkjoin lock is held by the caller. */
738 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
739                                  int master_tid, int set_nthreads,
740                                  int enter_teams) {
741   int capacity;
742   int new_nthreads;
743   KMP_DEBUG_ASSERT(__kmp_init_serial);
744   KMP_DEBUG_ASSERT(root && parent_team);
745   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
746 
747   // If dyn-var is set, dynamically adjust the number of desired threads,
748   // according to the method specified by dynamic_mode.
749   new_nthreads = set_nthreads;
750   if (!get__dynamic_2(parent_team, master_tid)) {
751     ;
752   }
753 #ifdef USE_LOAD_BALANCE
754   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
755     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
756     if (new_nthreads == 1) {
757       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
758                     "reservation to 1 thread\n",
759                     master_tid));
760       return 1;
761     }
762     if (new_nthreads < set_nthreads) {
763       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
764                     "reservation to %d threads\n",
765                     master_tid, new_nthreads));
766     }
767   }
768 #endif /* USE_LOAD_BALANCE */
769   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
770     new_nthreads = __kmp_avail_proc - __kmp_nth +
771                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
772     if (new_nthreads <= 1) {
773       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
774                     "reservation to 1 thread\n",
775                     master_tid));
776       return 1;
777     }
778     if (new_nthreads < set_nthreads) {
779       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
780                     "reservation to %d threads\n",
781                     master_tid, new_nthreads));
782     } else {
783       new_nthreads = set_nthreads;
784     }
785   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
786     if (set_nthreads > 2) {
787       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
788       new_nthreads = (new_nthreads % set_nthreads) + 1;
789       if (new_nthreads == 1) {
790         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
791                       "reservation to 1 thread\n",
792                       master_tid));
793         return 1;
794       }
795       if (new_nthreads < set_nthreads) {
796         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
797                       "reservation to %d threads\n",
798                       master_tid, new_nthreads));
799       }
800     }
801   } else {
802     KMP_ASSERT(0);
803   }
804 
805   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
806   if (__kmp_nth + new_nthreads -
807           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
808       __kmp_max_nth) {
809     int tl_nthreads = __kmp_max_nth - __kmp_nth +
810                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
811     if (tl_nthreads <= 0) {
812       tl_nthreads = 1;
813     }
814 
815     // If dyn-var is false, emit a 1-time warning.
816     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
817       __kmp_reserve_warn = 1;
818       __kmp_msg(kmp_ms_warning,
819                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
820                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
821     }
822     if (tl_nthreads == 1) {
823       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
824                     "reduced reservation to 1 thread\n",
825                     master_tid));
826       return 1;
827     }
828     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
829                   "reservation to %d threads\n",
830                   master_tid, tl_nthreads));
831     new_nthreads = tl_nthreads;
832   }
833 
834   // Respect OMP_THREAD_LIMIT
835   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
836   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
837   if (cg_nthreads + new_nthreads -
838           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
839       max_cg_threads) {
840     int tl_nthreads = max_cg_threads - cg_nthreads +
841                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
842     if (tl_nthreads <= 0) {
843       tl_nthreads = 1;
844     }
845 
846     // If dyn-var is false, emit a 1-time warning.
847     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
848       __kmp_reserve_warn = 1;
849       __kmp_msg(kmp_ms_warning,
850                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
851                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
852     }
853     if (tl_nthreads == 1) {
854       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
855                     "reduced reservation to 1 thread\n",
856                     master_tid));
857       return 1;
858     }
859     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
860                   "reservation to %d threads\n",
861                   master_tid, tl_nthreads));
862     new_nthreads = tl_nthreads;
863   }
864 
865   // Check if the threads array is large enough, or needs expanding.
866   // See comment in __kmp_register_root() about the adjustment if
867   // __kmp_threads[0] == NULL.
868   capacity = __kmp_threads_capacity;
869   if (TCR_PTR(__kmp_threads[0]) == NULL) {
870     --capacity;
871   }
872   // If it is not for initializing the hidden helper team, we need to take
873   // __kmp_hidden_helper_threads_num out of the capacity because it is included
874   // in __kmp_threads_capacity.
875   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
876     capacity -= __kmp_hidden_helper_threads_num;
877   }
878   if (__kmp_nth + new_nthreads -
879           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
880       capacity) {
881     // Expand the threads array.
882     int slotsRequired = __kmp_nth + new_nthreads -
883                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
884                         capacity;
885     int slotsAdded = __kmp_expand_threads(slotsRequired);
886     if (slotsAdded < slotsRequired) {
887       // The threads array was not expanded enough.
888       new_nthreads -= (slotsRequired - slotsAdded);
889       KMP_ASSERT(new_nthreads >= 1);
890 
891       // If dyn-var is false, emit a 1-time warning.
892       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
893         __kmp_reserve_warn = 1;
894         if (__kmp_tp_cached) {
895           __kmp_msg(kmp_ms_warning,
896                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
897                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
898                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
899         } else {
900           __kmp_msg(kmp_ms_warning,
901                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
902                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
903         }
904       }
905     }
906   }
907 
908 #ifdef KMP_DEBUG
909   if (new_nthreads == 1) {
910     KC_TRACE(10,
911              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
912               "dead roots and rechecking; requested %d threads\n",
913               __kmp_get_gtid(), set_nthreads));
914   } else {
915     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
916                   " %d threads\n",
917                   __kmp_get_gtid(), new_nthreads, set_nthreads));
918   }
919 #endif // KMP_DEBUG
920   return new_nthreads;
921 }
922 
923 /* Allocate threads from the thread pool and assign them to the new team. We are
924    assured that there are enough threads available, because we checked on that
925    earlier within critical section forkjoin */
926 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
927                                     kmp_info_t *master_th, int master_gtid,
928                                     int fork_teams_workers) {
929   int i;
930   int use_hot_team;
931 
932   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
933   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
934   KMP_MB();
935 
936   /* first, let's setup the primary thread */
937   master_th->th.th_info.ds.ds_tid = 0;
938   master_th->th.th_team = team;
939   master_th->th.th_team_nproc = team->t.t_nproc;
940   master_th->th.th_team_master = master_th;
941   master_th->th.th_team_serialized = FALSE;
942   master_th->th.th_dispatch = &team->t.t_dispatch[0];
943 
944 /* make sure we are not the optimized hot team */
945 #if KMP_NESTED_HOT_TEAMS
946   use_hot_team = 0;
947   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
948   if (hot_teams) { // hot teams array is not allocated if
949     // KMP_HOT_TEAMS_MAX_LEVEL=0
950     int level = team->t.t_active_level - 1; // index in array of hot teams
951     if (master_th->th.th_teams_microtask) { // are we inside the teams?
952       if (master_th->th.th_teams_size.nteams > 1) {
953         ++level; // level was not increased in teams construct for
954         // team_of_masters
955       }
956       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
957           master_th->th.th_teams_level == team->t.t_level) {
958         ++level; // level was not increased in teams construct for
959         // team_of_workers before the parallel
960       } // team->t.t_level will be increased inside parallel
961     }
962     if (level < __kmp_hot_teams_max_level) {
963       if (hot_teams[level].hot_team) {
964         // hot team has already been allocated for given level
965         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
966         use_hot_team = 1; // the team is ready to use
967       } else {
968         use_hot_team = 0; // AC: threads are not allocated yet
969         hot_teams[level].hot_team = team; // remember new hot team
970         hot_teams[level].hot_team_nth = team->t.t_nproc;
971       }
972     } else {
973       use_hot_team = 0;
974     }
975   }
976 #else
977   use_hot_team = team == root->r.r_hot_team;
978 #endif
979   if (!use_hot_team) {
980 
981     /* install the primary thread */
982     team->t.t_threads[0] = master_th;
983     __kmp_initialize_info(master_th, team, 0, master_gtid);
984 
985     /* now, install the worker threads */
986     for (i = 1; i < team->t.t_nproc; i++) {
987 
988       /* fork or reallocate a new thread and install it in team */
989       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
990       team->t.t_threads[i] = thr;
991       KMP_DEBUG_ASSERT(thr);
992       KMP_DEBUG_ASSERT(thr->th.th_team == team);
993       /* align team and thread arrived states */
994       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
995                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
996                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
997                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
998                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
999                     team->t.t_bar[bs_plain_barrier].b_arrived));
1000       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1001       thr->th.th_teams_level = master_th->th.th_teams_level;
1002       thr->th.th_teams_size = master_th->th.th_teams_size;
1003       { // Initialize threads' barrier data.
1004         int b;
1005         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1006         for (b = 0; b < bs_last_barrier; ++b) {
1007           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1008           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1009 #if USE_DEBUGGER
1010           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1011 #endif
1012         }
1013       }
1014     }
1015 
1016 #if KMP_AFFINITY_SUPPORTED
1017     // Do not partition the places list for teams construct workers who
1018     // haven't actually been forked to do real work yet. This partitioning
1019     // will take place in the parallel region nested within the teams construct.
1020     if (!fork_teams_workers) {
1021       __kmp_partition_places(team);
1022     }
1023 #endif
1024 
1025     if (team->t.t_nproc > 1 &&
1026         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1027       team->t.b->update_num_threads(team->t.t_nproc);
1028       __kmp_add_threads_to_team(team, team->t.t_nproc);
1029     }
1030   }
1031 
1032   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1033     for (i = 0; i < team->t.t_nproc; i++) {
1034       kmp_info_t *thr = team->t.t_threads[i];
1035       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1036           thr->th.th_prev_level != team->t.t_level) {
1037         team->t.t_display_affinity = 1;
1038         break;
1039       }
1040     }
1041   }
1042 
1043   KMP_MB();
1044 }
1045 
1046 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1047 // Propagate any changes to the floating point control registers out to the team
1048 // We try to avoid unnecessary writes to the relevant cache line in the team
1049 // structure, so we don't make changes unless they are needed.
1050 inline static void propagateFPControl(kmp_team_t *team) {
1051   if (__kmp_inherit_fp_control) {
1052     kmp_int16 x87_fpu_control_word;
1053     kmp_uint32 mxcsr;
1054 
1055     // Get primary thread's values of FPU control flags (both X87 and vector)
1056     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1057     __kmp_store_mxcsr(&mxcsr);
1058     mxcsr &= KMP_X86_MXCSR_MASK;
1059 
1060     // There is no point looking at t_fp_control_saved here.
1061     // If it is TRUE, we still have to update the values if they are different
1062     // from those we now have. If it is FALSE we didn't save anything yet, but
1063     // our objective is the same. We have to ensure that the values in the team
1064     // are the same as those we have.
1065     // So, this code achieves what we need whether or not t_fp_control_saved is
1066     // true. By checking whether the value needs updating we avoid unnecessary
1067     // writes that would put the cache-line into a written state, causing all
1068     // threads in the team to have to read it again.
1069     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1070     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1071     // Although we don't use this value, other code in the runtime wants to know
1072     // whether it should restore them. So we must ensure it is correct.
1073     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1074   } else {
1075     // Similarly here. Don't write to this cache-line in the team structure
1076     // unless we have to.
1077     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1078   }
1079 }
1080 
1081 // Do the opposite, setting the hardware registers to the updated values from
1082 // the team.
1083 inline static void updateHWFPControl(kmp_team_t *team) {
1084   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1085     // Only reset the fp control regs if they have been changed in the team.
1086     // the parallel region that we are exiting.
1087     kmp_int16 x87_fpu_control_word;
1088     kmp_uint32 mxcsr;
1089     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1090     __kmp_store_mxcsr(&mxcsr);
1091     mxcsr &= KMP_X86_MXCSR_MASK;
1092 
1093     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1094       __kmp_clear_x87_fpu_status_word();
1095       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1096     }
1097 
1098     if (team->t.t_mxcsr != mxcsr) {
1099       __kmp_load_mxcsr(&team->t.t_mxcsr);
1100     }
1101   }
1102 }
1103 #else
1104 #define propagateFPControl(x) ((void)0)
1105 #define updateHWFPControl(x) ((void)0)
1106 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1107 
1108 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1109                                      int realloc); // forward declaration
1110 
1111 /* Run a parallel region that has been serialized, so runs only in a team of the
1112    single primary thread. */
1113 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1114   kmp_info_t *this_thr;
1115   kmp_team_t *serial_team;
1116 
1117   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1118 
1119   /* Skip all this code for autopar serialized loops since it results in
1120      unacceptable overhead */
1121   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1122     return;
1123 
1124   if (!TCR_4(__kmp_init_parallel))
1125     __kmp_parallel_initialize();
1126   __kmp_resume_if_soft_paused();
1127 
1128   this_thr = __kmp_threads[global_tid];
1129   serial_team = this_thr->th.th_serial_team;
1130 
1131   /* utilize the serialized team held by this thread */
1132   KMP_DEBUG_ASSERT(serial_team);
1133   KMP_MB();
1134 
1135   if (__kmp_tasking_mode != tskm_immediate_exec) {
1136     KMP_DEBUG_ASSERT(
1137         this_thr->th.th_task_team ==
1138         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1139     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1140                      NULL);
1141     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1142                   "team %p, new task_team = NULL\n",
1143                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1144     this_thr->th.th_task_team = NULL;
1145   }
1146 
1147   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1148   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1149     proc_bind = proc_bind_false;
1150   } else if (proc_bind == proc_bind_default) {
1151     // No proc_bind clause was specified, so use the current value
1152     // of proc-bind-var for this parallel region.
1153     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1154   }
1155   // Reset for next parallel region
1156   this_thr->th.th_set_proc_bind = proc_bind_default;
1157 
1158   // Reset num_threads for next parallel region
1159   this_thr->th.th_set_nproc = 0;
1160 
1161 #if OMPT_SUPPORT
1162   ompt_data_t ompt_parallel_data = ompt_data_none;
1163   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1164   if (ompt_enabled.enabled &&
1165       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1166 
1167     ompt_task_info_t *parent_task_info;
1168     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1169 
1170     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1171     if (ompt_enabled.ompt_callback_parallel_begin) {
1172       int team_size = 1;
1173 
1174       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1175           &(parent_task_info->task_data), &(parent_task_info->frame),
1176           &ompt_parallel_data, team_size,
1177           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1178     }
1179   }
1180 #endif // OMPT_SUPPORT
1181 
1182   if (this_thr->th.th_team != serial_team) {
1183     // Nested level will be an index in the nested nthreads array
1184     int level = this_thr->th.th_team->t.t_level;
1185 
1186     if (serial_team->t.t_serialized) {
1187       /* this serial team was already used
1188          TODO increase performance by making this locks more specific */
1189       kmp_team_t *new_team;
1190 
1191       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1192 
1193       new_team =
1194           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1195 #if OMPT_SUPPORT
1196                               ompt_parallel_data,
1197 #endif
1198                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1199                               0 USE_NESTED_HOT_ARG(NULL));
1200       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1201       KMP_ASSERT(new_team);
1202 
1203       /* setup new serialized team and install it */
1204       new_team->t.t_threads[0] = this_thr;
1205       new_team->t.t_parent = this_thr->th.th_team;
1206       serial_team = new_team;
1207       this_thr->th.th_serial_team = serial_team;
1208 
1209       KF_TRACE(
1210           10,
1211           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1212            global_tid, serial_team));
1213 
1214       /* TODO the above breaks the requirement that if we run out of resources,
1215          then we can still guarantee that serialized teams are ok, since we may
1216          need to allocate a new one */
1217     } else {
1218       KF_TRACE(
1219           10,
1220           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1221            global_tid, serial_team));
1222     }
1223 
1224     /* we have to initialize this serial team */
1225     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1226     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1227     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1228     serial_team->t.t_ident = loc;
1229     serial_team->t.t_serialized = 1;
1230     serial_team->t.t_nproc = 1;
1231     serial_team->t.t_parent = this_thr->th.th_team;
1232     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1233     this_thr->th.th_team = serial_team;
1234     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1235 
1236     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1237                   this_thr->th.th_current_task));
1238     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1239     this_thr->th.th_current_task->td_flags.executing = 0;
1240 
1241     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1242 
1243     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1244        implicit task for each serialized task represented by
1245        team->t.t_serialized? */
1246     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1247               &this_thr->th.th_current_task->td_parent->td_icvs);
1248 
1249     // Thread value exists in the nested nthreads array for the next nested
1250     // level
1251     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1252       this_thr->th.th_current_task->td_icvs.nproc =
1253           __kmp_nested_nth.nth[level + 1];
1254     }
1255 
1256     if (__kmp_nested_proc_bind.used &&
1257         (level + 1 < __kmp_nested_proc_bind.used)) {
1258       this_thr->th.th_current_task->td_icvs.proc_bind =
1259           __kmp_nested_proc_bind.bind_types[level + 1];
1260     }
1261 
1262 #if USE_DEBUGGER
1263     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1264 #endif
1265     this_thr->th.th_info.ds.ds_tid = 0;
1266 
1267     /* set thread cache values */
1268     this_thr->th.th_team_nproc = 1;
1269     this_thr->th.th_team_master = this_thr;
1270     this_thr->th.th_team_serialized = 1;
1271 
1272     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1273     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1274     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1275 
1276     propagateFPControl(serial_team);
1277 
1278     /* check if we need to allocate dispatch buffers stack */
1279     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1280     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1281       serial_team->t.t_dispatch->th_disp_buffer =
1282           (dispatch_private_info_t *)__kmp_allocate(
1283               sizeof(dispatch_private_info_t));
1284     }
1285     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1286 
1287     KMP_MB();
1288 
1289   } else {
1290     /* this serialized team is already being used,
1291      * that's fine, just add another nested level */
1292     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1293     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1294     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1295     ++serial_team->t.t_serialized;
1296     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1297 
1298     // Nested level will be an index in the nested nthreads array
1299     int level = this_thr->th.th_team->t.t_level;
1300     // Thread value exists in the nested nthreads array for the next nested
1301     // level
1302     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1303       this_thr->th.th_current_task->td_icvs.nproc =
1304           __kmp_nested_nth.nth[level + 1];
1305     }
1306     serial_team->t.t_level++;
1307     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1308                   "of serial team %p to %d\n",
1309                   global_tid, serial_team, serial_team->t.t_level));
1310 
1311     /* allocate/push dispatch buffers stack */
1312     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1313     {
1314       dispatch_private_info_t *disp_buffer =
1315           (dispatch_private_info_t *)__kmp_allocate(
1316               sizeof(dispatch_private_info_t));
1317       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1318       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1319     }
1320     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1321 
1322     KMP_MB();
1323   }
1324   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1325 
1326   // Perform the display affinity functionality for
1327   // serialized parallel regions
1328   if (__kmp_display_affinity) {
1329     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1330         this_thr->th.th_prev_num_threads != 1) {
1331       // NULL means use the affinity-format-var ICV
1332       __kmp_aux_display_affinity(global_tid, NULL);
1333       this_thr->th.th_prev_level = serial_team->t.t_level;
1334       this_thr->th.th_prev_num_threads = 1;
1335     }
1336   }
1337 
1338   if (__kmp_env_consistency_check)
1339     __kmp_push_parallel(global_tid, NULL);
1340 #if OMPT_SUPPORT
1341   serial_team->t.ompt_team_info.master_return_address = codeptr;
1342   if (ompt_enabled.enabled &&
1343       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1344     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1345         OMPT_GET_FRAME_ADDRESS(0);
1346 
1347     ompt_lw_taskteam_t lw_taskteam;
1348     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1349                             &ompt_parallel_data, codeptr);
1350 
1351     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1352     // don't use lw_taskteam after linking. content was swaped
1353 
1354     /* OMPT implicit task begin */
1355     if (ompt_enabled.ompt_callback_implicit_task) {
1356       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1357           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1358           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1359           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1360       OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1361           __kmp_tid_from_gtid(global_tid);
1362     }
1363 
1364     /* OMPT state */
1365     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1366     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1367         OMPT_GET_FRAME_ADDRESS(0);
1368   }
1369 #endif
1370 }
1371 
1372 // Test if this fork is for a team closely nested in a teams construct
1373 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1374                                           microtask_t microtask, int level,
1375                                           int teams_level, kmp_va_list ap) {
1376   return (master_th->th.th_teams_microtask && ap &&
1377           microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1378 }
1379 
1380 // Test if this fork is for the teams construct, i.e. to form the outer league
1381 // of teams
1382 static inline bool __kmp_is_entering_teams(int active_level, int level,
1383                                            int teams_level, kmp_va_list ap) {
1384   return ((ap == NULL && active_level == 0) ||
1385           (ap && teams_level > 0 && teams_level == level));
1386 }
1387 
1388 // AC: This is start of parallel that is nested inside teams construct.
1389 // The team is actual (hot), all workers are ready at the fork barrier.
1390 // No lock needed to initialize the team a bit, then free workers.
1391 static inline int
1392 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1393                     kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1394                     enum fork_context_e call_context, microtask_t microtask,
1395                     launch_t invoker, int master_set_numthreads, int level,
1396 #if OMPT_SUPPORT
1397                     ompt_data_t ompt_parallel_data, void *return_address,
1398 #endif
1399                     kmp_va_list ap) {
1400   void **argv;
1401   int i;
1402 
1403   parent_team->t.t_ident = loc;
1404   __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1405   parent_team->t.t_argc = argc;
1406   argv = (void **)parent_team->t.t_argv;
1407   for (i = argc - 1; i >= 0; --i) {
1408     *argv++ = va_arg(kmp_va_deref(ap), void *);
1409   }
1410   // Increment our nested depth levels, but not increase the serialization
1411   if (parent_team == master_th->th.th_serial_team) {
1412     // AC: we are in serialized parallel
1413     __kmpc_serialized_parallel(loc, gtid);
1414     KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1415 
1416     if (call_context == fork_context_gnu) {
1417       // AC: need to decrement t_serialized for enquiry functions to work
1418       // correctly, will restore at join time
1419       parent_team->t.t_serialized--;
1420       return TRUE;
1421     }
1422 
1423 #if OMPD_SUPPORT
1424     parent_team->t.t_pkfn = microtask;
1425 #endif
1426 
1427 #if OMPT_SUPPORT
1428     void *dummy;
1429     void **exit_frame_p;
1430     ompt_data_t *implicit_task_data;
1431     ompt_lw_taskteam_t lw_taskteam;
1432 
1433     if (ompt_enabled.enabled) {
1434       __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1435                               &ompt_parallel_data, return_address);
1436       exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1437 
1438       __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1439       // Don't use lw_taskteam after linking. Content was swapped.
1440 
1441       /* OMPT implicit task begin */
1442       implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1443       if (ompt_enabled.ompt_callback_implicit_task) {
1444         OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1445         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1446             ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1447             1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1448       }
1449 
1450       /* OMPT state */
1451       master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1452     } else {
1453       exit_frame_p = &dummy;
1454     }
1455 #endif
1456 
1457     // AC: need to decrement t_serialized for enquiry functions to work
1458     // correctly, will restore at join time
1459     parent_team->t.t_serialized--;
1460 
1461     {
1462       KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1463       KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1464       __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1465 #if OMPT_SUPPORT
1466                              ,
1467                              exit_frame_p
1468 #endif
1469                              );
1470     }
1471 
1472 #if OMPT_SUPPORT
1473     if (ompt_enabled.enabled) {
1474       *exit_frame_p = NULL;
1475       OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1476       if (ompt_enabled.ompt_callback_implicit_task) {
1477         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1478             ompt_scope_end, NULL, implicit_task_data, 1,
1479             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1480       }
1481       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1482       __ompt_lw_taskteam_unlink(master_th);
1483       if (ompt_enabled.ompt_callback_parallel_end) {
1484         ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1485             &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1486             OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1487       }
1488       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1489     }
1490 #endif
1491     return TRUE;
1492   }
1493 
1494   parent_team->t.t_pkfn = microtask;
1495   parent_team->t.t_invoke = invoker;
1496   KMP_ATOMIC_INC(&root->r.r_in_parallel);
1497   parent_team->t.t_active_level++;
1498   parent_team->t.t_level++;
1499   parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1500 
1501   // If the threads allocated to the team are less than the thread limit, update
1502   // the thread limit here. th_teams_size.nth is specific to this team nested
1503   // in a teams construct, the team is fully created, and we're about to do
1504   // the actual fork. Best to do this here so that the subsequent uses below
1505   // and in the join have the correct value.
1506   master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1507 
1508 #if OMPT_SUPPORT
1509   if (ompt_enabled.enabled) {
1510     ompt_lw_taskteam_t lw_taskteam;
1511     __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1512                             return_address);
1513     __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1514   }
1515 #endif
1516 
1517   /* Change number of threads in the team if requested */
1518   if (master_set_numthreads) { // The parallel has num_threads clause
1519     if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1520       // AC: only can reduce number of threads dynamically, can't increase
1521       kmp_info_t **other_threads = parent_team->t.t_threads;
1522       // NOTE: if using distributed barrier, we need to run this code block
1523       // even when the team size appears not to have changed from the max.
1524       int old_proc = master_th->th.th_teams_size.nth;
1525       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1526         __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1527         __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1528       }
1529       parent_team->t.t_nproc = master_set_numthreads;
1530       for (i = 0; i < master_set_numthreads; ++i) {
1531         other_threads[i]->th.th_team_nproc = master_set_numthreads;
1532       }
1533     }
1534     // Keep extra threads hot in the team for possible next parallels
1535     master_th->th.th_set_nproc = 0;
1536   }
1537 
1538 #if USE_DEBUGGER
1539   if (__kmp_debugging) { // Let debugger override number of threads.
1540     int nth = __kmp_omp_num_threads(loc);
1541     if (nth > 0) { // 0 means debugger doesn't want to change num threads
1542       master_set_numthreads = nth;
1543     }
1544   }
1545 #endif
1546 
1547   // Figure out the proc_bind policy for the nested parallel within teams
1548   kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1549   // proc_bind_default means don't update
1550   kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1551   if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1552     proc_bind = proc_bind_false;
1553   } else {
1554     // No proc_bind clause specified; use current proc-bind-var
1555     if (proc_bind == proc_bind_default) {
1556       proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1557     }
1558     /* else: The proc_bind policy was specified explicitly on parallel clause.
1559        This overrides proc-bind-var for this parallel region, but does not
1560        change proc-bind-var. */
1561     // Figure the value of proc-bind-var for the child threads.
1562     if ((level + 1 < __kmp_nested_proc_bind.used) &&
1563         (__kmp_nested_proc_bind.bind_types[level + 1] !=
1564          master_th->th.th_current_task->td_icvs.proc_bind)) {
1565       proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1566     }
1567   }
1568   KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1569   // Need to change the bind-var ICV to correct value for each implicit task
1570   if (proc_bind_icv != proc_bind_default &&
1571       master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1572     kmp_info_t **other_threads = parent_team->t.t_threads;
1573     for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1574       other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1575     }
1576   }
1577   // Reset for next parallel region
1578   master_th->th.th_set_proc_bind = proc_bind_default;
1579 
1580 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1581   if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1582        KMP_ITT_DEBUG) &&
1583       __kmp_forkjoin_frames_mode == 3 &&
1584       parent_team->t.t_active_level == 1 // only report frames at level 1
1585       && master_th->th.th_teams_size.nteams == 1) {
1586     kmp_uint64 tmp_time = __itt_get_timestamp();
1587     master_th->th.th_frame_time = tmp_time;
1588     parent_team->t.t_region_time = tmp_time;
1589   }
1590   if (__itt_stack_caller_create_ptr) {
1591     KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1592     // create new stack stitching id before entering fork barrier
1593     parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1594   }
1595 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1596 #if KMP_AFFINITY_SUPPORTED
1597   __kmp_partition_places(parent_team);
1598 #endif
1599 
1600   KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1601                 "master_th=%p, gtid=%d\n",
1602                 root, parent_team, master_th, gtid));
1603   __kmp_internal_fork(loc, gtid, parent_team);
1604   KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1605                 "master_th=%p, gtid=%d\n",
1606                 root, parent_team, master_th, gtid));
1607 
1608   if (call_context == fork_context_gnu)
1609     return TRUE;
1610 
1611   /* Invoke microtask for PRIMARY thread */
1612   KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1613                 parent_team->t.t_id, parent_team->t.t_pkfn));
1614 
1615   if (!parent_team->t.t_invoke(gtid)) {
1616     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1617   }
1618   KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1619                 parent_team->t.t_id, parent_team->t.t_pkfn));
1620   KMP_MB(); /* Flush all pending memory write invalidates.  */
1621 
1622   KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1623 
1624   return TRUE;
1625 }
1626 
1627 // Create a serialized parallel region
1628 static inline int
1629 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1630                        kmp_int32 argc, microtask_t microtask, launch_t invoker,
1631                        kmp_info_t *master_th, kmp_team_t *parent_team,
1632 #if OMPT_SUPPORT
1633                        ompt_data_t *ompt_parallel_data, void **return_address,
1634                        ompt_data_t **parent_task_data,
1635 #endif
1636                        kmp_va_list ap) {
1637   kmp_team_t *team;
1638   int i;
1639   void **argv;
1640 
1641 /* josh todo: hypothetical question: what do we do for OS X*? */
1642 #if KMP_OS_LINUX &&                                                            \
1643     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1644   void *args[argc];
1645 #else
1646   void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1647 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1648           KMP_ARCH_AARCH64) */
1649 
1650   KA_TRACE(
1651       20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1652 
1653   __kmpc_serialized_parallel(loc, gtid);
1654 
1655 #if OMPD_SUPPORT
1656   master_th->th.th_serial_team->t.t_pkfn = microtask;
1657 #endif
1658 
1659   if (call_context == fork_context_intel) {
1660     /* TODO this sucks, use the compiler itself to pass args! :) */
1661     master_th->th.th_serial_team->t.t_ident = loc;
1662     if (!ap) {
1663       // revert change made in __kmpc_serialized_parallel()
1664       master_th->th.th_serial_team->t.t_level--;
1665 // Get args from parent team for teams construct
1666 
1667 #if OMPT_SUPPORT
1668       void *dummy;
1669       void **exit_frame_p;
1670       ompt_task_info_t *task_info;
1671       ompt_lw_taskteam_t lw_taskteam;
1672 
1673       if (ompt_enabled.enabled) {
1674         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1675                                 ompt_parallel_data, *return_address);
1676 
1677         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1678         // don't use lw_taskteam after linking. content was swaped
1679         task_info = OMPT_CUR_TASK_INFO(master_th);
1680         exit_frame_p = &(task_info->frame.exit_frame.ptr);
1681         if (ompt_enabled.ompt_callback_implicit_task) {
1682           OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1683           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1684               ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1685               &(task_info->task_data), 1,
1686               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1687         }
1688 
1689         /* OMPT state */
1690         master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1691       } else {
1692         exit_frame_p = &dummy;
1693       }
1694 #endif
1695 
1696       {
1697         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1698         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1699         __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1700 #if OMPT_SUPPORT
1701                                ,
1702                                exit_frame_p
1703 #endif
1704                                );
1705       }
1706 
1707 #if OMPT_SUPPORT
1708       if (ompt_enabled.enabled) {
1709         *exit_frame_p = NULL;
1710         if (ompt_enabled.ompt_callback_implicit_task) {
1711           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1712               ompt_scope_end, NULL, &(task_info->task_data), 1,
1713               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1714         }
1715         *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1716         __ompt_lw_taskteam_unlink(master_th);
1717         if (ompt_enabled.ompt_callback_parallel_end) {
1718           ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1719               ompt_parallel_data, *parent_task_data,
1720               OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1721         }
1722         master_th->th.ompt_thread_info.state = ompt_state_overhead;
1723       }
1724 #endif
1725     } else if (microtask == (microtask_t)__kmp_teams_master) {
1726       KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1727       team = master_th->th.th_team;
1728       // team->t.t_pkfn = microtask;
1729       team->t.t_invoke = invoker;
1730       __kmp_alloc_argv_entries(argc, team, TRUE);
1731       team->t.t_argc = argc;
1732       argv = (void **)team->t.t_argv;
1733       if (ap) {
1734         for (i = argc - 1; i >= 0; --i)
1735           *argv++ = va_arg(kmp_va_deref(ap), void *);
1736       } else {
1737         for (i = 0; i < argc; ++i)
1738           // Get args from parent team for teams construct
1739           argv[i] = parent_team->t.t_argv[i];
1740       }
1741       // AC: revert change made in __kmpc_serialized_parallel()
1742       //     because initial code in teams should have level=0
1743       team->t.t_level--;
1744       // AC: call special invoker for outer "parallel" of teams construct
1745       invoker(gtid);
1746 #if OMPT_SUPPORT
1747       if (ompt_enabled.enabled) {
1748         ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1749         if (ompt_enabled.ompt_callback_implicit_task) {
1750           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1751               ompt_scope_end, NULL, &(task_info->task_data), 0,
1752               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1753         }
1754         if (ompt_enabled.ompt_callback_parallel_end) {
1755           ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1756               ompt_parallel_data, *parent_task_data,
1757               OMPT_INVOKER(call_context) | ompt_parallel_league,
1758               *return_address);
1759         }
1760         master_th->th.ompt_thread_info.state = ompt_state_overhead;
1761       }
1762 #endif
1763     } else {
1764       argv = args;
1765       for (i = argc - 1; i >= 0; --i)
1766         *argv++ = va_arg(kmp_va_deref(ap), void *);
1767       KMP_MB();
1768 
1769 #if OMPT_SUPPORT
1770       void *dummy;
1771       void **exit_frame_p;
1772       ompt_task_info_t *task_info;
1773       ompt_lw_taskteam_t lw_taskteam;
1774       ompt_data_t *implicit_task_data;
1775 
1776       if (ompt_enabled.enabled) {
1777         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1778                                 ompt_parallel_data, *return_address);
1779         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1780         // don't use lw_taskteam after linking. content was swaped
1781         task_info = OMPT_CUR_TASK_INFO(master_th);
1782         exit_frame_p = &(task_info->frame.exit_frame.ptr);
1783 
1784         /* OMPT implicit task begin */
1785         implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1786         if (ompt_enabled.ompt_callback_implicit_task) {
1787           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1788               ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1789               implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1790               ompt_task_implicit);
1791           OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1792         }
1793 
1794         /* OMPT state */
1795         master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1796       } else {
1797         exit_frame_p = &dummy;
1798       }
1799 #endif
1800 
1801       {
1802         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1803         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1804         __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1805 #if OMPT_SUPPORT
1806                                ,
1807                                exit_frame_p
1808 #endif
1809                                );
1810       }
1811 
1812 #if OMPT_SUPPORT
1813       if (ompt_enabled.enabled) {
1814         *exit_frame_p = NULL;
1815         if (ompt_enabled.ompt_callback_implicit_task) {
1816           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1817               ompt_scope_end, NULL, &(task_info->task_data), 1,
1818               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1819         }
1820 
1821         *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1822         __ompt_lw_taskteam_unlink(master_th);
1823         if (ompt_enabled.ompt_callback_parallel_end) {
1824           ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1825               ompt_parallel_data, *parent_task_data,
1826               OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1827         }
1828         master_th->th.ompt_thread_info.state = ompt_state_overhead;
1829       }
1830 #endif
1831     }
1832   } else if (call_context == fork_context_gnu) {
1833 #if OMPT_SUPPORT
1834     if (ompt_enabled.enabled) {
1835       ompt_lw_taskteam_t lwt;
1836       __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1837                               *return_address);
1838 
1839       lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1840       __ompt_lw_taskteam_link(&lwt, master_th, 1);
1841     }
1842 // don't use lw_taskteam after linking. content was swaped
1843 #endif
1844 
1845     // we were called from GNU native code
1846     KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1847     return FALSE;
1848   } else {
1849     KMP_ASSERT2(call_context < fork_context_last,
1850                 "__kmp_serial_fork_call: unknown fork_context parameter");
1851   }
1852 
1853   KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1854   KMP_MB();
1855   return FALSE;
1856 }
1857 
1858 /* most of the work for a fork */
1859 /* return true if we really went parallel, false if serialized */
1860 int __kmp_fork_call(ident_t *loc, int gtid,
1861                     enum fork_context_e call_context, // Intel, GNU, ...
1862                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1863                     kmp_va_list ap) {
1864   void **argv;
1865   int i;
1866   int master_tid;
1867   int master_this_cons;
1868   kmp_team_t *team;
1869   kmp_team_t *parent_team;
1870   kmp_info_t *master_th;
1871   kmp_root_t *root;
1872   int nthreads;
1873   int master_active;
1874   int master_set_numthreads;
1875   int level;
1876   int active_level;
1877   int teams_level;
1878 #if KMP_NESTED_HOT_TEAMS
1879   kmp_hot_team_ptr_t **p_hot_teams;
1880 #endif
1881   { // KMP_TIME_BLOCK
1882     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1883     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1884 
1885     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1886     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1887       /* Some systems prefer the stack for the root thread(s) to start with */
1888       /* some gap from the parent stack to prevent false sharing. */
1889       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1890       /* These 2 lines below are so this does not get optimized out */
1891       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1892         __kmp_stkpadding += (short)((kmp_int64)dummy);
1893     }
1894 
1895     /* initialize if needed */
1896     KMP_DEBUG_ASSERT(
1897         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1898     if (!TCR_4(__kmp_init_parallel))
1899       __kmp_parallel_initialize();
1900     __kmp_resume_if_soft_paused();
1901 
1902     /* setup current data */
1903     // AC: potentially unsafe, not in sync with library shutdown,
1904     // __kmp_threads can be freed
1905     master_th = __kmp_threads[gtid];
1906 
1907     parent_team = master_th->th.th_team;
1908     master_tid = master_th->th.th_info.ds.ds_tid;
1909     master_this_cons = master_th->th.th_local.this_construct;
1910     root = master_th->th.th_root;
1911     master_active = root->r.r_active;
1912     master_set_numthreads = master_th->th.th_set_nproc;
1913 
1914 #if OMPT_SUPPORT
1915     ompt_data_t ompt_parallel_data = ompt_data_none;
1916     ompt_data_t *parent_task_data;
1917     ompt_frame_t *ompt_frame;
1918     void *return_address = NULL;
1919 
1920     if (ompt_enabled.enabled) {
1921       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1922                                     NULL, NULL);
1923       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1924     }
1925 #endif
1926 
1927     // Assign affinity to root thread if it hasn't happened yet
1928     __kmp_assign_root_init_mask();
1929 
1930     // Nested level will be an index in the nested nthreads array
1931     level = parent_team->t.t_level;
1932     // used to launch non-serial teams even if nested is not allowed
1933     active_level = parent_team->t.t_active_level;
1934     // needed to check nesting inside the teams
1935     teams_level = master_th->th.th_teams_level;
1936 #if KMP_NESTED_HOT_TEAMS
1937     p_hot_teams = &master_th->th.th_hot_teams;
1938     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1939       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1940           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1941       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1942       // it is either actual or not needed (when active_level > 0)
1943       (*p_hot_teams)[0].hot_team_nth = 1;
1944     }
1945 #endif
1946 
1947 #if OMPT_SUPPORT
1948     if (ompt_enabled.enabled) {
1949       if (ompt_enabled.ompt_callback_parallel_begin) {
1950         int team_size = master_set_numthreads
1951                             ? master_set_numthreads
1952                             : get__nproc_2(parent_team, master_tid);
1953         int flags = OMPT_INVOKER(call_context) |
1954                     ((microtask == (microtask_t)__kmp_teams_master)
1955                          ? ompt_parallel_league
1956                          : ompt_parallel_team);
1957         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1958             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1959             return_address);
1960       }
1961       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1962     }
1963 #endif
1964 
1965     master_th->th.th_ident = loc;
1966 
1967     // Parallel closely nested in teams construct:
1968     if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
1969       return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
1970                                  call_context, microtask, invoker,
1971                                  master_set_numthreads, level,
1972 #if OMPT_SUPPORT
1973                                  ompt_parallel_data, return_address,
1974 #endif
1975                                  ap);
1976     } // End parallel closely nested in teams construct
1977 
1978 #if KMP_DEBUG
1979     if (__kmp_tasking_mode != tskm_immediate_exec) {
1980       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1981                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1982     }
1983 #endif
1984 
1985     // Need this to happen before we determine the number of threads, not while
1986     // we are allocating the team
1987     //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1988 
1989     // Determine the number of threads
1990     int enter_teams =
1991         __kmp_is_entering_teams(active_level, level, teams_level, ap);
1992     if ((!enter_teams &&
1993          (parent_team->t.t_active_level >=
1994           master_th->th.th_current_task->td_icvs.max_active_levels)) ||
1995         (__kmp_library == library_serial)) {
1996       KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
1997       nthreads = 1;
1998     } else {
1999       nthreads = master_set_numthreads
2000                      ? master_set_numthreads
2001                      // TODO: get nproc directly from current task
2002                      : get__nproc_2(parent_team, master_tid);
2003       // Check if we need to take forkjoin lock? (no need for serialized
2004       // parallel out of teams construct).
2005       if (nthreads > 1) {
2006         /* determine how many new threads we can use */
2007         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2008         /* AC: If we execute teams from parallel region (on host), then teams
2009            should be created but each can only have 1 thread if nesting is
2010            disabled. If teams called from serial region, then teams and their
2011            threads should be created regardless of the nesting setting. */
2012         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2013                                          nthreads, enter_teams);
2014         if (nthreads == 1) {
2015           // Free lock for single thread execution here; for multi-thread
2016           // execution it will be freed later after team of threads created
2017           // and initialized
2018           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2019         }
2020       }
2021     }
2022     KMP_DEBUG_ASSERT(nthreads > 0);
2023 
2024     // If we temporarily changed the set number of threads then restore it now
2025     master_th->th.th_set_nproc = 0;
2026 
2027     if (nthreads == 1) {
2028       return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2029                                     invoker, master_th, parent_team,
2030 #if OMPT_SUPPORT
2031                                     &ompt_parallel_data, &return_address,
2032                                     &parent_task_data,
2033 #endif
2034                                     ap);
2035     } // if (nthreads == 1)
2036 
2037     // GEH: only modify the executing flag in the case when not serialized
2038     //      serialized case is handled in kmpc_serialized_parallel
2039     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2040                   "curtask=%p, curtask_max_aclevel=%d\n",
2041                   parent_team->t.t_active_level, master_th,
2042                   master_th->th.th_current_task,
2043                   master_th->th.th_current_task->td_icvs.max_active_levels));
2044     // TODO: GEH - cannot do this assertion because root thread not set up as
2045     // executing
2046     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2047     master_th->th.th_current_task->td_flags.executing = 0;
2048 
2049     if (!master_th->th.th_teams_microtask || level > teams_level) {
2050       /* Increment our nested depth level */
2051       KMP_ATOMIC_INC(&root->r.r_in_parallel);
2052     }
2053 
2054     // See if we need to make a copy of the ICVs.
2055     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2056     if ((level + 1 < __kmp_nested_nth.used) &&
2057         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2058       nthreads_icv = __kmp_nested_nth.nth[level + 1];
2059     } else {
2060       nthreads_icv = 0; // don't update
2061     }
2062 
2063     // Figure out the proc_bind_policy for the new team.
2064     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2065     // proc_bind_default means don't update
2066     kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2067     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2068       proc_bind = proc_bind_false;
2069     } else {
2070       // No proc_bind clause specified; use current proc-bind-var for this
2071       // parallel region
2072       if (proc_bind == proc_bind_default) {
2073         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2074       }
2075       // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2076       if (master_th->th.th_teams_microtask &&
2077           microtask == (microtask_t)__kmp_teams_master) {
2078         proc_bind = __kmp_teams_proc_bind;
2079       }
2080       /* else: The proc_bind policy was specified explicitly on parallel clause.
2081          This overrides proc-bind-var for this parallel region, but does not
2082          change proc-bind-var. */
2083       // Figure the value of proc-bind-var for the child threads.
2084       if ((level + 1 < __kmp_nested_proc_bind.used) &&
2085           (__kmp_nested_proc_bind.bind_types[level + 1] !=
2086            master_th->th.th_current_task->td_icvs.proc_bind)) {
2087         // Do not modify the proc bind icv for the two teams construct forks
2088         // They just let the proc bind icv pass through
2089         if (!master_th->th.th_teams_microtask ||
2090             !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2091           proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2092       }
2093     }
2094 
2095     // Reset for next parallel region
2096     master_th->th.th_set_proc_bind = proc_bind_default;
2097 
2098     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2099       kmp_internal_control_t new_icvs;
2100       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2101       new_icvs.next = NULL;
2102       if (nthreads_icv > 0) {
2103         new_icvs.nproc = nthreads_icv;
2104       }
2105       if (proc_bind_icv != proc_bind_default) {
2106         new_icvs.proc_bind = proc_bind_icv;
2107       }
2108 
2109       /* allocate a new parallel team */
2110       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2111       team = __kmp_allocate_team(root, nthreads, nthreads,
2112 #if OMPT_SUPPORT
2113                                  ompt_parallel_data,
2114 #endif
2115                                  proc_bind, &new_icvs,
2116                                  argc USE_NESTED_HOT_ARG(master_th));
2117       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2118         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2119     } else {
2120       /* allocate a new parallel team */
2121       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2122       team = __kmp_allocate_team(root, nthreads, nthreads,
2123 #if OMPT_SUPPORT
2124                                  ompt_parallel_data,
2125 #endif
2126                                  proc_bind,
2127                                  &master_th->th.th_current_task->td_icvs,
2128                                  argc USE_NESTED_HOT_ARG(master_th));
2129       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2130         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2131                   &master_th->th.th_current_task->td_icvs);
2132     }
2133     KF_TRACE(
2134         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2135 
2136     /* setup the new team */
2137     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2138     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2139     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2140     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2141     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2142 #if OMPT_SUPPORT
2143     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2144                           return_address);
2145 #endif
2146     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2147     // TODO: parent_team->t.t_level == INT_MAX ???
2148     if (!master_th->th.th_teams_microtask || level > teams_level) {
2149       int new_level = parent_team->t.t_level + 1;
2150       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2151       new_level = parent_team->t.t_active_level + 1;
2152       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2153     } else {
2154       // AC: Do not increase parallel level at start of the teams construct
2155       int new_level = parent_team->t.t_level;
2156       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2157       new_level = parent_team->t.t_active_level;
2158       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2159     }
2160     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2161     // set primary thread's schedule as new run-time schedule
2162     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2163 
2164     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2165     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2166 
2167     // Update the floating point rounding in the team if required.
2168     propagateFPControl(team);
2169 #if OMPD_SUPPORT
2170     if (ompd_state & OMPD_ENABLE_BP)
2171       ompd_bp_parallel_begin();
2172 #endif
2173 
2174     if (__kmp_tasking_mode != tskm_immediate_exec) {
2175       // Set primary thread's task team to team's task team. Unless this is hot
2176       // team, it should be NULL.
2177       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2178                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2179       KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2180                     "%p, new task_team %p / team %p\n",
2181                     __kmp_gtid_from_thread(master_th),
2182                     master_th->th.th_task_team, parent_team,
2183                     team->t.t_task_team[master_th->th.th_task_state], team));
2184 
2185       if (active_level || master_th->th.th_task_team) {
2186         // Take a memo of primary thread's task_state
2187         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2188         if (master_th->th.th_task_state_top >=
2189             master_th->th.th_task_state_stack_sz) { // increase size
2190           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2191           kmp_uint8 *old_stack, *new_stack;
2192           kmp_uint32 i;
2193           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2194           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2195             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2196           }
2197           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2198                ++i) { // zero-init rest of stack
2199             new_stack[i] = 0;
2200           }
2201           old_stack = master_th->th.th_task_state_memo_stack;
2202           master_th->th.th_task_state_memo_stack = new_stack;
2203           master_th->th.th_task_state_stack_sz = new_size;
2204           __kmp_free(old_stack);
2205         }
2206         // Store primary thread's task_state on stack
2207         master_th->th
2208             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2209             master_th->th.th_task_state;
2210         master_th->th.th_task_state_top++;
2211 #if KMP_NESTED_HOT_TEAMS
2212         if (master_th->th.th_hot_teams &&
2213             active_level < __kmp_hot_teams_max_level &&
2214             team == master_th->th.th_hot_teams[active_level].hot_team) {
2215           // Restore primary thread's nested state if nested hot team
2216           master_th->th.th_task_state =
2217               master_th->th
2218                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2219         } else {
2220 #endif
2221           master_th->th.th_task_state = 0;
2222 #if KMP_NESTED_HOT_TEAMS
2223         }
2224 #endif
2225       }
2226 #if !KMP_NESTED_HOT_TEAMS
2227       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2228                        (team == root->r.r_hot_team));
2229 #endif
2230     }
2231 
2232     KA_TRACE(
2233         20,
2234         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2235          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2236          team->t.t_nproc));
2237     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2238                      (team->t.t_master_tid == 0 &&
2239                       (team->t.t_parent == root->r.r_root_team ||
2240                        team->t.t_parent->t.t_serialized)));
2241     KMP_MB();
2242 
2243     /* now, setup the arguments */
2244     argv = (void **)team->t.t_argv;
2245     if (ap) {
2246       for (i = argc - 1; i >= 0; --i) {
2247         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2248         KMP_CHECK_UPDATE(*argv, new_argv);
2249         argv++;
2250       }
2251     } else {
2252       for (i = 0; i < argc; ++i) {
2253         // Get args from parent team for teams construct
2254         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2255       }
2256     }
2257 
2258     /* now actually fork the threads */
2259     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2260     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2261       root->r.r_active = TRUE;
2262 
2263     __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2264     __kmp_setup_icv_copy(team, nthreads,
2265                          &master_th->th.th_current_task->td_icvs, loc);
2266 
2267 #if OMPT_SUPPORT
2268     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2269 #endif
2270 
2271     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2272 
2273 #if USE_ITT_BUILD
2274     if (team->t.t_active_level == 1 // only report frames at level 1
2275         && !master_th->th.th_teams_microtask) { // not in teams construct
2276 #if USE_ITT_NOTIFY
2277       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2278           (__kmp_forkjoin_frames_mode == 3 ||
2279            __kmp_forkjoin_frames_mode == 1)) {
2280         kmp_uint64 tmp_time = 0;
2281         if (__itt_get_timestamp_ptr)
2282           tmp_time = __itt_get_timestamp();
2283         // Internal fork - report frame begin
2284         master_th->th.th_frame_time = tmp_time;
2285         if (__kmp_forkjoin_frames_mode == 3)
2286           team->t.t_region_time = tmp_time;
2287       } else
2288 // only one notification scheme (either "submit" or "forking/joined", not both)
2289 #endif /* USE_ITT_NOTIFY */
2290         if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2291             __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2292           // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2293           __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2294         }
2295     }
2296 #endif /* USE_ITT_BUILD */
2297 
2298     /* now go on and do the work */
2299     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2300     KMP_MB();
2301     KF_TRACE(10,
2302              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2303               root, team, master_th, gtid));
2304 
2305 #if USE_ITT_BUILD
2306     if (__itt_stack_caller_create_ptr) {
2307       // create new stack stitching id before entering fork barrier
2308       if (!enter_teams) {
2309         KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2310         team->t.t_stack_id = __kmp_itt_stack_caller_create();
2311       } else if (parent_team->t.t_serialized) {
2312         // keep stack stitching id in the serialized parent_team;
2313         // current team will be used for parallel inside the teams;
2314         // if parent_team is active, then it already keeps stack stitching id
2315         // for the league of teams
2316         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2317         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2318       }
2319     }
2320 #endif /* USE_ITT_BUILD */
2321 
2322     // AC: skip __kmp_internal_fork at teams construct, let only primary
2323     // threads execute
2324     if (ap) {
2325       __kmp_internal_fork(loc, gtid, team);
2326       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2327                     "master_th=%p, gtid=%d\n",
2328                     root, team, master_th, gtid));
2329     }
2330 
2331     if (call_context == fork_context_gnu) {
2332       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2333       return TRUE;
2334     }
2335 
2336     /* Invoke microtask for PRIMARY thread */
2337     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2338                   team->t.t_id, team->t.t_pkfn));
2339   } // END of timer KMP_fork_call block
2340 
2341 #if KMP_STATS_ENABLED
2342   // If beginning a teams construct, then change thread state
2343   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2344   if (!ap) {
2345     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2346   }
2347 #endif
2348 
2349   if (!team->t.t_invoke(gtid)) {
2350     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2351   }
2352 
2353 #if KMP_STATS_ENABLED
2354   // If was beginning of a teams construct, then reset thread state
2355   if (!ap) {
2356     KMP_SET_THREAD_STATE(previous_state);
2357   }
2358 #endif
2359 
2360   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2361                 team->t.t_id, team->t.t_pkfn));
2362   KMP_MB(); /* Flush all pending memory write invalidates.  */
2363 
2364   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2365 #if OMPT_SUPPORT
2366   if (ompt_enabled.enabled) {
2367     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2368   }
2369 #endif
2370 
2371   return TRUE;
2372 }
2373 
2374 #if OMPT_SUPPORT
2375 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2376                                             kmp_team_t *team) {
2377   // restore state outside the region
2378   thread->th.ompt_thread_info.state =
2379       ((team->t.t_serialized) ? ompt_state_work_serial
2380                               : ompt_state_work_parallel);
2381 }
2382 
2383 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2384                                    kmp_team_t *team, ompt_data_t *parallel_data,
2385                                    int flags, void *codeptr) {
2386   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2387   if (ompt_enabled.ompt_callback_parallel_end) {
2388     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2389         parallel_data, &(task_info->task_data), flags, codeptr);
2390   }
2391 
2392   task_info->frame.enter_frame = ompt_data_none;
2393   __kmp_join_restore_state(thread, team);
2394 }
2395 #endif
2396 
2397 void __kmp_join_call(ident_t *loc, int gtid
2398 #if OMPT_SUPPORT
2399                      ,
2400                      enum fork_context_e fork_context
2401 #endif
2402                      ,
2403                      int exit_teams) {
2404   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2405   kmp_team_t *team;
2406   kmp_team_t *parent_team;
2407   kmp_info_t *master_th;
2408   kmp_root_t *root;
2409   int master_active;
2410 
2411   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2412 
2413   /* setup current data */
2414   master_th = __kmp_threads[gtid];
2415   root = master_th->th.th_root;
2416   team = master_th->th.th_team;
2417   parent_team = team->t.t_parent;
2418 
2419   master_th->th.th_ident = loc;
2420 
2421 #if OMPT_SUPPORT
2422   void *team_microtask = (void *)team->t.t_pkfn;
2423   // For GOMP interface with serialized parallel, need the
2424   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2425   // and end-parallel events.
2426   if (ompt_enabled.enabled &&
2427       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2428     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2429   }
2430 #endif
2431 
2432 #if KMP_DEBUG
2433   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2434     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2435                   "th_task_team = %p\n",
2436                   __kmp_gtid_from_thread(master_th), team,
2437                   team->t.t_task_team[master_th->th.th_task_state],
2438                   master_th->th.th_task_team));
2439     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2440                      team->t.t_task_team[master_th->th.th_task_state]);
2441   }
2442 #endif
2443 
2444   if (team->t.t_serialized) {
2445     if (master_th->th.th_teams_microtask) {
2446       // We are in teams construct
2447       int level = team->t.t_level;
2448       int tlevel = master_th->th.th_teams_level;
2449       if (level == tlevel) {
2450         // AC: we haven't incremented it earlier at start of teams construct,
2451         //     so do it here - at the end of teams construct
2452         team->t.t_level++;
2453       } else if (level == tlevel + 1) {
2454         // AC: we are exiting parallel inside teams, need to increment
2455         // serialization in order to restore it in the next call to
2456         // __kmpc_end_serialized_parallel
2457         team->t.t_serialized++;
2458       }
2459     }
2460     __kmpc_end_serialized_parallel(loc, gtid);
2461 
2462 #if OMPT_SUPPORT
2463     if (ompt_enabled.enabled) {
2464       if (fork_context == fork_context_gnu) {
2465         __ompt_lw_taskteam_unlink(master_th);
2466       }
2467       __kmp_join_restore_state(master_th, parent_team);
2468     }
2469 #endif
2470 
2471     return;
2472   }
2473 
2474   master_active = team->t.t_master_active;
2475 
2476   if (!exit_teams) {
2477     // AC: No barrier for internal teams at exit from teams construct.
2478     //     But there is barrier for external team (league).
2479     __kmp_internal_join(loc, gtid, team);
2480 #if USE_ITT_BUILD
2481     if (__itt_stack_caller_create_ptr) {
2482       KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2483       // destroy the stack stitching id after join barrier
2484       __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2485       team->t.t_stack_id = NULL;
2486     }
2487 #endif
2488   } else {
2489     master_th->th.th_task_state =
2490         0; // AC: no tasking in teams (out of any parallel)
2491 #if USE_ITT_BUILD
2492     if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2493       KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2494       // destroy the stack stitching id on exit from the teams construct
2495       // if parent_team is active, then the id will be destroyed later on
2496       // by master of the league of teams
2497       __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2498       parent_team->t.t_stack_id = NULL;
2499     }
2500 #endif
2501   }
2502 
2503   KMP_MB();
2504 
2505 #if OMPT_SUPPORT
2506   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2507   void *codeptr = team->t.ompt_team_info.master_return_address;
2508 #endif
2509 
2510 #if USE_ITT_BUILD
2511   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2512   if (team->t.t_active_level == 1 &&
2513       (!master_th->th.th_teams_microtask || /* not in teams construct */
2514        master_th->th.th_teams_size.nteams == 1)) {
2515     master_th->th.th_ident = loc;
2516     // only one notification scheme (either "submit" or "forking/joined", not
2517     // both)
2518     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2519         __kmp_forkjoin_frames_mode == 3)
2520       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2521                              master_th->th.th_frame_time, 0, loc,
2522                              master_th->th.th_team_nproc, 1);
2523     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2524              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2525       __kmp_itt_region_joined(gtid);
2526   } // active_level == 1
2527 #endif /* USE_ITT_BUILD */
2528 
2529 #if KMP_AFFINITY_SUPPORTED
2530   if (!exit_teams) {
2531     // Restore master thread's partition.
2532     master_th->th.th_first_place = team->t.t_first_place;
2533     master_th->th.th_last_place = team->t.t_last_place;
2534   }
2535 #endif // KMP_AFFINITY_SUPPORTED
2536 
2537   if (master_th->th.th_teams_microtask && !exit_teams &&
2538       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2539       team->t.t_level == master_th->th.th_teams_level + 1) {
2540 // AC: We need to leave the team structure intact at the end of parallel
2541 // inside the teams construct, so that at the next parallel same (hot) team
2542 // works, only adjust nesting levels
2543 #if OMPT_SUPPORT
2544     ompt_data_t ompt_parallel_data = ompt_data_none;
2545     if (ompt_enabled.enabled) {
2546       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2547       if (ompt_enabled.ompt_callback_implicit_task) {
2548         int ompt_team_size = team->t.t_nproc;
2549         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2550             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2551             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2552       }
2553       task_info->frame.exit_frame = ompt_data_none;
2554       task_info->task_data = ompt_data_none;
2555       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2556       __ompt_lw_taskteam_unlink(master_th);
2557     }
2558 #endif
2559     /* Decrement our nested depth level */
2560     team->t.t_level--;
2561     team->t.t_active_level--;
2562     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2563 
2564     // Restore number of threads in the team if needed. This code relies on
2565     // the proper adjustment of th_teams_size.nth after the fork in
2566     // __kmp_teams_master on each teams primary thread in the case that
2567     // __kmp_reserve_threads reduced it.
2568     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2569       int old_num = master_th->th.th_team_nproc;
2570       int new_num = master_th->th.th_teams_size.nth;
2571       kmp_info_t **other_threads = team->t.t_threads;
2572       team->t.t_nproc = new_num;
2573       for (int i = 0; i < old_num; ++i) {
2574         other_threads[i]->th.th_team_nproc = new_num;
2575       }
2576       // Adjust states of non-used threads of the team
2577       for (int i = old_num; i < new_num; ++i) {
2578         // Re-initialize thread's barrier data.
2579         KMP_DEBUG_ASSERT(other_threads[i]);
2580         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2581         for (int b = 0; b < bs_last_barrier; ++b) {
2582           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2583           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2584 #if USE_DEBUGGER
2585           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2586 #endif
2587         }
2588         if (__kmp_tasking_mode != tskm_immediate_exec) {
2589           // Synchronize thread's task state
2590           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2591         }
2592       }
2593     }
2594 
2595 #if OMPT_SUPPORT
2596     if (ompt_enabled.enabled) {
2597       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2598                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2599     }
2600 #endif
2601 
2602     return;
2603   }
2604 
2605   /* do cleanup and restore the parent team */
2606   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2607   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2608 
2609   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2610 
2611   /* jc: The following lock has instructions with REL and ACQ semantics,
2612      separating the parallel user code called in this parallel region
2613      from the serial user code called after this function returns. */
2614   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2615 
2616   if (!master_th->th.th_teams_microtask ||
2617       team->t.t_level > master_th->th.th_teams_level) {
2618     /* Decrement our nested depth level */
2619     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2620   }
2621   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2622 
2623 #if OMPT_SUPPORT
2624   if (ompt_enabled.enabled) {
2625     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2626     if (ompt_enabled.ompt_callback_implicit_task) {
2627       int flags = (team_microtask == (void *)__kmp_teams_master)
2628                       ? ompt_task_initial
2629                       : ompt_task_implicit;
2630       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2631       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2632           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2633           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2634     }
2635     task_info->frame.exit_frame = ompt_data_none;
2636     task_info->task_data = ompt_data_none;
2637   }
2638 #endif
2639 
2640   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2641                 master_th, team));
2642   __kmp_pop_current_task_from_thread(master_th);
2643 
2644   master_th->th.th_def_allocator = team->t.t_def_allocator;
2645 
2646 #if OMPD_SUPPORT
2647   if (ompd_state & OMPD_ENABLE_BP)
2648     ompd_bp_parallel_end();
2649 #endif
2650   updateHWFPControl(team);
2651 
2652   if (root->r.r_active != master_active)
2653     root->r.r_active = master_active;
2654 
2655   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2656                             master_th)); // this will free worker threads
2657 
2658   /* this race was fun to find. make sure the following is in the critical
2659      region otherwise assertions may fail occasionally since the old team may be
2660      reallocated and the hierarchy appears inconsistent. it is actually safe to
2661      run and won't cause any bugs, but will cause those assertion failures. it's
2662      only one deref&assign so might as well put this in the critical region */
2663   master_th->th.th_team = parent_team;
2664   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2665   master_th->th.th_team_master = parent_team->t.t_threads[0];
2666   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2667 
2668   /* restore serialized team, if need be */
2669   if (parent_team->t.t_serialized &&
2670       parent_team != master_th->th.th_serial_team &&
2671       parent_team != root->r.r_root_team) {
2672     __kmp_free_team(root,
2673                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2674     master_th->th.th_serial_team = parent_team;
2675   }
2676 
2677   if (__kmp_tasking_mode != tskm_immediate_exec) {
2678     if (master_th->th.th_task_state_top >
2679         0) { // Restore task state from memo stack
2680       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2681       // Remember primary thread's state if we re-use this nested hot team
2682       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2683           master_th->th.th_task_state;
2684       --master_th->th.th_task_state_top; // pop
2685       // Now restore state at this level
2686       master_th->th.th_task_state =
2687           master_th->th
2688               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2689     } else if (team != root->r.r_hot_team) {
2690       // Reset the task state of primary thread if we are not hot team because
2691       // in this case all the worker threads will be free, and their task state
2692       // will be reset. If not reset the primary's, the task state will be
2693       // inconsistent.
2694       master_th->th.th_task_state = 0;
2695     }
2696     // Copy the task team from the parent team to the primary thread
2697     master_th->th.th_task_team =
2698         parent_team->t.t_task_team[master_th->th.th_task_state];
2699     KA_TRACE(20,
2700              ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2701               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2702               parent_team));
2703   }
2704 
2705   // TODO: GEH - cannot do this assertion because root thread not set up as
2706   // executing
2707   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2708   master_th->th.th_current_task->td_flags.executing = 1;
2709 
2710   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2711 
2712 #if KMP_AFFINITY_SUPPORTED
2713   if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2714     __kmp_reset_root_init_mask(gtid);
2715   }
2716 #endif
2717 #if OMPT_SUPPORT
2718   int flags =
2719       OMPT_INVOKER(fork_context) |
2720       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2721                                                       : ompt_parallel_team);
2722   if (ompt_enabled.enabled) {
2723     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2724                     codeptr);
2725   }
2726 #endif
2727 
2728   KMP_MB();
2729   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2730 }
2731 
2732 /* Check whether we should push an internal control record onto the
2733    serial team stack.  If so, do it.  */
2734 void __kmp_save_internal_controls(kmp_info_t *thread) {
2735 
2736   if (thread->th.th_team != thread->th.th_serial_team) {
2737     return;
2738   }
2739   if (thread->th.th_team->t.t_serialized > 1) {
2740     int push = 0;
2741 
2742     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2743       push = 1;
2744     } else {
2745       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2746           thread->th.th_team->t.t_serialized) {
2747         push = 1;
2748       }
2749     }
2750     if (push) { /* push a record on the serial team's stack */
2751       kmp_internal_control_t *control =
2752           (kmp_internal_control_t *)__kmp_allocate(
2753               sizeof(kmp_internal_control_t));
2754 
2755       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2756 
2757       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2758 
2759       control->next = thread->th.th_team->t.t_control_stack_top;
2760       thread->th.th_team->t.t_control_stack_top = control;
2761     }
2762   }
2763 }
2764 
2765 /* Changes set_nproc */
2766 void __kmp_set_num_threads(int new_nth, int gtid) {
2767   kmp_info_t *thread;
2768   kmp_root_t *root;
2769 
2770   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2771   KMP_DEBUG_ASSERT(__kmp_init_serial);
2772 
2773   if (new_nth < 1)
2774     new_nth = 1;
2775   else if (new_nth > __kmp_max_nth)
2776     new_nth = __kmp_max_nth;
2777 
2778   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2779   thread = __kmp_threads[gtid];
2780   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2781     return; // nothing to do
2782 
2783   __kmp_save_internal_controls(thread);
2784 
2785   set__nproc(thread, new_nth);
2786 
2787   // If this omp_set_num_threads() call will cause the hot team size to be
2788   // reduced (in the absence of a num_threads clause), then reduce it now,
2789   // rather than waiting for the next parallel region.
2790   root = thread->th.th_root;
2791   if (__kmp_init_parallel && (!root->r.r_active) &&
2792       (root->r.r_hot_team->t.t_nproc > new_nth)
2793 #if KMP_NESTED_HOT_TEAMS
2794       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2795 #endif
2796   ) {
2797     kmp_team_t *hot_team = root->r.r_hot_team;
2798     int f;
2799 
2800     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2801 
2802     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2803       __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2804     }
2805     // Release the extra threads we don't need any more.
2806     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2807       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2808       if (__kmp_tasking_mode != tskm_immediate_exec) {
2809         // When decreasing team size, threads no longer in the team should unref
2810         // task team.
2811         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2812       }
2813       __kmp_free_thread(hot_team->t.t_threads[f]);
2814       hot_team->t.t_threads[f] = NULL;
2815     }
2816     hot_team->t.t_nproc = new_nth;
2817 #if KMP_NESTED_HOT_TEAMS
2818     if (thread->th.th_hot_teams) {
2819       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2820       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2821     }
2822 #endif
2823 
2824     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2825       hot_team->t.b->update_num_threads(new_nth);
2826       __kmp_add_threads_to_team(hot_team, new_nth);
2827     }
2828 
2829     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2830 
2831     // Update the t_nproc field in the threads that are still active.
2832     for (f = 0; f < new_nth; f++) {
2833       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2834       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2835     }
2836     // Special flag in case omp_set_num_threads() call
2837     hot_team->t.t_size_changed = -1;
2838   }
2839 }
2840 
2841 /* Changes max_active_levels */
2842 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2843   kmp_info_t *thread;
2844 
2845   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2846                 "%d = (%d)\n",
2847                 gtid, max_active_levels));
2848   KMP_DEBUG_ASSERT(__kmp_init_serial);
2849 
2850   // validate max_active_levels
2851   if (max_active_levels < 0) {
2852     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2853     // We ignore this call if the user has specified a negative value.
2854     // The current setting won't be changed. The last valid setting will be
2855     // used. A warning will be issued (if warnings are allowed as controlled by
2856     // the KMP_WARNINGS env var).
2857     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2858                   "max_active_levels for thread %d = (%d)\n",
2859                   gtid, max_active_levels));
2860     return;
2861   }
2862   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2863     // it's OK, the max_active_levels is within the valid range: [ 0;
2864     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2865     // We allow a zero value. (implementation defined behavior)
2866   } else {
2867     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2868                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2869     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2870     // Current upper limit is MAX_INT. (implementation defined behavior)
2871     // If the input exceeds the upper limit, we correct the input to be the
2872     // upper limit. (implementation defined behavior)
2873     // Actually, the flow should never get here until we use MAX_INT limit.
2874   }
2875   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2876                 "max_active_levels for thread %d = (%d)\n",
2877                 gtid, max_active_levels));
2878 
2879   thread = __kmp_threads[gtid];
2880 
2881   __kmp_save_internal_controls(thread);
2882 
2883   set__max_active_levels(thread, max_active_levels);
2884 }
2885 
2886 /* Gets max_active_levels */
2887 int __kmp_get_max_active_levels(int gtid) {
2888   kmp_info_t *thread;
2889 
2890   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2891   KMP_DEBUG_ASSERT(__kmp_init_serial);
2892 
2893   thread = __kmp_threads[gtid];
2894   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2895   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2896                 "curtask_maxaclevel=%d\n",
2897                 gtid, thread->th.th_current_task,
2898                 thread->th.th_current_task->td_icvs.max_active_levels));
2899   return thread->th.th_current_task->td_icvs.max_active_levels;
2900 }
2901 
2902 // nteams-var per-device ICV
2903 void __kmp_set_num_teams(int num_teams) {
2904   if (num_teams > 0)
2905     __kmp_nteams = num_teams;
2906 }
2907 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2908 // teams-thread-limit-var per-device ICV
2909 void __kmp_set_teams_thread_limit(int limit) {
2910   if (limit > 0)
2911     __kmp_teams_thread_limit = limit;
2912 }
2913 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2914 
2915 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2916 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2917 
2918 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2919 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2920   kmp_info_t *thread;
2921   kmp_sched_t orig_kind;
2922   //    kmp_team_t *team;
2923 
2924   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2925                 gtid, (int)kind, chunk));
2926   KMP_DEBUG_ASSERT(__kmp_init_serial);
2927 
2928   // Check if the kind parameter is valid, correct if needed.
2929   // Valid parameters should fit in one of two intervals - standard or extended:
2930   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2931   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2932   orig_kind = kind;
2933   kind = __kmp_sched_without_mods(kind);
2934 
2935   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2936       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2937     // TODO: Hint needs attention in case we change the default schedule.
2938     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2939               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2940               __kmp_msg_null);
2941     kind = kmp_sched_default;
2942     chunk = 0; // ignore chunk value in case of bad kind
2943   }
2944 
2945   thread = __kmp_threads[gtid];
2946 
2947   __kmp_save_internal_controls(thread);
2948 
2949   if (kind < kmp_sched_upper_std) {
2950     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2951       // differ static chunked vs. unchunked:  chunk should be invalid to
2952       // indicate unchunked schedule (which is the default)
2953       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2954     } else {
2955       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2956           __kmp_sch_map[kind - kmp_sched_lower - 1];
2957     }
2958   } else {
2959     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2960     //    kmp_sched_lower - 2 ];
2961     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2962         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2963                       kmp_sched_lower - 2];
2964   }
2965   __kmp_sched_apply_mods_intkind(
2966       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2967   if (kind == kmp_sched_auto || chunk < 1) {
2968     // ignore parameter chunk for schedule auto
2969     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2970   } else {
2971     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2972   }
2973 }
2974 
2975 /* Gets def_sched_var ICV values */
2976 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2977   kmp_info_t *thread;
2978   enum sched_type th_type;
2979 
2980   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2981   KMP_DEBUG_ASSERT(__kmp_init_serial);
2982 
2983   thread = __kmp_threads[gtid];
2984 
2985   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2986   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2987   case kmp_sch_static:
2988   case kmp_sch_static_greedy:
2989   case kmp_sch_static_balanced:
2990     *kind = kmp_sched_static;
2991     __kmp_sched_apply_mods_stdkind(kind, th_type);
2992     *chunk = 0; // chunk was not set, try to show this fact via zero value
2993     return;
2994   case kmp_sch_static_chunked:
2995     *kind = kmp_sched_static;
2996     break;
2997   case kmp_sch_dynamic_chunked:
2998     *kind = kmp_sched_dynamic;
2999     break;
3000   case kmp_sch_guided_chunked:
3001   case kmp_sch_guided_iterative_chunked:
3002   case kmp_sch_guided_analytical_chunked:
3003     *kind = kmp_sched_guided;
3004     break;
3005   case kmp_sch_auto:
3006     *kind = kmp_sched_auto;
3007     break;
3008   case kmp_sch_trapezoidal:
3009     *kind = kmp_sched_trapezoidal;
3010     break;
3011 #if KMP_STATIC_STEAL_ENABLED
3012   case kmp_sch_static_steal:
3013     *kind = kmp_sched_static_steal;
3014     break;
3015 #endif
3016   default:
3017     KMP_FATAL(UnknownSchedulingType, th_type);
3018   }
3019 
3020   __kmp_sched_apply_mods_stdkind(kind, th_type);
3021   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3022 }
3023 
3024 int __kmp_get_ancestor_thread_num(int gtid, int level) {
3025 
3026   int ii, dd;
3027   kmp_team_t *team;
3028   kmp_info_t *thr;
3029 
3030   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3031   KMP_DEBUG_ASSERT(__kmp_init_serial);
3032 
3033   // validate level
3034   if (level == 0)
3035     return 0;
3036   if (level < 0)
3037     return -1;
3038   thr = __kmp_threads[gtid];
3039   team = thr->th.th_team;
3040   ii = team->t.t_level;
3041   if (level > ii)
3042     return -1;
3043 
3044   if (thr->th.th_teams_microtask) {
3045     // AC: we are in teams region where multiple nested teams have same level
3046     int tlevel = thr->th.th_teams_level; // the level of the teams construct
3047     if (level <=
3048         tlevel) { // otherwise usual algorithm works (will not touch the teams)
3049       KMP_DEBUG_ASSERT(ii >= tlevel);
3050       // AC: As we need to pass by the teams league, we need to artificially
3051       // increase ii
3052       if (ii == tlevel) {
3053         ii += 2; // three teams have same level
3054       } else {
3055         ii++; // two teams have same level
3056       }
3057     }
3058   }
3059 
3060   if (ii == level)
3061     return __kmp_tid_from_gtid(gtid);
3062 
3063   dd = team->t.t_serialized;
3064   level++;
3065   while (ii > level) {
3066     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3067     }
3068     if ((team->t.t_serialized) && (!dd)) {
3069       team = team->t.t_parent;
3070       continue;
3071     }
3072     if (ii > level) {
3073       team = team->t.t_parent;
3074       dd = team->t.t_serialized;
3075       ii--;
3076     }
3077   }
3078 
3079   return (dd > 1) ? (0) : (team->t.t_master_tid);
3080 }
3081 
3082 int __kmp_get_team_size(int gtid, int level) {
3083 
3084   int ii, dd;
3085   kmp_team_t *team;
3086   kmp_info_t *thr;
3087 
3088   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3089   KMP_DEBUG_ASSERT(__kmp_init_serial);
3090 
3091   // validate level
3092   if (level == 0)
3093     return 1;
3094   if (level < 0)
3095     return -1;
3096   thr = __kmp_threads[gtid];
3097   team = thr->th.th_team;
3098   ii = team->t.t_level;
3099   if (level > ii)
3100     return -1;
3101 
3102   if (thr->th.th_teams_microtask) {
3103     // AC: we are in teams region where multiple nested teams have same level
3104     int tlevel = thr->th.th_teams_level; // the level of the teams construct
3105     if (level <=
3106         tlevel) { // otherwise usual algorithm works (will not touch the teams)
3107       KMP_DEBUG_ASSERT(ii >= tlevel);
3108       // AC: As we need to pass by the teams league, we need to artificially
3109       // increase ii
3110       if (ii == tlevel) {
3111         ii += 2; // three teams have same level
3112       } else {
3113         ii++; // two teams have same level
3114       }
3115     }
3116   }
3117 
3118   while (ii > level) {
3119     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3120     }
3121     if (team->t.t_serialized && (!dd)) {
3122       team = team->t.t_parent;
3123       continue;
3124     }
3125     if (ii > level) {
3126       team = team->t.t_parent;
3127       ii--;
3128     }
3129   }
3130 
3131   return team->t.t_nproc;
3132 }
3133 
3134 kmp_r_sched_t __kmp_get_schedule_global() {
3135   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3136   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3137   // independently. So one can get the updated schedule here.
3138 
3139   kmp_r_sched_t r_sched;
3140 
3141   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3142   // __kmp_guided. __kmp_sched should keep original value, so that user can set
3143   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3144   // different roots (even in OMP 2.5)
3145   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3146   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3147   if (s == kmp_sch_static) {
3148     // replace STATIC with more detailed schedule (balanced or greedy)
3149     r_sched.r_sched_type = __kmp_static;
3150   } else if (s == kmp_sch_guided_chunked) {
3151     // replace GUIDED with more detailed schedule (iterative or analytical)
3152     r_sched.r_sched_type = __kmp_guided;
3153   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3154     r_sched.r_sched_type = __kmp_sched;
3155   }
3156   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3157 
3158   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3159     // __kmp_chunk may be wrong here (if it was not ever set)
3160     r_sched.chunk = KMP_DEFAULT_CHUNK;
3161   } else {
3162     r_sched.chunk = __kmp_chunk;
3163   }
3164 
3165   return r_sched;
3166 }
3167 
3168 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3169    at least argc number of *t_argv entries for the requested team. */
3170 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3171 
3172   KMP_DEBUG_ASSERT(team);
3173   if (!realloc || argc > team->t.t_max_argc) {
3174 
3175     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3176                    "current entries=%d\n",
3177                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3178     /* if previously allocated heap space for args, free them */
3179     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3180       __kmp_free((void *)team->t.t_argv);
3181 
3182     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3183       /* use unused space in the cache line for arguments */
3184       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3185       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3186                      "argv entries\n",
3187                      team->t.t_id, team->t.t_max_argc));
3188       team->t.t_argv = &team->t.t_inline_argv[0];
3189       if (__kmp_storage_map) {
3190         __kmp_print_storage_map_gtid(
3191             -1, &team->t.t_inline_argv[0],
3192             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3193             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3194             team->t.t_id);
3195       }
3196     } else {
3197       /* allocate space for arguments in the heap */
3198       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3199                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3200                                : 2 * argc;
3201       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3202                      "argv entries\n",
3203                      team->t.t_id, team->t.t_max_argc));
3204       team->t.t_argv =
3205           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3206       if (__kmp_storage_map) {
3207         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3208                                      &team->t.t_argv[team->t.t_max_argc],
3209                                      sizeof(void *) * team->t.t_max_argc,
3210                                      "team_%d.t_argv", team->t.t_id);
3211       }
3212     }
3213   }
3214 }
3215 
3216 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3217   int i;
3218   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3219   team->t.t_threads =
3220       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3221   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3222       sizeof(dispatch_shared_info_t) * num_disp_buff);
3223   team->t.t_dispatch =
3224       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3225   team->t.t_implicit_task_taskdata =
3226       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3227   team->t.t_max_nproc = max_nth;
3228 
3229   /* setup dispatch buffers */
3230   for (i = 0; i < num_disp_buff; ++i) {
3231     team->t.t_disp_buffer[i].buffer_index = i;
3232     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3233   }
3234 }
3235 
3236 static void __kmp_free_team_arrays(kmp_team_t *team) {
3237   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3238   int i;
3239   for (i = 0; i < team->t.t_max_nproc; ++i) {
3240     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3241       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3242       team->t.t_dispatch[i].th_disp_buffer = NULL;
3243     }
3244   }
3245 #if KMP_USE_HIER_SCHED
3246   __kmp_dispatch_free_hierarchies(team);
3247 #endif
3248   __kmp_free(team->t.t_threads);
3249   __kmp_free(team->t.t_disp_buffer);
3250   __kmp_free(team->t.t_dispatch);
3251   __kmp_free(team->t.t_implicit_task_taskdata);
3252   team->t.t_threads = NULL;
3253   team->t.t_disp_buffer = NULL;
3254   team->t.t_dispatch = NULL;
3255   team->t.t_implicit_task_taskdata = 0;
3256 }
3257 
3258 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3259   kmp_info_t **oldThreads = team->t.t_threads;
3260 
3261   __kmp_free(team->t.t_disp_buffer);
3262   __kmp_free(team->t.t_dispatch);
3263   __kmp_free(team->t.t_implicit_task_taskdata);
3264   __kmp_allocate_team_arrays(team, max_nth);
3265 
3266   KMP_MEMCPY(team->t.t_threads, oldThreads,
3267              team->t.t_nproc * sizeof(kmp_info_t *));
3268 
3269   __kmp_free(oldThreads);
3270 }
3271 
3272 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3273 
3274   kmp_r_sched_t r_sched =
3275       __kmp_get_schedule_global(); // get current state of scheduling globals
3276 
3277   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3278 
3279   kmp_internal_control_t g_icvs = {
3280     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3281     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3282     // adjustment of threads (per thread)
3283     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3284     // whether blocktime is explicitly set
3285     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3286 #if KMP_USE_MONITOR
3287     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3288 // intervals
3289 #endif
3290     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3291     // next parallel region (per thread)
3292     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3293     __kmp_cg_max_nth, // int thread_limit;
3294     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3295     // for max_active_levels
3296     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3297     // {sched,chunk} pair
3298     __kmp_nested_proc_bind.bind_types[0],
3299     __kmp_default_device,
3300     NULL // struct kmp_internal_control *next;
3301   };
3302 
3303   return g_icvs;
3304 }
3305 
3306 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3307 
3308   kmp_internal_control_t gx_icvs;
3309   gx_icvs.serial_nesting_level =
3310       0; // probably =team->t.t_serial like in save_inter_controls
3311   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3312   gx_icvs.next = NULL;
3313 
3314   return gx_icvs;
3315 }
3316 
3317 static void __kmp_initialize_root(kmp_root_t *root) {
3318   int f;
3319   kmp_team_t *root_team;
3320   kmp_team_t *hot_team;
3321   int hot_team_max_nth;
3322   kmp_r_sched_t r_sched =
3323       __kmp_get_schedule_global(); // get current state of scheduling globals
3324   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3325   KMP_DEBUG_ASSERT(root);
3326   KMP_ASSERT(!root->r.r_begin);
3327 
3328   /* setup the root state structure */
3329   __kmp_init_lock(&root->r.r_begin_lock);
3330   root->r.r_begin = FALSE;
3331   root->r.r_active = FALSE;
3332   root->r.r_in_parallel = 0;
3333   root->r.r_blocktime = __kmp_dflt_blocktime;
3334 #if KMP_AFFINITY_SUPPORTED
3335   root->r.r_affinity_assigned = FALSE;
3336 #endif
3337 
3338   /* setup the root team for this task */
3339   /* allocate the root team structure */
3340   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3341 
3342   root_team =
3343       __kmp_allocate_team(root,
3344                           1, // new_nproc
3345                           1, // max_nproc
3346 #if OMPT_SUPPORT
3347                           ompt_data_none, // root parallel id
3348 #endif
3349                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3350                           0 // argc
3351                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3352       );
3353 #if USE_DEBUGGER
3354   // Non-NULL value should be assigned to make the debugger display the root
3355   // team.
3356   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3357 #endif
3358 
3359   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3360 
3361   root->r.r_root_team = root_team;
3362   root_team->t.t_control_stack_top = NULL;
3363 
3364   /* initialize root team */
3365   root_team->t.t_threads[0] = NULL;
3366   root_team->t.t_nproc = 1;
3367   root_team->t.t_serialized = 1;
3368   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3369   root_team->t.t_sched.sched = r_sched.sched;
3370   KA_TRACE(
3371       20,
3372       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3373        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3374 
3375   /* setup the  hot team for this task */
3376   /* allocate the hot team structure */
3377   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3378 
3379   hot_team =
3380       __kmp_allocate_team(root,
3381                           1, // new_nproc
3382                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3383 #if OMPT_SUPPORT
3384                           ompt_data_none, // root parallel id
3385 #endif
3386                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3387                           0 // argc
3388                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3389       );
3390   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3391 
3392   root->r.r_hot_team = hot_team;
3393   root_team->t.t_control_stack_top = NULL;
3394 
3395   /* first-time initialization */
3396   hot_team->t.t_parent = root_team;
3397 
3398   /* initialize hot team */
3399   hot_team_max_nth = hot_team->t.t_max_nproc;
3400   for (f = 0; f < hot_team_max_nth; ++f) {
3401     hot_team->t.t_threads[f] = NULL;
3402   }
3403   hot_team->t.t_nproc = 1;
3404   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3405   hot_team->t.t_sched.sched = r_sched.sched;
3406   hot_team->t.t_size_changed = 0;
3407 }
3408 
3409 #ifdef KMP_DEBUG
3410 
3411 typedef struct kmp_team_list_item {
3412   kmp_team_p const *entry;
3413   struct kmp_team_list_item *next;
3414 } kmp_team_list_item_t;
3415 typedef kmp_team_list_item_t *kmp_team_list_t;
3416 
3417 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3418     kmp_team_list_t list, // List of teams.
3419     kmp_team_p const *team // Team to add.
3420 ) {
3421 
3422   // List must terminate with item where both entry and next are NULL.
3423   // Team is added to the list only once.
3424   // List is sorted in ascending order by team id.
3425   // Team id is *not* a key.
3426 
3427   kmp_team_list_t l;
3428 
3429   KMP_DEBUG_ASSERT(list != NULL);
3430   if (team == NULL) {
3431     return;
3432   }
3433 
3434   __kmp_print_structure_team_accum(list, team->t.t_parent);
3435   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3436 
3437   // Search list for the team.
3438   l = list;
3439   while (l->next != NULL && l->entry != team) {
3440     l = l->next;
3441   }
3442   if (l->next != NULL) {
3443     return; // Team has been added before, exit.
3444   }
3445 
3446   // Team is not found. Search list again for insertion point.
3447   l = list;
3448   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3449     l = l->next;
3450   }
3451 
3452   // Insert team.
3453   {
3454     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3455         sizeof(kmp_team_list_item_t));
3456     *item = *l;
3457     l->entry = team;
3458     l->next = item;
3459   }
3460 }
3461 
3462 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3463 
3464 ) {
3465   __kmp_printf("%s", title);
3466   if (team != NULL) {
3467     __kmp_printf("%2x %p\n", team->t.t_id, team);
3468   } else {
3469     __kmp_printf(" - (nil)\n");
3470   }
3471 }
3472 
3473 static void __kmp_print_structure_thread(char const *title,
3474                                          kmp_info_p const *thread) {
3475   __kmp_printf("%s", title);
3476   if (thread != NULL) {
3477     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3478   } else {
3479     __kmp_printf(" - (nil)\n");
3480   }
3481 }
3482 
3483 void __kmp_print_structure(void) {
3484 
3485   kmp_team_list_t list;
3486 
3487   // Initialize list of teams.
3488   list =
3489       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3490   list->entry = NULL;
3491   list->next = NULL;
3492 
3493   __kmp_printf("\n------------------------------\nGlobal Thread "
3494                "Table\n------------------------------\n");
3495   {
3496     int gtid;
3497     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3498       __kmp_printf("%2d", gtid);
3499       if (__kmp_threads != NULL) {
3500         __kmp_printf(" %p", __kmp_threads[gtid]);
3501       }
3502       if (__kmp_root != NULL) {
3503         __kmp_printf(" %p", __kmp_root[gtid]);
3504       }
3505       __kmp_printf("\n");
3506     }
3507   }
3508 
3509   // Print out __kmp_threads array.
3510   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3511                "----------\n");
3512   if (__kmp_threads != NULL) {
3513     int gtid;
3514     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3515       kmp_info_t const *thread = __kmp_threads[gtid];
3516       if (thread != NULL) {
3517         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3518         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3519         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3520         __kmp_print_structure_team("    Serial Team:  ",
3521                                    thread->th.th_serial_team);
3522         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3523         __kmp_print_structure_thread("    Primary:      ",
3524                                      thread->th.th_team_master);
3525         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3526         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3527         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3528         __kmp_print_structure_thread("    Next in pool: ",
3529                                      thread->th.th_next_pool);
3530         __kmp_printf("\n");
3531         __kmp_print_structure_team_accum(list, thread->th.th_team);
3532         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3533       }
3534     }
3535   } else {
3536     __kmp_printf("Threads array is not allocated.\n");
3537   }
3538 
3539   // Print out __kmp_root array.
3540   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3541                "--------\n");
3542   if (__kmp_root != NULL) {
3543     int gtid;
3544     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3545       kmp_root_t const *root = __kmp_root[gtid];
3546       if (root != NULL) {
3547         __kmp_printf("GTID %2d %p:\n", gtid, root);
3548         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3549         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3550         __kmp_print_structure_thread("    Uber Thread:  ",
3551                                      root->r.r_uber_thread);
3552         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3553         __kmp_printf("    In Parallel:  %2d\n",
3554                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3555         __kmp_printf("\n");
3556         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3557         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3558       }
3559     }
3560   } else {
3561     __kmp_printf("Ubers array is not allocated.\n");
3562   }
3563 
3564   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3565                "--------\n");
3566   while (list->next != NULL) {
3567     kmp_team_p const *team = list->entry;
3568     int i;
3569     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3570     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3571     __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
3572     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3573     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3574     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3575     for (i = 0; i < team->t.t_nproc; ++i) {
3576       __kmp_printf("    Thread %2d:      ", i);
3577       __kmp_print_structure_thread("", team->t.t_threads[i]);
3578     }
3579     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3580     __kmp_printf("\n");
3581     list = list->next;
3582   }
3583 
3584   // Print out __kmp_thread_pool and __kmp_team_pool.
3585   __kmp_printf("\n------------------------------\nPools\n----------------------"
3586                "--------\n");
3587   __kmp_print_structure_thread("Thread pool:          ",
3588                                CCAST(kmp_info_t *, __kmp_thread_pool));
3589   __kmp_print_structure_team("Team pool:            ",
3590                              CCAST(kmp_team_t *, __kmp_team_pool));
3591   __kmp_printf("\n");
3592 
3593   // Free team list.
3594   while (list != NULL) {
3595     kmp_team_list_item_t *item = list;
3596     list = list->next;
3597     KMP_INTERNAL_FREE(item);
3598   }
3599 }
3600 
3601 #endif
3602 
3603 //---------------------------------------------------------------------------
3604 //  Stuff for per-thread fast random number generator
3605 //  Table of primes
3606 static const unsigned __kmp_primes[] = {
3607     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3608     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3609     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3610     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3611     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3612     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3613     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3614     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3615     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3616     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3617     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3618 
3619 //---------------------------------------------------------------------------
3620 //  __kmp_get_random: Get a random number using a linear congruential method.
3621 unsigned short __kmp_get_random(kmp_info_t *thread) {
3622   unsigned x = thread->th.th_x;
3623   unsigned short r = (unsigned short)(x >> 16);
3624 
3625   thread->th.th_x = x * thread->th.th_a + 1;
3626 
3627   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3628                 thread->th.th_info.ds.ds_tid, r));
3629 
3630   return r;
3631 }
3632 //--------------------------------------------------------
3633 // __kmp_init_random: Initialize a random number generator
3634 void __kmp_init_random(kmp_info_t *thread) {
3635   unsigned seed = thread->th.th_info.ds.ds_tid;
3636 
3637   thread->th.th_a =
3638       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3639   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3640   KA_TRACE(30,
3641            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3642 }
3643 
3644 #if KMP_OS_WINDOWS
3645 /* reclaim array entries for root threads that are already dead, returns number
3646  * reclaimed */
3647 static int __kmp_reclaim_dead_roots(void) {
3648   int i, r = 0;
3649 
3650   for (i = 0; i < __kmp_threads_capacity; ++i) {
3651     if (KMP_UBER_GTID(i) &&
3652         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3653         !__kmp_root[i]
3654              ->r.r_active) { // AC: reclaim only roots died in non-active state
3655       r += __kmp_unregister_root_other_thread(i);
3656     }
3657   }
3658   return r;
3659 }
3660 #endif
3661 
3662 /* This function attempts to create free entries in __kmp_threads and
3663    __kmp_root, and returns the number of free entries generated.
3664 
3665    For Windows* OS static library, the first mechanism used is to reclaim array
3666    entries for root threads that are already dead.
3667 
3668    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3669    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3670    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3671    threadprivate cache array has been created. Synchronization with
3672    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3673 
3674    After any dead root reclamation, if the clipping value allows array expansion
3675    to result in the generation of a total of nNeed free slots, the function does
3676    that expansion. If not, nothing is done beyond the possible initial root
3677    thread reclamation.
3678 
3679    If any argument is negative, the behavior is undefined. */
3680 static int __kmp_expand_threads(int nNeed) {
3681   int added = 0;
3682   int minimumRequiredCapacity;
3683   int newCapacity;
3684   kmp_info_t **newThreads;
3685   kmp_root_t **newRoot;
3686 
3687   // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3688   // resizing __kmp_threads does not need additional protection if foreign
3689   // threads are present
3690 
3691 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3692   /* only for Windows static library */
3693   /* reclaim array entries for root threads that are already dead */
3694   added = __kmp_reclaim_dead_roots();
3695 
3696   if (nNeed) {
3697     nNeed -= added;
3698     if (nNeed < 0)
3699       nNeed = 0;
3700   }
3701 #endif
3702   if (nNeed <= 0)
3703     return added;
3704 
3705   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3706   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3707   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3708   // > __kmp_max_nth in one of two ways:
3709   //
3710   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3711   //    may not be reused by another thread, so we may need to increase
3712   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3713   //
3714   // 2) New foreign root(s) are encountered.  We always register new foreign
3715   //    roots. This may cause a smaller # of threads to be allocated at
3716   //    subsequent parallel regions, but the worker threads hang around (and
3717   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3718   //
3719   // Anyway, that is the reason for moving the check to see if
3720   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3721   // instead of having it performed here. -BB
3722 
3723   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3724 
3725   /* compute expansion headroom to check if we can expand */
3726   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3727     /* possible expansion too small -- give up */
3728     return added;
3729   }
3730   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3731 
3732   newCapacity = __kmp_threads_capacity;
3733   do {
3734     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3735                                                           : __kmp_sys_max_nth;
3736   } while (newCapacity < minimumRequiredCapacity);
3737   newThreads = (kmp_info_t **)__kmp_allocate(
3738       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3739   newRoot =
3740       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3741   KMP_MEMCPY(newThreads, __kmp_threads,
3742              __kmp_threads_capacity * sizeof(kmp_info_t *));
3743   KMP_MEMCPY(newRoot, __kmp_root,
3744              __kmp_threads_capacity * sizeof(kmp_root_t *));
3745   // Put old __kmp_threads array on a list. Any ongoing references to the old
3746   // list will be valid. This list is cleaned up at library shutdown.
3747   kmp_old_threads_list_t *node =
3748       (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3749   node->threads = __kmp_threads;
3750   node->next = __kmp_old_threads_list;
3751   __kmp_old_threads_list = node;
3752 
3753   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3754   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3755   added += newCapacity - __kmp_threads_capacity;
3756   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3757 
3758   if (newCapacity > __kmp_tp_capacity) {
3759     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3760     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3761       __kmp_threadprivate_resize_cache(newCapacity);
3762     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3763       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3764     }
3765     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3766   }
3767 
3768   return added;
3769 }
3770 
3771 /* Register the current thread as a root thread and obtain our gtid. We must
3772    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3773    thread that calls from __kmp_do_serial_initialize() */
3774 int __kmp_register_root(int initial_thread) {
3775   kmp_info_t *root_thread;
3776   kmp_root_t *root;
3777   int gtid;
3778   int capacity;
3779   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3780   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3781   KMP_MB();
3782 
3783   /* 2007-03-02:
3784      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3785      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3786      work as expected -- it may return false (that means there is at least one
3787      empty slot in __kmp_threads array), but it is possible the only free slot
3788      is #0, which is reserved for initial thread and so cannot be used for this
3789      one. Following code workarounds this bug.
3790 
3791      However, right solution seems to be not reserving slot #0 for initial
3792      thread because:
3793      (1) there is no magic in slot #0,
3794      (2) we cannot detect initial thread reliably (the first thread which does
3795         serial initialization may be not a real initial thread).
3796   */
3797   capacity = __kmp_threads_capacity;
3798   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3799     --capacity;
3800   }
3801 
3802   // If it is not for initializing the hidden helper team, we need to take
3803   // __kmp_hidden_helper_threads_num out of the capacity because it is included
3804   // in __kmp_threads_capacity.
3805   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3806     capacity -= __kmp_hidden_helper_threads_num;
3807   }
3808 
3809   /* see if there are too many threads */
3810   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3811     if (__kmp_tp_cached) {
3812       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3813                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3814                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3815     } else {
3816       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3817                   __kmp_msg_null);
3818     }
3819   }
3820 
3821   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3822   // 0: initial thread, also a regular OpenMP thread.
3823   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3824   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3825   // regular OpenMP threads.
3826   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3827     // Find an available thread slot for hidden helper thread. Slots for hidden
3828     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3829     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3830                    gtid <= __kmp_hidden_helper_threads_num;
3831          gtid++)
3832       ;
3833     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3834     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3835                  "hidden helper thread: T#%d\n",
3836                  gtid));
3837   } else {
3838     /* find an available thread slot */
3839     // Don't reassign the zero slot since we need that to only be used by
3840     // initial thread. Slots for hidden helper threads should also be skipped.
3841     if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3842       gtid = 0;
3843     } else {
3844       for (gtid = __kmp_hidden_helper_threads_num + 1;
3845            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3846         ;
3847     }
3848     KA_TRACE(
3849         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3850     KMP_ASSERT(gtid < __kmp_threads_capacity);
3851   }
3852 
3853   /* update global accounting */
3854   __kmp_all_nth++;
3855   TCW_4(__kmp_nth, __kmp_nth + 1);
3856 
3857   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3858   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3859   if (__kmp_adjust_gtid_mode) {
3860     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3861       if (TCR_4(__kmp_gtid_mode) != 2) {
3862         TCW_4(__kmp_gtid_mode, 2);
3863       }
3864     } else {
3865       if (TCR_4(__kmp_gtid_mode) != 1) {
3866         TCW_4(__kmp_gtid_mode, 1);
3867       }
3868     }
3869   }
3870 
3871 #ifdef KMP_ADJUST_BLOCKTIME
3872   /* Adjust blocktime to zero if necessary            */
3873   /* Middle initialization might not have occurred yet */
3874   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3875     if (__kmp_nth > __kmp_avail_proc) {
3876       __kmp_zero_bt = TRUE;
3877     }
3878   }
3879 #endif /* KMP_ADJUST_BLOCKTIME */
3880 
3881   /* setup this new hierarchy */
3882   if (!(root = __kmp_root[gtid])) {
3883     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3884     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3885   }
3886 
3887 #if KMP_STATS_ENABLED
3888   // Initialize stats as soon as possible (right after gtid assignment).
3889   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3890   __kmp_stats_thread_ptr->startLife();
3891   KMP_SET_THREAD_STATE(SERIAL_REGION);
3892   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3893 #endif
3894   __kmp_initialize_root(root);
3895 
3896   /* setup new root thread structure */
3897   if (root->r.r_uber_thread) {
3898     root_thread = root->r.r_uber_thread;
3899   } else {
3900     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3901     if (__kmp_storage_map) {
3902       __kmp_print_thread_storage_map(root_thread, gtid);
3903     }
3904     root_thread->th.th_info.ds.ds_gtid = gtid;
3905 #if OMPT_SUPPORT
3906     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3907 #endif
3908     root_thread->th.th_root = root;
3909     if (__kmp_env_consistency_check) {
3910       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3911     }
3912 #if USE_FAST_MEMORY
3913     __kmp_initialize_fast_memory(root_thread);
3914 #endif /* USE_FAST_MEMORY */
3915 
3916 #if KMP_USE_BGET
3917     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3918     __kmp_initialize_bget(root_thread);
3919 #endif
3920     __kmp_init_random(root_thread); // Initialize random number generator
3921   }
3922 
3923   /* setup the serial team held in reserve by the root thread */
3924   if (!root_thread->th.th_serial_team) {
3925     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3926     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3927     root_thread->th.th_serial_team = __kmp_allocate_team(
3928         root, 1, 1,
3929 #if OMPT_SUPPORT
3930         ompt_data_none, // root parallel id
3931 #endif
3932         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3933   }
3934   KMP_ASSERT(root_thread->th.th_serial_team);
3935   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3936                 root_thread->th.th_serial_team));
3937 
3938   /* drop root_thread into place */
3939   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3940 
3941   root->r.r_root_team->t.t_threads[0] = root_thread;
3942   root->r.r_hot_team->t.t_threads[0] = root_thread;
3943   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3944   // AC: the team created in reserve, not for execution (it is unused for now).
3945   root_thread->th.th_serial_team->t.t_serialized = 0;
3946   root->r.r_uber_thread = root_thread;
3947 
3948   /* initialize the thread, get it ready to go */
3949   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3950   TCW_4(__kmp_init_gtid, TRUE);
3951 
3952   /* prepare the primary thread for get_gtid() */
3953   __kmp_gtid_set_specific(gtid);
3954 
3955 #if USE_ITT_BUILD
3956   __kmp_itt_thread_name(gtid);
3957 #endif /* USE_ITT_BUILD */
3958 
3959 #ifdef KMP_TDATA_GTID
3960   __kmp_gtid = gtid;
3961 #endif
3962   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3963   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3964 
3965   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3966                 "plain=%u\n",
3967                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3968                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3969                 KMP_INIT_BARRIER_STATE));
3970   { // Initialize barrier data.
3971     int b;
3972     for (b = 0; b < bs_last_barrier; ++b) {
3973       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3974 #if USE_DEBUGGER
3975       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3976 #endif
3977     }
3978   }
3979   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3980                    KMP_INIT_BARRIER_STATE);
3981 
3982 #if KMP_AFFINITY_SUPPORTED
3983   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3984   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3985   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3986   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3987 #endif /* KMP_AFFINITY_SUPPORTED */
3988   root_thread->th.th_def_allocator = __kmp_def_allocator;
3989   root_thread->th.th_prev_level = 0;
3990   root_thread->th.th_prev_num_threads = 1;
3991 
3992   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3993   tmp->cg_root = root_thread;
3994   tmp->cg_thread_limit = __kmp_cg_max_nth;
3995   tmp->cg_nthreads = 1;
3996   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3997                  " cg_nthreads init to 1\n",
3998                  root_thread, tmp));
3999   tmp->up = NULL;
4000   root_thread->th.th_cg_roots = tmp;
4001 
4002   __kmp_root_counter++;
4003 
4004 #if OMPT_SUPPORT
4005   if (!initial_thread && ompt_enabled.enabled) {
4006 
4007     kmp_info_t *root_thread = ompt_get_thread();
4008 
4009     ompt_set_thread_state(root_thread, ompt_state_overhead);
4010 
4011     if (ompt_enabled.ompt_callback_thread_begin) {
4012       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4013           ompt_thread_initial, __ompt_get_thread_data_internal());
4014     }
4015     ompt_data_t *task_data;
4016     ompt_data_t *parallel_data;
4017     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4018                                   NULL);
4019     if (ompt_enabled.ompt_callback_implicit_task) {
4020       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4021           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4022     }
4023 
4024     ompt_set_thread_state(root_thread, ompt_state_work_serial);
4025   }
4026 #endif
4027 #if OMPD_SUPPORT
4028   if (ompd_state & OMPD_ENABLE_BP)
4029     ompd_bp_thread_begin();
4030 #endif
4031 
4032   KMP_MB();
4033   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4034 
4035   return gtid;
4036 }
4037 
4038 #if KMP_NESTED_HOT_TEAMS
4039 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4040                                 const int max_level) {
4041   int i, n, nth;
4042   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4043   if (!hot_teams || !hot_teams[level].hot_team) {
4044     return 0;
4045   }
4046   KMP_DEBUG_ASSERT(level < max_level);
4047   kmp_team_t *team = hot_teams[level].hot_team;
4048   nth = hot_teams[level].hot_team_nth;
4049   n = nth - 1; // primary thread is not freed
4050   if (level < max_level - 1) {
4051     for (i = 0; i < nth; ++i) {
4052       kmp_info_t *th = team->t.t_threads[i];
4053       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4054       if (i > 0 && th->th.th_hot_teams) {
4055         __kmp_free(th->th.th_hot_teams);
4056         th->th.th_hot_teams = NULL;
4057       }
4058     }
4059   }
4060   __kmp_free_team(root, team, NULL);
4061   return n;
4062 }
4063 #endif
4064 
4065 // Resets a root thread and clear its root and hot teams.
4066 // Returns the number of __kmp_threads entries directly and indirectly freed.
4067 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4068   kmp_team_t *root_team = root->r.r_root_team;
4069   kmp_team_t *hot_team = root->r.r_hot_team;
4070   int n = hot_team->t.t_nproc;
4071   int i;
4072 
4073   KMP_DEBUG_ASSERT(!root->r.r_active);
4074 
4075   root->r.r_root_team = NULL;
4076   root->r.r_hot_team = NULL;
4077   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4078   // before call to __kmp_free_team().
4079   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4080 #if KMP_NESTED_HOT_TEAMS
4081   if (__kmp_hot_teams_max_level >
4082       0) { // need to free nested hot teams and their threads if any
4083     for (i = 0; i < hot_team->t.t_nproc; ++i) {
4084       kmp_info_t *th = hot_team->t.t_threads[i];
4085       if (__kmp_hot_teams_max_level > 1) {
4086         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4087       }
4088       if (th->th.th_hot_teams) {
4089         __kmp_free(th->th.th_hot_teams);
4090         th->th.th_hot_teams = NULL;
4091       }
4092     }
4093   }
4094 #endif
4095   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4096 
4097   // Before we can reap the thread, we need to make certain that all other
4098   // threads in the teams that had this root as ancestor have stopped trying to
4099   // steal tasks.
4100   if (__kmp_tasking_mode != tskm_immediate_exec) {
4101     __kmp_wait_to_unref_task_teams();
4102   }
4103 
4104 #if KMP_OS_WINDOWS
4105   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4106   KA_TRACE(
4107       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4108            "\n",
4109            (LPVOID) & (root->r.r_uber_thread->th),
4110            root->r.r_uber_thread->th.th_info.ds.ds_thread));
4111   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4112 #endif /* KMP_OS_WINDOWS */
4113 
4114 #if OMPD_SUPPORT
4115   if (ompd_state & OMPD_ENABLE_BP)
4116     ompd_bp_thread_end();
4117 #endif
4118 
4119 #if OMPT_SUPPORT
4120   ompt_data_t *task_data;
4121   ompt_data_t *parallel_data;
4122   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4123                                 NULL);
4124   if (ompt_enabled.ompt_callback_implicit_task) {
4125     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4126         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4127   }
4128   if (ompt_enabled.ompt_callback_thread_end) {
4129     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4130         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4131   }
4132 #endif
4133 
4134   TCW_4(__kmp_nth,
4135         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4136   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4137   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4138                  " to %d\n",
4139                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4140                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4141   if (i == 1) {
4142     // need to free contention group structure
4143     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4144                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
4145     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4146     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4147     root->r.r_uber_thread->th.th_cg_roots = NULL;
4148   }
4149   __kmp_reap_thread(root->r.r_uber_thread, 1);
4150 
4151   // We canot put root thread to __kmp_thread_pool, so we have to reap it
4152   // instead of freeing.
4153   root->r.r_uber_thread = NULL;
4154   /* mark root as no longer in use */
4155   root->r.r_begin = FALSE;
4156 
4157   return n;
4158 }
4159 
4160 void __kmp_unregister_root_current_thread(int gtid) {
4161   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4162   /* this lock should be ok, since unregister_root_current_thread is never
4163      called during an abort, only during a normal close. furthermore, if you
4164      have the forkjoin lock, you should never try to get the initz lock */
4165   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4166   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4167     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4168                   "exiting T#%d\n",
4169                   gtid));
4170     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4171     return;
4172   }
4173   kmp_root_t *root = __kmp_root[gtid];
4174 
4175   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4176   KMP_ASSERT(KMP_UBER_GTID(gtid));
4177   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4178   KMP_ASSERT(root->r.r_active == FALSE);
4179 
4180   KMP_MB();
4181 
4182   kmp_info_t *thread = __kmp_threads[gtid];
4183   kmp_team_t *team = thread->th.th_team;
4184   kmp_task_team_t *task_team = thread->th.th_task_team;
4185 
4186   // we need to wait for the proxy tasks before finishing the thread
4187   if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4188                             task_team->tt.tt_hidden_helper_task_encountered)) {
4189 #if OMPT_SUPPORT
4190     // the runtime is shutting down so we won't report any events
4191     thread->th.ompt_thread_info.state = ompt_state_undefined;
4192 #endif
4193     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4194   }
4195 
4196   __kmp_reset_root(gtid, root);
4197 
4198   KMP_MB();
4199   KC_TRACE(10,
4200            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4201 
4202   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4203 }
4204 
4205 #if KMP_OS_WINDOWS
4206 /* __kmp_forkjoin_lock must be already held
4207    Unregisters a root thread that is not the current thread.  Returns the number
4208    of __kmp_threads entries freed as a result. */
4209 static int __kmp_unregister_root_other_thread(int gtid) {
4210   kmp_root_t *root = __kmp_root[gtid];
4211   int r;
4212 
4213   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4214   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4215   KMP_ASSERT(KMP_UBER_GTID(gtid));
4216   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4217   KMP_ASSERT(root->r.r_active == FALSE);
4218 
4219   r = __kmp_reset_root(gtid, root);
4220   KC_TRACE(10,
4221            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4222   return r;
4223 }
4224 #endif
4225 
4226 #if KMP_DEBUG
4227 void __kmp_task_info() {
4228 
4229   kmp_int32 gtid = __kmp_entry_gtid();
4230   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4231   kmp_info_t *this_thr = __kmp_threads[gtid];
4232   kmp_team_t *steam = this_thr->th.th_serial_team;
4233   kmp_team_t *team = this_thr->th.th_team;
4234 
4235   __kmp_printf(
4236       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4237       "ptask=%p\n",
4238       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4239       team->t.t_implicit_task_taskdata[tid].td_parent);
4240 }
4241 #endif // KMP_DEBUG
4242 
4243 /* TODO optimize with one big memclr, take out what isn't needed, split
4244    responsibility to workers as much as possible, and delay initialization of
4245    features as much as possible  */
4246 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4247                                   int tid, int gtid) {
4248   /* this_thr->th.th_info.ds.ds_gtid is setup in
4249      kmp_allocate_thread/create_worker.
4250      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4251   KMP_DEBUG_ASSERT(this_thr != NULL);
4252   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4253   KMP_DEBUG_ASSERT(team);
4254   KMP_DEBUG_ASSERT(team->t.t_threads);
4255   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4256   kmp_info_t *master = team->t.t_threads[0];
4257   KMP_DEBUG_ASSERT(master);
4258   KMP_DEBUG_ASSERT(master->th.th_root);
4259 
4260   KMP_MB();
4261 
4262   TCW_SYNC_PTR(this_thr->th.th_team, team);
4263 
4264   this_thr->th.th_info.ds.ds_tid = tid;
4265   this_thr->th.th_set_nproc = 0;
4266   if (__kmp_tasking_mode != tskm_immediate_exec)
4267     // When tasking is possible, threads are not safe to reap until they are
4268     // done tasking; this will be set when tasking code is exited in wait
4269     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4270   else // no tasking --> always safe to reap
4271     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4272   this_thr->th.th_set_proc_bind = proc_bind_default;
4273 #if KMP_AFFINITY_SUPPORTED
4274   this_thr->th.th_new_place = this_thr->th.th_current_place;
4275 #endif
4276   this_thr->th.th_root = master->th.th_root;
4277 
4278   /* setup the thread's cache of the team structure */
4279   this_thr->th.th_team_nproc = team->t.t_nproc;
4280   this_thr->th.th_team_master = master;
4281   this_thr->th.th_team_serialized = team->t.t_serialized;
4282 
4283   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4284 
4285   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4286                 tid, gtid, this_thr, this_thr->th.th_current_task));
4287 
4288   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4289                            team, tid, TRUE);
4290 
4291   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4292                 tid, gtid, this_thr, this_thr->th.th_current_task));
4293   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4294   // __kmp_initialize_team()?
4295 
4296   /* TODO no worksharing in speculative threads */
4297   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4298 
4299   this_thr->th.th_local.this_construct = 0;
4300 
4301   if (!this_thr->th.th_pri_common) {
4302     this_thr->th.th_pri_common =
4303         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4304     if (__kmp_storage_map) {
4305       __kmp_print_storage_map_gtid(
4306           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4307           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4308     }
4309     this_thr->th.th_pri_head = NULL;
4310   }
4311 
4312   if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4313       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4314     // Make new thread's CG root same as primary thread's
4315     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4316     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4317     if (tmp) {
4318       // worker changes CG, need to check if old CG should be freed
4319       int i = tmp->cg_nthreads--;
4320       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4321                      " on node %p of thread %p to %d\n",
4322                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4323       if (i == 1) {
4324         __kmp_free(tmp); // last thread left CG --> free it
4325       }
4326     }
4327     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4328     // Increment new thread's CG root's counter to add the new thread
4329     this_thr->th.th_cg_roots->cg_nthreads++;
4330     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4331                    " node %p of thread %p to %d\n",
4332                    this_thr, this_thr->th.th_cg_roots,
4333                    this_thr->th.th_cg_roots->cg_root,
4334                    this_thr->th.th_cg_roots->cg_nthreads));
4335     this_thr->th.th_current_task->td_icvs.thread_limit =
4336         this_thr->th.th_cg_roots->cg_thread_limit;
4337   }
4338 
4339   /* Initialize dynamic dispatch */
4340   {
4341     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4342     // Use team max_nproc since this will never change for the team.
4343     size_t disp_size =
4344         sizeof(dispatch_private_info_t) *
4345         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4346     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4347                   team->t.t_max_nproc));
4348     KMP_ASSERT(dispatch);
4349     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4350     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4351 
4352     dispatch->th_disp_index = 0;
4353     dispatch->th_doacross_buf_idx = 0;
4354     if (!dispatch->th_disp_buffer) {
4355       dispatch->th_disp_buffer =
4356           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4357 
4358       if (__kmp_storage_map) {
4359         __kmp_print_storage_map_gtid(
4360             gtid, &dispatch->th_disp_buffer[0],
4361             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4362                                           ? 1
4363                                           : __kmp_dispatch_num_buffers],
4364             disp_size,
4365             "th_%d.th_dispatch.th_disp_buffer "
4366             "(team_%d.t_dispatch[%d].th_disp_buffer)",
4367             gtid, team->t.t_id, gtid);
4368       }
4369     } else {
4370       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4371     }
4372 
4373     dispatch->th_dispatch_pr_current = 0;
4374     dispatch->th_dispatch_sh_current = 0;
4375 
4376     dispatch->th_deo_fcn = 0; /* ORDERED     */
4377     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4378   }
4379 
4380   this_thr->th.th_next_pool = NULL;
4381 
4382   if (!this_thr->th.th_task_state_memo_stack) {
4383     size_t i;
4384     this_thr->th.th_task_state_memo_stack =
4385         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4386     this_thr->th.th_task_state_top = 0;
4387     this_thr->th.th_task_state_stack_sz = 4;
4388     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4389          ++i) // zero init the stack
4390       this_thr->th.th_task_state_memo_stack[i] = 0;
4391   }
4392 
4393   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4394   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4395 
4396   KMP_MB();
4397 }
4398 
4399 /* allocate a new thread for the requesting team. this is only called from
4400    within a forkjoin critical section. we will first try to get an available
4401    thread from the thread pool. if none is available, we will fork a new one
4402    assuming we are able to create a new one. this should be assured, as the
4403    caller should check on this first. */
4404 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4405                                   int new_tid) {
4406   kmp_team_t *serial_team;
4407   kmp_info_t *new_thr;
4408   int new_gtid;
4409 
4410   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4411   KMP_DEBUG_ASSERT(root && team);
4412 #if !KMP_NESTED_HOT_TEAMS
4413   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4414 #endif
4415   KMP_MB();
4416 
4417   /* first, try to get one from the thread pool */
4418   if (__kmp_thread_pool) {
4419     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4420     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4421     if (new_thr == __kmp_thread_pool_insert_pt) {
4422       __kmp_thread_pool_insert_pt = NULL;
4423     }
4424     TCW_4(new_thr->th.th_in_pool, FALSE);
4425     __kmp_suspend_initialize_thread(new_thr);
4426     __kmp_lock_suspend_mx(new_thr);
4427     if (new_thr->th.th_active_in_pool == TRUE) {
4428       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4429       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4430       new_thr->th.th_active_in_pool = FALSE;
4431     }
4432     __kmp_unlock_suspend_mx(new_thr);
4433 
4434     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4435                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4436     KMP_ASSERT(!new_thr->th.th_team);
4437     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4438 
4439     /* setup the thread structure */
4440     __kmp_initialize_info(new_thr, team, new_tid,
4441                           new_thr->th.th_info.ds.ds_gtid);
4442     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4443 
4444     TCW_4(__kmp_nth, __kmp_nth + 1);
4445 
4446     new_thr->th.th_task_state = 0;
4447     new_thr->th.th_task_state_top = 0;
4448     new_thr->th.th_task_state_stack_sz = 4;
4449 
4450     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4451       // Make sure pool thread has transitioned to waiting on own thread struct
4452       KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4453       // Thread activated in __kmp_allocate_team when increasing team size
4454     }
4455 
4456 #ifdef KMP_ADJUST_BLOCKTIME
4457     /* Adjust blocktime back to zero if necessary */
4458     /* Middle initialization might not have occurred yet */
4459     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4460       if (__kmp_nth > __kmp_avail_proc) {
4461         __kmp_zero_bt = TRUE;
4462       }
4463     }
4464 #endif /* KMP_ADJUST_BLOCKTIME */
4465 
4466 #if KMP_DEBUG
4467     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4468     // KMP_BARRIER_PARENT_FLAG.
4469     int b;
4470     kmp_balign_t *balign = new_thr->th.th_bar;
4471     for (b = 0; b < bs_last_barrier; ++b)
4472       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4473 #endif
4474 
4475     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4476                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4477 
4478     KMP_MB();
4479     return new_thr;
4480   }
4481 
4482   /* no, well fork a new one */
4483   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4484   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4485 
4486 #if KMP_USE_MONITOR
4487   // If this is the first worker thread the RTL is creating, then also
4488   // launch the monitor thread.  We try to do this as early as possible.
4489   if (!TCR_4(__kmp_init_monitor)) {
4490     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4491     if (!TCR_4(__kmp_init_monitor)) {
4492       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4493       TCW_4(__kmp_init_monitor, 1);
4494       __kmp_create_monitor(&__kmp_monitor);
4495       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4496 #if KMP_OS_WINDOWS
4497       // AC: wait until monitor has started. This is a fix for CQ232808.
4498       // The reason is that if the library is loaded/unloaded in a loop with
4499       // small (parallel) work in between, then there is high probability that
4500       // monitor thread started after the library shutdown. At shutdown it is
4501       // too late to cope with the problem, because when the primary thread is
4502       // in DllMain (process detach) the monitor has no chances to start (it is
4503       // blocked), and primary thread has no means to inform the monitor that
4504       // the library has gone, because all the memory which the monitor can
4505       // access is going to be released/reset.
4506       while (TCR_4(__kmp_init_monitor) < 2) {
4507         KMP_YIELD(TRUE);
4508       }
4509       KF_TRACE(10, ("after monitor thread has started\n"));
4510 #endif
4511     }
4512     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4513   }
4514 #endif
4515 
4516   KMP_MB();
4517 
4518   {
4519     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4520                              ? 1
4521                              : __kmp_hidden_helper_threads_num + 1;
4522 
4523     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4524          ++new_gtid) {
4525       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4526     }
4527 
4528     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4529       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4530     }
4531   }
4532 
4533   /* allocate space for it. */
4534   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4535 
4536   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4537 
4538 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4539   // suppress race conditions detection on synchronization flags in debug mode
4540   // this helps to analyze library internals eliminating false positives
4541   __itt_suppress_mark_range(
4542       __itt_suppress_range, __itt_suppress_threading_errors,
4543       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4544   __itt_suppress_mark_range(
4545       __itt_suppress_range, __itt_suppress_threading_errors,
4546       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4547 #if KMP_OS_WINDOWS
4548   __itt_suppress_mark_range(
4549       __itt_suppress_range, __itt_suppress_threading_errors,
4550       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4551 #else
4552   __itt_suppress_mark_range(__itt_suppress_range,
4553                             __itt_suppress_threading_errors,
4554                             &new_thr->th.th_suspend_init_count,
4555                             sizeof(new_thr->th.th_suspend_init_count));
4556 #endif
4557   // TODO: check if we need to also suppress b_arrived flags
4558   __itt_suppress_mark_range(__itt_suppress_range,
4559                             __itt_suppress_threading_errors,
4560                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4561                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4562   __itt_suppress_mark_range(__itt_suppress_range,
4563                             __itt_suppress_threading_errors,
4564                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4565                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4566   __itt_suppress_mark_range(__itt_suppress_range,
4567                             __itt_suppress_threading_errors,
4568                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4569                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4570 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4571   if (__kmp_storage_map) {
4572     __kmp_print_thread_storage_map(new_thr, new_gtid);
4573   }
4574 
4575   // add the reserve serialized team, initialized from the team's primary thread
4576   {
4577     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4578     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4579     new_thr->th.th_serial_team = serial_team =
4580         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4581 #if OMPT_SUPPORT
4582                                           ompt_data_none, // root parallel id
4583 #endif
4584                                           proc_bind_default, &r_icvs,
4585                                           0 USE_NESTED_HOT_ARG(NULL));
4586   }
4587   KMP_ASSERT(serial_team);
4588   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4589   // execution (it is unused for now).
4590   serial_team->t.t_threads[0] = new_thr;
4591   KF_TRACE(10,
4592            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4593             new_thr));
4594 
4595   /* setup the thread structures */
4596   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4597 
4598 #if USE_FAST_MEMORY
4599   __kmp_initialize_fast_memory(new_thr);
4600 #endif /* USE_FAST_MEMORY */
4601 
4602 #if KMP_USE_BGET
4603   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4604   __kmp_initialize_bget(new_thr);
4605 #endif
4606 
4607   __kmp_init_random(new_thr); // Initialize random number generator
4608 
4609   /* Initialize these only once when thread is grabbed for a team allocation */
4610   KA_TRACE(20,
4611            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4612             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4613 
4614   int b;
4615   kmp_balign_t *balign = new_thr->th.th_bar;
4616   for (b = 0; b < bs_last_barrier; ++b) {
4617     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4618     balign[b].bb.team = NULL;
4619     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4620     balign[b].bb.use_oncore_barrier = 0;
4621   }
4622 
4623   TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4624   new_thr->th.th_sleep_loc_type = flag_unset;
4625 
4626   new_thr->th.th_spin_here = FALSE;
4627   new_thr->th.th_next_waiting = 0;
4628 #if KMP_OS_UNIX
4629   new_thr->th.th_blocking = false;
4630 #endif
4631 
4632 #if KMP_AFFINITY_SUPPORTED
4633   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4634   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4635   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4636   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4637 #endif
4638   new_thr->th.th_def_allocator = __kmp_def_allocator;
4639   new_thr->th.th_prev_level = 0;
4640   new_thr->th.th_prev_num_threads = 1;
4641 
4642   TCW_4(new_thr->th.th_in_pool, FALSE);
4643   new_thr->th.th_active_in_pool = FALSE;
4644   TCW_4(new_thr->th.th_active, TRUE);
4645 
4646   /* adjust the global counters */
4647   __kmp_all_nth++;
4648   __kmp_nth++;
4649 
4650   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4651   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4652   if (__kmp_adjust_gtid_mode) {
4653     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4654       if (TCR_4(__kmp_gtid_mode) != 2) {
4655         TCW_4(__kmp_gtid_mode, 2);
4656       }
4657     } else {
4658       if (TCR_4(__kmp_gtid_mode) != 1) {
4659         TCW_4(__kmp_gtid_mode, 1);
4660       }
4661     }
4662   }
4663 
4664 #ifdef KMP_ADJUST_BLOCKTIME
4665   /* Adjust blocktime back to zero if necessary       */
4666   /* Middle initialization might not have occurred yet */
4667   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4668     if (__kmp_nth > __kmp_avail_proc) {
4669       __kmp_zero_bt = TRUE;
4670     }
4671   }
4672 #endif /* KMP_ADJUST_BLOCKTIME */
4673 
4674   /* actually fork it and create the new worker thread */
4675   KF_TRACE(
4676       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4677   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4678   KF_TRACE(10,
4679            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4680 
4681   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4682                 new_gtid));
4683   KMP_MB();
4684   return new_thr;
4685 }
4686 
4687 /* Reinitialize team for reuse.
4688    The hot team code calls this case at every fork barrier, so EPCC barrier
4689    test are extremely sensitive to changes in it, esp. writes to the team
4690    struct, which cause a cache invalidation in all threads.
4691    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4692 static void __kmp_reinitialize_team(kmp_team_t *team,
4693                                     kmp_internal_control_t *new_icvs,
4694                                     ident_t *loc) {
4695   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4696                 team->t.t_threads[0], team));
4697   KMP_DEBUG_ASSERT(team && new_icvs);
4698   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4699   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4700 
4701   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4702   // Copy ICVs to the primary thread's implicit taskdata
4703   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4704   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4705 
4706   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4707                 team->t.t_threads[0], team));
4708 }
4709 
4710 /* Initialize the team data structure.
4711    This assumes the t_threads and t_max_nproc are already set.
4712    Also, we don't touch the arguments */
4713 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4714                                   kmp_internal_control_t *new_icvs,
4715                                   ident_t *loc) {
4716   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4717 
4718   /* verify */
4719   KMP_DEBUG_ASSERT(team);
4720   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4721   KMP_DEBUG_ASSERT(team->t.t_threads);
4722   KMP_MB();
4723 
4724   team->t.t_master_tid = 0; /* not needed */
4725   /* team->t.t_master_bar;        not needed */
4726   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4727   team->t.t_nproc = new_nproc;
4728 
4729   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4730   team->t.t_next_pool = NULL;
4731   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4732    * up hot team */
4733 
4734   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4735   team->t.t_invoke = NULL; /* not needed */
4736 
4737   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4738   team->t.t_sched.sched = new_icvs->sched.sched;
4739 
4740 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4741   team->t.t_fp_control_saved = FALSE; /* not needed */
4742   team->t.t_x87_fpu_control_word = 0; /* not needed */
4743   team->t.t_mxcsr = 0; /* not needed */
4744 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4745 
4746   team->t.t_construct = 0;
4747 
4748   team->t.t_ordered.dt.t_value = 0;
4749   team->t.t_master_active = FALSE;
4750 
4751 #ifdef KMP_DEBUG
4752   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4753 #endif
4754 #if KMP_OS_WINDOWS
4755   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4756 #endif
4757 
4758   team->t.t_control_stack_top = NULL;
4759 
4760   __kmp_reinitialize_team(team, new_icvs, loc);
4761 
4762   KMP_MB();
4763   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4764 }
4765 
4766 #if KMP_AFFINITY_SUPPORTED
4767 
4768 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4769 // It calculates the worker + primary thread's partition based upon the parent
4770 // thread's partition, and binds each worker to a thread in their partition.
4771 // The primary thread's partition should already include its current binding.
4772 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4773   // Do not partition places for the hidden helper team
4774   if (KMP_HIDDEN_HELPER_TEAM(team))
4775     return;
4776   // Copy the primary thread's place partition to the team struct
4777   kmp_info_t *master_th = team->t.t_threads[0];
4778   KMP_DEBUG_ASSERT(master_th != NULL);
4779   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4780   int first_place = master_th->th.th_first_place;
4781   int last_place = master_th->th.th_last_place;
4782   int masters_place = master_th->th.th_current_place;
4783   int num_masks = __kmp_affinity.num_masks;
4784   team->t.t_first_place = first_place;
4785   team->t.t_last_place = last_place;
4786 
4787   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4788                 "bound to place %d partition = [%d,%d]\n",
4789                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4790                 team->t.t_id, masters_place, first_place, last_place));
4791 
4792   switch (proc_bind) {
4793 
4794   case proc_bind_default:
4795     // Serial teams might have the proc_bind policy set to proc_bind_default.
4796     // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4797     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4798     break;
4799 
4800   case proc_bind_primary: {
4801     int f;
4802     int n_th = team->t.t_nproc;
4803     for (f = 1; f < n_th; f++) {
4804       kmp_info_t *th = team->t.t_threads[f];
4805       KMP_DEBUG_ASSERT(th != NULL);
4806       th->th.th_first_place = first_place;
4807       th->th.th_last_place = last_place;
4808       th->th.th_new_place = masters_place;
4809       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4810           team->t.t_display_affinity != 1) {
4811         team->t.t_display_affinity = 1;
4812       }
4813 
4814       KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4815                      "partition = [%d,%d]\n",
4816                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4817                      f, masters_place, first_place, last_place));
4818     }
4819   } break;
4820 
4821   case proc_bind_close: {
4822     int f;
4823     int n_th = team->t.t_nproc;
4824     int n_places;
4825     if (first_place <= last_place) {
4826       n_places = last_place - first_place + 1;
4827     } else {
4828       n_places = num_masks - first_place + last_place + 1;
4829     }
4830     if (n_th <= n_places) {
4831       int place = masters_place;
4832       for (f = 1; f < n_th; f++) {
4833         kmp_info_t *th = team->t.t_threads[f];
4834         KMP_DEBUG_ASSERT(th != NULL);
4835 
4836         if (place == last_place) {
4837           place = first_place;
4838         } else if (place == (num_masks - 1)) {
4839           place = 0;
4840         } else {
4841           place++;
4842         }
4843         th->th.th_first_place = first_place;
4844         th->th.th_last_place = last_place;
4845         th->th.th_new_place = place;
4846         if (__kmp_display_affinity && place != th->th.th_current_place &&
4847             team->t.t_display_affinity != 1) {
4848           team->t.t_display_affinity = 1;
4849         }
4850 
4851         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4852                        "partition = [%d,%d]\n",
4853                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4854                        team->t.t_id, f, place, first_place, last_place));
4855       }
4856     } else {
4857       int S, rem, gap, s_count;
4858       S = n_th / n_places;
4859       s_count = 0;
4860       rem = n_th - (S * n_places);
4861       gap = rem > 0 ? n_places / rem : n_places;
4862       int place = masters_place;
4863       int gap_ct = gap;
4864       for (f = 0; f < n_th; f++) {
4865         kmp_info_t *th = team->t.t_threads[f];
4866         KMP_DEBUG_ASSERT(th != NULL);
4867 
4868         th->th.th_first_place = first_place;
4869         th->th.th_last_place = last_place;
4870         th->th.th_new_place = place;
4871         if (__kmp_display_affinity && place != th->th.th_current_place &&
4872             team->t.t_display_affinity != 1) {
4873           team->t.t_display_affinity = 1;
4874         }
4875         s_count++;
4876 
4877         if ((s_count == S) && rem && (gap_ct == gap)) {
4878           // do nothing, add an extra thread to place on next iteration
4879         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4880           // we added an extra thread to this place; move to next place
4881           if (place == last_place) {
4882             place = first_place;
4883           } else if (place == (num_masks - 1)) {
4884             place = 0;
4885           } else {
4886             place++;
4887           }
4888           s_count = 0;
4889           gap_ct = 1;
4890           rem--;
4891         } else if (s_count == S) { // place full; don't add extra
4892           if (place == last_place) {
4893             place = first_place;
4894           } else if (place == (num_masks - 1)) {
4895             place = 0;
4896           } else {
4897             place++;
4898           }
4899           gap_ct++;
4900           s_count = 0;
4901         }
4902 
4903         KA_TRACE(100,
4904                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4905                   "partition = [%d,%d]\n",
4906                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4907                   th->th.th_new_place, first_place, last_place));
4908       }
4909       KMP_DEBUG_ASSERT(place == masters_place);
4910     }
4911   } break;
4912 
4913   case proc_bind_spread: {
4914     int f;
4915     int n_th = team->t.t_nproc;
4916     int n_places;
4917     int thidx;
4918     if (first_place <= last_place) {
4919       n_places = last_place - first_place + 1;
4920     } else {
4921       n_places = num_masks - first_place + last_place + 1;
4922     }
4923     if (n_th <= n_places) {
4924       int place = -1;
4925 
4926       if (n_places != num_masks) {
4927         int S = n_places / n_th;
4928         int s_count, rem, gap, gap_ct;
4929 
4930         place = masters_place;
4931         rem = n_places - n_th * S;
4932         gap = rem ? n_th / rem : 1;
4933         gap_ct = gap;
4934         thidx = n_th;
4935         if (update_master_only == 1)
4936           thidx = 1;
4937         for (f = 0; f < thidx; f++) {
4938           kmp_info_t *th = team->t.t_threads[f];
4939           KMP_DEBUG_ASSERT(th != NULL);
4940 
4941           th->th.th_first_place = place;
4942           th->th.th_new_place = place;
4943           if (__kmp_display_affinity && place != th->th.th_current_place &&
4944               team->t.t_display_affinity != 1) {
4945             team->t.t_display_affinity = 1;
4946           }
4947           s_count = 1;
4948           while (s_count < S) {
4949             if (place == last_place) {
4950               place = first_place;
4951             } else if (place == (num_masks - 1)) {
4952               place = 0;
4953             } else {
4954               place++;
4955             }
4956             s_count++;
4957           }
4958           if (rem && (gap_ct == gap)) {
4959             if (place == last_place) {
4960               place = first_place;
4961             } else if (place == (num_masks - 1)) {
4962               place = 0;
4963             } else {
4964               place++;
4965             }
4966             rem--;
4967             gap_ct = 0;
4968           }
4969           th->th.th_last_place = place;
4970           gap_ct++;
4971 
4972           if (place == last_place) {
4973             place = first_place;
4974           } else if (place == (num_masks - 1)) {
4975             place = 0;
4976           } else {
4977             place++;
4978           }
4979 
4980           KA_TRACE(100,
4981                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4982                     "partition = [%d,%d], num_masks: %u\n",
4983                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4984                     f, th->th.th_new_place, th->th.th_first_place,
4985                     th->th.th_last_place, num_masks));
4986         }
4987       } else {
4988         /* Having uniform space of available computation places I can create
4989            T partitions of round(P/T) size and put threads into the first
4990            place of each partition. */
4991         double current = static_cast<double>(masters_place);
4992         double spacing =
4993             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4994         int first, last;
4995         kmp_info_t *th;
4996 
4997         thidx = n_th + 1;
4998         if (update_master_only == 1)
4999           thidx = 1;
5000         for (f = 0; f < thidx; f++) {
5001           first = static_cast<int>(current);
5002           last = static_cast<int>(current + spacing) - 1;
5003           KMP_DEBUG_ASSERT(last >= first);
5004           if (first >= n_places) {
5005             if (masters_place) {
5006               first -= n_places;
5007               last -= n_places;
5008               if (first == (masters_place + 1)) {
5009                 KMP_DEBUG_ASSERT(f == n_th);
5010                 first--;
5011               }
5012               if (last == masters_place) {
5013                 KMP_DEBUG_ASSERT(f == (n_th - 1));
5014                 last--;
5015               }
5016             } else {
5017               KMP_DEBUG_ASSERT(f == n_th);
5018               first = 0;
5019               last = 0;
5020             }
5021           }
5022           if (last >= n_places) {
5023             last = (n_places - 1);
5024           }
5025           place = first;
5026           current += spacing;
5027           if (f < n_th) {
5028             KMP_DEBUG_ASSERT(0 <= first);
5029             KMP_DEBUG_ASSERT(n_places > first);
5030             KMP_DEBUG_ASSERT(0 <= last);
5031             KMP_DEBUG_ASSERT(n_places > last);
5032             KMP_DEBUG_ASSERT(last_place >= first_place);
5033             th = team->t.t_threads[f];
5034             KMP_DEBUG_ASSERT(th);
5035             th->th.th_first_place = first;
5036             th->th.th_new_place = place;
5037             th->th.th_last_place = last;
5038             if (__kmp_display_affinity && place != th->th.th_current_place &&
5039                 team->t.t_display_affinity != 1) {
5040               team->t.t_display_affinity = 1;
5041             }
5042             KA_TRACE(100,
5043                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5044                       "partition = [%d,%d], spacing = %.4f\n",
5045                       __kmp_gtid_from_thread(team->t.t_threads[f]),
5046                       team->t.t_id, f, th->th.th_new_place,
5047                       th->th.th_first_place, th->th.th_last_place, spacing));
5048           }
5049         }
5050       }
5051       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5052     } else {
5053       int S, rem, gap, s_count;
5054       S = n_th / n_places;
5055       s_count = 0;
5056       rem = n_th - (S * n_places);
5057       gap = rem > 0 ? n_places / rem : n_places;
5058       int place = masters_place;
5059       int gap_ct = gap;
5060       thidx = n_th;
5061       if (update_master_only == 1)
5062         thidx = 1;
5063       for (f = 0; f < thidx; f++) {
5064         kmp_info_t *th = team->t.t_threads[f];
5065         KMP_DEBUG_ASSERT(th != NULL);
5066 
5067         th->th.th_first_place = place;
5068         th->th.th_last_place = place;
5069         th->th.th_new_place = place;
5070         if (__kmp_display_affinity && place != th->th.th_current_place &&
5071             team->t.t_display_affinity != 1) {
5072           team->t.t_display_affinity = 1;
5073         }
5074         s_count++;
5075 
5076         if ((s_count == S) && rem && (gap_ct == gap)) {
5077           // do nothing, add an extra thread to place on next iteration
5078         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5079           // we added an extra thread to this place; move on to next place
5080           if (place == last_place) {
5081             place = first_place;
5082           } else if (place == (num_masks - 1)) {
5083             place = 0;
5084           } else {
5085             place++;
5086           }
5087           s_count = 0;
5088           gap_ct = 1;
5089           rem--;
5090         } else if (s_count == S) { // place is full; don't add extra thread
5091           if (place == last_place) {
5092             place = first_place;
5093           } else if (place == (num_masks - 1)) {
5094             place = 0;
5095           } else {
5096             place++;
5097           }
5098           gap_ct++;
5099           s_count = 0;
5100         }
5101 
5102         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5103                        "partition = [%d,%d]\n",
5104                        __kmp_gtid_from_thread(team->t.t_threads[f]),
5105                        team->t.t_id, f, th->th.th_new_place,
5106                        th->th.th_first_place, th->th.th_last_place));
5107       }
5108       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5109     }
5110   } break;
5111 
5112   default:
5113     break;
5114   }
5115 
5116   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5117 }
5118 
5119 #endif // KMP_AFFINITY_SUPPORTED
5120 
5121 /* allocate a new team data structure to use.  take one off of the free pool if
5122    available */
5123 kmp_team_t *
5124 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5125 #if OMPT_SUPPORT
5126                     ompt_data_t ompt_parallel_data,
5127 #endif
5128                     kmp_proc_bind_t new_proc_bind,
5129                     kmp_internal_control_t *new_icvs,
5130                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5131   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5132   int f;
5133   kmp_team_t *team;
5134   int use_hot_team = !root->r.r_active;
5135   int level = 0;
5136   int do_place_partition = 1;
5137 
5138   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5139   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5140   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5141   KMP_MB();
5142 
5143 #if KMP_NESTED_HOT_TEAMS
5144   kmp_hot_team_ptr_t *hot_teams;
5145   if (master) {
5146     team = master->th.th_team;
5147     level = team->t.t_active_level;
5148     if (master->th.th_teams_microtask) { // in teams construct?
5149       if (master->th.th_teams_size.nteams > 1 &&
5150           ( // #teams > 1
5151               team->t.t_pkfn ==
5152                   (microtask_t)__kmp_teams_master || // inner fork of the teams
5153               master->th.th_teams_level <
5154                   team->t.t_level)) { // or nested parallel inside the teams
5155         ++level; // not increment if #teams==1, or for outer fork of the teams;
5156         // increment otherwise
5157       }
5158       // Do not perform the place partition if inner fork of the teams
5159       // Wait until nested parallel region encountered inside teams construct
5160       if ((master->th.th_teams_size.nteams == 1 &&
5161            master->th.th_teams_level >= team->t.t_level) ||
5162           (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5163         do_place_partition = 0;
5164     }
5165     hot_teams = master->th.th_hot_teams;
5166     if (level < __kmp_hot_teams_max_level && hot_teams &&
5167         hot_teams[level].hot_team) {
5168       // hot team has already been allocated for given level
5169       use_hot_team = 1;
5170     } else {
5171       use_hot_team = 0;
5172     }
5173   } else {
5174     // check we won't access uninitialized hot_teams, just in case
5175     KMP_DEBUG_ASSERT(new_nproc == 1);
5176   }
5177 #endif
5178   // Optimization to use a "hot" team
5179   if (use_hot_team && new_nproc > 1) {
5180     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5181 #if KMP_NESTED_HOT_TEAMS
5182     team = hot_teams[level].hot_team;
5183 #else
5184     team = root->r.r_hot_team;
5185 #endif
5186 #if KMP_DEBUG
5187     if (__kmp_tasking_mode != tskm_immediate_exec) {
5188       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5189                     "task_team[1] = %p before reinit\n",
5190                     team->t.t_task_team[0], team->t.t_task_team[1]));
5191     }
5192 #endif
5193 
5194     if (team->t.t_nproc != new_nproc &&
5195         __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5196       // Distributed barrier may need a resize
5197       int old_nthr = team->t.t_nproc;
5198       __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5199     }
5200 
5201     // If not doing the place partition, then reset the team's proc bind
5202     // to indicate that partitioning of all threads still needs to take place
5203     if (do_place_partition == 0)
5204       team->t.t_proc_bind = proc_bind_default;
5205     // Has the number of threads changed?
5206     /* Let's assume the most common case is that the number of threads is
5207        unchanged, and put that case first. */
5208     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5209       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5210       // This case can mean that omp_set_num_threads() was called and the hot
5211       // team size was already reduced, so we check the special flag
5212       if (team->t.t_size_changed == -1) {
5213         team->t.t_size_changed = 1;
5214       } else {
5215         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5216       }
5217 
5218       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5219       kmp_r_sched_t new_sched = new_icvs->sched;
5220       // set primary thread's schedule as new run-time schedule
5221       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5222 
5223       __kmp_reinitialize_team(team, new_icvs,
5224                               root->r.r_uber_thread->th.th_ident);
5225 
5226       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5227                     team->t.t_threads[0], team));
5228       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5229 
5230 #if KMP_AFFINITY_SUPPORTED
5231       if ((team->t.t_size_changed == 0) &&
5232           (team->t.t_proc_bind == new_proc_bind)) {
5233         if (new_proc_bind == proc_bind_spread) {
5234           if (do_place_partition) {
5235             // add flag to update only master for spread
5236             __kmp_partition_places(team, 1);
5237           }
5238         }
5239         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5240                        "proc_bind = %d, partition = [%d,%d]\n",
5241                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5242                        team->t.t_last_place));
5243       } else {
5244         if (do_place_partition) {
5245           KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5246           __kmp_partition_places(team);
5247         }
5248       }
5249 #else
5250       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5251 #endif /* KMP_AFFINITY_SUPPORTED */
5252     } else if (team->t.t_nproc > new_nproc) {
5253       KA_TRACE(20,
5254                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5255                 new_nproc));
5256 
5257       team->t.t_size_changed = 1;
5258       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5259         // Barrier size already reduced earlier in this function
5260         // Activate team threads via th_used_in_team
5261         __kmp_add_threads_to_team(team, new_nproc);
5262       }
5263 #if KMP_NESTED_HOT_TEAMS
5264       if (__kmp_hot_teams_mode == 0) {
5265         // AC: saved number of threads should correspond to team's value in this
5266         // mode, can be bigger in mode 1, when hot team has threads in reserve
5267         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5268         hot_teams[level].hot_team_nth = new_nproc;
5269 #endif // KMP_NESTED_HOT_TEAMS
5270         /* release the extra threads we don't need any more */
5271         for (f = new_nproc; f < team->t.t_nproc; f++) {
5272           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5273           if (__kmp_tasking_mode != tskm_immediate_exec) {
5274             // When decreasing team size, threads no longer in the team should
5275             // unref task team.
5276             team->t.t_threads[f]->th.th_task_team = NULL;
5277           }
5278           __kmp_free_thread(team->t.t_threads[f]);
5279           team->t.t_threads[f] = NULL;
5280         }
5281 #if KMP_NESTED_HOT_TEAMS
5282       } // (__kmp_hot_teams_mode == 0)
5283       else {
5284         // When keeping extra threads in team, switch threads to wait on own
5285         // b_go flag
5286         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5287           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5288           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5289           for (int b = 0; b < bs_last_barrier; ++b) {
5290             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5291               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5292             }
5293             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5294           }
5295         }
5296       }
5297 #endif // KMP_NESTED_HOT_TEAMS
5298       team->t.t_nproc = new_nproc;
5299       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5300       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5301       __kmp_reinitialize_team(team, new_icvs,
5302                               root->r.r_uber_thread->th.th_ident);
5303 
5304       // Update remaining threads
5305       for (f = 0; f < new_nproc; ++f) {
5306         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5307       }
5308 
5309       // restore the current task state of the primary thread: should be the
5310       // implicit task
5311       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5312                     team->t.t_threads[0], team));
5313 
5314       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5315 
5316 #ifdef KMP_DEBUG
5317       for (f = 0; f < team->t.t_nproc; f++) {
5318         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5319                          team->t.t_threads[f]->th.th_team_nproc ==
5320                              team->t.t_nproc);
5321       }
5322 #endif
5323 
5324       if (do_place_partition) {
5325         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5326 #if KMP_AFFINITY_SUPPORTED
5327         __kmp_partition_places(team);
5328 #endif
5329       }
5330     } else { // team->t.t_nproc < new_nproc
5331 
5332       KA_TRACE(20,
5333                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5334                 new_nproc));
5335       int old_nproc = team->t.t_nproc; // save old value and use to update only
5336       team->t.t_size_changed = 1;
5337 
5338 #if KMP_NESTED_HOT_TEAMS
5339       int avail_threads = hot_teams[level].hot_team_nth;
5340       if (new_nproc < avail_threads)
5341         avail_threads = new_nproc;
5342       kmp_info_t **other_threads = team->t.t_threads;
5343       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5344         // Adjust barrier data of reserved threads (if any) of the team
5345         // Other data will be set in __kmp_initialize_info() below.
5346         int b;
5347         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5348         for (b = 0; b < bs_last_barrier; ++b) {
5349           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5350           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5351 #if USE_DEBUGGER
5352           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5353 #endif
5354         }
5355       }
5356       if (hot_teams[level].hot_team_nth >= new_nproc) {
5357         // we have all needed threads in reserve, no need to allocate any
5358         // this only possible in mode 1, cannot have reserved threads in mode 0
5359         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5360         team->t.t_nproc = new_nproc; // just get reserved threads involved
5361       } else {
5362         // We may have some threads in reserve, but not enough;
5363         // get reserved threads involved if any.
5364         team->t.t_nproc = hot_teams[level].hot_team_nth;
5365         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5366 #endif // KMP_NESTED_HOT_TEAMS
5367         if (team->t.t_max_nproc < new_nproc) {
5368           /* reallocate larger arrays */
5369           __kmp_reallocate_team_arrays(team, new_nproc);
5370           __kmp_reinitialize_team(team, new_icvs, NULL);
5371         }
5372 
5373 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5374         /* Temporarily set full mask for primary thread before creation of
5375            workers. The reason is that workers inherit the affinity from the
5376            primary thread, so if a lot of workers are created on the single
5377            core quickly, they don't get a chance to set their own affinity for
5378            a long time. */
5379         kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5380 #endif
5381 
5382         /* allocate new threads for the hot team */
5383         for (f = team->t.t_nproc; f < new_nproc; f++) {
5384           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5385           KMP_DEBUG_ASSERT(new_worker);
5386           team->t.t_threads[f] = new_worker;
5387 
5388           KA_TRACE(20,
5389                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5390                     "join=%llu, plain=%llu\n",
5391                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5392                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5393                     team->t.t_bar[bs_plain_barrier].b_arrived));
5394 
5395           { // Initialize barrier data for new threads.
5396             int b;
5397             kmp_balign_t *balign = new_worker->th.th_bar;
5398             for (b = 0; b < bs_last_barrier; ++b) {
5399               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5400               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5401                                KMP_BARRIER_PARENT_FLAG);
5402 #if USE_DEBUGGER
5403               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5404 #endif
5405             }
5406           }
5407         }
5408 
5409 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5410         /* Restore initial primary thread's affinity mask */
5411         new_temp_affinity.restore();
5412 #endif
5413 #if KMP_NESTED_HOT_TEAMS
5414       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5415 #endif // KMP_NESTED_HOT_TEAMS
5416       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5417         // Barrier size already increased earlier in this function
5418         // Activate team threads via th_used_in_team
5419         __kmp_add_threads_to_team(team, new_nproc);
5420       }
5421       /* make sure everyone is syncronized */
5422       // new threads below
5423       __kmp_initialize_team(team, new_nproc, new_icvs,
5424                             root->r.r_uber_thread->th.th_ident);
5425 
5426       /* reinitialize the threads */
5427       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5428       for (f = 0; f < team->t.t_nproc; ++f)
5429         __kmp_initialize_info(team->t.t_threads[f], team, f,
5430                               __kmp_gtid_from_tid(f, team));
5431 
5432       // set th_task_state for new threads in hot team with older thread's state
5433       kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5434       for (f = old_nproc; f < team->t.t_nproc; ++f)
5435         team->t.t_threads[f]->th.th_task_state = old_state;
5436 
5437 #ifdef KMP_DEBUG
5438       for (f = 0; f < team->t.t_nproc; ++f) {
5439         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5440                          team->t.t_threads[f]->th.th_team_nproc ==
5441                              team->t.t_nproc);
5442       }
5443 #endif
5444 
5445       if (do_place_partition) {
5446         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5447 #if KMP_AFFINITY_SUPPORTED
5448         __kmp_partition_places(team);
5449 #endif
5450       }
5451     } // Check changes in number of threads
5452 
5453     kmp_info_t *master = team->t.t_threads[0];
5454     if (master->th.th_teams_microtask) {
5455       for (f = 1; f < new_nproc; ++f) {
5456         // propagate teams construct specific info to workers
5457         kmp_info_t *thr = team->t.t_threads[f];
5458         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5459         thr->th.th_teams_level = master->th.th_teams_level;
5460         thr->th.th_teams_size = master->th.th_teams_size;
5461       }
5462     }
5463 #if KMP_NESTED_HOT_TEAMS
5464     if (level) {
5465       // Sync barrier state for nested hot teams, not needed for outermost hot
5466       // team.
5467       for (f = 1; f < new_nproc; ++f) {
5468         kmp_info_t *thr = team->t.t_threads[f];
5469         int b;
5470         kmp_balign_t *balign = thr->th.th_bar;
5471         for (b = 0; b < bs_last_barrier; ++b) {
5472           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5473           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5474 #if USE_DEBUGGER
5475           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5476 #endif
5477         }
5478       }
5479     }
5480 #endif // KMP_NESTED_HOT_TEAMS
5481 
5482     /* reallocate space for arguments if necessary */
5483     __kmp_alloc_argv_entries(argc, team, TRUE);
5484     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5485     // The hot team re-uses the previous task team,
5486     // if untouched during the previous release->gather phase.
5487 
5488     KF_TRACE(10, (" hot_team = %p\n", team));
5489 
5490 #if KMP_DEBUG
5491     if (__kmp_tasking_mode != tskm_immediate_exec) {
5492       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5493                     "task_team[1] = %p after reinit\n",
5494                     team->t.t_task_team[0], team->t.t_task_team[1]));
5495     }
5496 #endif
5497 
5498 #if OMPT_SUPPORT
5499     __ompt_team_assign_id(team, ompt_parallel_data);
5500 #endif
5501 
5502     KMP_MB();
5503 
5504     return team;
5505   }
5506 
5507   /* next, let's try to take one from the team pool */
5508   KMP_MB();
5509   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5510     /* TODO: consider resizing undersized teams instead of reaping them, now
5511        that we have a resizing mechanism */
5512     if (team->t.t_max_nproc >= max_nproc) {
5513       /* take this team from the team pool */
5514       __kmp_team_pool = team->t.t_next_pool;
5515 
5516       if (max_nproc > 1 &&
5517           __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5518         if (!team->t.b) { // Allocate barrier structure
5519           team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5520         }
5521       }
5522 
5523       /* setup the team for fresh use */
5524       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5525 
5526       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5527                     "task_team[1] %p to NULL\n",
5528                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5529       team->t.t_task_team[0] = NULL;
5530       team->t.t_task_team[1] = NULL;
5531 
5532       /* reallocate space for arguments if necessary */
5533       __kmp_alloc_argv_entries(argc, team, TRUE);
5534       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5535 
5536       KA_TRACE(
5537           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5538                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5539       { // Initialize barrier data.
5540         int b;
5541         for (b = 0; b < bs_last_barrier; ++b) {
5542           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5543 #if USE_DEBUGGER
5544           team->t.t_bar[b].b_master_arrived = 0;
5545           team->t.t_bar[b].b_team_arrived = 0;
5546 #endif
5547         }
5548       }
5549 
5550       team->t.t_proc_bind = new_proc_bind;
5551 
5552       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5553                     team->t.t_id));
5554 
5555 #if OMPT_SUPPORT
5556       __ompt_team_assign_id(team, ompt_parallel_data);
5557 #endif
5558 
5559       KMP_MB();
5560 
5561       return team;
5562     }
5563 
5564     /* reap team if it is too small, then loop back and check the next one */
5565     // not sure if this is wise, but, will be redone during the hot-teams
5566     // rewrite.
5567     /* TODO: Use technique to find the right size hot-team, don't reap them */
5568     team = __kmp_reap_team(team);
5569     __kmp_team_pool = team;
5570   }
5571 
5572   /* nothing available in the pool, no matter, make a new team! */
5573   KMP_MB();
5574   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5575 
5576   /* and set it up */
5577   team->t.t_max_nproc = max_nproc;
5578   if (max_nproc > 1 &&
5579       __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5580     // Allocate barrier structure
5581     team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5582   }
5583 
5584   /* NOTE well, for some reason allocating one big buffer and dividing it up
5585      seems to really hurt performance a lot on the P4, so, let's not use this */
5586   __kmp_allocate_team_arrays(team, max_nproc);
5587 
5588   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5589   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5590 
5591   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5592                 "%p to NULL\n",
5593                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5594   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5595   // memory, no need to duplicate
5596   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5597   // memory, no need to duplicate
5598 
5599   if (__kmp_storage_map) {
5600     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5601   }
5602 
5603   /* allocate space for arguments */
5604   __kmp_alloc_argv_entries(argc, team, FALSE);
5605   team->t.t_argc = argc;
5606 
5607   KA_TRACE(20,
5608            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5609             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5610   { // Initialize barrier data.
5611     int b;
5612     for (b = 0; b < bs_last_barrier; ++b) {
5613       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5614 #if USE_DEBUGGER
5615       team->t.t_bar[b].b_master_arrived = 0;
5616       team->t.t_bar[b].b_team_arrived = 0;
5617 #endif
5618     }
5619   }
5620 
5621   team->t.t_proc_bind = new_proc_bind;
5622 
5623 #if OMPT_SUPPORT
5624   __ompt_team_assign_id(team, ompt_parallel_data);
5625   team->t.ompt_serialized_team_info = NULL;
5626 #endif
5627 
5628   KMP_MB();
5629 
5630   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5631                 team->t.t_id));
5632 
5633   return team;
5634 }
5635 
5636 /* TODO implement hot-teams at all levels */
5637 /* TODO implement lazy thread release on demand (disband request) */
5638 
5639 /* free the team.  return it to the team pool.  release all the threads
5640  * associated with it */
5641 void __kmp_free_team(kmp_root_t *root,
5642                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5643   int f;
5644   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5645                 team->t.t_id));
5646 
5647   /* verify state */
5648   KMP_DEBUG_ASSERT(root);
5649   KMP_DEBUG_ASSERT(team);
5650   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5651   KMP_DEBUG_ASSERT(team->t.t_threads);
5652 
5653   int use_hot_team = team == root->r.r_hot_team;
5654 #if KMP_NESTED_HOT_TEAMS
5655   int level;
5656   if (master) {
5657     level = team->t.t_active_level - 1;
5658     if (master->th.th_teams_microtask) { // in teams construct?
5659       if (master->th.th_teams_size.nteams > 1) {
5660         ++level; // level was not increased in teams construct for
5661         // team_of_masters
5662       }
5663       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5664           master->th.th_teams_level == team->t.t_level) {
5665         ++level; // level was not increased in teams construct for
5666         // team_of_workers before the parallel
5667       } // team->t.t_level will be increased inside parallel
5668     }
5669 #if KMP_DEBUG
5670     kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5671 #endif
5672     if (level < __kmp_hot_teams_max_level) {
5673       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5674       use_hot_team = 1;
5675     }
5676   }
5677 #endif // KMP_NESTED_HOT_TEAMS
5678 
5679   /* team is done working */
5680   TCW_SYNC_PTR(team->t.t_pkfn,
5681                NULL); // Important for Debugging Support Library.
5682 #if KMP_OS_WINDOWS
5683   team->t.t_copyin_counter = 0; // init counter for possible reuse
5684 #endif
5685   // Do not reset pointer to parent team to NULL for hot teams.
5686 
5687   /* if we are non-hot team, release our threads */
5688   if (!use_hot_team) {
5689     if (__kmp_tasking_mode != tskm_immediate_exec) {
5690       // Wait for threads to reach reapable state
5691       for (f = 1; f < team->t.t_nproc; ++f) {
5692         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5693         kmp_info_t *th = team->t.t_threads[f];
5694         volatile kmp_uint32 *state = &th->th.th_reap_state;
5695         while (*state != KMP_SAFE_TO_REAP) {
5696 #if KMP_OS_WINDOWS
5697           // On Windows a thread can be killed at any time, check this
5698           DWORD ecode;
5699           if (!__kmp_is_thread_alive(th, &ecode)) {
5700             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5701             break;
5702           }
5703 #endif
5704           // first check if thread is sleeping
5705           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5706           if (fl.is_sleeping())
5707             fl.resume(__kmp_gtid_from_thread(th));
5708           KMP_CPU_PAUSE();
5709         }
5710       }
5711 
5712       // Delete task teams
5713       int tt_idx;
5714       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5715         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5716         if (task_team != NULL) {
5717           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5718             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5719             team->t.t_threads[f]->th.th_task_team = NULL;
5720           }
5721           KA_TRACE(
5722               20,
5723               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5724                __kmp_get_gtid(), task_team, team->t.t_id));
5725 #if KMP_NESTED_HOT_TEAMS
5726           __kmp_free_task_team(master, task_team);
5727 #endif
5728           team->t.t_task_team[tt_idx] = NULL;
5729         }
5730       }
5731     }
5732 
5733     // Reset pointer to parent team only for non-hot teams.
5734     team->t.t_parent = NULL;
5735     team->t.t_level = 0;
5736     team->t.t_active_level = 0;
5737 
5738     /* free the worker threads */
5739     for (f = 1; f < team->t.t_nproc; ++f) {
5740       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5741       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5742         KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5743                                     1, 2);
5744       }
5745       __kmp_free_thread(team->t.t_threads[f]);
5746     }
5747 
5748     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5749       if (team->t.b) {
5750         // wake up thread at old location
5751         team->t.b->go_release();
5752         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5753           for (f = 1; f < team->t.t_nproc; ++f) {
5754             if (team->t.b->sleep[f].sleep) {
5755               __kmp_atomic_resume_64(
5756                   team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5757                   (kmp_atomic_flag_64<> *)NULL);
5758             }
5759           }
5760         }
5761         // Wait for threads to be removed from team
5762         for (int f = 1; f < team->t.t_nproc; ++f) {
5763           while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5764             KMP_CPU_PAUSE();
5765         }
5766       }
5767     }
5768 
5769     for (f = 1; f < team->t.t_nproc; ++f) {
5770       team->t.t_threads[f] = NULL;
5771     }
5772 
5773     if (team->t.t_max_nproc > 1 &&
5774         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5775       distributedBarrier::deallocate(team->t.b);
5776       team->t.b = NULL;
5777     }
5778     /* put the team back in the team pool */
5779     /* TODO limit size of team pool, call reap_team if pool too large */
5780     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5781     __kmp_team_pool = (volatile kmp_team_t *)team;
5782   } else { // Check if team was created for primary threads in teams construct
5783     // See if first worker is a CG root
5784     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5785                      team->t.t_threads[1]->th.th_cg_roots);
5786     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5787       // Clean up the CG root nodes on workers so that this team can be re-used
5788       for (f = 1; f < team->t.t_nproc; ++f) {
5789         kmp_info_t *thr = team->t.t_threads[f];
5790         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5791                          thr->th.th_cg_roots->cg_root == thr);
5792         // Pop current CG root off list
5793         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5794         thr->th.th_cg_roots = tmp->up;
5795         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5796                        " up to node %p. cg_nthreads was %d\n",
5797                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5798         int i = tmp->cg_nthreads--;
5799         if (i == 1) {
5800           __kmp_free(tmp); // free CG if we are the last thread in it
5801         }
5802         // Restore current task's thread_limit from CG root
5803         if (thr->th.th_cg_roots)
5804           thr->th.th_current_task->td_icvs.thread_limit =
5805               thr->th.th_cg_roots->cg_thread_limit;
5806       }
5807     }
5808   }
5809 
5810   KMP_MB();
5811 }
5812 
5813 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5814 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5815   kmp_team_t *next_pool = team->t.t_next_pool;
5816 
5817   KMP_DEBUG_ASSERT(team);
5818   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5819   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5820   KMP_DEBUG_ASSERT(team->t.t_threads);
5821   KMP_DEBUG_ASSERT(team->t.t_argv);
5822 
5823   /* TODO clean the threads that are a part of this? */
5824 
5825   /* free stuff */
5826   __kmp_free_team_arrays(team);
5827   if (team->t.t_argv != &team->t.t_inline_argv[0])
5828     __kmp_free((void *)team->t.t_argv);
5829   __kmp_free(team);
5830 
5831   KMP_MB();
5832   return next_pool;
5833 }
5834 
5835 // Free the thread.  Don't reap it, just place it on the pool of available
5836 // threads.
5837 //
5838 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5839 // binding for the affinity mechanism to be useful.
5840 //
5841 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5842 // However, we want to avoid a potential performance problem by always
5843 // scanning through the list to find the correct point at which to insert
5844 // the thread (potential N**2 behavior).  To do this we keep track of the
5845 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5846 // With single-level parallelism, threads will always be added to the tail
5847 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5848 // parallelism, all bets are off and we may need to scan through the entire
5849 // free list.
5850 //
5851 // This change also has a potentially large performance benefit, for some
5852 // applications.  Previously, as threads were freed from the hot team, they
5853 // would be placed back on the free list in inverse order.  If the hot team
5854 // grew back to it's original size, then the freed thread would be placed
5855 // back on the hot team in reverse order.  This could cause bad cache
5856 // locality problems on programs where the size of the hot team regularly
5857 // grew and shrunk.
5858 //
5859 // Now, for single-level parallelism, the OMP tid is always == gtid.
5860 void __kmp_free_thread(kmp_info_t *this_th) {
5861   int gtid;
5862   kmp_info_t **scan;
5863 
5864   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5865                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5866 
5867   KMP_DEBUG_ASSERT(this_th);
5868 
5869   // When moving thread to pool, switch thread to wait on own b_go flag, and
5870   // uninitialized (NULL team).
5871   int b;
5872   kmp_balign_t *balign = this_th->th.th_bar;
5873   for (b = 0; b < bs_last_barrier; ++b) {
5874     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5875       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5876     balign[b].bb.team = NULL;
5877     balign[b].bb.leaf_kids = 0;
5878   }
5879   this_th->th.th_task_state = 0;
5880   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5881 
5882   /* put thread back on the free pool */
5883   TCW_PTR(this_th->th.th_team, NULL);
5884   TCW_PTR(this_th->th.th_root, NULL);
5885   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5886 
5887   while (this_th->th.th_cg_roots) {
5888     this_th->th.th_cg_roots->cg_nthreads--;
5889     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5890                    " %p of thread  %p to %d\n",
5891                    this_th, this_th->th.th_cg_roots,
5892                    this_th->th.th_cg_roots->cg_root,
5893                    this_th->th.th_cg_roots->cg_nthreads));
5894     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5895     if (tmp->cg_root == this_th) { // Thread is a cg_root
5896       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5897       KA_TRACE(
5898           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5899       this_th->th.th_cg_roots = tmp->up;
5900       __kmp_free(tmp);
5901     } else { // Worker thread
5902       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5903         __kmp_free(tmp);
5904       }
5905       this_th->th.th_cg_roots = NULL;
5906       break;
5907     }
5908   }
5909 
5910   /* If the implicit task assigned to this thread can be used by other threads
5911    * -> multiple threads can share the data and try to free the task at
5912    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5913    * with higher probability when hot team is disabled but can occurs even when
5914    * the hot team is enabled */
5915   __kmp_free_implicit_task(this_th);
5916   this_th->th.th_current_task = NULL;
5917 
5918   // If the __kmp_thread_pool_insert_pt is already past the new insert
5919   // point, then we need to re-scan the entire list.
5920   gtid = this_th->th.th_info.ds.ds_gtid;
5921   if (__kmp_thread_pool_insert_pt != NULL) {
5922     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5923     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5924       __kmp_thread_pool_insert_pt = NULL;
5925     }
5926   }
5927 
5928   // Scan down the list to find the place to insert the thread.
5929   // scan is the address of a link in the list, possibly the address of
5930   // __kmp_thread_pool itself.
5931   //
5932   // In the absence of nested parallelism, the for loop will have 0 iterations.
5933   if (__kmp_thread_pool_insert_pt != NULL) {
5934     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5935   } else {
5936     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5937   }
5938   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5939        scan = &((*scan)->th.th_next_pool))
5940     ;
5941 
5942   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5943   // to its address.
5944   TCW_PTR(this_th->th.th_next_pool, *scan);
5945   __kmp_thread_pool_insert_pt = *scan = this_th;
5946   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5947                    (this_th->th.th_info.ds.ds_gtid <
5948                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5949   TCW_4(this_th->th.th_in_pool, TRUE);
5950   __kmp_suspend_initialize_thread(this_th);
5951   __kmp_lock_suspend_mx(this_th);
5952   if (this_th->th.th_active == TRUE) {
5953     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5954     this_th->th.th_active_in_pool = TRUE;
5955   }
5956 #if KMP_DEBUG
5957   else {
5958     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5959   }
5960 #endif
5961   __kmp_unlock_suspend_mx(this_th);
5962 
5963   TCW_4(__kmp_nth, __kmp_nth - 1);
5964 
5965 #ifdef KMP_ADJUST_BLOCKTIME
5966   /* Adjust blocktime back to user setting or default if necessary */
5967   /* Middle initialization might never have occurred                */
5968   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5969     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5970     if (__kmp_nth <= __kmp_avail_proc) {
5971       __kmp_zero_bt = FALSE;
5972     }
5973   }
5974 #endif /* KMP_ADJUST_BLOCKTIME */
5975 
5976   KMP_MB();
5977 }
5978 
5979 /* ------------------------------------------------------------------------ */
5980 
5981 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5982 #if OMP_PROFILING_SUPPORT
5983   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5984   // TODO: add a configuration option for time granularity
5985   if (ProfileTraceFile)
5986     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5987 #endif
5988 
5989   int gtid = this_thr->th.th_info.ds.ds_gtid;
5990   /*    void                 *stack_data;*/
5991   kmp_team_t **volatile pteam;
5992 
5993   KMP_MB();
5994   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5995 
5996   if (__kmp_env_consistency_check) {
5997     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5998   }
5999 
6000 #if OMPD_SUPPORT
6001   if (ompd_state & OMPD_ENABLE_BP)
6002     ompd_bp_thread_begin();
6003 #endif
6004 
6005 #if OMPT_SUPPORT
6006   ompt_data_t *thread_data = nullptr;
6007   if (ompt_enabled.enabled) {
6008     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6009     *thread_data = ompt_data_none;
6010 
6011     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6012     this_thr->th.ompt_thread_info.wait_id = 0;
6013     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6014     this_thr->th.ompt_thread_info.parallel_flags = 0;
6015     if (ompt_enabled.ompt_callback_thread_begin) {
6016       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6017           ompt_thread_worker, thread_data);
6018     }
6019     this_thr->th.ompt_thread_info.state = ompt_state_idle;
6020   }
6021 #endif
6022 
6023   /* This is the place where threads wait for work */
6024   while (!TCR_4(__kmp_global.g.g_done)) {
6025     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6026     KMP_MB();
6027 
6028     /* wait for work to do */
6029     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6030 
6031     /* No tid yet since not part of a team */
6032     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6033 
6034 #if OMPT_SUPPORT
6035     if (ompt_enabled.enabled) {
6036       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6037     }
6038 #endif
6039 
6040     pteam = &this_thr->th.th_team;
6041 
6042     /* have we been allocated? */
6043     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6044       /* we were just woken up, so run our new task */
6045       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6046         int rc;
6047         KA_TRACE(20,
6048                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6049                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6050                   (*pteam)->t.t_pkfn));
6051 
6052         updateHWFPControl(*pteam);
6053 
6054 #if OMPT_SUPPORT
6055         if (ompt_enabled.enabled) {
6056           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6057         }
6058 #endif
6059 
6060         rc = (*pteam)->t.t_invoke(gtid);
6061         KMP_ASSERT(rc);
6062 
6063         KMP_MB();
6064         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6065                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6066                       (*pteam)->t.t_pkfn));
6067       }
6068 #if OMPT_SUPPORT
6069       if (ompt_enabled.enabled) {
6070         /* no frame set while outside task */
6071         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6072 
6073         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6074       }
6075 #endif
6076       /* join barrier after parallel region */
6077       __kmp_join_barrier(gtid);
6078     }
6079   }
6080 
6081 #if OMPD_SUPPORT
6082   if (ompd_state & OMPD_ENABLE_BP)
6083     ompd_bp_thread_end();
6084 #endif
6085 
6086 #if OMPT_SUPPORT
6087   if (ompt_enabled.ompt_callback_thread_end) {
6088     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6089   }
6090 #endif
6091 
6092   this_thr->th.th_task_team = NULL;
6093   /* run the destructors for the threadprivate data for this thread */
6094   __kmp_common_destroy_gtid(gtid);
6095 
6096   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6097   KMP_MB();
6098 
6099 #if OMP_PROFILING_SUPPORT
6100   llvm::timeTraceProfilerFinishThread();
6101 #endif
6102   return this_thr;
6103 }
6104 
6105 /* ------------------------------------------------------------------------ */
6106 
6107 void __kmp_internal_end_dest(void *specific_gtid) {
6108   // Make sure no significant bits are lost
6109   int gtid;
6110   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6111 
6112   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6113   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6114    * this is because 0 is reserved for the nothing-stored case */
6115 
6116   __kmp_internal_end_thread(gtid);
6117 }
6118 
6119 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6120 
6121 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6122   __kmp_internal_end_atexit();
6123 }
6124 
6125 #endif
6126 
6127 /* [Windows] josh: when the atexit handler is called, there may still be more
6128    than one thread alive */
6129 void __kmp_internal_end_atexit(void) {
6130   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6131   /* [Windows]
6132      josh: ideally, we want to completely shutdown the library in this atexit
6133      handler, but stat code that depends on thread specific data for gtid fails
6134      because that data becomes unavailable at some point during the shutdown, so
6135      we call __kmp_internal_end_thread instead. We should eventually remove the
6136      dependency on __kmp_get_specific_gtid in the stat code and use
6137      __kmp_internal_end_library to cleanly shutdown the library.
6138 
6139      // TODO: Can some of this comment about GVS be removed?
6140      I suspect that the offending stat code is executed when the calling thread
6141      tries to clean up a dead root thread's data structures, resulting in GVS
6142      code trying to close the GVS structures for that thread, but since the stat
6143      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6144      the calling thread is cleaning up itself instead of another thread, it get
6145      confused. This happens because allowing a thread to unregister and cleanup
6146      another thread is a recent modification for addressing an issue.
6147      Based on the current design (20050722), a thread may end up
6148      trying to unregister another thread only if thread death does not trigger
6149      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
6150      thread specific data destructor function to detect thread death. For
6151      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6152      is nothing.  Thus, the workaround is applicable only for Windows static
6153      stat library. */
6154   __kmp_internal_end_library(-1);
6155 #if KMP_OS_WINDOWS
6156   __kmp_close_console();
6157 #endif
6158 }
6159 
6160 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6161   // It is assumed __kmp_forkjoin_lock is acquired.
6162 
6163   int gtid;
6164 
6165   KMP_DEBUG_ASSERT(thread != NULL);
6166 
6167   gtid = thread->th.th_info.ds.ds_gtid;
6168 
6169   if (!is_root) {
6170     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6171       /* Assume the threads are at the fork barrier here */
6172       KA_TRACE(
6173           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6174                gtid));
6175       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6176         while (
6177             !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6178           KMP_CPU_PAUSE();
6179         __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6180       } else {
6181         /* Need release fence here to prevent seg faults for tree forkjoin
6182            barrier (GEH) */
6183         kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6184                            thread);
6185         __kmp_release_64(&flag);
6186       }
6187     }
6188 
6189     // Terminate OS thread.
6190     __kmp_reap_worker(thread);
6191 
6192     // The thread was killed asynchronously.  If it was actively
6193     // spinning in the thread pool, decrement the global count.
6194     //
6195     // There is a small timing hole here - if the worker thread was just waking
6196     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6197     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6198     // the global counter might not get updated.
6199     //
6200     // Currently, this can only happen as the library is unloaded,
6201     // so there are no harmful side effects.
6202     if (thread->th.th_active_in_pool) {
6203       thread->th.th_active_in_pool = FALSE;
6204       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6205       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6206     }
6207   }
6208 
6209   __kmp_free_implicit_task(thread);
6210 
6211 // Free the fast memory for tasking
6212 #if USE_FAST_MEMORY
6213   __kmp_free_fast_memory(thread);
6214 #endif /* USE_FAST_MEMORY */
6215 
6216   __kmp_suspend_uninitialize_thread(thread);
6217 
6218   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6219   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6220 
6221   --__kmp_all_nth;
6222   // __kmp_nth was decremented when thread is added to the pool.
6223 
6224 #ifdef KMP_ADJUST_BLOCKTIME
6225   /* Adjust blocktime back to user setting or default if necessary */
6226   /* Middle initialization might never have occurred                */
6227   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6228     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6229     if (__kmp_nth <= __kmp_avail_proc) {
6230       __kmp_zero_bt = FALSE;
6231     }
6232   }
6233 #endif /* KMP_ADJUST_BLOCKTIME */
6234 
6235   /* free the memory being used */
6236   if (__kmp_env_consistency_check) {
6237     if (thread->th.th_cons) {
6238       __kmp_free_cons_stack(thread->th.th_cons);
6239       thread->th.th_cons = NULL;
6240     }
6241   }
6242 
6243   if (thread->th.th_pri_common != NULL) {
6244     __kmp_free(thread->th.th_pri_common);
6245     thread->th.th_pri_common = NULL;
6246   }
6247 
6248   if (thread->th.th_task_state_memo_stack != NULL) {
6249     __kmp_free(thread->th.th_task_state_memo_stack);
6250     thread->th.th_task_state_memo_stack = NULL;
6251   }
6252 
6253 #if KMP_USE_BGET
6254   if (thread->th.th_local.bget_data != NULL) {
6255     __kmp_finalize_bget(thread);
6256   }
6257 #endif
6258 
6259 #if KMP_AFFINITY_SUPPORTED
6260   if (thread->th.th_affin_mask != NULL) {
6261     KMP_CPU_FREE(thread->th.th_affin_mask);
6262     thread->th.th_affin_mask = NULL;
6263   }
6264 #endif /* KMP_AFFINITY_SUPPORTED */
6265 
6266 #if KMP_USE_HIER_SCHED
6267   if (thread->th.th_hier_bar_data != NULL) {
6268     __kmp_free(thread->th.th_hier_bar_data);
6269     thread->th.th_hier_bar_data = NULL;
6270   }
6271 #endif
6272 
6273   __kmp_reap_team(thread->th.th_serial_team);
6274   thread->th.th_serial_team = NULL;
6275   __kmp_free(thread);
6276 
6277   KMP_MB();
6278 
6279 } // __kmp_reap_thread
6280 
6281 static void __kmp_itthash_clean(kmp_info_t *th) {
6282 #if USE_ITT_NOTIFY
6283   if (__kmp_itt_region_domains.count > 0) {
6284     for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6285       kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6286       while (bucket) {
6287         kmp_itthash_entry_t *next = bucket->next_in_bucket;
6288         __kmp_thread_free(th, bucket);
6289         bucket = next;
6290       }
6291     }
6292   }
6293   if (__kmp_itt_barrier_domains.count > 0) {
6294     for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6295       kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6296       while (bucket) {
6297         kmp_itthash_entry_t *next = bucket->next_in_bucket;
6298         __kmp_thread_free(th, bucket);
6299         bucket = next;
6300       }
6301     }
6302   }
6303 #endif
6304 }
6305 
6306 static void __kmp_internal_end(void) {
6307   int i;
6308 
6309   /* First, unregister the library */
6310   __kmp_unregister_library();
6311 
6312 #if KMP_OS_WINDOWS
6313   /* In Win static library, we can't tell when a root actually dies, so we
6314      reclaim the data structures for any root threads that have died but not
6315      unregistered themselves, in order to shut down cleanly.
6316      In Win dynamic library we also can't tell when a thread dies.  */
6317   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6318 // dead roots
6319 #endif
6320 
6321   for (i = 0; i < __kmp_threads_capacity; i++)
6322     if (__kmp_root[i])
6323       if (__kmp_root[i]->r.r_active)
6324         break;
6325   KMP_MB(); /* Flush all pending memory write invalidates.  */
6326   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6327 
6328   if (i < __kmp_threads_capacity) {
6329 #if KMP_USE_MONITOR
6330     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6331     KMP_MB(); /* Flush all pending memory write invalidates.  */
6332 
6333     // Need to check that monitor was initialized before reaping it. If we are
6334     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6335     // __kmp_monitor will appear to contain valid data, but it is only valid in
6336     // the parent process, not the child.
6337     // New behavior (201008): instead of keying off of the flag
6338     // __kmp_init_parallel, the monitor thread creation is keyed off
6339     // of the new flag __kmp_init_monitor.
6340     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6341     if (TCR_4(__kmp_init_monitor)) {
6342       __kmp_reap_monitor(&__kmp_monitor);
6343       TCW_4(__kmp_init_monitor, 0);
6344     }
6345     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6346     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6347 #endif // KMP_USE_MONITOR
6348   } else {
6349 /* TODO move this to cleanup code */
6350 #ifdef KMP_DEBUG
6351     /* make sure that everything has properly ended */
6352     for (i = 0; i < __kmp_threads_capacity; i++) {
6353       if (__kmp_root[i]) {
6354         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6355         //                    there can be uber threads alive here
6356         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6357       }
6358     }
6359 #endif
6360 
6361     KMP_MB();
6362 
6363     // Reap the worker threads.
6364     // This is valid for now, but be careful if threads are reaped sooner.
6365     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6366       // Get the next thread from the pool.
6367       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6368       __kmp_thread_pool = thread->th.th_next_pool;
6369       // Reap it.
6370       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6371       thread->th.th_next_pool = NULL;
6372       thread->th.th_in_pool = FALSE;
6373       __kmp_reap_thread(thread, 0);
6374     }
6375     __kmp_thread_pool_insert_pt = NULL;
6376 
6377     // Reap teams.
6378     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6379       // Get the next team from the pool.
6380       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6381       __kmp_team_pool = team->t.t_next_pool;
6382       // Reap it.
6383       team->t.t_next_pool = NULL;
6384       __kmp_reap_team(team);
6385     }
6386 
6387     __kmp_reap_task_teams();
6388 
6389 #if KMP_OS_UNIX
6390     // Threads that are not reaped should not access any resources since they
6391     // are going to be deallocated soon, so the shutdown sequence should wait
6392     // until all threads either exit the final spin-waiting loop or begin
6393     // sleeping after the given blocktime.
6394     for (i = 0; i < __kmp_threads_capacity; i++) {
6395       kmp_info_t *thr = __kmp_threads[i];
6396       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6397         KMP_CPU_PAUSE();
6398     }
6399 #endif
6400 
6401     for (i = 0; i < __kmp_threads_capacity; ++i) {
6402       // TBD: Add some checking...
6403       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6404     }
6405 
6406     /* Make sure all threadprivate destructors get run by joining with all
6407        worker threads before resetting this flag */
6408     TCW_SYNC_4(__kmp_init_common, FALSE);
6409 
6410     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6411     KMP_MB();
6412 
6413 #if KMP_USE_MONITOR
6414     // See note above: One of the possible fixes for CQ138434 / CQ140126
6415     //
6416     // FIXME: push both code fragments down and CSE them?
6417     // push them into __kmp_cleanup() ?
6418     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6419     if (TCR_4(__kmp_init_monitor)) {
6420       __kmp_reap_monitor(&__kmp_monitor);
6421       TCW_4(__kmp_init_monitor, 0);
6422     }
6423     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6424     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6425 #endif
6426   } /* else !__kmp_global.t_active */
6427   TCW_4(__kmp_init_gtid, FALSE);
6428   KMP_MB(); /* Flush all pending memory write invalidates.  */
6429 
6430   __kmp_cleanup();
6431 #if OMPT_SUPPORT
6432   ompt_fini();
6433 #endif
6434 }
6435 
6436 void __kmp_internal_end_library(int gtid_req) {
6437   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6438   /* this shouldn't be a race condition because __kmp_internal_end() is the
6439      only place to clear __kmp_serial_init */
6440   /* we'll check this later too, after we get the lock */
6441   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6442   // redundant, because the next check will work in any case.
6443   if (__kmp_global.g.g_abort) {
6444     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6445     /* TODO abort? */
6446     return;
6447   }
6448   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6449     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6450     return;
6451   }
6452 
6453   // If hidden helper team has been initialized, we need to deinit it
6454   if (TCR_4(__kmp_init_hidden_helper) &&
6455       !TCR_4(__kmp_hidden_helper_team_done)) {
6456     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6457     // First release the main thread to let it continue its work
6458     __kmp_hidden_helper_main_thread_release();
6459     // Wait until the hidden helper team has been destroyed
6460     __kmp_hidden_helper_threads_deinitz_wait();
6461   }
6462 
6463   KMP_MB(); /* Flush all pending memory write invalidates.  */
6464   /* find out who we are and what we should do */
6465   {
6466     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6467     KA_TRACE(
6468         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6469     if (gtid == KMP_GTID_SHUTDOWN) {
6470       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6471                     "already shutdown\n"));
6472       return;
6473     } else if (gtid == KMP_GTID_MONITOR) {
6474       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6475                     "registered, or system shutdown\n"));
6476       return;
6477     } else if (gtid == KMP_GTID_DNE) {
6478       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6479                     "shutdown\n"));
6480       /* we don't know who we are, but we may still shutdown the library */
6481     } else if (KMP_UBER_GTID(gtid)) {
6482       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6483       if (__kmp_root[gtid]->r.r_active) {
6484         __kmp_global.g.g_abort = -1;
6485         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6486         __kmp_unregister_library();
6487         KA_TRACE(10,
6488                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6489                   gtid));
6490         return;
6491       } else {
6492         __kmp_itthash_clean(__kmp_threads[gtid]);
6493         KA_TRACE(
6494             10,
6495             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6496         __kmp_unregister_root_current_thread(gtid);
6497       }
6498     } else {
6499 /* worker threads may call this function through the atexit handler, if they
6500  * call exit() */
6501 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6502    TODO: do a thorough shutdown instead */
6503 #ifdef DUMP_DEBUG_ON_EXIT
6504       if (__kmp_debug_buf)
6505         __kmp_dump_debug_buffer();
6506 #endif
6507       // added unregister library call here when we switch to shm linux
6508       // if we don't, it will leave lots of files in /dev/shm
6509       // cleanup shared memory file before exiting.
6510       __kmp_unregister_library();
6511       return;
6512     }
6513   }
6514   /* synchronize the termination process */
6515   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6516 
6517   /* have we already finished */
6518   if (__kmp_global.g.g_abort) {
6519     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6520     /* TODO abort? */
6521     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6522     return;
6523   }
6524   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6525     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6526     return;
6527   }
6528 
6529   /* We need this lock to enforce mutex between this reading of
6530      __kmp_threads_capacity and the writing by __kmp_register_root.
6531      Alternatively, we can use a counter of roots that is atomically updated by
6532      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6533      __kmp_internal_end_*.  */
6534   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6535 
6536   /* now we can safely conduct the actual termination */
6537   __kmp_internal_end();
6538 
6539   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6540   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6541 
6542   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6543 
6544 #ifdef DUMP_DEBUG_ON_EXIT
6545   if (__kmp_debug_buf)
6546     __kmp_dump_debug_buffer();
6547 #endif
6548 
6549 #if KMP_OS_WINDOWS
6550   __kmp_close_console();
6551 #endif
6552 
6553   __kmp_fini_allocator();
6554 
6555 } // __kmp_internal_end_library
6556 
6557 void __kmp_internal_end_thread(int gtid_req) {
6558   int i;
6559 
6560   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6561   /* this shouldn't be a race condition because __kmp_internal_end() is the
6562    * only place to clear __kmp_serial_init */
6563   /* we'll check this later too, after we get the lock */
6564   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6565   // redundant, because the next check will work in any case.
6566   if (__kmp_global.g.g_abort) {
6567     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6568     /* TODO abort? */
6569     return;
6570   }
6571   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6572     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6573     return;
6574   }
6575 
6576   // If hidden helper team has been initialized, we need to deinit it
6577   if (TCR_4(__kmp_init_hidden_helper) &&
6578       !TCR_4(__kmp_hidden_helper_team_done)) {
6579     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6580     // First release the main thread to let it continue its work
6581     __kmp_hidden_helper_main_thread_release();
6582     // Wait until the hidden helper team has been destroyed
6583     __kmp_hidden_helper_threads_deinitz_wait();
6584   }
6585 
6586   KMP_MB(); /* Flush all pending memory write invalidates.  */
6587 
6588   /* find out who we are and what we should do */
6589   {
6590     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6591     KA_TRACE(10,
6592              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6593     if (gtid == KMP_GTID_SHUTDOWN) {
6594       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6595                     "already shutdown\n"));
6596       return;
6597     } else if (gtid == KMP_GTID_MONITOR) {
6598       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6599                     "registered, or system shutdown\n"));
6600       return;
6601     } else if (gtid == KMP_GTID_DNE) {
6602       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6603                     "shutdown\n"));
6604       return;
6605       /* we don't know who we are */
6606     } else if (KMP_UBER_GTID(gtid)) {
6607       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6608       if (__kmp_root[gtid]->r.r_active) {
6609         __kmp_global.g.g_abort = -1;
6610         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6611         KA_TRACE(10,
6612                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6613                   gtid));
6614         return;
6615       } else {
6616         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6617                       gtid));
6618         __kmp_unregister_root_current_thread(gtid);
6619       }
6620     } else {
6621       /* just a worker thread, let's leave */
6622       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6623 
6624       if (gtid >= 0) {
6625         __kmp_threads[gtid]->th.th_task_team = NULL;
6626       }
6627 
6628       KA_TRACE(10,
6629                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6630                 gtid));
6631       return;
6632     }
6633   }
6634 #if KMP_DYNAMIC_LIB
6635   if (__kmp_pause_status != kmp_hard_paused)
6636   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6637   // because we will better shutdown later in the library destructor.
6638   {
6639     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6640     return;
6641   }
6642 #endif
6643   /* synchronize the termination process */
6644   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6645 
6646   /* have we already finished */
6647   if (__kmp_global.g.g_abort) {
6648     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6649     /* TODO abort? */
6650     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6651     return;
6652   }
6653   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6654     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6655     return;
6656   }
6657 
6658   /* We need this lock to enforce mutex between this reading of
6659      __kmp_threads_capacity and the writing by __kmp_register_root.
6660      Alternatively, we can use a counter of roots that is atomically updated by
6661      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6662      __kmp_internal_end_*.  */
6663 
6664   /* should we finish the run-time?  are all siblings done? */
6665   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6666 
6667   for (i = 0; i < __kmp_threads_capacity; ++i) {
6668     if (KMP_UBER_GTID(i)) {
6669       KA_TRACE(
6670           10,
6671           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6672       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6673       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6674       return;
6675     }
6676   }
6677 
6678   /* now we can safely conduct the actual termination */
6679 
6680   __kmp_internal_end();
6681 
6682   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6683   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6684 
6685   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6686 
6687 #ifdef DUMP_DEBUG_ON_EXIT
6688   if (__kmp_debug_buf)
6689     __kmp_dump_debug_buffer();
6690 #endif
6691 } // __kmp_internal_end_thread
6692 
6693 // -----------------------------------------------------------------------------
6694 // Library registration stuff.
6695 
6696 static long __kmp_registration_flag = 0;
6697 // Random value used to indicate library initialization.
6698 static char *__kmp_registration_str = NULL;
6699 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6700 
6701 static inline char *__kmp_reg_status_name() {
6702 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6703    each thread. If registration and unregistration go in different threads
6704    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6705    env var can not be found, because the name will contain different pid. */
6706 // macOS* complains about name being too long with additional getuid()
6707 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6708   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6709                           (int)getuid());
6710 #else
6711   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6712 #endif
6713 } // __kmp_reg_status_get
6714 
6715 #if defined(KMP_USE_SHM)
6716 // If /dev/shm is not accessible, we will create a temporary file under /tmp.
6717 char *temp_reg_status_file_name = nullptr;
6718 #endif
6719 
6720 void __kmp_register_library_startup(void) {
6721 
6722   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6723   int done = 0;
6724   union {
6725     double dtime;
6726     long ltime;
6727   } time;
6728 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6729   __kmp_initialize_system_tick();
6730 #endif
6731   __kmp_read_system_time(&time.dtime);
6732   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6733   __kmp_registration_str =
6734       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6735                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6736 
6737   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6738                 __kmp_registration_str));
6739 
6740   while (!done) {
6741 
6742     char *value = NULL; // Actual value of the environment variable.
6743 
6744 #if defined(KMP_USE_SHM)
6745     char *shm_name = __kmp_str_format("/%s", name);
6746     int shm_preexist = 0;
6747     char *data1;
6748     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6749     if ((fd1 == -1) && (errno == EEXIST)) {
6750       // file didn't open because it already exists.
6751       // try opening existing file
6752       fd1 = shm_open(shm_name, O_RDWR, 0666);
6753       if (fd1 == -1) { // file didn't open
6754         // error out here
6755         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6756                     __kmp_msg_null);
6757       } else {
6758         // able to open existing file
6759         shm_preexist = 1;
6760       }
6761     } else if (fd1 == -1) {
6762       // SHM didn't open; it was due to error other than already exists. Try to
6763       // create a temp file under /tmp.
6764       // TODO: /tmp might not always be the temporary directory. For now we will
6765       // not consider TMPDIR. If /tmp is not accessible, we simply error out.
6766       char *temp_file_name = __kmp_str_format("/tmp/%sXXXXXX", name);
6767       fd1 = mkstemp(temp_file_name);
6768       if (fd1 == -1) {
6769         // error out here.
6770         __kmp_fatal(KMP_MSG(FunctionError, "Can't open TEMP"), KMP_ERR(errno),
6771                     __kmp_msg_null);
6772       }
6773       temp_reg_status_file_name = temp_file_name;
6774     }
6775     if (shm_preexist == 0) {
6776       // we created SHM now set size
6777       if (ftruncate(fd1, SHM_SIZE) == -1) {
6778         // error occured setting size;
6779         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6780                     KMP_ERR(errno), __kmp_msg_null);
6781       }
6782     }
6783     data1 =
6784         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6785     if (data1 == MAP_FAILED) {
6786       // failed to map shared memory
6787       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6788                   __kmp_msg_null);
6789     }
6790     if (shm_preexist == 0) { // set data to SHM, set value
6791       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6792     }
6793     // Read value from either what we just wrote or existing file.
6794     value = __kmp_str_format("%s", data1); // read value from SHM
6795     munmap(data1, SHM_SIZE);
6796     close(fd1);
6797 #else // Windows and unix with static library
6798     // Set environment variable, but do not overwrite if it is exist.
6799     __kmp_env_set(name, __kmp_registration_str, 0);
6800     // read value to see if it got set
6801     value = __kmp_env_get(name);
6802 #endif
6803 
6804     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6805       done = 1; // Ok, environment variable set successfully, exit the loop.
6806     } else {
6807       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6808       // Check whether it alive or dead.
6809       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6810       char *tail = value;
6811       char *flag_addr_str = NULL;
6812       char *flag_val_str = NULL;
6813       char const *file_name = NULL;
6814       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6815       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6816       file_name = tail;
6817       if (tail != NULL) {
6818         unsigned long *flag_addr = 0;
6819         unsigned long flag_val = 0;
6820         KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6821         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6822         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6823           // First, check whether environment-encoded address is mapped into
6824           // addr space.
6825           // If so, dereference it to see if it still has the right value.
6826           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6827             neighbor = 1;
6828           } else {
6829             // If not, then we know the other copy of the library is no longer
6830             // running.
6831             neighbor = 2;
6832           }
6833         }
6834       }
6835       switch (neighbor) {
6836       case 0: // Cannot parse environment variable -- neighbor status unknown.
6837         // Assume it is the incompatible format of future version of the
6838         // library. Assume the other library is alive.
6839         // WARN( ... ); // TODO: Issue a warning.
6840         file_name = "unknown library";
6841         KMP_FALLTHROUGH();
6842       // Attention! Falling to the next case. That's intentional.
6843       case 1: { // Neighbor is alive.
6844         // Check it is allowed.
6845         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6846         if (!__kmp_str_match_true(duplicate_ok)) {
6847           // That's not allowed. Issue fatal error.
6848           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6849                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6850         }
6851         KMP_INTERNAL_FREE(duplicate_ok);
6852         __kmp_duplicate_library_ok = 1;
6853         done = 1; // Exit the loop.
6854       } break;
6855       case 2: { // Neighbor is dead.
6856 
6857 #if defined(KMP_USE_SHM)
6858         // close shared memory.
6859         shm_unlink(shm_name); // this removes file in /dev/shm
6860 #else
6861         // Clear the variable and try to register library again.
6862         __kmp_env_unset(name);
6863 #endif
6864       } break;
6865       default: {
6866         KMP_DEBUG_ASSERT(0);
6867       } break;
6868       }
6869     }
6870     KMP_INTERNAL_FREE((void *)value);
6871 #if defined(KMP_USE_SHM)
6872     KMP_INTERNAL_FREE((void *)shm_name);
6873 #endif
6874   } // while
6875   KMP_INTERNAL_FREE((void *)name);
6876 
6877 } // func __kmp_register_library_startup
6878 
6879 void __kmp_unregister_library(void) {
6880 
6881   char *name = __kmp_reg_status_name();
6882   char *value = NULL;
6883 
6884 #if defined(KMP_USE_SHM)
6885   bool use_shm = true;
6886   char *shm_name = __kmp_str_format("/%s", name);
6887   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6888   if (fd1 == -1) {
6889     // File did not open. Try the temporary file.
6890     use_shm = false;
6891     KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6892     fd1 = open(temp_reg_status_file_name, O_RDONLY);
6893     if (fd1 == -1) {
6894       // give it up now.
6895       return;
6896     }
6897   }
6898   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6899   if (data1 != MAP_FAILED) {
6900     value = __kmp_str_format("%s", data1); // read value from SHM
6901     munmap(data1, SHM_SIZE);
6902   }
6903   close(fd1);
6904 #else
6905   value = __kmp_env_get(name);
6906 #endif
6907 
6908   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6909   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6910   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6911 //  Ok, this is our variable. Delete it.
6912 #if defined(KMP_USE_SHM)
6913     if (use_shm) {
6914       shm_unlink(shm_name); // this removes file in /dev/shm
6915     } else {
6916       KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6917       unlink(temp_reg_status_file_name); // this removes the temp file
6918     }
6919 #else
6920     __kmp_env_unset(name);
6921 #endif
6922   }
6923 
6924 #if defined(KMP_USE_SHM)
6925   KMP_INTERNAL_FREE(shm_name);
6926   if (!use_shm) {
6927     KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6928     KMP_INTERNAL_FREE(temp_reg_status_file_name);
6929   }
6930 #endif
6931 
6932   KMP_INTERNAL_FREE(__kmp_registration_str);
6933   KMP_INTERNAL_FREE(value);
6934   KMP_INTERNAL_FREE(name);
6935 
6936   __kmp_registration_flag = 0;
6937   __kmp_registration_str = NULL;
6938 
6939 } // __kmp_unregister_library
6940 
6941 // End of Library registration stuff.
6942 // -----------------------------------------------------------------------------
6943 
6944 #if KMP_MIC_SUPPORTED
6945 
6946 static void __kmp_check_mic_type() {
6947   kmp_cpuid_t cpuid_state = {0};
6948   kmp_cpuid_t *cs_p = &cpuid_state;
6949   __kmp_x86_cpuid(1, 0, cs_p);
6950   // We don't support mic1 at the moment
6951   if ((cs_p->eax & 0xff0) == 0xB10) {
6952     __kmp_mic_type = mic2;
6953   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6954     __kmp_mic_type = mic3;
6955   } else {
6956     __kmp_mic_type = non_mic;
6957   }
6958 }
6959 
6960 #endif /* KMP_MIC_SUPPORTED */
6961 
6962 #if KMP_HAVE_UMWAIT
6963 static void __kmp_user_level_mwait_init() {
6964   struct kmp_cpuid buf;
6965   __kmp_x86_cpuid(7, 0, &buf);
6966   __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
6967   __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
6968   __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
6969   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6970                 __kmp_umwait_enabled));
6971 }
6972 #elif KMP_HAVE_MWAIT
6973 #ifndef AT_INTELPHIUSERMWAIT
6974 // Spurious, non-existent value that should always fail to return anything.
6975 // Will be replaced with the correct value when we know that.
6976 #define AT_INTELPHIUSERMWAIT 10000
6977 #endif
6978 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6979 // earlier OS is used to build the RTL, we'll use the following internal
6980 // function when the entry is not found.
6981 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6982 unsigned long getauxval(unsigned long) { return 0; }
6983 
6984 static void __kmp_user_level_mwait_init() {
6985   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6986   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6987   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6988   // KMP_USER_LEVEL_MWAIT was set to TRUE.
6989   if (__kmp_mic_type == mic3) {
6990     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6991     if ((res & 0x1) || __kmp_user_level_mwait) {
6992       __kmp_mwait_enabled = TRUE;
6993       if (__kmp_user_level_mwait) {
6994         KMP_INFORM(EnvMwaitWarn);
6995       }
6996     } else {
6997       __kmp_mwait_enabled = FALSE;
6998     }
6999   }
7000   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7001                 "__kmp_mwait_enabled = %d\n",
7002                 __kmp_mic_type, __kmp_mwait_enabled));
7003 }
7004 #endif /* KMP_HAVE_UMWAIT */
7005 
7006 static void __kmp_do_serial_initialize(void) {
7007   int i, gtid;
7008   size_t size;
7009 
7010   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7011 
7012   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7013   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7014   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7015   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7016   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7017 
7018 #if OMPT_SUPPORT
7019   ompt_pre_init();
7020 #endif
7021 #if OMPD_SUPPORT
7022   __kmp_env_dump();
7023   ompd_init();
7024 #endif
7025 
7026   __kmp_validate_locks();
7027 
7028 #if ENABLE_LIBOMPTARGET
7029   /* Initialize functions from libomptarget */
7030   __kmp_init_omptarget();
7031 #endif
7032 
7033   /* Initialize internal memory allocator */
7034   __kmp_init_allocator();
7035 
7036   /* Register the library startup via an environment variable or via mapped
7037      shared memory file and check to see whether another copy of the library is
7038      already registered. Since forked child process is often terminated, we
7039      postpone the registration till middle initialization in the child */
7040   if (__kmp_need_register_serial)
7041     __kmp_register_library_startup();
7042 
7043   /* TODO reinitialization of library */
7044   if (TCR_4(__kmp_global.g.g_done)) {
7045     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7046   }
7047 
7048   __kmp_global.g.g_abort = 0;
7049   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7050 
7051 /* initialize the locks */
7052 #if KMP_USE_ADAPTIVE_LOCKS
7053 #if KMP_DEBUG_ADAPTIVE_LOCKS
7054   __kmp_init_speculative_stats();
7055 #endif
7056 #endif
7057 #if KMP_STATS_ENABLED
7058   __kmp_stats_init();
7059 #endif
7060   __kmp_init_lock(&__kmp_global_lock);
7061   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7062   __kmp_init_lock(&__kmp_debug_lock);
7063   __kmp_init_atomic_lock(&__kmp_atomic_lock);
7064   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7065   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7066   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7067   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7068   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7069   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7070   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7071   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7072   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7073   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7074   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7075   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7076   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7077   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7078 #if KMP_USE_MONITOR
7079   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7080 #endif
7081   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7082 
7083   /* conduct initialization and initial setup of configuration */
7084 
7085   __kmp_runtime_initialize();
7086 
7087 #if KMP_MIC_SUPPORTED
7088   __kmp_check_mic_type();
7089 #endif
7090 
7091 // Some global variable initialization moved here from kmp_env_initialize()
7092 #ifdef KMP_DEBUG
7093   kmp_diag = 0;
7094 #endif
7095   __kmp_abort_delay = 0;
7096 
7097   // From __kmp_init_dflt_team_nth()
7098   /* assume the entire machine will be used */
7099   __kmp_dflt_team_nth_ub = __kmp_xproc;
7100   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7101     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7102   }
7103   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7104     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7105   }
7106   __kmp_max_nth = __kmp_sys_max_nth;
7107   __kmp_cg_max_nth = __kmp_sys_max_nth;
7108   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7109   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7110     __kmp_teams_max_nth = __kmp_sys_max_nth;
7111   }
7112 
7113   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7114   // part
7115   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7116 #if KMP_USE_MONITOR
7117   __kmp_monitor_wakeups =
7118       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7119   __kmp_bt_intervals =
7120       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7121 #endif
7122   // From "KMP_LIBRARY" part of __kmp_env_initialize()
7123   __kmp_library = library_throughput;
7124   // From KMP_SCHEDULE initialization
7125   __kmp_static = kmp_sch_static_balanced;
7126 // AC: do not use analytical here, because it is non-monotonous
7127 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7128 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7129 // need to repeat assignment
7130 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7131 // bit control and barrier method control parts
7132 #if KMP_FAST_REDUCTION_BARRIER
7133 #define kmp_reduction_barrier_gather_bb ((int)1)
7134 #define kmp_reduction_barrier_release_bb ((int)1)
7135 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7136 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7137 #endif // KMP_FAST_REDUCTION_BARRIER
7138   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7139     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7140     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7141     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7142     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7143 #if KMP_FAST_REDUCTION_BARRIER
7144     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7145       // lin_64 ): hyper,1
7146       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7147       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7148       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7149       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7150     }
7151 #endif // KMP_FAST_REDUCTION_BARRIER
7152   }
7153 #if KMP_FAST_REDUCTION_BARRIER
7154 #undef kmp_reduction_barrier_release_pat
7155 #undef kmp_reduction_barrier_gather_pat
7156 #undef kmp_reduction_barrier_release_bb
7157 #undef kmp_reduction_barrier_gather_bb
7158 #endif // KMP_FAST_REDUCTION_BARRIER
7159 #if KMP_MIC_SUPPORTED
7160   if (__kmp_mic_type == mic2) { // KNC
7161     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7162     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7163     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7164         1; // forkjoin release
7165     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7166     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7167   }
7168 #if KMP_FAST_REDUCTION_BARRIER
7169   if (__kmp_mic_type == mic2) { // KNC
7170     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7171     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7172   }
7173 #endif // KMP_FAST_REDUCTION_BARRIER
7174 #endif // KMP_MIC_SUPPORTED
7175 
7176 // From KMP_CHECKS initialization
7177 #ifdef KMP_DEBUG
7178   __kmp_env_checks = TRUE; /* development versions have the extra checks */
7179 #else
7180   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7181 #endif
7182 
7183   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7184   __kmp_foreign_tp = TRUE;
7185 
7186   __kmp_global.g.g_dynamic = FALSE;
7187   __kmp_global.g.g_dynamic_mode = dynamic_default;
7188 
7189   __kmp_init_nesting_mode();
7190 
7191   __kmp_env_initialize(NULL);
7192 
7193 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7194   __kmp_user_level_mwait_init();
7195 #endif
7196 // Print all messages in message catalog for testing purposes.
7197 #ifdef KMP_DEBUG
7198   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7199   if (__kmp_str_match_true(val)) {
7200     kmp_str_buf_t buffer;
7201     __kmp_str_buf_init(&buffer);
7202     __kmp_i18n_dump_catalog(&buffer);
7203     __kmp_printf("%s", buffer.str);
7204     __kmp_str_buf_free(&buffer);
7205   }
7206   __kmp_env_free(&val);
7207 #endif
7208 
7209   __kmp_threads_capacity =
7210       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7211   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7212   __kmp_tp_capacity = __kmp_default_tp_capacity(
7213       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7214 
7215   // If the library is shut down properly, both pools must be NULL. Just in
7216   // case, set them to NULL -- some memory may leak, but subsequent code will
7217   // work even if pools are not freed.
7218   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7219   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7220   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7221   __kmp_thread_pool = NULL;
7222   __kmp_thread_pool_insert_pt = NULL;
7223   __kmp_team_pool = NULL;
7224 
7225   /* Allocate all of the variable sized records */
7226   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7227    * expandable */
7228   /* Since allocation is cache-aligned, just add extra padding at the end */
7229   size =
7230       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7231       CACHE_LINE;
7232   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7233   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7234                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
7235 
7236   /* init thread counts */
7237   KMP_DEBUG_ASSERT(__kmp_all_nth ==
7238                    0); // Asserts fail if the library is reinitializing and
7239   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7240   __kmp_all_nth = 0;
7241   __kmp_nth = 0;
7242 
7243   /* setup the uber master thread and hierarchy */
7244   gtid = __kmp_register_root(TRUE);
7245   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
7246   KMP_ASSERT(KMP_UBER_GTID(gtid));
7247   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7248 
7249   KMP_MB(); /* Flush all pending memory write invalidates.  */
7250 
7251   __kmp_common_initialize();
7252 
7253 #if KMP_OS_UNIX
7254   /* invoke the child fork handler */
7255   __kmp_register_atfork();
7256 #endif
7257 
7258 #if !KMP_DYNAMIC_LIB ||                                                        \
7259     ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7260   {
7261     /* Invoke the exit handler when the program finishes, only for static
7262        library and macOS* dynamic. For other dynamic libraries, we already
7263        have _fini and DllMain. */
7264     int rc = atexit(__kmp_internal_end_atexit);
7265     if (rc != 0) {
7266       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7267                   __kmp_msg_null);
7268     }
7269   }
7270 #endif
7271 
7272 #if KMP_HANDLE_SIGNALS
7273 #if KMP_OS_UNIX
7274   /* NOTE: make sure that this is called before the user installs their own
7275      signal handlers so that the user handlers are called first. this way they
7276      can return false, not call our handler, avoid terminating the library, and
7277      continue execution where they left off. */
7278   __kmp_install_signals(FALSE);
7279 #endif /* KMP_OS_UNIX */
7280 #if KMP_OS_WINDOWS
7281   __kmp_install_signals(TRUE);
7282 #endif /* KMP_OS_WINDOWS */
7283 #endif
7284 
7285   /* we have finished the serial initialization */
7286   __kmp_init_counter++;
7287 
7288   __kmp_init_serial = TRUE;
7289 
7290   if (__kmp_version) {
7291     __kmp_print_version_1();
7292   }
7293 
7294   if (__kmp_settings) {
7295     __kmp_env_print();
7296   }
7297 
7298   if (__kmp_display_env || __kmp_display_env_verbose) {
7299     __kmp_env_print_2();
7300   }
7301 
7302 #if OMPT_SUPPORT
7303   ompt_post_init();
7304 #endif
7305 
7306   KMP_MB();
7307 
7308   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7309 }
7310 
7311 void __kmp_serial_initialize(void) {
7312   if (__kmp_init_serial) {
7313     return;
7314   }
7315   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7316   if (__kmp_init_serial) {
7317     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7318     return;
7319   }
7320   __kmp_do_serial_initialize();
7321   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7322 }
7323 
7324 static void __kmp_do_middle_initialize(void) {
7325   int i, j;
7326   int prev_dflt_team_nth;
7327 
7328   if (!__kmp_init_serial) {
7329     __kmp_do_serial_initialize();
7330   }
7331 
7332   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7333 
7334   if (UNLIKELY(!__kmp_need_register_serial)) {
7335     // We are in a forked child process. The registration was skipped during
7336     // serial initialization in __kmp_atfork_child handler. Do it here.
7337     __kmp_register_library_startup();
7338   }
7339 
7340   // Save the previous value for the __kmp_dflt_team_nth so that
7341   // we can avoid some reinitialization if it hasn't changed.
7342   prev_dflt_team_nth = __kmp_dflt_team_nth;
7343 
7344 #if KMP_AFFINITY_SUPPORTED
7345   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7346   // number of cores on the machine.
7347   __kmp_affinity_initialize(__kmp_affinity);
7348 
7349 #endif /* KMP_AFFINITY_SUPPORTED */
7350 
7351   KMP_ASSERT(__kmp_xproc > 0);
7352   if (__kmp_avail_proc == 0) {
7353     __kmp_avail_proc = __kmp_xproc;
7354   }
7355 
7356   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7357   // correct them now
7358   j = 0;
7359   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7360     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7361         __kmp_avail_proc;
7362     j++;
7363   }
7364 
7365   if (__kmp_dflt_team_nth == 0) {
7366 #ifdef KMP_DFLT_NTH_CORES
7367     // Default #threads = #cores
7368     __kmp_dflt_team_nth = __kmp_ncores;
7369     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7370                   "__kmp_ncores (%d)\n",
7371                   __kmp_dflt_team_nth));
7372 #else
7373     // Default #threads = #available OS procs
7374     __kmp_dflt_team_nth = __kmp_avail_proc;
7375     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7376                   "__kmp_avail_proc(%d)\n",
7377                   __kmp_dflt_team_nth));
7378 #endif /* KMP_DFLT_NTH_CORES */
7379   }
7380 
7381   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7382     __kmp_dflt_team_nth = KMP_MIN_NTH;
7383   }
7384   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7385     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7386   }
7387 
7388   if (__kmp_nesting_mode > 0)
7389     __kmp_set_nesting_mode_threads();
7390 
7391   // There's no harm in continuing if the following check fails,
7392   // but it indicates an error in the previous logic.
7393   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7394 
7395   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7396     // Run through the __kmp_threads array and set the num threads icv for each
7397     // root thread that is currently registered with the RTL (which has not
7398     // already explicitly set its nthreads-var with a call to
7399     // omp_set_num_threads()).
7400     for (i = 0; i < __kmp_threads_capacity; i++) {
7401       kmp_info_t *thread = __kmp_threads[i];
7402       if (thread == NULL)
7403         continue;
7404       if (thread->th.th_current_task->td_icvs.nproc != 0)
7405         continue;
7406 
7407       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7408     }
7409   }
7410   KA_TRACE(
7411       20,
7412       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7413        __kmp_dflt_team_nth));
7414 
7415 #ifdef KMP_ADJUST_BLOCKTIME
7416   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7417   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7418     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7419     if (__kmp_nth > __kmp_avail_proc) {
7420       __kmp_zero_bt = TRUE;
7421     }
7422   }
7423 #endif /* KMP_ADJUST_BLOCKTIME */
7424 
7425   /* we have finished middle initialization */
7426   TCW_SYNC_4(__kmp_init_middle, TRUE);
7427 
7428   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7429 }
7430 
7431 void __kmp_middle_initialize(void) {
7432   if (__kmp_init_middle) {
7433     return;
7434   }
7435   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7436   if (__kmp_init_middle) {
7437     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7438     return;
7439   }
7440   __kmp_do_middle_initialize();
7441   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7442 }
7443 
7444 void __kmp_parallel_initialize(void) {
7445   int gtid = __kmp_entry_gtid(); // this might be a new root
7446 
7447   /* synchronize parallel initialization (for sibling) */
7448   if (TCR_4(__kmp_init_parallel))
7449     return;
7450   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7451   if (TCR_4(__kmp_init_parallel)) {
7452     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7453     return;
7454   }
7455 
7456   /* TODO reinitialization after we have already shut down */
7457   if (TCR_4(__kmp_global.g.g_done)) {
7458     KA_TRACE(
7459         10,
7460         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7461     __kmp_infinite_loop();
7462   }
7463 
7464   /* jc: The lock __kmp_initz_lock is already held, so calling
7465      __kmp_serial_initialize would cause a deadlock.  So we call
7466      __kmp_do_serial_initialize directly. */
7467   if (!__kmp_init_middle) {
7468     __kmp_do_middle_initialize();
7469   }
7470   __kmp_assign_root_init_mask();
7471   __kmp_resume_if_hard_paused();
7472 
7473   /* begin initialization */
7474   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7475   KMP_ASSERT(KMP_UBER_GTID(gtid));
7476 
7477 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7478   // Save the FP control regs.
7479   // Worker threads will set theirs to these values at thread startup.
7480   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7481   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7482   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7483 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7484 
7485 #if KMP_OS_UNIX
7486 #if KMP_HANDLE_SIGNALS
7487   /*  must be after __kmp_serial_initialize  */
7488   __kmp_install_signals(TRUE);
7489 #endif
7490 #endif
7491 
7492   __kmp_suspend_initialize();
7493 
7494 #if defined(USE_LOAD_BALANCE)
7495   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7496     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7497   }
7498 #else
7499   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7500     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7501   }
7502 #endif
7503 
7504   if (__kmp_version) {
7505     __kmp_print_version_2();
7506   }
7507 
7508   /* we have finished parallel initialization */
7509   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7510 
7511   KMP_MB();
7512   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7513 
7514   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7515 }
7516 
7517 void __kmp_hidden_helper_initialize() {
7518   if (TCR_4(__kmp_init_hidden_helper))
7519     return;
7520 
7521   // __kmp_parallel_initialize is required before we initialize hidden helper
7522   if (!TCR_4(__kmp_init_parallel))
7523     __kmp_parallel_initialize();
7524 
7525   // Double check. Note that this double check should not be placed before
7526   // __kmp_parallel_initialize as it will cause dead lock.
7527   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7528   if (TCR_4(__kmp_init_hidden_helper)) {
7529     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7530     return;
7531   }
7532 
7533 #if KMP_AFFINITY_SUPPORTED
7534   // Initialize hidden helper affinity settings.
7535   // The above __kmp_parallel_initialize() will initialize
7536   // regular affinity (and topology) if not already done.
7537   if (!__kmp_hh_affinity.flags.initialized)
7538     __kmp_affinity_initialize(__kmp_hh_affinity);
7539 #endif
7540 
7541   // Set the count of hidden helper tasks to be executed to zero
7542   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7543 
7544   // Set the global variable indicating that we're initializing hidden helper
7545   // team/threads
7546   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7547 
7548   // Platform independent initialization
7549   __kmp_do_initialize_hidden_helper_threads();
7550 
7551   // Wait here for the finish of initialization of hidden helper teams
7552   __kmp_hidden_helper_threads_initz_wait();
7553 
7554   // We have finished hidden helper initialization
7555   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7556 
7557   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7558 }
7559 
7560 /* ------------------------------------------------------------------------ */
7561 
7562 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7563                                    kmp_team_t *team) {
7564   kmp_disp_t *dispatch;
7565 
7566   KMP_MB();
7567 
7568   /* none of the threads have encountered any constructs, yet. */
7569   this_thr->th.th_local.this_construct = 0;
7570 #if KMP_CACHE_MANAGE
7571   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7572 #endif /* KMP_CACHE_MANAGE */
7573   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7574   KMP_DEBUG_ASSERT(dispatch);
7575   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7576   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7577   // this_thr->th.th_info.ds.ds_tid ] );
7578 
7579   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7580   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7581   if (__kmp_env_consistency_check)
7582     __kmp_push_parallel(gtid, team->t.t_ident);
7583 
7584   KMP_MB(); /* Flush all pending memory write invalidates.  */
7585 }
7586 
7587 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7588                                   kmp_team_t *team) {
7589   if (__kmp_env_consistency_check)
7590     __kmp_pop_parallel(gtid, team->t.t_ident);
7591 
7592   __kmp_finish_implicit_task(this_thr);
7593 }
7594 
7595 int __kmp_invoke_task_func(int gtid) {
7596   int rc;
7597   int tid = __kmp_tid_from_gtid(gtid);
7598   kmp_info_t *this_thr = __kmp_threads[gtid];
7599   kmp_team_t *team = this_thr->th.th_team;
7600 
7601   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7602 #if USE_ITT_BUILD
7603   if (__itt_stack_caller_create_ptr) {
7604     // inform ittnotify about entering user's code
7605     if (team->t.t_stack_id != NULL) {
7606       __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7607     } else {
7608       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7609       __kmp_itt_stack_callee_enter(
7610           (__itt_caller)team->t.t_parent->t.t_stack_id);
7611     }
7612   }
7613 #endif /* USE_ITT_BUILD */
7614 #if INCLUDE_SSC_MARKS
7615   SSC_MARK_INVOKING();
7616 #endif
7617 
7618 #if OMPT_SUPPORT
7619   void *dummy;
7620   void **exit_frame_p;
7621   ompt_data_t *my_task_data;
7622   ompt_data_t *my_parallel_data;
7623   int ompt_team_size;
7624 
7625   if (ompt_enabled.enabled) {
7626     exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7627                          .ompt_task_info.frame.exit_frame.ptr);
7628   } else {
7629     exit_frame_p = &dummy;
7630   }
7631 
7632   my_task_data =
7633       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7634   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7635   if (ompt_enabled.ompt_callback_implicit_task) {
7636     ompt_team_size = team->t.t_nproc;
7637     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7638         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7639         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7640     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7641   }
7642 #endif
7643 
7644 #if KMP_STATS_ENABLED
7645   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7646   if (previous_state == stats_state_e::TEAMS_REGION) {
7647     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7648   } else {
7649     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7650   }
7651   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7652 #endif
7653 
7654   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7655                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7656 #if OMPT_SUPPORT
7657                               ,
7658                               exit_frame_p
7659 #endif
7660   );
7661 #if OMPT_SUPPORT
7662   *exit_frame_p = NULL;
7663   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7664 #endif
7665 
7666 #if KMP_STATS_ENABLED
7667   if (previous_state == stats_state_e::TEAMS_REGION) {
7668     KMP_SET_THREAD_STATE(previous_state);
7669   }
7670   KMP_POP_PARTITIONED_TIMER();
7671 #endif
7672 
7673 #if USE_ITT_BUILD
7674   if (__itt_stack_caller_create_ptr) {
7675     // inform ittnotify about leaving user's code
7676     if (team->t.t_stack_id != NULL) {
7677       __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7678     } else {
7679       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7680       __kmp_itt_stack_callee_leave(
7681           (__itt_caller)team->t.t_parent->t.t_stack_id);
7682     }
7683   }
7684 #endif /* USE_ITT_BUILD */
7685   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7686 
7687   return rc;
7688 }
7689 
7690 void __kmp_teams_master(int gtid) {
7691   // This routine is called by all primary threads in teams construct
7692   kmp_info_t *thr = __kmp_threads[gtid];
7693   kmp_team_t *team = thr->th.th_team;
7694   ident_t *loc = team->t.t_ident;
7695   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7696   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7697   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7698   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7699                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7700 
7701   // This thread is a new CG root.  Set up the proper variables.
7702   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7703   tmp->cg_root = thr; // Make thr the CG root
7704   // Init to thread limit stored when league primary threads were forked
7705   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7706   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7707   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7708                  " cg_nthreads to 1\n",
7709                  thr, tmp));
7710   tmp->up = thr->th.th_cg_roots;
7711   thr->th.th_cg_roots = tmp;
7712 
7713 // Launch league of teams now, but not let workers execute
7714 // (they hang on fork barrier until next parallel)
7715 #if INCLUDE_SSC_MARKS
7716   SSC_MARK_FORKING();
7717 #endif
7718   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7719                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7720                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7721 #if INCLUDE_SSC_MARKS
7722   SSC_MARK_JOINING();
7723 #endif
7724   // If the team size was reduced from the limit, set it to the new size
7725   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7726     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7727   // AC: last parameter "1" eliminates join barrier which won't work because
7728   // worker threads are in a fork barrier waiting for more parallel regions
7729   __kmp_join_call(loc, gtid
7730 #if OMPT_SUPPORT
7731                   ,
7732                   fork_context_intel
7733 #endif
7734                   ,
7735                   1);
7736 }
7737 
7738 int __kmp_invoke_teams_master(int gtid) {
7739   kmp_info_t *this_thr = __kmp_threads[gtid];
7740   kmp_team_t *team = this_thr->th.th_team;
7741 #if KMP_DEBUG
7742   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7743     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7744                      (void *)__kmp_teams_master);
7745 #endif
7746   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7747 #if OMPT_SUPPORT
7748   int tid = __kmp_tid_from_gtid(gtid);
7749   ompt_data_t *task_data =
7750       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7751   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7752   if (ompt_enabled.ompt_callback_implicit_task) {
7753     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7754         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7755         ompt_task_initial);
7756     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7757   }
7758 #endif
7759   __kmp_teams_master(gtid);
7760 #if OMPT_SUPPORT
7761   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7762 #endif
7763   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7764   return 1;
7765 }
7766 
7767 /* this sets the requested number of threads for the next parallel region
7768    encountered by this team. since this should be enclosed in the forkjoin
7769    critical section it should avoid race conditions with asymmetrical nested
7770    parallelism */
7771 
7772 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7773   kmp_info_t *thr = __kmp_threads[gtid];
7774 
7775   if (num_threads > 0)
7776     thr->th.th_set_nproc = num_threads;
7777 }
7778 
7779 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7780                                     int num_threads) {
7781   KMP_DEBUG_ASSERT(thr);
7782   // Remember the number of threads for inner parallel regions
7783   if (!TCR_4(__kmp_init_middle))
7784     __kmp_middle_initialize(); // get internal globals calculated
7785   __kmp_assign_root_init_mask();
7786   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7787   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7788 
7789   if (num_threads == 0) {
7790     if (__kmp_teams_thread_limit > 0) {
7791       num_threads = __kmp_teams_thread_limit;
7792     } else {
7793       num_threads = __kmp_avail_proc / num_teams;
7794     }
7795     // adjust num_threads w/o warning as it is not user setting
7796     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7797     // no thread_limit clause specified -  do not change thread-limit-var ICV
7798     if (num_threads > __kmp_dflt_team_nth) {
7799       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7800     }
7801     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7802       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7803     } // prevent team size to exceed thread-limit-var
7804     if (num_teams * num_threads > __kmp_teams_max_nth) {
7805       num_threads = __kmp_teams_max_nth / num_teams;
7806     }
7807     if (num_threads == 0) {
7808       num_threads = 1;
7809     }
7810   } else {
7811     if (num_threads < 0) {
7812       __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7813                 __kmp_msg_null);
7814       num_threads = 1;
7815     }
7816     // This thread will be the primary thread of the league primary threads
7817     // Store new thread limit; old limit is saved in th_cg_roots list
7818     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7819     // num_threads = min(num_threads, nthreads-var)
7820     if (num_threads > __kmp_dflt_team_nth) {
7821       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7822     }
7823     if (num_teams * num_threads > __kmp_teams_max_nth) {
7824       int new_threads = __kmp_teams_max_nth / num_teams;
7825       if (new_threads == 0) {
7826         new_threads = 1;
7827       }
7828       if (new_threads != num_threads) {
7829         if (!__kmp_reserve_warn) { // user asked for too many threads
7830           __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7831           __kmp_msg(kmp_ms_warning,
7832                     KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7833                     KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7834         }
7835       }
7836       num_threads = new_threads;
7837     }
7838   }
7839   thr->th.th_teams_size.nth = num_threads;
7840 }
7841 
7842 /* this sets the requested number of teams for the teams region and/or
7843    the number of threads for the next parallel region encountered  */
7844 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7845                           int num_threads) {
7846   kmp_info_t *thr = __kmp_threads[gtid];
7847   if (num_teams < 0) {
7848     // OpenMP specification requires requested values to be positive,
7849     // but people can send us any value, so we'd better check
7850     __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7851               __kmp_msg_null);
7852     num_teams = 1;
7853   }
7854   if (num_teams == 0) {
7855     if (__kmp_nteams > 0) {
7856       num_teams = __kmp_nteams;
7857     } else {
7858       num_teams = 1; // default number of teams is 1.
7859     }
7860   }
7861   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7862     if (!__kmp_reserve_warn) {
7863       __kmp_reserve_warn = 1;
7864       __kmp_msg(kmp_ms_warning,
7865                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7866                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7867     }
7868     num_teams = __kmp_teams_max_nth;
7869   }
7870   // Set number of teams (number of threads in the outer "parallel" of the
7871   // teams)
7872   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7873 
7874   __kmp_push_thread_limit(thr, num_teams, num_threads);
7875 }
7876 
7877 /* This sets the requested number of teams for the teams region and/or
7878    the number of threads for the next parallel region encountered  */
7879 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7880                              int num_teams_ub, int num_threads) {
7881   kmp_info_t *thr = __kmp_threads[gtid];
7882   KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7883   KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7884   KMP_DEBUG_ASSERT(num_threads >= 0);
7885 
7886   if (num_teams_lb > num_teams_ub) {
7887     __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7888                 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7889   }
7890 
7891   int num_teams = 1; // defalt number of teams is 1.
7892 
7893   if (num_teams_lb == 0 && num_teams_ub > 0)
7894     num_teams_lb = num_teams_ub;
7895 
7896   if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7897     num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7898     if (num_teams > __kmp_teams_max_nth) {
7899       if (!__kmp_reserve_warn) {
7900         __kmp_reserve_warn = 1;
7901         __kmp_msg(kmp_ms_warning,
7902                   KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7903                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7904       }
7905       num_teams = __kmp_teams_max_nth;
7906     }
7907   } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7908     num_teams = num_teams_ub;
7909   } else { // num_teams_lb <= num_teams <= num_teams_ub
7910     if (num_threads <= 0) {
7911       if (num_teams_ub > __kmp_teams_max_nth) {
7912         num_teams = num_teams_lb;
7913       } else {
7914         num_teams = num_teams_ub;
7915       }
7916     } else {
7917       num_teams = (num_threads > __kmp_teams_max_nth)
7918                       ? num_teams
7919                       : __kmp_teams_max_nth / num_threads;
7920       if (num_teams < num_teams_lb) {
7921         num_teams = num_teams_lb;
7922       } else if (num_teams > num_teams_ub) {
7923         num_teams = num_teams_ub;
7924       }
7925     }
7926   }
7927   // Set number of teams (number of threads in the outer "parallel" of the
7928   // teams)
7929   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7930 
7931   __kmp_push_thread_limit(thr, num_teams, num_threads);
7932 }
7933 
7934 // Set the proc_bind var to use in the following parallel region.
7935 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7936   kmp_info_t *thr = __kmp_threads[gtid];
7937   thr->th.th_set_proc_bind = proc_bind;
7938 }
7939 
7940 /* Launch the worker threads into the microtask. */
7941 
7942 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7943   kmp_info_t *this_thr = __kmp_threads[gtid];
7944 
7945 #ifdef KMP_DEBUG
7946   int f;
7947 #endif /* KMP_DEBUG */
7948 
7949   KMP_DEBUG_ASSERT(team);
7950   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7951   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7952   KMP_MB(); /* Flush all pending memory write invalidates.  */
7953 
7954   team->t.t_construct = 0; /* no single directives seen yet */
7955   team->t.t_ordered.dt.t_value =
7956       0; /* thread 0 enters the ordered section first */
7957 
7958   /* Reset the identifiers on the dispatch buffer */
7959   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7960   if (team->t.t_max_nproc > 1) {
7961     int i;
7962     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7963       team->t.t_disp_buffer[i].buffer_index = i;
7964       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7965     }
7966   } else {
7967     team->t.t_disp_buffer[0].buffer_index = 0;
7968     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7969   }
7970 
7971   KMP_MB(); /* Flush all pending memory write invalidates.  */
7972   KMP_ASSERT(this_thr->th.th_team == team);
7973 
7974 #ifdef KMP_DEBUG
7975   for (f = 0; f < team->t.t_nproc; f++) {
7976     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7977                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7978   }
7979 #endif /* KMP_DEBUG */
7980 
7981   /* release the worker threads so they may begin working */
7982   __kmp_fork_barrier(gtid, 0);
7983 }
7984 
7985 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7986   kmp_info_t *this_thr = __kmp_threads[gtid];
7987 
7988   KMP_DEBUG_ASSERT(team);
7989   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7990   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7991   KMP_MB(); /* Flush all pending memory write invalidates.  */
7992 
7993   /* Join barrier after fork */
7994 
7995 #ifdef KMP_DEBUG
7996   if (__kmp_threads[gtid] &&
7997       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7998     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7999                  __kmp_threads[gtid]);
8000     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8001                  "team->t.t_nproc=%d\n",
8002                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8003                  team->t.t_nproc);
8004     __kmp_print_structure();
8005   }
8006   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8007                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8008 #endif /* KMP_DEBUG */
8009 
8010   __kmp_join_barrier(gtid); /* wait for everyone */
8011 #if OMPT_SUPPORT
8012   if (ompt_enabled.enabled &&
8013       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
8014     int ds_tid = this_thr->th.th_info.ds.ds_tid;
8015     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8016     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8017 #if OMPT_OPTIONAL
8018     void *codeptr = NULL;
8019     if (KMP_MASTER_TID(ds_tid) &&
8020         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8021          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8022       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8023 
8024     if (ompt_enabled.ompt_callback_sync_region_wait) {
8025       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8026           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8027           codeptr);
8028     }
8029     if (ompt_enabled.ompt_callback_sync_region) {
8030       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8031           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8032           codeptr);
8033     }
8034 #endif
8035     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8036       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8037           ompt_scope_end, NULL, task_data, 0, ds_tid,
8038           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8039     }
8040   }
8041 #endif
8042 
8043   KMP_MB(); /* Flush all pending memory write invalidates.  */
8044   KMP_ASSERT(this_thr->th.th_team == team);
8045 }
8046 
8047 /* ------------------------------------------------------------------------ */
8048 
8049 #ifdef USE_LOAD_BALANCE
8050 
8051 // Return the worker threads actively spinning in the hot team, if we
8052 // are at the outermost level of parallelism.  Otherwise, return 0.
8053 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8054   int i;
8055   int retval;
8056   kmp_team_t *hot_team;
8057 
8058   if (root->r.r_active) {
8059     return 0;
8060   }
8061   hot_team = root->r.r_hot_team;
8062   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8063     return hot_team->t.t_nproc - 1; // Don't count primary thread
8064   }
8065 
8066   // Skip the primary thread - it is accounted for elsewhere.
8067   retval = 0;
8068   for (i = 1; i < hot_team->t.t_nproc; i++) {
8069     if (hot_team->t.t_threads[i]->th.th_active) {
8070       retval++;
8071     }
8072   }
8073   return retval;
8074 }
8075 
8076 // Perform an automatic adjustment to the number of
8077 // threads used by the next parallel region.
8078 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8079   int retval;
8080   int pool_active;
8081   int hot_team_active;
8082   int team_curr_active;
8083   int system_active;
8084 
8085   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8086                 set_nproc));
8087   KMP_DEBUG_ASSERT(root);
8088   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8089                        ->th.th_current_task->td_icvs.dynamic == TRUE);
8090   KMP_DEBUG_ASSERT(set_nproc > 1);
8091 
8092   if (set_nproc == 1) {
8093     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8094     return 1;
8095   }
8096 
8097   // Threads that are active in the thread pool, active in the hot team for this
8098   // particular root (if we are at the outer par level), and the currently
8099   // executing thread (to become the primary thread) are available to add to the
8100   // new team, but are currently contributing to the system load, and must be
8101   // accounted for.
8102   pool_active = __kmp_thread_pool_active_nth;
8103   hot_team_active = __kmp_active_hot_team_nproc(root);
8104   team_curr_active = pool_active + hot_team_active + 1;
8105 
8106   // Check the system load.
8107   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8108   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8109                 "hot team active = %d\n",
8110                 system_active, pool_active, hot_team_active));
8111 
8112   if (system_active < 0) {
8113     // There was an error reading the necessary info from /proc, so use the
8114     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8115     // = dynamic_thread_limit, we shouldn't wind up getting back here.
8116     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8117     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8118 
8119     // Make this call behave like the thread limit algorithm.
8120     retval = __kmp_avail_proc - __kmp_nth +
8121              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8122     if (retval > set_nproc) {
8123       retval = set_nproc;
8124     }
8125     if (retval < KMP_MIN_NTH) {
8126       retval = KMP_MIN_NTH;
8127     }
8128 
8129     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8130                   retval));
8131     return retval;
8132   }
8133 
8134   // There is a slight delay in the load balance algorithm in detecting new
8135   // running procs. The real system load at this instant should be at least as
8136   // large as the #active omp thread that are available to add to the team.
8137   if (system_active < team_curr_active) {
8138     system_active = team_curr_active;
8139   }
8140   retval = __kmp_avail_proc - system_active + team_curr_active;
8141   if (retval > set_nproc) {
8142     retval = set_nproc;
8143   }
8144   if (retval < KMP_MIN_NTH) {
8145     retval = KMP_MIN_NTH;
8146   }
8147 
8148   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8149   return retval;
8150 } // __kmp_load_balance_nproc()
8151 
8152 #endif /* USE_LOAD_BALANCE */
8153 
8154 /* ------------------------------------------------------------------------ */
8155 
8156 /* NOTE: this is called with the __kmp_init_lock held */
8157 void __kmp_cleanup(void) {
8158   int f;
8159 
8160   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8161 
8162   if (TCR_4(__kmp_init_parallel)) {
8163 #if KMP_HANDLE_SIGNALS
8164     __kmp_remove_signals();
8165 #endif
8166     TCW_4(__kmp_init_parallel, FALSE);
8167   }
8168 
8169   if (TCR_4(__kmp_init_middle)) {
8170 #if KMP_AFFINITY_SUPPORTED
8171     __kmp_affinity_uninitialize();
8172 #endif /* KMP_AFFINITY_SUPPORTED */
8173     __kmp_cleanup_hierarchy();
8174     TCW_4(__kmp_init_middle, FALSE);
8175   }
8176 
8177   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8178 
8179   if (__kmp_init_serial) {
8180     __kmp_runtime_destroy();
8181     __kmp_init_serial = FALSE;
8182   }
8183 
8184   __kmp_cleanup_threadprivate_caches();
8185 
8186   for (f = 0; f < __kmp_threads_capacity; f++) {
8187     if (__kmp_root[f] != NULL) {
8188       __kmp_free(__kmp_root[f]);
8189       __kmp_root[f] = NULL;
8190     }
8191   }
8192   __kmp_free(__kmp_threads);
8193   // __kmp_threads and __kmp_root were allocated at once, as single block, so
8194   // there is no need in freeing __kmp_root.
8195   __kmp_threads = NULL;
8196   __kmp_root = NULL;
8197   __kmp_threads_capacity = 0;
8198 
8199   // Free old __kmp_threads arrays if they exist.
8200   kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8201   while (ptr) {
8202     kmp_old_threads_list_t *next = ptr->next;
8203     __kmp_free(ptr->threads);
8204     __kmp_free(ptr);
8205     ptr = next;
8206   }
8207 
8208 #if KMP_USE_DYNAMIC_LOCK
8209   __kmp_cleanup_indirect_user_locks();
8210 #else
8211   __kmp_cleanup_user_locks();
8212 #endif
8213 #if OMPD_SUPPORT
8214   if (ompd_state) {
8215     __kmp_free(ompd_env_block);
8216     ompd_env_block = NULL;
8217     ompd_env_block_size = 0;
8218   }
8219 #endif
8220 
8221 #if KMP_AFFINITY_SUPPORTED
8222   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8223   __kmp_cpuinfo_file = NULL;
8224 #endif /* KMP_AFFINITY_SUPPORTED */
8225 
8226 #if KMP_USE_ADAPTIVE_LOCKS
8227 #if KMP_DEBUG_ADAPTIVE_LOCKS
8228   __kmp_print_speculative_stats();
8229 #endif
8230 #endif
8231   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8232   __kmp_nested_nth.nth = NULL;
8233   __kmp_nested_nth.size = 0;
8234   __kmp_nested_nth.used = 0;
8235   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8236   __kmp_nested_proc_bind.bind_types = NULL;
8237   __kmp_nested_proc_bind.size = 0;
8238   __kmp_nested_proc_bind.used = 0;
8239   if (__kmp_affinity_format) {
8240     KMP_INTERNAL_FREE(__kmp_affinity_format);
8241     __kmp_affinity_format = NULL;
8242   }
8243 
8244   __kmp_i18n_catclose();
8245 
8246 #if KMP_USE_HIER_SCHED
8247   __kmp_hier_scheds.deallocate();
8248 #endif
8249 
8250 #if KMP_STATS_ENABLED
8251   __kmp_stats_fini();
8252 #endif
8253 
8254   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8255 }
8256 
8257 /* ------------------------------------------------------------------------ */
8258 
8259 int __kmp_ignore_mppbeg(void) {
8260   char *env;
8261 
8262   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8263     if (__kmp_str_match_false(env))
8264       return FALSE;
8265   }
8266   // By default __kmpc_begin() is no-op.
8267   return TRUE;
8268 }
8269 
8270 int __kmp_ignore_mppend(void) {
8271   char *env;
8272 
8273   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8274     if (__kmp_str_match_false(env))
8275       return FALSE;
8276   }
8277   // By default __kmpc_end() is no-op.
8278   return TRUE;
8279 }
8280 
8281 void __kmp_internal_begin(void) {
8282   int gtid;
8283   kmp_root_t *root;
8284 
8285   /* this is a very important step as it will register new sibling threads
8286      and assign these new uber threads a new gtid */
8287   gtid = __kmp_entry_gtid();
8288   root = __kmp_threads[gtid]->th.th_root;
8289   KMP_ASSERT(KMP_UBER_GTID(gtid));
8290 
8291   if (root->r.r_begin)
8292     return;
8293   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8294   if (root->r.r_begin) {
8295     __kmp_release_lock(&root->r.r_begin_lock, gtid);
8296     return;
8297   }
8298 
8299   root->r.r_begin = TRUE;
8300 
8301   __kmp_release_lock(&root->r.r_begin_lock, gtid);
8302 }
8303 
8304 /* ------------------------------------------------------------------------ */
8305 
8306 void __kmp_user_set_library(enum library_type arg) {
8307   int gtid;
8308   kmp_root_t *root;
8309   kmp_info_t *thread;
8310 
8311   /* first, make sure we are initialized so we can get our gtid */
8312 
8313   gtid = __kmp_entry_gtid();
8314   thread = __kmp_threads[gtid];
8315 
8316   root = thread->th.th_root;
8317 
8318   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8319                 library_serial));
8320   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8321                                   thread */
8322     KMP_WARNING(SetLibraryIncorrectCall);
8323     return;
8324   }
8325 
8326   switch (arg) {
8327   case library_serial:
8328     thread->th.th_set_nproc = 0;
8329     set__nproc(thread, 1);
8330     break;
8331   case library_turnaround:
8332     thread->th.th_set_nproc = 0;
8333     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8334                                            : __kmp_dflt_team_nth_ub);
8335     break;
8336   case library_throughput:
8337     thread->th.th_set_nproc = 0;
8338     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8339                                            : __kmp_dflt_team_nth_ub);
8340     break;
8341   default:
8342     KMP_FATAL(UnknownLibraryType, arg);
8343   }
8344 
8345   __kmp_aux_set_library(arg);
8346 }
8347 
8348 void __kmp_aux_set_stacksize(size_t arg) {
8349   if (!__kmp_init_serial)
8350     __kmp_serial_initialize();
8351 
8352 #if KMP_OS_DARWIN
8353   if (arg & (0x1000 - 1)) {
8354     arg &= ~(0x1000 - 1);
8355     if (arg + 0x1000) /* check for overflow if we round up */
8356       arg += 0x1000;
8357   }
8358 #endif
8359   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8360 
8361   /* only change the default stacksize before the first parallel region */
8362   if (!TCR_4(__kmp_init_parallel)) {
8363     size_t value = arg; /* argument is in bytes */
8364 
8365     if (value < __kmp_sys_min_stksize)
8366       value = __kmp_sys_min_stksize;
8367     else if (value > KMP_MAX_STKSIZE)
8368       value = KMP_MAX_STKSIZE;
8369 
8370     __kmp_stksize = value;
8371 
8372     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8373   }
8374 
8375   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8376 }
8377 
8378 /* set the behaviour of the runtime library */
8379 /* TODO this can cause some odd behaviour with sibling parallelism... */
8380 void __kmp_aux_set_library(enum library_type arg) {
8381   __kmp_library = arg;
8382 
8383   switch (__kmp_library) {
8384   case library_serial: {
8385     KMP_INFORM(LibraryIsSerial);
8386   } break;
8387   case library_turnaround:
8388     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8389       __kmp_use_yield = 2; // only yield when oversubscribed
8390     break;
8391   case library_throughput:
8392     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8393       __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8394     break;
8395   default:
8396     KMP_FATAL(UnknownLibraryType, arg);
8397   }
8398 }
8399 
8400 /* Getting team information common for all team API */
8401 // Returns NULL if not in teams construct
8402 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8403   kmp_info_t *thr = __kmp_entry_thread();
8404   teams_serialized = 0;
8405   if (thr->th.th_teams_microtask) {
8406     kmp_team_t *team = thr->th.th_team;
8407     int tlevel = thr->th.th_teams_level; // the level of the teams construct
8408     int ii = team->t.t_level;
8409     teams_serialized = team->t.t_serialized;
8410     int level = tlevel + 1;
8411     KMP_DEBUG_ASSERT(ii >= tlevel);
8412     while (ii > level) {
8413       for (teams_serialized = team->t.t_serialized;
8414            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8415       }
8416       if (team->t.t_serialized && (!teams_serialized)) {
8417         team = team->t.t_parent;
8418         continue;
8419       }
8420       if (ii > level) {
8421         team = team->t.t_parent;
8422         ii--;
8423       }
8424     }
8425     return team;
8426   }
8427   return NULL;
8428 }
8429 
8430 int __kmp_aux_get_team_num() {
8431   int serialized;
8432   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8433   if (team) {
8434     if (serialized > 1) {
8435       return 0; // teams region is serialized ( 1 team of 1 thread ).
8436     } else {
8437       return team->t.t_master_tid;
8438     }
8439   }
8440   return 0;
8441 }
8442 
8443 int __kmp_aux_get_num_teams() {
8444   int serialized;
8445   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8446   if (team) {
8447     if (serialized > 1) {
8448       return 1;
8449     } else {
8450       return team->t.t_parent->t.t_nproc;
8451     }
8452   }
8453   return 1;
8454 }
8455 
8456 /* ------------------------------------------------------------------------ */
8457 
8458 /*
8459  * Affinity Format Parser
8460  *
8461  * Field is in form of: %[[[0].]size]type
8462  * % and type are required (%% means print a literal '%')
8463  * type is either single char or long name surrounded by {},
8464  * e.g., N or {num_threads}
8465  * 0 => leading zeros
8466  * . => right justified when size is specified
8467  * by default output is left justified
8468  * size is the *minimum* field length
8469  * All other characters are printed as is
8470  *
8471  * Available field types:
8472  * L {thread_level}      - omp_get_level()
8473  * n {thread_num}        - omp_get_thread_num()
8474  * h {host}              - name of host machine
8475  * P {process_id}        - process id (integer)
8476  * T {thread_identifier} - native thread identifier (integer)
8477  * N {num_threads}       - omp_get_num_threads()
8478  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8479  * a {thread_affinity}   - comma separated list of integers or integer ranges
8480  *                         (values of affinity mask)
8481  *
8482  * Implementation-specific field types can be added
8483  * If a type is unknown, print "undefined"
8484  */
8485 
8486 // Structure holding the short name, long name, and corresponding data type
8487 // for snprintf.  A table of these will represent the entire valid keyword
8488 // field types.
8489 typedef struct kmp_affinity_format_field_t {
8490   char short_name; // from spec e.g., L -> thread level
8491   const char *long_name; // from spec thread_level -> thread level
8492   char field_format; // data type for snprintf (typically 'd' or 's'
8493   // for integer or string)
8494 } kmp_affinity_format_field_t;
8495 
8496 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8497 #if KMP_AFFINITY_SUPPORTED
8498     {'A', "thread_affinity", 's'},
8499 #endif
8500     {'t', "team_num", 'd'},
8501     {'T', "num_teams", 'd'},
8502     {'L', "nesting_level", 'd'},
8503     {'n', "thread_num", 'd'},
8504     {'N', "num_threads", 'd'},
8505     {'a', "ancestor_tnum", 'd'},
8506     {'H', "host", 's'},
8507     {'P', "process_id", 'd'},
8508     {'i', "native_thread_id", 'd'}};
8509 
8510 // Return the number of characters it takes to hold field
8511 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8512                                             const char **ptr,
8513                                             kmp_str_buf_t *field_buffer) {
8514   int rc, format_index, field_value;
8515   const char *width_left, *width_right;
8516   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8517   static const int FORMAT_SIZE = 20;
8518   char format[FORMAT_SIZE] = {0};
8519   char absolute_short_name = 0;
8520 
8521   KMP_DEBUG_ASSERT(gtid >= 0);
8522   KMP_DEBUG_ASSERT(th);
8523   KMP_DEBUG_ASSERT(**ptr == '%');
8524   KMP_DEBUG_ASSERT(field_buffer);
8525 
8526   __kmp_str_buf_clear(field_buffer);
8527 
8528   // Skip the initial %
8529   (*ptr)++;
8530 
8531   // Check for %% first
8532   if (**ptr == '%') {
8533     __kmp_str_buf_cat(field_buffer, "%", 1);
8534     (*ptr)++; // skip over the second %
8535     return 1;
8536   }
8537 
8538   // Parse field modifiers if they are present
8539   pad_zeros = false;
8540   if (**ptr == '0') {
8541     pad_zeros = true;
8542     (*ptr)++; // skip over 0
8543   }
8544   right_justify = false;
8545   if (**ptr == '.') {
8546     right_justify = true;
8547     (*ptr)++; // skip over .
8548   }
8549   // Parse width of field: [width_left, width_right)
8550   width_left = width_right = NULL;
8551   if (**ptr >= '0' && **ptr <= '9') {
8552     width_left = *ptr;
8553     SKIP_DIGITS(*ptr);
8554     width_right = *ptr;
8555   }
8556 
8557   // Create the format for KMP_SNPRINTF based on flags parsed above
8558   format_index = 0;
8559   format[format_index++] = '%';
8560   if (!right_justify)
8561     format[format_index++] = '-';
8562   if (pad_zeros)
8563     format[format_index++] = '0';
8564   if (width_left && width_right) {
8565     int i = 0;
8566     // Only allow 8 digit number widths.
8567     // This also prevents overflowing format variable
8568     while (i < 8 && width_left < width_right) {
8569       format[format_index++] = *width_left;
8570       width_left++;
8571       i++;
8572     }
8573   }
8574 
8575   // Parse a name (long or short)
8576   // Canonicalize the name into absolute_short_name
8577   found_valid_name = false;
8578   parse_long_name = (**ptr == '{');
8579   if (parse_long_name)
8580     (*ptr)++; // skip initial left brace
8581   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8582                              sizeof(__kmp_affinity_format_table[0]);
8583        ++i) {
8584     char short_name = __kmp_affinity_format_table[i].short_name;
8585     const char *long_name = __kmp_affinity_format_table[i].long_name;
8586     char field_format = __kmp_affinity_format_table[i].field_format;
8587     if (parse_long_name) {
8588       size_t length = KMP_STRLEN(long_name);
8589       if (strncmp(*ptr, long_name, length) == 0) {
8590         found_valid_name = true;
8591         (*ptr) += length; // skip the long name
8592       }
8593     } else if (**ptr == short_name) {
8594       found_valid_name = true;
8595       (*ptr)++; // skip the short name
8596     }
8597     if (found_valid_name) {
8598       format[format_index++] = field_format;
8599       format[format_index++] = '\0';
8600       absolute_short_name = short_name;
8601       break;
8602     }
8603   }
8604   if (parse_long_name) {
8605     if (**ptr != '}') {
8606       absolute_short_name = 0;
8607     } else {
8608       (*ptr)++; // skip over the right brace
8609     }
8610   }
8611 
8612   // Attempt to fill the buffer with the requested
8613   // value using snprintf within __kmp_str_buf_print()
8614   switch (absolute_short_name) {
8615   case 't':
8616     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8617     break;
8618   case 'T':
8619     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8620     break;
8621   case 'L':
8622     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8623     break;
8624   case 'n':
8625     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8626     break;
8627   case 'H': {
8628     static const int BUFFER_SIZE = 256;
8629     char buf[BUFFER_SIZE];
8630     __kmp_expand_host_name(buf, BUFFER_SIZE);
8631     rc = __kmp_str_buf_print(field_buffer, format, buf);
8632   } break;
8633   case 'P':
8634     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8635     break;
8636   case 'i':
8637     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8638     break;
8639   case 'N':
8640     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8641     break;
8642   case 'a':
8643     field_value =
8644         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8645     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8646     break;
8647 #if KMP_AFFINITY_SUPPORTED
8648   case 'A': {
8649     kmp_str_buf_t buf;
8650     __kmp_str_buf_init(&buf);
8651     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8652     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8653     __kmp_str_buf_free(&buf);
8654   } break;
8655 #endif
8656   default:
8657     // According to spec, If an implementation does not have info for field
8658     // type, then "undefined" is printed
8659     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8660     // Skip the field
8661     if (parse_long_name) {
8662       SKIP_TOKEN(*ptr);
8663       if (**ptr == '}')
8664         (*ptr)++;
8665     } else {
8666       (*ptr)++;
8667     }
8668   }
8669 
8670   KMP_ASSERT(format_index <= FORMAT_SIZE);
8671   return rc;
8672 }
8673 
8674 /*
8675  * Return number of characters needed to hold the affinity string
8676  * (not including null byte character)
8677  * The resultant string is printed to buffer, which the caller can then
8678  * handle afterwards
8679  */
8680 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8681                                   kmp_str_buf_t *buffer) {
8682   const char *parse_ptr;
8683   size_t retval;
8684   const kmp_info_t *th;
8685   kmp_str_buf_t field;
8686 
8687   KMP_DEBUG_ASSERT(buffer);
8688   KMP_DEBUG_ASSERT(gtid >= 0);
8689 
8690   __kmp_str_buf_init(&field);
8691   __kmp_str_buf_clear(buffer);
8692 
8693   th = __kmp_threads[gtid];
8694   retval = 0;
8695 
8696   // If format is NULL or zero-length string, then we use
8697   // affinity-format-var ICV
8698   parse_ptr = format;
8699   if (parse_ptr == NULL || *parse_ptr == '\0') {
8700     parse_ptr = __kmp_affinity_format;
8701   }
8702   KMP_DEBUG_ASSERT(parse_ptr);
8703 
8704   while (*parse_ptr != '\0') {
8705     // Parse a field
8706     if (*parse_ptr == '%') {
8707       // Put field in the buffer
8708       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8709       __kmp_str_buf_catbuf(buffer, &field);
8710       retval += rc;
8711     } else {
8712       // Put literal character in buffer
8713       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8714       retval++;
8715       parse_ptr++;
8716     }
8717   }
8718   __kmp_str_buf_free(&field);
8719   return retval;
8720 }
8721 
8722 // Displays the affinity string to stdout
8723 void __kmp_aux_display_affinity(int gtid, const char *format) {
8724   kmp_str_buf_t buf;
8725   __kmp_str_buf_init(&buf);
8726   __kmp_aux_capture_affinity(gtid, format, &buf);
8727   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8728   __kmp_str_buf_free(&buf);
8729 }
8730 
8731 /* ------------------------------------------------------------------------ */
8732 
8733 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8734   int blocktime = arg; /* argument is in milliseconds */
8735 #if KMP_USE_MONITOR
8736   int bt_intervals;
8737 #endif
8738   kmp_int8 bt_set;
8739 
8740   __kmp_save_internal_controls(thread);
8741 
8742   /* Normalize and set blocktime for the teams */
8743   if (blocktime < KMP_MIN_BLOCKTIME)
8744     blocktime = KMP_MIN_BLOCKTIME;
8745   else if (blocktime > KMP_MAX_BLOCKTIME)
8746     blocktime = KMP_MAX_BLOCKTIME;
8747 
8748   set__blocktime_team(thread->th.th_team, tid, blocktime);
8749   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8750 
8751 #if KMP_USE_MONITOR
8752   /* Calculate and set blocktime intervals for the teams */
8753   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8754 
8755   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8756   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8757 #endif
8758 
8759   /* Set whether blocktime has been set to "TRUE" */
8760   bt_set = TRUE;
8761 
8762   set__bt_set_team(thread->th.th_team, tid, bt_set);
8763   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8764 #if KMP_USE_MONITOR
8765   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8766                 "bt_intervals=%d, monitor_updates=%d\n",
8767                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8768                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8769                 __kmp_monitor_wakeups));
8770 #else
8771   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8772                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8773                 thread->th.th_team->t.t_id, tid, blocktime));
8774 #endif
8775 }
8776 
8777 void __kmp_aux_set_defaults(char const *str, size_t len) {
8778   if (!__kmp_init_serial) {
8779     __kmp_serial_initialize();
8780   }
8781   __kmp_env_initialize(str);
8782 
8783   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8784     __kmp_env_print();
8785   }
8786 } // __kmp_aux_set_defaults
8787 
8788 /* ------------------------------------------------------------------------ */
8789 /* internal fast reduction routines */
8790 
8791 PACKED_REDUCTION_METHOD_T
8792 __kmp_determine_reduction_method(
8793     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8794     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8795     kmp_critical_name *lck) {
8796 
8797   // Default reduction method: critical construct ( lck != NULL, like in current
8798   // PAROPT )
8799   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8800   // can be selected by RTL
8801   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8802   // can be selected by RTL
8803   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8804   // among generated by PAROPT.
8805 
8806   PACKED_REDUCTION_METHOD_T retval;
8807 
8808   int team_size;
8809 
8810   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8811 
8812 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8813   (loc &&                                                                      \
8814    ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8815 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8816 
8817   retval = critical_reduce_block;
8818 
8819   // another choice of getting a team size (with 1 dynamic deference) is slower
8820   team_size = __kmp_get_team_num_threads(global_tid);
8821   if (team_size == 1) {
8822 
8823     retval = empty_reduce_block;
8824 
8825   } else {
8826 
8827     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8828 
8829 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8830     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
8831 
8832 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8833     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8834 
8835     int teamsize_cutoff = 4;
8836 
8837 #if KMP_MIC_SUPPORTED
8838     if (__kmp_mic_type != non_mic) {
8839       teamsize_cutoff = 8;
8840     }
8841 #endif
8842     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8843     if (tree_available) {
8844       if (team_size <= teamsize_cutoff) {
8845         if (atomic_available) {
8846           retval = atomic_reduce_block;
8847         }
8848       } else {
8849         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8850       }
8851     } else if (atomic_available) {
8852       retval = atomic_reduce_block;
8853     }
8854 #else
8855 #error "Unknown or unsupported OS"
8856 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8857        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8858 
8859 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8860 
8861 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8862     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8863 
8864     // basic tuning
8865 
8866     if (atomic_available) {
8867       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8868         retval = atomic_reduce_block;
8869       }
8870     } // otherwise: use critical section
8871 
8872 #elif KMP_OS_DARWIN
8873 
8874     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8875     if (atomic_available && (num_vars <= 3)) {
8876       retval = atomic_reduce_block;
8877     } else if (tree_available) {
8878       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8879           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8880         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8881       }
8882     } // otherwise: use critical section
8883 
8884 #else
8885 #error "Unknown or unsupported OS"
8886 #endif
8887 
8888 #else
8889 #error "Unknown or unsupported architecture"
8890 #endif
8891   }
8892 
8893   // KMP_FORCE_REDUCTION
8894 
8895   // If the team is serialized (team_size == 1), ignore the forced reduction
8896   // method and stay with the unsynchronized method (empty_reduce_block)
8897   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8898       team_size != 1) {
8899 
8900     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8901 
8902     int atomic_available, tree_available;
8903 
8904     switch ((forced_retval = __kmp_force_reduction_method)) {
8905     case critical_reduce_block:
8906       KMP_ASSERT(lck); // lck should be != 0
8907       break;
8908 
8909     case atomic_reduce_block:
8910       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8911       if (!atomic_available) {
8912         KMP_WARNING(RedMethodNotSupported, "atomic");
8913         forced_retval = critical_reduce_block;
8914       }
8915       break;
8916 
8917     case tree_reduce_block:
8918       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8919       if (!tree_available) {
8920         KMP_WARNING(RedMethodNotSupported, "tree");
8921         forced_retval = critical_reduce_block;
8922       } else {
8923 #if KMP_FAST_REDUCTION_BARRIER
8924         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8925 #endif
8926       }
8927       break;
8928 
8929     default:
8930       KMP_ASSERT(0); // "unsupported method specified"
8931     }
8932 
8933     retval = forced_retval;
8934   }
8935 
8936   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8937 
8938 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8939 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8940 
8941   return (retval);
8942 }
8943 // this function is for testing set/get/determine reduce method
8944 kmp_int32 __kmp_get_reduce_method(void) {
8945   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8946 }
8947 
8948 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8949 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8950 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8951 
8952 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8953 // OpenMP is used subsequently.
8954 void __kmp_hard_pause() {
8955   __kmp_pause_status = kmp_hard_paused;
8956   __kmp_internal_end_thread(-1);
8957 }
8958 
8959 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8960 void __kmp_resume_if_soft_paused() {
8961   if (__kmp_pause_status == kmp_soft_paused) {
8962     __kmp_pause_status = kmp_not_paused;
8963 
8964     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8965       kmp_info_t *thread = __kmp_threads[gtid];
8966       if (thread) { // Wake it if sleeping
8967         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8968                          thread);
8969         if (fl.is_sleeping())
8970           fl.resume(gtid);
8971         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8972           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8973         } else { // thread holds the lock and may sleep soon
8974           do { // until either the thread sleeps, or we can get the lock
8975             if (fl.is_sleeping()) {
8976               fl.resume(gtid);
8977               break;
8978             } else if (__kmp_try_suspend_mx(thread)) {
8979               __kmp_unlock_suspend_mx(thread);
8980               break;
8981             }
8982           } while (1);
8983         }
8984       }
8985     }
8986   }
8987 }
8988 
8989 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8990 // TODO: add warning messages
8991 int __kmp_pause_resource(kmp_pause_status_t level) {
8992   if (level == kmp_not_paused) { // requesting resume
8993     if (__kmp_pause_status == kmp_not_paused) {
8994       // error message about runtime not being paused, so can't resume
8995       return 1;
8996     } else {
8997       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8998                        __kmp_pause_status == kmp_hard_paused);
8999       __kmp_pause_status = kmp_not_paused;
9000       return 0;
9001     }
9002   } else if (level == kmp_soft_paused) { // requesting soft pause
9003     if (__kmp_pause_status != kmp_not_paused) {
9004       // error message about already being paused
9005       return 1;
9006     } else {
9007       __kmp_soft_pause();
9008       return 0;
9009     }
9010   } else if (level == kmp_hard_paused) { // requesting hard pause
9011     if (__kmp_pause_status != kmp_not_paused) {
9012       // error message about already being paused
9013       return 1;
9014     } else {
9015       __kmp_hard_pause();
9016       return 0;
9017     }
9018   } else {
9019     // error message about invalid level
9020     return 1;
9021   }
9022 }
9023 
9024 void __kmp_omp_display_env(int verbose) {
9025   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9026   if (__kmp_init_serial == 0)
9027     __kmp_do_serial_initialize();
9028   __kmp_display_env_impl(!verbose, verbose);
9029   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9030 }
9031 
9032 // The team size is changing, so distributed barrier must be modified
9033 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9034                                int new_nthreads) {
9035   KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9036                    bp_dist_bar);
9037   kmp_info_t **other_threads = team->t.t_threads;
9038 
9039   // We want all the workers to stop waiting on the barrier while we adjust the
9040   // size of the team.
9041   for (int f = 1; f < old_nthreads; ++f) {
9042     KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9043     // Ignore threads that are already inactive or not present in the team
9044     if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9045       // teams construct causes thread_limit to get passed in, and some of
9046       // those could be inactive; just ignore them
9047       continue;
9048     }
9049     // If thread is transitioning still to in_use state, wait for it
9050     if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9051       while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9052         KMP_CPU_PAUSE();
9053     }
9054     // The thread should be in_use now
9055     KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9056     // Transition to unused state
9057     team->t.t_threads[f]->th.th_used_in_team.store(2);
9058     KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9059   }
9060   // Release all the workers
9061   team->t.b->go_release();
9062 
9063   KMP_MFENCE();
9064 
9065   // Workers should see transition status 2 and move to 0; but may need to be
9066   // woken up first
9067   int count = old_nthreads - 1;
9068   while (count > 0) {
9069     count = old_nthreads - 1;
9070     for (int f = 1; f < old_nthreads; ++f) {
9071       if (other_threads[f]->th.th_used_in_team.load() != 0) {
9072         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9073           kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9074               void *, other_threads[f]->th.th_sleep_loc);
9075           __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9076         }
9077       } else {
9078         KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9079         count--;
9080       }
9081     }
9082   }
9083   // Now update the barrier size
9084   team->t.b->update_num_threads(new_nthreads);
9085   team->t.b->go_reset();
9086 }
9087 
9088 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9089   // Add the threads back to the team
9090   KMP_DEBUG_ASSERT(team);
9091   // Threads were paused and pointed at th_used_in_team temporarily during a
9092   // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9093   // the thread that it should transition itself back into the team. Then, if
9094   // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9095   // to wake it up.
9096   for (int f = 1; f < new_nthreads; ++f) {
9097     KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9098     KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9099                                 3);
9100     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9101       __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9102                       (kmp_flag_32<false, false> *)NULL);
9103     }
9104   }
9105   // The threads should be transitioning to the team; when they are done, they
9106   // should have set th_used_in_team to 1. This loop forces master to wait until
9107   // all threads have moved into the team and are waiting in the barrier.
9108   int count = new_nthreads - 1;
9109   while (count > 0) {
9110     count = new_nthreads - 1;
9111     for (int f = 1; f < new_nthreads; ++f) {
9112       if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9113         count--;
9114       }
9115     }
9116   }
9117 }
9118 
9119 // Globals and functions for hidden helper task
9120 kmp_info_t **__kmp_hidden_helper_threads;
9121 kmp_info_t *__kmp_hidden_helper_main_thread;
9122 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9123 #if KMP_OS_LINUX
9124 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9125 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9126 #else
9127 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9128 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9129 #endif
9130 
9131 namespace {
9132 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9133 
9134 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9135   // This is an explicit synchronization on all hidden helper threads in case
9136   // that when a regular thread pushes a hidden helper task to one hidden
9137   // helper thread, the thread has not been awaken once since they're released
9138   // by the main thread after creating the team.
9139   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9140   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9141          __kmp_hidden_helper_threads_num)
9142     ;
9143 
9144   // If main thread, then wait for signal
9145   if (__kmpc_master(nullptr, *gtid)) {
9146     // First, unset the initial state and release the initial thread
9147     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9148     __kmp_hidden_helper_initz_release();
9149     __kmp_hidden_helper_main_thread_wait();
9150     // Now wake up all worker threads
9151     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9152       __kmp_hidden_helper_worker_thread_signal();
9153     }
9154   }
9155 }
9156 } // namespace
9157 
9158 void __kmp_hidden_helper_threads_initz_routine() {
9159   // Create a new root for hidden helper team/threads
9160   const int gtid = __kmp_register_root(TRUE);
9161   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9162   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9163   __kmp_hidden_helper_main_thread->th.th_set_nproc =
9164       __kmp_hidden_helper_threads_num;
9165 
9166   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9167 
9168   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9169 
9170   // Set the initialization flag to FALSE
9171   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9172 
9173   __kmp_hidden_helper_threads_deinitz_release();
9174 }
9175 
9176 /* Nesting Mode:
9177    Set via KMP_NESTING_MODE, which takes an integer.
9178    Note: we skip duplicate topology levels, and skip levels with only
9179       one entity.
9180    KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9181    KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9182       in the topology, and initializes the number of threads at each of those
9183       levels to the number of entities at each level, respectively, below the
9184       entity at the parent level.
9185    KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9186       but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9187       the user to turn nesting on explicitly. This is an even more experimental
9188       option to this experimental feature, and may change or go away in the
9189       future.
9190 */
9191 
9192 // Allocate space to store nesting levels
9193 void __kmp_init_nesting_mode() {
9194   int levels = KMP_HW_LAST;
9195   __kmp_nesting_mode_nlevels = levels;
9196   __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9197   for (int i = 0; i < levels; ++i)
9198     __kmp_nesting_nth_level[i] = 0;
9199   if (__kmp_nested_nth.size < levels) {
9200     __kmp_nested_nth.nth =
9201         (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9202     __kmp_nested_nth.size = levels;
9203   }
9204 }
9205 
9206 // Set # threads for top levels of nesting; must be called after topology set
9207 void __kmp_set_nesting_mode_threads() {
9208   kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9209 
9210   if (__kmp_nesting_mode == 1)
9211     __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9212   else if (__kmp_nesting_mode > 1)
9213     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9214 
9215   if (__kmp_topology) { // use topology info
9216     int loc, hw_level;
9217     for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9218                                 loc < __kmp_nesting_mode_nlevels;
9219          loc++, hw_level++) {
9220       __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9221       if (__kmp_nesting_nth_level[loc] == 1)
9222         loc--;
9223     }
9224     // Make sure all cores are used
9225     if (__kmp_nesting_mode > 1 && loc > 1) {
9226       int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9227       int num_cores = __kmp_topology->get_count(core_level);
9228       int upper_levels = 1;
9229       for (int level = 0; level < loc - 1; ++level)
9230         upper_levels *= __kmp_nesting_nth_level[level];
9231       if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9232         __kmp_nesting_nth_level[loc - 1] =
9233             num_cores / __kmp_nesting_nth_level[loc - 2];
9234     }
9235     __kmp_nesting_mode_nlevels = loc;
9236     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9237   } else { // no topology info available; provide a reasonable guesstimation
9238     if (__kmp_avail_proc >= 4) {
9239       __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9240       __kmp_nesting_nth_level[1] = 2;
9241       __kmp_nesting_mode_nlevels = 2;
9242     } else {
9243       __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9244       __kmp_nesting_mode_nlevels = 1;
9245     }
9246     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9247   }
9248   for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9249     __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9250   }
9251   set__nproc(thread, __kmp_nesting_nth_level[0]);
9252   if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9253     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9254   if (get__max_active_levels(thread) > 1) {
9255     // if max levels was set, set nesting mode levels to same
9256     __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9257   }
9258   if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9259     set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9260 }
9261 
9262 // Empty symbols to export (see exports_so.txt) when feature is disabled
9263 extern "C" {
9264 #if !KMP_STATS_ENABLED
9265 void __kmp_reset_stats() {}
9266 #endif
9267 #if !USE_DEBUGGER
9268 int __kmp_omp_debug_struct_info = FALSE;
9269 int __kmp_debugging = FALSE;
9270 #endif
9271 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9272 void __kmp_itt_fini_ittlib() {}
9273 void __kmp_itt_init_ittlib() {}
9274 #endif
9275 }
9276 
9277 // end of file
9278