xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_runtime.cpp (revision e64fe029e9d3ce476e77a478318e0c3cd201ff08)
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #if KMP_OS_WINDOWS
51 // windows does not need include files as it doesn't use shared memory
52 #else
53 #include <sys/mman.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #define SHM_SIZE 1024
57 #endif
58 
59 #if defined(KMP_GOMP_COMPAT)
60 char const __kmp_version_alt_comp[] =
61     KMP_VERSION_PREFIX "alternative compiler support: yes";
62 #endif /* defined(KMP_GOMP_COMPAT) */
63 
64 char const __kmp_version_omp_api[] =
65     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66 
67 #ifdef KMP_DEBUG
68 char const __kmp_version_lock[] =
69     KMP_VERSION_PREFIX "lock type: run time selectable";
70 #endif /* KMP_DEBUG */
71 
72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73 
74 /* ------------------------------------------------------------------------ */
75 
76 #if KMP_USE_MONITOR
77 kmp_info_t __kmp_monitor;
78 #endif
79 
80 /* Forward declarations */
81 
82 void __kmp_cleanup(void);
83 
84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85                                   int gtid);
86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87                                   kmp_internal_control_t *new_icvs,
88                                   ident_t *loc);
89 #if KMP_AFFINITY_SUPPORTED
90 static void __kmp_partition_places(kmp_team_t *team,
91                                    int update_master_only = 0);
92 #endif
93 static void __kmp_do_serial_initialize(void);
94 void __kmp_fork_barrier(int gtid, int tid);
95 void __kmp_join_barrier(int gtid);
96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97                           kmp_internal_control_t *new_icvs, ident_t *loc);
98 
99 #ifdef USE_LOAD_BALANCE
100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101 #endif
102 
103 static int __kmp_expand_threads(int nNeed);
104 #if KMP_OS_WINDOWS
105 static int __kmp_unregister_root_other_thread(int gtid);
106 #endif
107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109 
110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111                                int new_nthreads);
112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113 
114 /* Calculate the identifier of the current thread */
115 /* fast (and somewhat portable) way to get unique identifier of executing
116    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117 int __kmp_get_global_thread_id() {
118   int i;
119   kmp_info_t **other_threads;
120   size_t stack_data;
121   char *stack_addr;
122   size_t stack_size;
123   char *stack_base;
124 
125   KA_TRACE(
126       1000,
127       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
128        __kmp_nth, __kmp_all_nth));
129 
130   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133      __kmp_init_gtid for this to work. */
134 
135   if (!TCR_4(__kmp_init_gtid))
136     return KMP_GTID_DNE;
137 
138 #ifdef KMP_TDATA_GTID
139   if (TCR_4(__kmp_gtid_mode) >= 3) {
140     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141     return __kmp_gtid;
142   }
143 #endif
144   if (TCR_4(__kmp_gtid_mode) >= 2) {
145     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146     return __kmp_gtid_get_specific();
147   }
148   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149 
150   stack_addr = (char *)&stack_data;
151   other_threads = __kmp_threads;
152 
153   /* ATT: The code below is a source of potential bugs due to unsynchronized
154      access to __kmp_threads array. For example:
155      1. Current thread loads other_threads[i] to thr and checks it, it is
156         non-NULL.
157      2. Current thread is suspended by OS.
158      3. Another thread unregisters and finishes (debug versions of free()
159         may fill memory with something like 0xEF).
160      4. Current thread is resumed.
161      5. Current thread reads junk from *thr.
162      TODO: Fix it.  --ln  */
163 
164   for (i = 0; i < __kmp_threads_capacity; i++) {
165 
166     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167     if (!thr)
168       continue;
169 
170     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172 
173     /* stack grows down -- search through all of the active threads */
174 
175     if (stack_addr <= stack_base) {
176       size_t stack_diff = stack_base - stack_addr;
177 
178       if (stack_diff <= stack_size) {
179         /* The only way we can be closer than the allocated */
180         /* stack size is if we are running on this thread. */
181         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
182         return i;
183       }
184     }
185   }
186 
187   /* get specific to try and determine our gtid */
188   KA_TRACE(1000,
189            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
190             "thread, using TLS\n"));
191   i = __kmp_gtid_get_specific();
192 
193   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
194 
195   /* if we havn't been assigned a gtid, then return code */
196   if (i < 0)
197     return i;
198 
199   /* dynamically updated stack window for uber threads to avoid get_specific
200      call */
201   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
202     KMP_FATAL(StackOverflow, i);
203   }
204 
205   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206   if (stack_addr > stack_base) {
207     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
208     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
210                 stack_base);
211   } else {
212     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
213             stack_base - stack_addr);
214   }
215 
216   /* Reprint stack bounds for ubermaster since they have been refined */
217   if (__kmp_storage_map) {
218     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
220     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
221                                  other_threads[i]->th.th_info.ds.ds_stacksize,
222                                  "th_%d stack (refinement)", i);
223   }
224   return i;
225 }
226 
227 int __kmp_get_global_thread_id_reg() {
228   int gtid;
229 
230   if (!__kmp_init_serial) {
231     gtid = KMP_GTID_DNE;
232   } else
233 #ifdef KMP_TDATA_GTID
234       if (TCR_4(__kmp_gtid_mode) >= 3) {
235     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
236     gtid = __kmp_gtid;
237   } else
238 #endif
239       if (TCR_4(__kmp_gtid_mode) >= 2) {
240     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
241     gtid = __kmp_gtid_get_specific();
242   } else {
243     KA_TRACE(1000,
244              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
245     gtid = __kmp_get_global_thread_id();
246   }
247 
248   /* we must be a new uber master sibling thread */
249   if (gtid == KMP_GTID_DNE) {
250     KA_TRACE(10,
251              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
252               "Registering a new gtid.\n"));
253     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
254     if (!__kmp_init_serial) {
255       __kmp_do_serial_initialize();
256       gtid = __kmp_gtid_get_specific();
257     } else {
258       gtid = __kmp_register_root(FALSE);
259     }
260     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
261     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
262   }
263 
264   KMP_DEBUG_ASSERT(gtid >= 0);
265 
266   return gtid;
267 }
268 
269 /* caller must hold forkjoin_lock */
270 void __kmp_check_stack_overlap(kmp_info_t *th) {
271   int f;
272   char *stack_beg = NULL;
273   char *stack_end = NULL;
274   int gtid;
275 
276   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
277   if (__kmp_storage_map) {
278     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
279     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
280 
281     gtid = __kmp_gtid_from_thread(th);
282 
283     if (gtid == KMP_GTID_MONITOR) {
284       __kmp_print_storage_map_gtid(
285           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286           "th_%s stack (%s)", "mon",
287           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288     } else {
289       __kmp_print_storage_map_gtid(
290           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
291           "th_%d stack (%s)", gtid,
292           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
293     }
294   }
295 
296   /* No point in checking ubermaster threads since they use refinement and
297    * cannot overlap */
298   gtid = __kmp_gtid_from_thread(th);
299   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
300     KA_TRACE(10,
301              ("__kmp_check_stack_overlap: performing extensive checking\n"));
302     if (stack_beg == NULL) {
303       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
304       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
305     }
306 
307     for (f = 0; f < __kmp_threads_capacity; f++) {
308       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
309 
310       if (f_th && f_th != th) {
311         char *other_stack_end =
312             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
313         char *other_stack_beg =
314             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
315         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
316             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
317 
318           /* Print the other stack values before the abort */
319           if (__kmp_storage_map)
320             __kmp_print_storage_map_gtid(
321                 -1, other_stack_beg, other_stack_end,
322                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
323                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
324 
325           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
326                       __kmp_msg_null);
327         }
328       }
329     }
330   }
331   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
332 }
333 
334 /* ------------------------------------------------------------------------ */
335 
336 void __kmp_infinite_loop(void) {
337   static int done = FALSE;
338 
339   while (!done) {
340     KMP_YIELD(TRUE);
341   }
342 }
343 
344 #define MAX_MESSAGE 512
345 
346 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
347                                   char const *format, ...) {
348   char buffer[MAX_MESSAGE];
349   va_list ap;
350 
351   va_start(ap, format);
352   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
353                p2, (unsigned long)size, format);
354   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
355   __kmp_vprintf(kmp_err, buffer, ap);
356 #if KMP_PRINT_DATA_PLACEMENT
357   int node;
358   if (gtid >= 0) {
359     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
360       if (__kmp_storage_map_verbose) {
361         node = __kmp_get_host_node(p1);
362         if (node < 0) /* doesn't work, so don't try this next time */
363           __kmp_storage_map_verbose = FALSE;
364         else {
365           char *last;
366           int lastNode;
367           int localProc = __kmp_get_cpu_from_gtid(gtid);
368 
369           const int page_size = KMP_GET_PAGE_SIZE();
370 
371           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
372           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
373           if (localProc >= 0)
374             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
375                                  localProc >> 1);
376           else
377             __kmp_printf_no_lock("  GTID %d\n", gtid);
378 #if KMP_USE_PRCTL
379           /* The more elaborate format is disabled for now because of the prctl
380            * hanging bug. */
381           do {
382             last = p1;
383             lastNode = node;
384             /* This loop collates adjacent pages with the same host node. */
385             do {
386               (char *)p1 += page_size;
387             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
388             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
389                                  lastNode);
390           } while (p1 <= p2);
391 #else
392           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
393                                (char *)p1 + (page_size - 1),
394                                __kmp_get_host_node(p1));
395           if (p1 < p2) {
396             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
397                                  (char *)p2 + (page_size - 1),
398                                  __kmp_get_host_node(p2));
399           }
400 #endif
401         }
402       }
403     } else
404       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
405   }
406 #endif /* KMP_PRINT_DATA_PLACEMENT */
407   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
408 }
409 
410 void __kmp_warn(char const *format, ...) {
411   char buffer[MAX_MESSAGE];
412   va_list ap;
413 
414   if (__kmp_generate_warnings == kmp_warnings_off) {
415     return;
416   }
417 
418   va_start(ap, format);
419 
420   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
421   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
422   __kmp_vprintf(kmp_err, buffer, ap);
423   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
424 
425   va_end(ap);
426 }
427 
428 void __kmp_abort_process() {
429   // Later threads may stall here, but that's ok because abort() will kill them.
430   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
431 
432   if (__kmp_debug_buf) {
433     __kmp_dump_debug_buffer();
434   }
435 
436   if (KMP_OS_WINDOWS) {
437     // Let other threads know of abnormal termination and prevent deadlock
438     // if abort happened during library initialization or shutdown
439     __kmp_global.g.g_abort = SIGABRT;
440 
441     /* On Windows* OS by default abort() causes pop-up error box, which stalls
442        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
443        boxes. _set_abort_behavior() works well, but this function is not
444        available in VS7 (this is not problem for DLL, but it is a problem for
445        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
446        help, at least in some versions of MS C RTL.
447 
448        It seems following sequence is the only way to simulate abort() and
449        avoid pop-up error box. */
450     raise(SIGABRT);
451     _exit(3); // Just in case, if signal ignored, exit anyway.
452   } else {
453     __kmp_unregister_library();
454     abort();
455   }
456 
457   __kmp_infinite_loop();
458   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
459 
460 } // __kmp_abort_process
461 
462 void __kmp_abort_thread(void) {
463   // TODO: Eliminate g_abort global variable and this function.
464   // In case of abort just call abort(), it will kill all the threads.
465   __kmp_infinite_loop();
466 } // __kmp_abort_thread
467 
468 /* Print out the storage map for the major kmp_info_t thread data structures
469    that are allocated together. */
470 
471 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
472   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
473                                gtid);
474 
475   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
476                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
477 
478   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
479                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
480 
481   __kmp_print_storage_map_gtid(
482       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
483       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
484 
485   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
486                                &thr->th.th_bar[bs_plain_barrier + 1],
487                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
488                                gtid);
489 
490   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
491                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
492                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
493                                gtid);
494 
495 #if KMP_FAST_REDUCTION_BARRIER
496   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
497                                &thr->th.th_bar[bs_reduction_barrier + 1],
498                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
499                                gtid);
500 #endif // KMP_FAST_REDUCTION_BARRIER
501 }
502 
503 /* Print out the storage map for the major kmp_team_t team data structures
504    that are allocated together. */
505 
506 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
507                                          int team_id, int num_thr) {
508   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
509   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
510                                header, team_id);
511 
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
513                                &team->t.t_bar[bs_last_barrier],
514                                sizeof(kmp_balign_team_t) * bs_last_barrier,
515                                "%s_%d.t_bar", header, team_id);
516 
517   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
518                                &team->t.t_bar[bs_plain_barrier + 1],
519                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
520                                header, team_id);
521 
522   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
523                                &team->t.t_bar[bs_forkjoin_barrier + 1],
524                                sizeof(kmp_balign_team_t),
525                                "%s_%d.t_bar[forkjoin]", header, team_id);
526 
527 #if KMP_FAST_REDUCTION_BARRIER
528   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
529                                &team->t.t_bar[bs_reduction_barrier + 1],
530                                sizeof(kmp_balign_team_t),
531                                "%s_%d.t_bar[reduction]", header, team_id);
532 #endif // KMP_FAST_REDUCTION_BARRIER
533 
534   __kmp_print_storage_map_gtid(
535       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
536       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
537 
538   __kmp_print_storage_map_gtid(
539       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
540       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
541 
542   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
543                                &team->t.t_disp_buffer[num_disp_buff],
544                                sizeof(dispatch_shared_info_t) * num_disp_buff,
545                                "%s_%d.t_disp_buffer", header, team_id);
546 }
547 
548 static void __kmp_init_allocator() {
549   __kmp_init_memkind();
550   __kmp_init_target_mem();
551 }
552 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
553 
554 /* ------------------------------------------------------------------------ */
555 
556 #if KMP_DYNAMIC_LIB
557 #if KMP_OS_WINDOWS
558 
559 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
560   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
561 
562   switch (fdwReason) {
563 
564   case DLL_PROCESS_ATTACH:
565     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
566 
567     return TRUE;
568 
569   case DLL_PROCESS_DETACH:
570     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
571 
572     // According to Windows* documentation for DllMain entry point:
573     // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
574     //   lpReserved == NULL when FreeLibrary() is called,
575     //   lpReserved != NULL when the process is terminated.
576     // When FreeLibrary() is called, worker threads remain alive. So the
577     // runtime's state is consistent and executing proper shutdown is OK.
578     // When the process is terminated, worker threads have exited or been
579     // forcefully terminated by the OS and only the shutdown thread remains.
580     // This can leave the runtime in an inconsistent state.
581     // Hence, only attempt proper cleanup when FreeLibrary() is called.
582     // Otherwise, rely on OS to reclaim resources.
583     if (lpReserved == NULL)
584       __kmp_internal_end_library(__kmp_gtid_get_specific());
585 
586     return TRUE;
587 
588   case DLL_THREAD_ATTACH:
589     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
590 
591     /* if we want to register new siblings all the time here call
592      * __kmp_get_gtid(); */
593     return TRUE;
594 
595   case DLL_THREAD_DETACH:
596     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
597 
598     __kmp_internal_end_thread(__kmp_gtid_get_specific());
599     return TRUE;
600   }
601 
602   return TRUE;
603 }
604 
605 #endif /* KMP_OS_WINDOWS */
606 #endif /* KMP_DYNAMIC_LIB */
607 
608 /* __kmp_parallel_deo -- Wait until it's our turn. */
609 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
610   int gtid = *gtid_ref;
611 #ifdef BUILD_PARALLEL_ORDERED
612   kmp_team_t *team = __kmp_team_from_gtid(gtid);
613 #endif /* BUILD_PARALLEL_ORDERED */
614 
615   if (__kmp_env_consistency_check) {
616     if (__kmp_threads[gtid]->th.th_root->r.r_active)
617 #if KMP_USE_DYNAMIC_LOCK
618       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
619 #else
620       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
621 #endif
622   }
623 #ifdef BUILD_PARALLEL_ORDERED
624   if (!team->t.t_serialized) {
625     KMP_MB();
626     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
627              NULL);
628     KMP_MB();
629   }
630 #endif /* BUILD_PARALLEL_ORDERED */
631 }
632 
633 /* __kmp_parallel_dxo -- Signal the next task. */
634 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
635   int gtid = *gtid_ref;
636 #ifdef BUILD_PARALLEL_ORDERED
637   int tid = __kmp_tid_from_gtid(gtid);
638   kmp_team_t *team = __kmp_team_from_gtid(gtid);
639 #endif /* BUILD_PARALLEL_ORDERED */
640 
641   if (__kmp_env_consistency_check) {
642     if (__kmp_threads[gtid]->th.th_root->r.r_active)
643       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
644   }
645 #ifdef BUILD_PARALLEL_ORDERED
646   if (!team->t.t_serialized) {
647     KMP_MB(); /* Flush all pending memory write invalidates.  */
648 
649     /* use the tid of the next thread in this team */
650     /* TODO replace with general release procedure */
651     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
652 
653     KMP_MB(); /* Flush all pending memory write invalidates.  */
654   }
655 #endif /* BUILD_PARALLEL_ORDERED */
656 }
657 
658 /* ------------------------------------------------------------------------ */
659 /* The BARRIER for a SINGLE process section is always explicit   */
660 
661 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
662   int status;
663   kmp_info_t *th;
664   kmp_team_t *team;
665 
666   if (!TCR_4(__kmp_init_parallel))
667     __kmp_parallel_initialize();
668   __kmp_resume_if_soft_paused();
669 
670   th = __kmp_threads[gtid];
671   team = th->th.th_team;
672   status = 0;
673 
674   th->th.th_ident = id_ref;
675 
676   if (team->t.t_serialized) {
677     status = 1;
678   } else {
679     kmp_int32 old_this = th->th.th_local.this_construct;
680 
681     ++th->th.th_local.this_construct;
682     /* try to set team count to thread count--success means thread got the
683        single block */
684     /* TODO: Should this be acquire or release? */
685     if (team->t.t_construct == old_this) {
686       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
687                                               th->th.th_local.this_construct);
688     }
689 #if USE_ITT_BUILD
690     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
691         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
692         team->t.t_active_level == 1) {
693       // Only report metadata by primary thread of active team at level 1
694       __kmp_itt_metadata_single(id_ref);
695     }
696 #endif /* USE_ITT_BUILD */
697   }
698 
699   if (__kmp_env_consistency_check) {
700     if (status && push_ws) {
701       __kmp_push_workshare(gtid, ct_psingle, id_ref);
702     } else {
703       __kmp_check_workshare(gtid, ct_psingle, id_ref);
704     }
705   }
706 #if USE_ITT_BUILD
707   if (status) {
708     __kmp_itt_single_start(gtid);
709   }
710 #endif /* USE_ITT_BUILD */
711   return status;
712 }
713 
714 void __kmp_exit_single(int gtid) {
715 #if USE_ITT_BUILD
716   __kmp_itt_single_end(gtid);
717 #endif /* USE_ITT_BUILD */
718   if (__kmp_env_consistency_check)
719     __kmp_pop_workshare(gtid, ct_psingle, NULL);
720 }
721 
722 /* determine if we can go parallel or must use a serialized parallel region and
723  * how many threads we can use
724  * set_nproc is the number of threads requested for the team
725  * returns 0 if we should serialize or only use one thread,
726  * otherwise the number of threads to use
727  * The forkjoin lock is held by the caller. */
728 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
729                                  int master_tid, int set_nthreads,
730                                  int enter_teams) {
731   int capacity;
732   int new_nthreads;
733   KMP_DEBUG_ASSERT(__kmp_init_serial);
734   KMP_DEBUG_ASSERT(root && parent_team);
735   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
736 
737   // If dyn-var is set, dynamically adjust the number of desired threads,
738   // according to the method specified by dynamic_mode.
739   new_nthreads = set_nthreads;
740   if (!get__dynamic_2(parent_team, master_tid)) {
741     ;
742   }
743 #ifdef USE_LOAD_BALANCE
744   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
745     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
746     if (new_nthreads == 1) {
747       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
748                     "reservation to 1 thread\n",
749                     master_tid));
750       return 1;
751     }
752     if (new_nthreads < set_nthreads) {
753       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
754                     "reservation to %d threads\n",
755                     master_tid, new_nthreads));
756     }
757   }
758 #endif /* USE_LOAD_BALANCE */
759   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
760     new_nthreads = __kmp_avail_proc - __kmp_nth +
761                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
762     if (new_nthreads <= 1) {
763       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
764                     "reservation to 1 thread\n",
765                     master_tid));
766       return 1;
767     }
768     if (new_nthreads < set_nthreads) {
769       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
770                     "reservation to %d threads\n",
771                     master_tid, new_nthreads));
772     } else {
773       new_nthreads = set_nthreads;
774     }
775   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
776     if (set_nthreads > 2) {
777       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
778       new_nthreads = (new_nthreads % set_nthreads) + 1;
779       if (new_nthreads == 1) {
780         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
781                       "reservation to 1 thread\n",
782                       master_tid));
783         return 1;
784       }
785       if (new_nthreads < set_nthreads) {
786         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
787                       "reservation to %d threads\n",
788                       master_tid, new_nthreads));
789       }
790     }
791   } else {
792     KMP_ASSERT(0);
793   }
794 
795   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
796   if (__kmp_nth + new_nthreads -
797           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
798       __kmp_max_nth) {
799     int tl_nthreads = __kmp_max_nth - __kmp_nth +
800                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
801     if (tl_nthreads <= 0) {
802       tl_nthreads = 1;
803     }
804 
805     // If dyn-var is false, emit a 1-time warning.
806     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
807       __kmp_reserve_warn = 1;
808       __kmp_msg(kmp_ms_warning,
809                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
810                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
811     }
812     if (tl_nthreads == 1) {
813       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
814                     "reduced reservation to 1 thread\n",
815                     master_tid));
816       return 1;
817     }
818     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
819                   "reservation to %d threads\n",
820                   master_tid, tl_nthreads));
821     new_nthreads = tl_nthreads;
822   }
823 
824   // Respect OMP_THREAD_LIMIT
825   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
826   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
827   if (cg_nthreads + new_nthreads -
828           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
829       max_cg_threads) {
830     int tl_nthreads = max_cg_threads - cg_nthreads +
831                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
832     if (tl_nthreads <= 0) {
833       tl_nthreads = 1;
834     }
835 
836     // If dyn-var is false, emit a 1-time warning.
837     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
838       __kmp_reserve_warn = 1;
839       __kmp_msg(kmp_ms_warning,
840                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
841                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
842     }
843     if (tl_nthreads == 1) {
844       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
845                     "reduced reservation to 1 thread\n",
846                     master_tid));
847       return 1;
848     }
849     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
850                   "reservation to %d threads\n",
851                   master_tid, tl_nthreads));
852     new_nthreads = tl_nthreads;
853   }
854 
855   // Check if the threads array is large enough, or needs expanding.
856   // See comment in __kmp_register_root() about the adjustment if
857   // __kmp_threads[0] == NULL.
858   capacity = __kmp_threads_capacity;
859   if (TCR_PTR(__kmp_threads[0]) == NULL) {
860     --capacity;
861   }
862   // If it is not for initializing the hidden helper team, we need to take
863   // __kmp_hidden_helper_threads_num out of the capacity because it is included
864   // in __kmp_threads_capacity.
865   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
866     capacity -= __kmp_hidden_helper_threads_num;
867   }
868   if (__kmp_nth + new_nthreads -
869           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870       capacity) {
871     // Expand the threads array.
872     int slotsRequired = __kmp_nth + new_nthreads -
873                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
874                         capacity;
875     int slotsAdded = __kmp_expand_threads(slotsRequired);
876     if (slotsAdded < slotsRequired) {
877       // The threads array was not expanded enough.
878       new_nthreads -= (slotsRequired - slotsAdded);
879       KMP_ASSERT(new_nthreads >= 1);
880 
881       // If dyn-var is false, emit a 1-time warning.
882       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
883         __kmp_reserve_warn = 1;
884         if (__kmp_tp_cached) {
885           __kmp_msg(kmp_ms_warning,
886                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
887                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
888                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
889         } else {
890           __kmp_msg(kmp_ms_warning,
891                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
892                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
893         }
894       }
895     }
896   }
897 
898 #ifdef KMP_DEBUG
899   if (new_nthreads == 1) {
900     KC_TRACE(10,
901              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
902               "dead roots and rechecking; requested %d threads\n",
903               __kmp_get_gtid(), set_nthreads));
904   } else {
905     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
906                   " %d threads\n",
907                   __kmp_get_gtid(), new_nthreads, set_nthreads));
908   }
909 #endif // KMP_DEBUG
910   return new_nthreads;
911 }
912 
913 /* Allocate threads from the thread pool and assign them to the new team. We are
914    assured that there are enough threads available, because we checked on that
915    earlier within critical section forkjoin */
916 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
917                                     kmp_info_t *master_th, int master_gtid,
918                                     int fork_teams_workers) {
919   int i;
920   int use_hot_team;
921 
922   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
923   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
924   KMP_MB();
925 
926   /* first, let's setup the primary thread */
927   master_th->th.th_info.ds.ds_tid = 0;
928   master_th->th.th_team = team;
929   master_th->th.th_team_nproc = team->t.t_nproc;
930   master_th->th.th_team_master = master_th;
931   master_th->th.th_team_serialized = FALSE;
932   master_th->th.th_dispatch = &team->t.t_dispatch[0];
933 
934 /* make sure we are not the optimized hot team */
935 #if KMP_NESTED_HOT_TEAMS
936   use_hot_team = 0;
937   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
938   if (hot_teams) { // hot teams array is not allocated if
939     // KMP_HOT_TEAMS_MAX_LEVEL=0
940     int level = team->t.t_active_level - 1; // index in array of hot teams
941     if (master_th->th.th_teams_microtask) { // are we inside the teams?
942       if (master_th->th.th_teams_size.nteams > 1) {
943         ++level; // level was not increased in teams construct for
944         // team_of_masters
945       }
946       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
947           master_th->th.th_teams_level == team->t.t_level) {
948         ++level; // level was not increased in teams construct for
949         // team_of_workers before the parallel
950       } // team->t.t_level will be increased inside parallel
951     }
952     if (level < __kmp_hot_teams_max_level) {
953       if (hot_teams[level].hot_team) {
954         // hot team has already been allocated for given level
955         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
956         use_hot_team = 1; // the team is ready to use
957       } else {
958         use_hot_team = 0; // AC: threads are not allocated yet
959         hot_teams[level].hot_team = team; // remember new hot team
960         hot_teams[level].hot_team_nth = team->t.t_nproc;
961       }
962     } else {
963       use_hot_team = 0;
964     }
965   }
966 #else
967   use_hot_team = team == root->r.r_hot_team;
968 #endif
969   if (!use_hot_team) {
970 
971     /* install the primary thread */
972     team->t.t_threads[0] = master_th;
973     __kmp_initialize_info(master_th, team, 0, master_gtid);
974 
975     /* now, install the worker threads */
976     for (i = 1; i < team->t.t_nproc; i++) {
977 
978       /* fork or reallocate a new thread and install it in team */
979       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
980       team->t.t_threads[i] = thr;
981       KMP_DEBUG_ASSERT(thr);
982       KMP_DEBUG_ASSERT(thr->th.th_team == team);
983       /* align team and thread arrived states */
984       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
985                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
986                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
987                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
988                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
989                     team->t.t_bar[bs_plain_barrier].b_arrived));
990       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
991       thr->th.th_teams_level = master_th->th.th_teams_level;
992       thr->th.th_teams_size = master_th->th.th_teams_size;
993       { // Initialize threads' barrier data.
994         int b;
995         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
996         for (b = 0; b < bs_last_barrier; ++b) {
997           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
998           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
999 #if USE_DEBUGGER
1000           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1001 #endif
1002         }
1003       }
1004     }
1005 
1006 #if KMP_AFFINITY_SUPPORTED
1007     // Do not partition the places list for teams construct workers who
1008     // haven't actually been forked to do real work yet. This partitioning
1009     // will take place in the parallel region nested within the teams construct.
1010     if (!fork_teams_workers) {
1011       __kmp_partition_places(team);
1012     }
1013 #endif
1014 
1015     if (team->t.t_nproc > 1 &&
1016         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1017       team->t.b->update_num_threads(team->t.t_nproc);
1018       __kmp_add_threads_to_team(team, team->t.t_nproc);
1019     }
1020   }
1021 
1022   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1023     for (i = 0; i < team->t.t_nproc; i++) {
1024       kmp_info_t *thr = team->t.t_threads[i];
1025       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1026           thr->th.th_prev_level != team->t.t_level) {
1027         team->t.t_display_affinity = 1;
1028         break;
1029       }
1030     }
1031   }
1032 
1033   KMP_MB();
1034 }
1035 
1036 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1037 // Propagate any changes to the floating point control registers out to the team
1038 // We try to avoid unnecessary writes to the relevant cache line in the team
1039 // structure, so we don't make changes unless they are needed.
1040 inline static void propagateFPControl(kmp_team_t *team) {
1041   if (__kmp_inherit_fp_control) {
1042     kmp_int16 x87_fpu_control_word;
1043     kmp_uint32 mxcsr;
1044 
1045     // Get primary thread's values of FPU control flags (both X87 and vector)
1046     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1047     __kmp_store_mxcsr(&mxcsr);
1048     mxcsr &= KMP_X86_MXCSR_MASK;
1049 
1050     // There is no point looking at t_fp_control_saved here.
1051     // If it is TRUE, we still have to update the values if they are different
1052     // from those we now have. If it is FALSE we didn't save anything yet, but
1053     // our objective is the same. We have to ensure that the values in the team
1054     // are the same as those we have.
1055     // So, this code achieves what we need whether or not t_fp_control_saved is
1056     // true. By checking whether the value needs updating we avoid unnecessary
1057     // writes that would put the cache-line into a written state, causing all
1058     // threads in the team to have to read it again.
1059     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1060     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1061     // Although we don't use this value, other code in the runtime wants to know
1062     // whether it should restore them. So we must ensure it is correct.
1063     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1064   } else {
1065     // Similarly here. Don't write to this cache-line in the team structure
1066     // unless we have to.
1067     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1068   }
1069 }
1070 
1071 // Do the opposite, setting the hardware registers to the updated values from
1072 // the team.
1073 inline static void updateHWFPControl(kmp_team_t *team) {
1074   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1075     // Only reset the fp control regs if they have been changed in the team.
1076     // the parallel region that we are exiting.
1077     kmp_int16 x87_fpu_control_word;
1078     kmp_uint32 mxcsr;
1079     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1080     __kmp_store_mxcsr(&mxcsr);
1081     mxcsr &= KMP_X86_MXCSR_MASK;
1082 
1083     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1084       __kmp_clear_x87_fpu_status_word();
1085       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1086     }
1087 
1088     if (team->t.t_mxcsr != mxcsr) {
1089       __kmp_load_mxcsr(&team->t.t_mxcsr);
1090     }
1091   }
1092 }
1093 #else
1094 #define propagateFPControl(x) ((void)0)
1095 #define updateHWFPControl(x) ((void)0)
1096 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1097 
1098 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1099                                      int realloc); // forward declaration
1100 
1101 /* Run a parallel region that has been serialized, so runs only in a team of the
1102    single primary thread. */
1103 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1104   kmp_info_t *this_thr;
1105   kmp_team_t *serial_team;
1106 
1107   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1108 
1109   /* Skip all this code for autopar serialized loops since it results in
1110      unacceptable overhead */
1111   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1112     return;
1113 
1114   if (!TCR_4(__kmp_init_parallel))
1115     __kmp_parallel_initialize();
1116   __kmp_resume_if_soft_paused();
1117 
1118   this_thr = __kmp_threads[global_tid];
1119   serial_team = this_thr->th.th_serial_team;
1120 
1121   /* utilize the serialized team held by this thread */
1122   KMP_DEBUG_ASSERT(serial_team);
1123   KMP_MB();
1124 
1125   if (__kmp_tasking_mode != tskm_immediate_exec) {
1126     KMP_DEBUG_ASSERT(
1127         this_thr->th.th_task_team ==
1128         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1129     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1130                      NULL);
1131     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1132                   "team %p, new task_team = NULL\n",
1133                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1134     this_thr->th.th_task_team = NULL;
1135   }
1136 
1137   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1138   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1139     proc_bind = proc_bind_false;
1140   } else if (proc_bind == proc_bind_default) {
1141     // No proc_bind clause was specified, so use the current value
1142     // of proc-bind-var for this parallel region.
1143     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1144   }
1145   // Reset for next parallel region
1146   this_thr->th.th_set_proc_bind = proc_bind_default;
1147 
1148 #if OMPT_SUPPORT
1149   ompt_data_t ompt_parallel_data = ompt_data_none;
1150   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1151   if (ompt_enabled.enabled &&
1152       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1153 
1154     ompt_task_info_t *parent_task_info;
1155     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1156 
1157     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1158     if (ompt_enabled.ompt_callback_parallel_begin) {
1159       int team_size = 1;
1160 
1161       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1162           &(parent_task_info->task_data), &(parent_task_info->frame),
1163           &ompt_parallel_data, team_size,
1164           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1165     }
1166   }
1167 #endif // OMPT_SUPPORT
1168 
1169   if (this_thr->th.th_team != serial_team) {
1170     // Nested level will be an index in the nested nthreads array
1171     int level = this_thr->th.th_team->t.t_level;
1172 
1173     if (serial_team->t.t_serialized) {
1174       /* this serial team was already used
1175          TODO increase performance by making this locks more specific */
1176       kmp_team_t *new_team;
1177 
1178       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1179 
1180       new_team =
1181           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1182 #if OMPT_SUPPORT
1183                               ompt_parallel_data,
1184 #endif
1185                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1186                               0 USE_NESTED_HOT_ARG(NULL));
1187       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1188       KMP_ASSERT(new_team);
1189 
1190       /* setup new serialized team and install it */
1191       new_team->t.t_threads[0] = this_thr;
1192       new_team->t.t_parent = this_thr->th.th_team;
1193       serial_team = new_team;
1194       this_thr->th.th_serial_team = serial_team;
1195 
1196       KF_TRACE(
1197           10,
1198           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1199            global_tid, serial_team));
1200 
1201       /* TODO the above breaks the requirement that if we run out of resources,
1202          then we can still guarantee that serialized teams are ok, since we may
1203          need to allocate a new one */
1204     } else {
1205       KF_TRACE(
1206           10,
1207           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1208            global_tid, serial_team));
1209     }
1210 
1211     /* we have to initialize this serial team */
1212     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1213     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1214     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1215     serial_team->t.t_ident = loc;
1216     serial_team->t.t_serialized = 1;
1217     serial_team->t.t_nproc = 1;
1218     serial_team->t.t_parent = this_thr->th.th_team;
1219     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1220     this_thr->th.th_team = serial_team;
1221     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1222 
1223     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1224                   this_thr->th.th_current_task));
1225     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1226     this_thr->th.th_current_task->td_flags.executing = 0;
1227 
1228     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1229 
1230     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1231        implicit task for each serialized task represented by
1232        team->t.t_serialized? */
1233     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1234               &this_thr->th.th_current_task->td_parent->td_icvs);
1235 
1236     // Thread value exists in the nested nthreads array for the next nested
1237     // level
1238     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1239       this_thr->th.th_current_task->td_icvs.nproc =
1240           __kmp_nested_nth.nth[level + 1];
1241     }
1242 
1243     if (__kmp_nested_proc_bind.used &&
1244         (level + 1 < __kmp_nested_proc_bind.used)) {
1245       this_thr->th.th_current_task->td_icvs.proc_bind =
1246           __kmp_nested_proc_bind.bind_types[level + 1];
1247     }
1248 
1249 #if USE_DEBUGGER
1250     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1251 #endif
1252     this_thr->th.th_info.ds.ds_tid = 0;
1253 
1254     /* set thread cache values */
1255     this_thr->th.th_team_nproc = 1;
1256     this_thr->th.th_team_master = this_thr;
1257     this_thr->th.th_team_serialized = 1;
1258 
1259     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1260     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1261     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1262 
1263     propagateFPControl(serial_team);
1264 
1265     /* check if we need to allocate dispatch buffers stack */
1266     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1267     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1268       serial_team->t.t_dispatch->th_disp_buffer =
1269           (dispatch_private_info_t *)__kmp_allocate(
1270               sizeof(dispatch_private_info_t));
1271     }
1272     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1273 
1274     KMP_MB();
1275 
1276   } else {
1277     /* this serialized team is already being used,
1278      * that's fine, just add another nested level */
1279     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1280     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1281     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1282     ++serial_team->t.t_serialized;
1283     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1284 
1285     // Nested level will be an index in the nested nthreads array
1286     int level = this_thr->th.th_team->t.t_level;
1287     // Thread value exists in the nested nthreads array for the next nested
1288     // level
1289     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1290       this_thr->th.th_current_task->td_icvs.nproc =
1291           __kmp_nested_nth.nth[level + 1];
1292     }
1293     serial_team->t.t_level++;
1294     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1295                   "of serial team %p to %d\n",
1296                   global_tid, serial_team, serial_team->t.t_level));
1297 
1298     /* allocate/push dispatch buffers stack */
1299     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1300     {
1301       dispatch_private_info_t *disp_buffer =
1302           (dispatch_private_info_t *)__kmp_allocate(
1303               sizeof(dispatch_private_info_t));
1304       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1305       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1306     }
1307     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1308 
1309     KMP_MB();
1310   }
1311   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1312 
1313   // Perform the display affinity functionality for
1314   // serialized parallel regions
1315   if (__kmp_display_affinity) {
1316     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1317         this_thr->th.th_prev_num_threads != 1) {
1318       // NULL means use the affinity-format-var ICV
1319       __kmp_aux_display_affinity(global_tid, NULL);
1320       this_thr->th.th_prev_level = serial_team->t.t_level;
1321       this_thr->th.th_prev_num_threads = 1;
1322     }
1323   }
1324 
1325   if (__kmp_env_consistency_check)
1326     __kmp_push_parallel(global_tid, NULL);
1327 #if OMPT_SUPPORT
1328   serial_team->t.ompt_team_info.master_return_address = codeptr;
1329   if (ompt_enabled.enabled &&
1330       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1331     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1332         OMPT_GET_FRAME_ADDRESS(0);
1333 
1334     ompt_lw_taskteam_t lw_taskteam;
1335     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1336                             &ompt_parallel_data, codeptr);
1337 
1338     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1339     // don't use lw_taskteam after linking. content was swaped
1340 
1341     /* OMPT implicit task begin */
1342     if (ompt_enabled.ompt_callback_implicit_task) {
1343       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1344           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1345           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1346           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1347       OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1348           __kmp_tid_from_gtid(global_tid);
1349     }
1350 
1351     /* OMPT state */
1352     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1353     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1354         OMPT_GET_FRAME_ADDRESS(0);
1355   }
1356 #endif
1357 }
1358 
1359 // Test if this fork is for a team closely nested in a teams construct
1360 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1361                                           microtask_t microtask, int level,
1362                                           int teams_level, kmp_va_list ap) {
1363   return (master_th->th.th_teams_microtask && ap &&
1364           microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1365 }
1366 
1367 // Test if this fork is for the teams construct, i.e. to form the outer league
1368 // of teams
1369 static inline bool __kmp_is_entering_teams(int active_level, int level,
1370                                            int teams_level, kmp_va_list ap) {
1371   return ((ap == NULL && active_level == 0) ||
1372           (ap && teams_level > 0 && teams_level == level));
1373 }
1374 
1375 // AC: This is start of parallel that is nested inside teams construct.
1376 // The team is actual (hot), all workers are ready at the fork barrier.
1377 // No lock needed to initialize the team a bit, then free workers.
1378 static inline int
1379 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1380                     kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1381                     enum fork_context_e call_context, microtask_t microtask,
1382                     launch_t invoker, int master_set_numthreads, int level,
1383 #if OMPT_SUPPORT
1384                     ompt_data_t ompt_parallel_data, void *return_address,
1385 #endif
1386                     kmp_va_list ap) {
1387   void **argv;
1388   int i;
1389 
1390   parent_team->t.t_ident = loc;
1391   __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1392   parent_team->t.t_argc = argc;
1393   argv = (void **)parent_team->t.t_argv;
1394   for (i = argc - 1; i >= 0; --i) {
1395     *argv++ = va_arg(kmp_va_deref(ap), void *);
1396   }
1397   // Increment our nested depth levels, but not increase the serialization
1398   if (parent_team == master_th->th.th_serial_team) {
1399     // AC: we are in serialized parallel
1400     __kmpc_serialized_parallel(loc, gtid);
1401     KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1402 
1403     if (call_context == fork_context_gnu) {
1404       // AC: need to decrement t_serialized for enquiry functions to work
1405       // correctly, will restore at join time
1406       parent_team->t.t_serialized--;
1407       return TRUE;
1408     }
1409 
1410 #if OMPD_SUPPORT
1411     parent_team->t.t_pkfn = microtask;
1412 #endif
1413 
1414 #if OMPT_SUPPORT
1415     void *dummy;
1416     void **exit_frame_p;
1417     ompt_data_t *implicit_task_data;
1418     ompt_lw_taskteam_t lw_taskteam;
1419 
1420     if (ompt_enabled.enabled) {
1421       __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1422                               &ompt_parallel_data, return_address);
1423       exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1424 
1425       __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1426       // Don't use lw_taskteam after linking. Content was swapped.
1427 
1428       /* OMPT implicit task begin */
1429       implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1430       if (ompt_enabled.ompt_callback_implicit_task) {
1431         OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1432         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1433             ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1434             1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1435       }
1436 
1437       /* OMPT state */
1438       master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1439     } else {
1440       exit_frame_p = &dummy;
1441     }
1442 #endif
1443 
1444     // AC: need to decrement t_serialized for enquiry functions to work
1445     // correctly, will restore at join time
1446     parent_team->t.t_serialized--;
1447 
1448     {
1449       KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1450       KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1451       __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1452 #if OMPT_SUPPORT
1453                              ,
1454                              exit_frame_p
1455 #endif
1456                              );
1457     }
1458 
1459 #if OMPT_SUPPORT
1460     if (ompt_enabled.enabled) {
1461       *exit_frame_p = NULL;
1462       OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1463       if (ompt_enabled.ompt_callback_implicit_task) {
1464         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1465             ompt_scope_end, NULL, implicit_task_data, 1,
1466             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1467       }
1468       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1469       __ompt_lw_taskteam_unlink(master_th);
1470       if (ompt_enabled.ompt_callback_parallel_end) {
1471         ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1472             &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1473             OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1474       }
1475       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1476     }
1477 #endif
1478     return TRUE;
1479   }
1480 
1481   parent_team->t.t_pkfn = microtask;
1482   parent_team->t.t_invoke = invoker;
1483   KMP_ATOMIC_INC(&root->r.r_in_parallel);
1484   parent_team->t.t_active_level++;
1485   parent_team->t.t_level++;
1486   parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1487 
1488   // If the threads allocated to the team are less than the thread limit, update
1489   // the thread limit here. th_teams_size.nth is specific to this team nested
1490   // in a teams construct, the team is fully created, and we're about to do
1491   // the actual fork. Best to do this here so that the subsequent uses below
1492   // and in the join have the correct value.
1493   master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1494 
1495 #if OMPT_SUPPORT
1496   if (ompt_enabled.enabled) {
1497     ompt_lw_taskteam_t lw_taskteam;
1498     __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1499                             return_address);
1500     __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1501   }
1502 #endif
1503 
1504   /* Change number of threads in the team if requested */
1505   if (master_set_numthreads) { // The parallel has num_threads clause
1506     if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1507       // AC: only can reduce number of threads dynamically, can't increase
1508       kmp_info_t **other_threads = parent_team->t.t_threads;
1509       // NOTE: if using distributed barrier, we need to run this code block
1510       // even when the team size appears not to have changed from the max.
1511       int old_proc = master_th->th.th_teams_size.nth;
1512       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1513         __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1514         __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1515       }
1516       parent_team->t.t_nproc = master_set_numthreads;
1517       for (i = 0; i < master_set_numthreads; ++i) {
1518         other_threads[i]->th.th_team_nproc = master_set_numthreads;
1519       }
1520     }
1521     // Keep extra threads hot in the team for possible next parallels
1522     master_th->th.th_set_nproc = 0;
1523   }
1524 
1525 #if USE_DEBUGGER
1526   if (__kmp_debugging) { // Let debugger override number of threads.
1527     int nth = __kmp_omp_num_threads(loc);
1528     if (nth > 0) { // 0 means debugger doesn't want to change num threads
1529       master_set_numthreads = nth;
1530     }
1531   }
1532 #endif
1533 
1534   // Figure out the proc_bind policy for the nested parallel within teams
1535   kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1536   // proc_bind_default means don't update
1537   kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1538   if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1539     proc_bind = proc_bind_false;
1540   } else {
1541     // No proc_bind clause specified; use current proc-bind-var
1542     if (proc_bind == proc_bind_default) {
1543       proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1544     }
1545     /* else: The proc_bind policy was specified explicitly on parallel clause.
1546        This overrides proc-bind-var for this parallel region, but does not
1547        change proc-bind-var. */
1548     // Figure the value of proc-bind-var for the child threads.
1549     if ((level + 1 < __kmp_nested_proc_bind.used) &&
1550         (__kmp_nested_proc_bind.bind_types[level + 1] !=
1551          master_th->th.th_current_task->td_icvs.proc_bind)) {
1552       proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1553     }
1554   }
1555   KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1556   // Need to change the bind-var ICV to correct value for each implicit task
1557   if (proc_bind_icv != proc_bind_default &&
1558       master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1559     kmp_info_t **other_threads = parent_team->t.t_threads;
1560     for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1561       other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1562     }
1563   }
1564   // Reset for next parallel region
1565   master_th->th.th_set_proc_bind = proc_bind_default;
1566 
1567 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1568   if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1569        KMP_ITT_DEBUG) &&
1570       __kmp_forkjoin_frames_mode == 3 &&
1571       parent_team->t.t_active_level == 1 // only report frames at level 1
1572       && master_th->th.th_teams_size.nteams == 1) {
1573     kmp_uint64 tmp_time = __itt_get_timestamp();
1574     master_th->th.th_frame_time = tmp_time;
1575     parent_team->t.t_region_time = tmp_time;
1576   }
1577   if (__itt_stack_caller_create_ptr) {
1578     KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1579     // create new stack stitching id before entering fork barrier
1580     parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1581   }
1582 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1583 #if KMP_AFFINITY_SUPPORTED
1584   __kmp_partition_places(parent_team);
1585 #endif
1586 
1587   KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1588                 "master_th=%p, gtid=%d\n",
1589                 root, parent_team, master_th, gtid));
1590   __kmp_internal_fork(loc, gtid, parent_team);
1591   KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1592                 "master_th=%p, gtid=%d\n",
1593                 root, parent_team, master_th, gtid));
1594 
1595   if (call_context == fork_context_gnu)
1596     return TRUE;
1597 
1598   /* Invoke microtask for PRIMARY thread */
1599   KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1600                 parent_team->t.t_id, parent_team->t.t_pkfn));
1601 
1602   if (!parent_team->t.t_invoke(gtid)) {
1603     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1604   }
1605   KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1606                 parent_team->t.t_id, parent_team->t.t_pkfn));
1607   KMP_MB(); /* Flush all pending memory write invalidates.  */
1608 
1609   KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1610 
1611   return TRUE;
1612 }
1613 
1614 // Create a serialized parallel region
1615 static inline int
1616 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1617                        kmp_int32 argc, microtask_t microtask, launch_t invoker,
1618                        kmp_info_t *master_th, kmp_team_t *parent_team,
1619 #if OMPT_SUPPORT
1620                        ompt_data_t *ompt_parallel_data, void **return_address,
1621                        ompt_data_t **parent_task_data,
1622 #endif
1623                        kmp_va_list ap) {
1624   kmp_team_t *team;
1625   int i;
1626   void **argv;
1627 
1628 /* josh todo: hypothetical question: what do we do for OS X*? */
1629 #if KMP_OS_LINUX &&                                                            \
1630     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1631   void *args[argc];
1632 #else
1633   void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1634 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1635           KMP_ARCH_AARCH64) */
1636 
1637   KA_TRACE(
1638       20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1639 
1640   __kmpc_serialized_parallel(loc, gtid);
1641 
1642 #if OMPD_SUPPORT
1643   master_th->th.th_serial_team->t.t_pkfn = microtask;
1644 #endif
1645 
1646   if (call_context == fork_context_intel) {
1647     /* TODO this sucks, use the compiler itself to pass args! :) */
1648     master_th->th.th_serial_team->t.t_ident = loc;
1649     if (!ap) {
1650       // revert change made in __kmpc_serialized_parallel()
1651       master_th->th.th_serial_team->t.t_level--;
1652 // Get args from parent team for teams construct
1653 
1654 #if OMPT_SUPPORT
1655       void *dummy;
1656       void **exit_frame_p;
1657       ompt_task_info_t *task_info;
1658       ompt_lw_taskteam_t lw_taskteam;
1659 
1660       if (ompt_enabled.enabled) {
1661         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1662                                 ompt_parallel_data, *return_address);
1663 
1664         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1665         // don't use lw_taskteam after linking. content was swaped
1666         task_info = OMPT_CUR_TASK_INFO(master_th);
1667         exit_frame_p = &(task_info->frame.exit_frame.ptr);
1668         if (ompt_enabled.ompt_callback_implicit_task) {
1669           OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1670           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1671               ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1672               &(task_info->task_data), 1,
1673               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1674         }
1675 
1676         /* OMPT state */
1677         master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1678       } else {
1679         exit_frame_p = &dummy;
1680       }
1681 #endif
1682 
1683       {
1684         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1685         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1686         __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1687 #if OMPT_SUPPORT
1688                                ,
1689                                exit_frame_p
1690 #endif
1691                                );
1692       }
1693 
1694 #if OMPT_SUPPORT
1695       if (ompt_enabled.enabled) {
1696         *exit_frame_p = NULL;
1697         if (ompt_enabled.ompt_callback_implicit_task) {
1698           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1699               ompt_scope_end, NULL, &(task_info->task_data), 1,
1700               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1701         }
1702         *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1703         __ompt_lw_taskteam_unlink(master_th);
1704         if (ompt_enabled.ompt_callback_parallel_end) {
1705           ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1706               ompt_parallel_data, *parent_task_data,
1707               OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1708         }
1709         master_th->th.ompt_thread_info.state = ompt_state_overhead;
1710       }
1711 #endif
1712     } else if (microtask == (microtask_t)__kmp_teams_master) {
1713       KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1714       team = master_th->th.th_team;
1715       // team->t.t_pkfn = microtask;
1716       team->t.t_invoke = invoker;
1717       __kmp_alloc_argv_entries(argc, team, TRUE);
1718       team->t.t_argc = argc;
1719       argv = (void **)team->t.t_argv;
1720       if (ap) {
1721         for (i = argc - 1; i >= 0; --i)
1722           *argv++ = va_arg(kmp_va_deref(ap), void *);
1723       } else {
1724         for (i = 0; i < argc; ++i)
1725           // Get args from parent team for teams construct
1726           argv[i] = parent_team->t.t_argv[i];
1727       }
1728       // AC: revert change made in __kmpc_serialized_parallel()
1729       //     because initial code in teams should have level=0
1730       team->t.t_level--;
1731       // AC: call special invoker for outer "parallel" of teams construct
1732       invoker(gtid);
1733 #if OMPT_SUPPORT
1734       if (ompt_enabled.enabled) {
1735         ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1736         if (ompt_enabled.ompt_callback_implicit_task) {
1737           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1738               ompt_scope_end, NULL, &(task_info->task_data), 0,
1739               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1740         }
1741         if (ompt_enabled.ompt_callback_parallel_end) {
1742           ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1743               ompt_parallel_data, *parent_task_data,
1744               OMPT_INVOKER(call_context) | ompt_parallel_league,
1745               *return_address);
1746         }
1747         master_th->th.ompt_thread_info.state = ompt_state_overhead;
1748       }
1749 #endif
1750     } else {
1751       argv = args;
1752       for (i = argc - 1; i >= 0; --i)
1753         *argv++ = va_arg(kmp_va_deref(ap), void *);
1754       KMP_MB();
1755 
1756 #if OMPT_SUPPORT
1757       void *dummy;
1758       void **exit_frame_p;
1759       ompt_task_info_t *task_info;
1760       ompt_lw_taskteam_t lw_taskteam;
1761       ompt_data_t *implicit_task_data;
1762 
1763       if (ompt_enabled.enabled) {
1764         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1765                                 ompt_parallel_data, *return_address);
1766         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1767         // don't use lw_taskteam after linking. content was swaped
1768         task_info = OMPT_CUR_TASK_INFO(master_th);
1769         exit_frame_p = &(task_info->frame.exit_frame.ptr);
1770 
1771         /* OMPT implicit task begin */
1772         implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1773         if (ompt_enabled.ompt_callback_implicit_task) {
1774           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1775               ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1776               implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1777               ompt_task_implicit);
1778           OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1779         }
1780 
1781         /* OMPT state */
1782         master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1783       } else {
1784         exit_frame_p = &dummy;
1785       }
1786 #endif
1787 
1788       {
1789         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1790         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1791         __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1792 #if OMPT_SUPPORT
1793                                ,
1794                                exit_frame_p
1795 #endif
1796                                );
1797       }
1798 
1799 #if OMPT_SUPPORT
1800       if (ompt_enabled.enabled) {
1801         *exit_frame_p = NULL;
1802         if (ompt_enabled.ompt_callback_implicit_task) {
1803           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1804               ompt_scope_end, NULL, &(task_info->task_data), 1,
1805               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1806         }
1807 
1808         *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1809         __ompt_lw_taskteam_unlink(master_th);
1810         if (ompt_enabled.ompt_callback_parallel_end) {
1811           ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1812               ompt_parallel_data, *parent_task_data,
1813               OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1814         }
1815         master_th->th.ompt_thread_info.state = ompt_state_overhead;
1816       }
1817 #endif
1818     }
1819   } else if (call_context == fork_context_gnu) {
1820 #if OMPT_SUPPORT
1821     if (ompt_enabled.enabled) {
1822       ompt_lw_taskteam_t lwt;
1823       __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1824                               *return_address);
1825 
1826       lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1827       __ompt_lw_taskteam_link(&lwt, master_th, 1);
1828     }
1829 // don't use lw_taskteam after linking. content was swaped
1830 #endif
1831 
1832     // we were called from GNU native code
1833     KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1834     return FALSE;
1835   } else {
1836     KMP_ASSERT2(call_context < fork_context_last,
1837                 "__kmp_serial_fork_call: unknown fork_context parameter");
1838   }
1839 
1840   KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1841   KMP_MB();
1842   return FALSE;
1843 }
1844 
1845 /* most of the work for a fork */
1846 /* return true if we really went parallel, false if serialized */
1847 int __kmp_fork_call(ident_t *loc, int gtid,
1848                     enum fork_context_e call_context, // Intel, GNU, ...
1849                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1850                     kmp_va_list ap) {
1851   void **argv;
1852   int i;
1853   int master_tid;
1854   int master_this_cons;
1855   kmp_team_t *team;
1856   kmp_team_t *parent_team;
1857   kmp_info_t *master_th;
1858   kmp_root_t *root;
1859   int nthreads;
1860   int master_active;
1861   int master_set_numthreads;
1862   int level;
1863   int active_level;
1864   int teams_level;
1865 #if KMP_NESTED_HOT_TEAMS
1866   kmp_hot_team_ptr_t **p_hot_teams;
1867 #endif
1868   { // KMP_TIME_BLOCK
1869     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1870     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1871 
1872     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1873     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1874       /* Some systems prefer the stack for the root thread(s) to start with */
1875       /* some gap from the parent stack to prevent false sharing. */
1876       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1877       /* These 2 lines below are so this does not get optimized out */
1878       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1879         __kmp_stkpadding += (short)((kmp_int64)dummy);
1880     }
1881 
1882     /* initialize if needed */
1883     KMP_DEBUG_ASSERT(
1884         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1885     if (!TCR_4(__kmp_init_parallel))
1886       __kmp_parallel_initialize();
1887     __kmp_resume_if_soft_paused();
1888 
1889     /* setup current data */
1890     // AC: potentially unsafe, not in sync with library shutdown,
1891     // __kmp_threads can be freed
1892     master_th = __kmp_threads[gtid];
1893 
1894     parent_team = master_th->th.th_team;
1895     master_tid = master_th->th.th_info.ds.ds_tid;
1896     master_this_cons = master_th->th.th_local.this_construct;
1897     root = master_th->th.th_root;
1898     master_active = root->r.r_active;
1899     master_set_numthreads = master_th->th.th_set_nproc;
1900 
1901 #if OMPT_SUPPORT
1902     ompt_data_t ompt_parallel_data = ompt_data_none;
1903     ompt_data_t *parent_task_data;
1904     ompt_frame_t *ompt_frame;
1905     void *return_address = NULL;
1906 
1907     if (ompt_enabled.enabled) {
1908       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1909                                     NULL, NULL);
1910       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1911     }
1912 #endif
1913 
1914     // Assign affinity to root thread if it hasn't happened yet
1915     __kmp_assign_root_init_mask();
1916 
1917     // Nested level will be an index in the nested nthreads array
1918     level = parent_team->t.t_level;
1919     // used to launch non-serial teams even if nested is not allowed
1920     active_level = parent_team->t.t_active_level;
1921     // needed to check nesting inside the teams
1922     teams_level = master_th->th.th_teams_level;
1923 #if KMP_NESTED_HOT_TEAMS
1924     p_hot_teams = &master_th->th.th_hot_teams;
1925     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1926       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1927           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1928       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1929       // it is either actual or not needed (when active_level > 0)
1930       (*p_hot_teams)[0].hot_team_nth = 1;
1931     }
1932 #endif
1933 
1934 #if OMPT_SUPPORT
1935     if (ompt_enabled.enabled) {
1936       if (ompt_enabled.ompt_callback_parallel_begin) {
1937         int team_size = master_set_numthreads
1938                             ? master_set_numthreads
1939                             : get__nproc_2(parent_team, master_tid);
1940         int flags = OMPT_INVOKER(call_context) |
1941                     ((microtask == (microtask_t)__kmp_teams_master)
1942                          ? ompt_parallel_league
1943                          : ompt_parallel_team);
1944         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1945             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1946             return_address);
1947       }
1948       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1949     }
1950 #endif
1951 
1952     master_th->th.th_ident = loc;
1953 
1954     // Parallel closely nested in teams construct:
1955     if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
1956       return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
1957                                  call_context, microtask, invoker,
1958                                  master_set_numthreads, level,
1959 #if OMPT_SUPPORT
1960                                  ompt_parallel_data, return_address,
1961 #endif
1962                                  ap);
1963     } // End parallel closely nested in teams construct
1964 
1965 #if KMP_DEBUG
1966     if (__kmp_tasking_mode != tskm_immediate_exec) {
1967       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1968                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1969     }
1970 #endif
1971 
1972     // Need this to happen before we determine the number of threads, not while
1973     // we are allocating the team
1974     //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1975 
1976     // Determine the number of threads
1977     int enter_teams =
1978         __kmp_is_entering_teams(active_level, level, teams_level, ap);
1979     if ((!enter_teams &&
1980          (parent_team->t.t_active_level >=
1981           master_th->th.th_current_task->td_icvs.max_active_levels)) ||
1982         (__kmp_library == library_serial)) {
1983       KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
1984       nthreads = 1;
1985     } else {
1986       nthreads = master_set_numthreads
1987                      ? master_set_numthreads
1988                      // TODO: get nproc directly from current task
1989                      : get__nproc_2(parent_team, master_tid);
1990       // Check if we need to take forkjoin lock? (no need for serialized
1991       // parallel out of teams construct).
1992       if (nthreads > 1) {
1993         /* determine how many new threads we can use */
1994         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1995         /* AC: If we execute teams from parallel region (on host), then teams
1996            should be created but each can only have 1 thread if nesting is
1997            disabled. If teams called from serial region, then teams and their
1998            threads should be created regardless of the nesting setting. */
1999         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2000                                          nthreads, enter_teams);
2001         if (nthreads == 1) {
2002           // Free lock for single thread execution here; for multi-thread
2003           // execution it will be freed later after team of threads created
2004           // and initialized
2005           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2006         }
2007       }
2008     }
2009     KMP_DEBUG_ASSERT(nthreads > 0);
2010 
2011     // If we temporarily changed the set number of threads then restore it now
2012     master_th->th.th_set_nproc = 0;
2013 
2014     if (nthreads == 1) {
2015       return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2016                                     invoker, master_th, parent_team,
2017 #if OMPT_SUPPORT
2018                                     &ompt_parallel_data, &return_address,
2019                                     &parent_task_data,
2020 #endif
2021                                     ap);
2022     } // if (nthreads == 1)
2023 
2024     // GEH: only modify the executing flag in the case when not serialized
2025     //      serialized case is handled in kmpc_serialized_parallel
2026     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2027                   "curtask=%p, curtask_max_aclevel=%d\n",
2028                   parent_team->t.t_active_level, master_th,
2029                   master_th->th.th_current_task,
2030                   master_th->th.th_current_task->td_icvs.max_active_levels));
2031     // TODO: GEH - cannot do this assertion because root thread not set up as
2032     // executing
2033     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2034     master_th->th.th_current_task->td_flags.executing = 0;
2035 
2036     if (!master_th->th.th_teams_microtask || level > teams_level) {
2037       /* Increment our nested depth level */
2038       KMP_ATOMIC_INC(&root->r.r_in_parallel);
2039     }
2040 
2041     // See if we need to make a copy of the ICVs.
2042     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2043     if ((level + 1 < __kmp_nested_nth.used) &&
2044         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2045       nthreads_icv = __kmp_nested_nth.nth[level + 1];
2046     } else {
2047       nthreads_icv = 0; // don't update
2048     }
2049 
2050     // Figure out the proc_bind_policy for the new team.
2051     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2052     // proc_bind_default means don't update
2053     kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2054     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2055       proc_bind = proc_bind_false;
2056     } else {
2057       // No proc_bind clause specified; use current proc-bind-var for this
2058       // parallel region
2059       if (proc_bind == proc_bind_default) {
2060         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2061       }
2062       // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2063       if (master_th->th.th_teams_microtask &&
2064           microtask == (microtask_t)__kmp_teams_master) {
2065         proc_bind = __kmp_teams_proc_bind;
2066       }
2067       /* else: The proc_bind policy was specified explicitly on parallel clause.
2068          This overrides proc-bind-var for this parallel region, but does not
2069          change proc-bind-var. */
2070       // Figure the value of proc-bind-var for the child threads.
2071       if ((level + 1 < __kmp_nested_proc_bind.used) &&
2072           (__kmp_nested_proc_bind.bind_types[level + 1] !=
2073            master_th->th.th_current_task->td_icvs.proc_bind)) {
2074         // Do not modify the proc bind icv for the two teams construct forks
2075         // They just let the proc bind icv pass through
2076         if (!master_th->th.th_teams_microtask ||
2077             !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2078           proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2079       }
2080     }
2081 
2082     // Reset for next parallel region
2083     master_th->th.th_set_proc_bind = proc_bind_default;
2084 
2085     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2086       kmp_internal_control_t new_icvs;
2087       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2088       new_icvs.next = NULL;
2089       if (nthreads_icv > 0) {
2090         new_icvs.nproc = nthreads_icv;
2091       }
2092       if (proc_bind_icv != proc_bind_default) {
2093         new_icvs.proc_bind = proc_bind_icv;
2094       }
2095 
2096       /* allocate a new parallel team */
2097       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2098       team = __kmp_allocate_team(root, nthreads, nthreads,
2099 #if OMPT_SUPPORT
2100                                  ompt_parallel_data,
2101 #endif
2102                                  proc_bind, &new_icvs,
2103                                  argc USE_NESTED_HOT_ARG(master_th));
2104       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2105         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2106     } else {
2107       /* allocate a new parallel team */
2108       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2109       team = __kmp_allocate_team(root, nthreads, nthreads,
2110 #if OMPT_SUPPORT
2111                                  ompt_parallel_data,
2112 #endif
2113                                  proc_bind,
2114                                  &master_th->th.th_current_task->td_icvs,
2115                                  argc USE_NESTED_HOT_ARG(master_th));
2116       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2117         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2118                   &master_th->th.th_current_task->td_icvs);
2119     }
2120     KF_TRACE(
2121         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2122 
2123     /* setup the new team */
2124     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2125     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2126     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2127     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2128     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2129 #if OMPT_SUPPORT
2130     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2131                           return_address);
2132 #endif
2133     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2134     // TODO: parent_team->t.t_level == INT_MAX ???
2135     if (!master_th->th.th_teams_microtask || level > teams_level) {
2136       int new_level = parent_team->t.t_level + 1;
2137       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2138       new_level = parent_team->t.t_active_level + 1;
2139       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2140     } else {
2141       // AC: Do not increase parallel level at start of the teams construct
2142       int new_level = parent_team->t.t_level;
2143       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2144       new_level = parent_team->t.t_active_level;
2145       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2146     }
2147     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2148     // set primary thread's schedule as new run-time schedule
2149     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2150 
2151     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2152     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2153 
2154     // Update the floating point rounding in the team if required.
2155     propagateFPControl(team);
2156 #if OMPD_SUPPORT
2157     if (ompd_state & OMPD_ENABLE_BP)
2158       ompd_bp_parallel_begin();
2159 #endif
2160 
2161     if (__kmp_tasking_mode != tskm_immediate_exec) {
2162       // Set primary thread's task team to team's task team. Unless this is hot
2163       // team, it should be NULL.
2164       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2165                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2166       KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2167                     "%p, new task_team %p / team %p\n",
2168                     __kmp_gtid_from_thread(master_th),
2169                     master_th->th.th_task_team, parent_team,
2170                     team->t.t_task_team[master_th->th.th_task_state], team));
2171 
2172       if (active_level || master_th->th.th_task_team) {
2173         // Take a memo of primary thread's task_state
2174         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2175         if (master_th->th.th_task_state_top >=
2176             master_th->th.th_task_state_stack_sz) { // increase size
2177           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2178           kmp_uint8 *old_stack, *new_stack;
2179           kmp_uint32 i;
2180           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2181           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2182             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2183           }
2184           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2185                ++i) { // zero-init rest of stack
2186             new_stack[i] = 0;
2187           }
2188           old_stack = master_th->th.th_task_state_memo_stack;
2189           master_th->th.th_task_state_memo_stack = new_stack;
2190           master_th->th.th_task_state_stack_sz = new_size;
2191           __kmp_free(old_stack);
2192         }
2193         // Store primary thread's task_state on stack
2194         master_th->th
2195             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2196             master_th->th.th_task_state;
2197         master_th->th.th_task_state_top++;
2198 #if KMP_NESTED_HOT_TEAMS
2199         if (master_th->th.th_hot_teams &&
2200             active_level < __kmp_hot_teams_max_level &&
2201             team == master_th->th.th_hot_teams[active_level].hot_team) {
2202           // Restore primary thread's nested state if nested hot team
2203           master_th->th.th_task_state =
2204               master_th->th
2205                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2206         } else {
2207 #endif
2208           master_th->th.th_task_state = 0;
2209 #if KMP_NESTED_HOT_TEAMS
2210         }
2211 #endif
2212       }
2213 #if !KMP_NESTED_HOT_TEAMS
2214       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2215                        (team == root->r.r_hot_team));
2216 #endif
2217     }
2218 
2219     KA_TRACE(
2220         20,
2221         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2222          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2223          team->t.t_nproc));
2224     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2225                      (team->t.t_master_tid == 0 &&
2226                       (team->t.t_parent == root->r.r_root_team ||
2227                        team->t.t_parent->t.t_serialized)));
2228     KMP_MB();
2229 
2230     /* now, setup the arguments */
2231     argv = (void **)team->t.t_argv;
2232     if (ap) {
2233       for (i = argc - 1; i >= 0; --i) {
2234         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2235         KMP_CHECK_UPDATE(*argv, new_argv);
2236         argv++;
2237       }
2238     } else {
2239       for (i = 0; i < argc; ++i) {
2240         // Get args from parent team for teams construct
2241         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2242       }
2243     }
2244 
2245     /* now actually fork the threads */
2246     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2247     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2248       root->r.r_active = TRUE;
2249 
2250     __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2251     __kmp_setup_icv_copy(team, nthreads,
2252                          &master_th->th.th_current_task->td_icvs, loc);
2253 
2254 #if OMPT_SUPPORT
2255     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2256 #endif
2257 
2258     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2259 
2260 #if USE_ITT_BUILD
2261     if (team->t.t_active_level == 1 // only report frames at level 1
2262         && !master_th->th.th_teams_microtask) { // not in teams construct
2263 #if USE_ITT_NOTIFY
2264       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2265           (__kmp_forkjoin_frames_mode == 3 ||
2266            __kmp_forkjoin_frames_mode == 1)) {
2267         kmp_uint64 tmp_time = 0;
2268         if (__itt_get_timestamp_ptr)
2269           tmp_time = __itt_get_timestamp();
2270         // Internal fork - report frame begin
2271         master_th->th.th_frame_time = tmp_time;
2272         if (__kmp_forkjoin_frames_mode == 3)
2273           team->t.t_region_time = tmp_time;
2274       } else
2275 // only one notification scheme (either "submit" or "forking/joined", not both)
2276 #endif /* USE_ITT_NOTIFY */
2277         if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2278             __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2279           // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2280           __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2281         }
2282     }
2283 #endif /* USE_ITT_BUILD */
2284 
2285     /* now go on and do the work */
2286     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2287     KMP_MB();
2288     KF_TRACE(10,
2289              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2290               root, team, master_th, gtid));
2291 
2292 #if USE_ITT_BUILD
2293     if (__itt_stack_caller_create_ptr) {
2294       // create new stack stitching id before entering fork barrier
2295       if (!enter_teams) {
2296         KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2297         team->t.t_stack_id = __kmp_itt_stack_caller_create();
2298       } else if (parent_team->t.t_serialized) {
2299         // keep stack stitching id in the serialized parent_team;
2300         // current team will be used for parallel inside the teams;
2301         // if parent_team is active, then it already keeps stack stitching id
2302         // for the league of teams
2303         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2304         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2305       }
2306     }
2307 #endif /* USE_ITT_BUILD */
2308 
2309     // AC: skip __kmp_internal_fork at teams construct, let only primary
2310     // threads execute
2311     if (ap) {
2312       __kmp_internal_fork(loc, gtid, team);
2313       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2314                     "master_th=%p, gtid=%d\n",
2315                     root, team, master_th, gtid));
2316     }
2317 
2318     if (call_context == fork_context_gnu) {
2319       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2320       return TRUE;
2321     }
2322 
2323     /* Invoke microtask for PRIMARY thread */
2324     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2325                   team->t.t_id, team->t.t_pkfn));
2326   } // END of timer KMP_fork_call block
2327 
2328 #if KMP_STATS_ENABLED
2329   // If beginning a teams construct, then change thread state
2330   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2331   if (!ap) {
2332     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2333   }
2334 #endif
2335 
2336   if (!team->t.t_invoke(gtid)) {
2337     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2338   }
2339 
2340 #if KMP_STATS_ENABLED
2341   // If was beginning of a teams construct, then reset thread state
2342   if (!ap) {
2343     KMP_SET_THREAD_STATE(previous_state);
2344   }
2345 #endif
2346 
2347   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2348                 team->t.t_id, team->t.t_pkfn));
2349   KMP_MB(); /* Flush all pending memory write invalidates.  */
2350 
2351   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2352 #if OMPT_SUPPORT
2353   if (ompt_enabled.enabled) {
2354     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2355   }
2356 #endif
2357 
2358   return TRUE;
2359 }
2360 
2361 #if OMPT_SUPPORT
2362 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2363                                             kmp_team_t *team) {
2364   // restore state outside the region
2365   thread->th.ompt_thread_info.state =
2366       ((team->t.t_serialized) ? ompt_state_work_serial
2367                               : ompt_state_work_parallel);
2368 }
2369 
2370 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2371                                    kmp_team_t *team, ompt_data_t *parallel_data,
2372                                    int flags, void *codeptr) {
2373   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2374   if (ompt_enabled.ompt_callback_parallel_end) {
2375     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2376         parallel_data, &(task_info->task_data), flags, codeptr);
2377   }
2378 
2379   task_info->frame.enter_frame = ompt_data_none;
2380   __kmp_join_restore_state(thread, team);
2381 }
2382 #endif
2383 
2384 void __kmp_join_call(ident_t *loc, int gtid
2385 #if OMPT_SUPPORT
2386                      ,
2387                      enum fork_context_e fork_context
2388 #endif
2389                      ,
2390                      int exit_teams) {
2391   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2392   kmp_team_t *team;
2393   kmp_team_t *parent_team;
2394   kmp_info_t *master_th;
2395   kmp_root_t *root;
2396   int master_active;
2397 
2398   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2399 
2400   /* setup current data */
2401   master_th = __kmp_threads[gtid];
2402   root = master_th->th.th_root;
2403   team = master_th->th.th_team;
2404   parent_team = team->t.t_parent;
2405 
2406   master_th->th.th_ident = loc;
2407 
2408 #if OMPT_SUPPORT
2409   void *team_microtask = (void *)team->t.t_pkfn;
2410   // For GOMP interface with serialized parallel, need the
2411   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2412   // and end-parallel events.
2413   if (ompt_enabled.enabled &&
2414       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2415     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2416   }
2417 #endif
2418 
2419 #if KMP_DEBUG
2420   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2421     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2422                   "th_task_team = %p\n",
2423                   __kmp_gtid_from_thread(master_th), team,
2424                   team->t.t_task_team[master_th->th.th_task_state],
2425                   master_th->th.th_task_team));
2426     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2427                      team->t.t_task_team[master_th->th.th_task_state]);
2428   }
2429 #endif
2430 
2431   if (team->t.t_serialized) {
2432     if (master_th->th.th_teams_microtask) {
2433       // We are in teams construct
2434       int level = team->t.t_level;
2435       int tlevel = master_th->th.th_teams_level;
2436       if (level == tlevel) {
2437         // AC: we haven't incremented it earlier at start of teams construct,
2438         //     so do it here - at the end of teams construct
2439         team->t.t_level++;
2440       } else if (level == tlevel + 1) {
2441         // AC: we are exiting parallel inside teams, need to increment
2442         // serialization in order to restore it in the next call to
2443         // __kmpc_end_serialized_parallel
2444         team->t.t_serialized++;
2445       }
2446     }
2447     __kmpc_end_serialized_parallel(loc, gtid);
2448 
2449 #if OMPT_SUPPORT
2450     if (ompt_enabled.enabled) {
2451       if (fork_context == fork_context_gnu) {
2452         __ompt_lw_taskteam_unlink(master_th);
2453       }
2454       __kmp_join_restore_state(master_th, parent_team);
2455     }
2456 #endif
2457 
2458     return;
2459   }
2460 
2461   master_active = team->t.t_master_active;
2462 
2463   if (!exit_teams) {
2464     // AC: No barrier for internal teams at exit from teams construct.
2465     //     But there is barrier for external team (league).
2466     __kmp_internal_join(loc, gtid, team);
2467 #if USE_ITT_BUILD
2468     if (__itt_stack_caller_create_ptr) {
2469       KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2470       // destroy the stack stitching id after join barrier
2471       __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2472       team->t.t_stack_id = NULL;
2473     }
2474 #endif
2475   } else {
2476     master_th->th.th_task_state =
2477         0; // AC: no tasking in teams (out of any parallel)
2478 #if USE_ITT_BUILD
2479     if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2480       KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2481       // destroy the stack stitching id on exit from the teams construct
2482       // if parent_team is active, then the id will be destroyed later on
2483       // by master of the league of teams
2484       __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2485       parent_team->t.t_stack_id = NULL;
2486     }
2487 #endif
2488   }
2489 
2490   KMP_MB();
2491 
2492 #if OMPT_SUPPORT
2493   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2494   void *codeptr = team->t.ompt_team_info.master_return_address;
2495 #endif
2496 
2497 #if USE_ITT_BUILD
2498   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2499   if (team->t.t_active_level == 1 &&
2500       (!master_th->th.th_teams_microtask || /* not in teams construct */
2501        master_th->th.th_teams_size.nteams == 1)) {
2502     master_th->th.th_ident = loc;
2503     // only one notification scheme (either "submit" or "forking/joined", not
2504     // both)
2505     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2506         __kmp_forkjoin_frames_mode == 3)
2507       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2508                              master_th->th.th_frame_time, 0, loc,
2509                              master_th->th.th_team_nproc, 1);
2510     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2511              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2512       __kmp_itt_region_joined(gtid);
2513   } // active_level == 1
2514 #endif /* USE_ITT_BUILD */
2515 
2516 #if KMP_AFFINITY_SUPPORTED
2517   if (!exit_teams) {
2518     // Restore master thread's partition.
2519     master_th->th.th_first_place = team->t.t_first_place;
2520     master_th->th.th_last_place = team->t.t_last_place;
2521   }
2522 #endif // KMP_AFFINITY_SUPPORTED
2523 
2524   if (master_th->th.th_teams_microtask && !exit_teams &&
2525       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2526       team->t.t_level == master_th->th.th_teams_level + 1) {
2527 // AC: We need to leave the team structure intact at the end of parallel
2528 // inside the teams construct, so that at the next parallel same (hot) team
2529 // works, only adjust nesting levels
2530 #if OMPT_SUPPORT
2531     ompt_data_t ompt_parallel_data = ompt_data_none;
2532     if (ompt_enabled.enabled) {
2533       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2534       if (ompt_enabled.ompt_callback_implicit_task) {
2535         int ompt_team_size = team->t.t_nproc;
2536         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2537             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2538             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2539       }
2540       task_info->frame.exit_frame = ompt_data_none;
2541       task_info->task_data = ompt_data_none;
2542       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2543       __ompt_lw_taskteam_unlink(master_th);
2544     }
2545 #endif
2546     /* Decrement our nested depth level */
2547     team->t.t_level--;
2548     team->t.t_active_level--;
2549     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2550 
2551     // Restore number of threads in the team if needed. This code relies on
2552     // the proper adjustment of th_teams_size.nth after the fork in
2553     // __kmp_teams_master on each teams primary thread in the case that
2554     // __kmp_reserve_threads reduced it.
2555     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2556       int old_num = master_th->th.th_team_nproc;
2557       int new_num = master_th->th.th_teams_size.nth;
2558       kmp_info_t **other_threads = team->t.t_threads;
2559       team->t.t_nproc = new_num;
2560       for (int i = 0; i < old_num; ++i) {
2561         other_threads[i]->th.th_team_nproc = new_num;
2562       }
2563       // Adjust states of non-used threads of the team
2564       for (int i = old_num; i < new_num; ++i) {
2565         // Re-initialize thread's barrier data.
2566         KMP_DEBUG_ASSERT(other_threads[i]);
2567         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2568         for (int b = 0; b < bs_last_barrier; ++b) {
2569           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2570           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2571 #if USE_DEBUGGER
2572           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2573 #endif
2574         }
2575         if (__kmp_tasking_mode != tskm_immediate_exec) {
2576           // Synchronize thread's task state
2577           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2578         }
2579       }
2580     }
2581 
2582 #if OMPT_SUPPORT
2583     if (ompt_enabled.enabled) {
2584       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2585                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2586     }
2587 #endif
2588 
2589     return;
2590   }
2591 
2592   /* do cleanup and restore the parent team */
2593   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2594   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2595 
2596   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2597 
2598   /* jc: The following lock has instructions with REL and ACQ semantics,
2599      separating the parallel user code called in this parallel region
2600      from the serial user code called after this function returns. */
2601   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2602 
2603   if (!master_th->th.th_teams_microtask ||
2604       team->t.t_level > master_th->th.th_teams_level) {
2605     /* Decrement our nested depth level */
2606     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2607   }
2608   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2609 
2610 #if OMPT_SUPPORT
2611   if (ompt_enabled.enabled) {
2612     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2613     if (ompt_enabled.ompt_callback_implicit_task) {
2614       int flags = (team_microtask == (void *)__kmp_teams_master)
2615                       ? ompt_task_initial
2616                       : ompt_task_implicit;
2617       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2618       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2619           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2620           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2621     }
2622     task_info->frame.exit_frame = ompt_data_none;
2623     task_info->task_data = ompt_data_none;
2624   }
2625 #endif
2626 
2627   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2628                 master_th, team));
2629   __kmp_pop_current_task_from_thread(master_th);
2630 
2631   master_th->th.th_def_allocator = team->t.t_def_allocator;
2632 
2633 #if OMPD_SUPPORT
2634   if (ompd_state & OMPD_ENABLE_BP)
2635     ompd_bp_parallel_end();
2636 #endif
2637   updateHWFPControl(team);
2638 
2639   if (root->r.r_active != master_active)
2640     root->r.r_active = master_active;
2641 
2642   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2643                             master_th)); // this will free worker threads
2644 
2645   /* this race was fun to find. make sure the following is in the critical
2646      region otherwise assertions may fail occasionally since the old team may be
2647      reallocated and the hierarchy appears inconsistent. it is actually safe to
2648      run and won't cause any bugs, but will cause those assertion failures. it's
2649      only one deref&assign so might as well put this in the critical region */
2650   master_th->th.th_team = parent_team;
2651   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2652   master_th->th.th_team_master = parent_team->t.t_threads[0];
2653   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2654 
2655   /* restore serialized team, if need be */
2656   if (parent_team->t.t_serialized &&
2657       parent_team != master_th->th.th_serial_team &&
2658       parent_team != root->r.r_root_team) {
2659     __kmp_free_team(root,
2660                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2661     master_th->th.th_serial_team = parent_team;
2662   }
2663 
2664   if (__kmp_tasking_mode != tskm_immediate_exec) {
2665     if (master_th->th.th_task_state_top >
2666         0) { // Restore task state from memo stack
2667       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2668       // Remember primary thread's state if we re-use this nested hot team
2669       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2670           master_th->th.th_task_state;
2671       --master_th->th.th_task_state_top; // pop
2672       // Now restore state at this level
2673       master_th->th.th_task_state =
2674           master_th->th
2675               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2676     } else if (team != root->r.r_hot_team) {
2677       // Reset the task state of primary thread if we are not hot team because
2678       // in this case all the worker threads will be free, and their task state
2679       // will be reset. If not reset the primary's, the task state will be
2680       // inconsistent.
2681       master_th->th.th_task_state = 0;
2682     }
2683     // Copy the task team from the parent team to the primary thread
2684     master_th->th.th_task_team =
2685         parent_team->t.t_task_team[master_th->th.th_task_state];
2686     KA_TRACE(20,
2687              ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2688               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2689               parent_team));
2690   }
2691 
2692   // TODO: GEH - cannot do this assertion because root thread not set up as
2693   // executing
2694   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2695   master_th->th.th_current_task->td_flags.executing = 1;
2696 
2697   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2698 
2699 #if KMP_AFFINITY_SUPPORTED
2700   if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2701     __kmp_reset_root_init_mask(gtid);
2702   }
2703 #endif
2704 #if OMPT_SUPPORT
2705   int flags =
2706       OMPT_INVOKER(fork_context) |
2707       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2708                                                       : ompt_parallel_team);
2709   if (ompt_enabled.enabled) {
2710     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2711                     codeptr);
2712   }
2713 #endif
2714 
2715   KMP_MB();
2716   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2717 }
2718 
2719 /* Check whether we should push an internal control record onto the
2720    serial team stack.  If so, do it.  */
2721 void __kmp_save_internal_controls(kmp_info_t *thread) {
2722 
2723   if (thread->th.th_team != thread->th.th_serial_team) {
2724     return;
2725   }
2726   if (thread->th.th_team->t.t_serialized > 1) {
2727     int push = 0;
2728 
2729     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2730       push = 1;
2731     } else {
2732       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2733           thread->th.th_team->t.t_serialized) {
2734         push = 1;
2735       }
2736     }
2737     if (push) { /* push a record on the serial team's stack */
2738       kmp_internal_control_t *control =
2739           (kmp_internal_control_t *)__kmp_allocate(
2740               sizeof(kmp_internal_control_t));
2741 
2742       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2743 
2744       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2745 
2746       control->next = thread->th.th_team->t.t_control_stack_top;
2747       thread->th.th_team->t.t_control_stack_top = control;
2748     }
2749   }
2750 }
2751 
2752 /* Changes set_nproc */
2753 void __kmp_set_num_threads(int new_nth, int gtid) {
2754   kmp_info_t *thread;
2755   kmp_root_t *root;
2756 
2757   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2758   KMP_DEBUG_ASSERT(__kmp_init_serial);
2759 
2760   if (new_nth < 1)
2761     new_nth = 1;
2762   else if (new_nth > __kmp_max_nth)
2763     new_nth = __kmp_max_nth;
2764 
2765   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2766   thread = __kmp_threads[gtid];
2767   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2768     return; // nothing to do
2769 
2770   __kmp_save_internal_controls(thread);
2771 
2772   set__nproc(thread, new_nth);
2773 
2774   // If this omp_set_num_threads() call will cause the hot team size to be
2775   // reduced (in the absence of a num_threads clause), then reduce it now,
2776   // rather than waiting for the next parallel region.
2777   root = thread->th.th_root;
2778   if (__kmp_init_parallel && (!root->r.r_active) &&
2779       (root->r.r_hot_team->t.t_nproc > new_nth)
2780 #if KMP_NESTED_HOT_TEAMS
2781       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2782 #endif
2783   ) {
2784     kmp_team_t *hot_team = root->r.r_hot_team;
2785     int f;
2786 
2787     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2788 
2789     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2790       __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2791     }
2792     // Release the extra threads we don't need any more.
2793     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2794       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2795       if (__kmp_tasking_mode != tskm_immediate_exec) {
2796         // When decreasing team size, threads no longer in the team should unref
2797         // task team.
2798         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2799       }
2800       __kmp_free_thread(hot_team->t.t_threads[f]);
2801       hot_team->t.t_threads[f] = NULL;
2802     }
2803     hot_team->t.t_nproc = new_nth;
2804 #if KMP_NESTED_HOT_TEAMS
2805     if (thread->th.th_hot_teams) {
2806       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2807       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2808     }
2809 #endif
2810 
2811     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2812       hot_team->t.b->update_num_threads(new_nth);
2813       __kmp_add_threads_to_team(hot_team, new_nth);
2814     }
2815 
2816     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2817 
2818     // Update the t_nproc field in the threads that are still active.
2819     for (f = 0; f < new_nth; f++) {
2820       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2821       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2822     }
2823     // Special flag in case omp_set_num_threads() call
2824     hot_team->t.t_size_changed = -1;
2825   }
2826 }
2827 
2828 /* Changes max_active_levels */
2829 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2830   kmp_info_t *thread;
2831 
2832   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2833                 "%d = (%d)\n",
2834                 gtid, max_active_levels));
2835   KMP_DEBUG_ASSERT(__kmp_init_serial);
2836 
2837   // validate max_active_levels
2838   if (max_active_levels < 0) {
2839     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2840     // We ignore this call if the user has specified a negative value.
2841     // The current setting won't be changed. The last valid setting will be
2842     // used. A warning will be issued (if warnings are allowed as controlled by
2843     // the KMP_WARNINGS env var).
2844     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2845                   "max_active_levels for thread %d = (%d)\n",
2846                   gtid, max_active_levels));
2847     return;
2848   }
2849   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2850     // it's OK, the max_active_levels is within the valid range: [ 0;
2851     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2852     // We allow a zero value. (implementation defined behavior)
2853   } else {
2854     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2855                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2856     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2857     // Current upper limit is MAX_INT. (implementation defined behavior)
2858     // If the input exceeds the upper limit, we correct the input to be the
2859     // upper limit. (implementation defined behavior)
2860     // Actually, the flow should never get here until we use MAX_INT limit.
2861   }
2862   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2863                 "max_active_levels for thread %d = (%d)\n",
2864                 gtid, max_active_levels));
2865 
2866   thread = __kmp_threads[gtid];
2867 
2868   __kmp_save_internal_controls(thread);
2869 
2870   set__max_active_levels(thread, max_active_levels);
2871 }
2872 
2873 /* Gets max_active_levels */
2874 int __kmp_get_max_active_levels(int gtid) {
2875   kmp_info_t *thread;
2876 
2877   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2878   KMP_DEBUG_ASSERT(__kmp_init_serial);
2879 
2880   thread = __kmp_threads[gtid];
2881   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2882   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2883                 "curtask_maxaclevel=%d\n",
2884                 gtid, thread->th.th_current_task,
2885                 thread->th.th_current_task->td_icvs.max_active_levels));
2886   return thread->th.th_current_task->td_icvs.max_active_levels;
2887 }
2888 
2889 // nteams-var per-device ICV
2890 void __kmp_set_num_teams(int num_teams) {
2891   if (num_teams > 0)
2892     __kmp_nteams = num_teams;
2893 }
2894 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2895 // teams-thread-limit-var per-device ICV
2896 void __kmp_set_teams_thread_limit(int limit) {
2897   if (limit > 0)
2898     __kmp_teams_thread_limit = limit;
2899 }
2900 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2901 
2902 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2903 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2904 
2905 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2906 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2907   kmp_info_t *thread;
2908   kmp_sched_t orig_kind;
2909   //    kmp_team_t *team;
2910 
2911   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2912                 gtid, (int)kind, chunk));
2913   KMP_DEBUG_ASSERT(__kmp_init_serial);
2914 
2915   // Check if the kind parameter is valid, correct if needed.
2916   // Valid parameters should fit in one of two intervals - standard or extended:
2917   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2918   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2919   orig_kind = kind;
2920   kind = __kmp_sched_without_mods(kind);
2921 
2922   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2923       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2924     // TODO: Hint needs attention in case we change the default schedule.
2925     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2926               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2927               __kmp_msg_null);
2928     kind = kmp_sched_default;
2929     chunk = 0; // ignore chunk value in case of bad kind
2930   }
2931 
2932   thread = __kmp_threads[gtid];
2933 
2934   __kmp_save_internal_controls(thread);
2935 
2936   if (kind < kmp_sched_upper_std) {
2937     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2938       // differ static chunked vs. unchunked:  chunk should be invalid to
2939       // indicate unchunked schedule (which is the default)
2940       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2941     } else {
2942       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2943           __kmp_sch_map[kind - kmp_sched_lower - 1];
2944     }
2945   } else {
2946     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2947     //    kmp_sched_lower - 2 ];
2948     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2949         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2950                       kmp_sched_lower - 2];
2951   }
2952   __kmp_sched_apply_mods_intkind(
2953       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2954   if (kind == kmp_sched_auto || chunk < 1) {
2955     // ignore parameter chunk for schedule auto
2956     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2957   } else {
2958     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2959   }
2960 }
2961 
2962 /* Gets def_sched_var ICV values */
2963 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2964   kmp_info_t *thread;
2965   enum sched_type th_type;
2966 
2967   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2968   KMP_DEBUG_ASSERT(__kmp_init_serial);
2969 
2970   thread = __kmp_threads[gtid];
2971 
2972   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2973   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2974   case kmp_sch_static:
2975   case kmp_sch_static_greedy:
2976   case kmp_sch_static_balanced:
2977     *kind = kmp_sched_static;
2978     __kmp_sched_apply_mods_stdkind(kind, th_type);
2979     *chunk = 0; // chunk was not set, try to show this fact via zero value
2980     return;
2981   case kmp_sch_static_chunked:
2982     *kind = kmp_sched_static;
2983     break;
2984   case kmp_sch_dynamic_chunked:
2985     *kind = kmp_sched_dynamic;
2986     break;
2987   case kmp_sch_guided_chunked:
2988   case kmp_sch_guided_iterative_chunked:
2989   case kmp_sch_guided_analytical_chunked:
2990     *kind = kmp_sched_guided;
2991     break;
2992   case kmp_sch_auto:
2993     *kind = kmp_sched_auto;
2994     break;
2995   case kmp_sch_trapezoidal:
2996     *kind = kmp_sched_trapezoidal;
2997     break;
2998 #if KMP_STATIC_STEAL_ENABLED
2999   case kmp_sch_static_steal:
3000     *kind = kmp_sched_static_steal;
3001     break;
3002 #endif
3003   default:
3004     KMP_FATAL(UnknownSchedulingType, th_type);
3005   }
3006 
3007   __kmp_sched_apply_mods_stdkind(kind, th_type);
3008   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3009 }
3010 
3011 int __kmp_get_ancestor_thread_num(int gtid, int level) {
3012 
3013   int ii, dd;
3014   kmp_team_t *team;
3015   kmp_info_t *thr;
3016 
3017   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3018   KMP_DEBUG_ASSERT(__kmp_init_serial);
3019 
3020   // validate level
3021   if (level == 0)
3022     return 0;
3023   if (level < 0)
3024     return -1;
3025   thr = __kmp_threads[gtid];
3026   team = thr->th.th_team;
3027   ii = team->t.t_level;
3028   if (level > ii)
3029     return -1;
3030 
3031   if (thr->th.th_teams_microtask) {
3032     // AC: we are in teams region where multiple nested teams have same level
3033     int tlevel = thr->th.th_teams_level; // the level of the teams construct
3034     if (level <=
3035         tlevel) { // otherwise usual algorithm works (will not touch the teams)
3036       KMP_DEBUG_ASSERT(ii >= tlevel);
3037       // AC: As we need to pass by the teams league, we need to artificially
3038       // increase ii
3039       if (ii == tlevel) {
3040         ii += 2; // three teams have same level
3041       } else {
3042         ii++; // two teams have same level
3043       }
3044     }
3045   }
3046 
3047   if (ii == level)
3048     return __kmp_tid_from_gtid(gtid);
3049 
3050   dd = team->t.t_serialized;
3051   level++;
3052   while (ii > level) {
3053     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3054     }
3055     if ((team->t.t_serialized) && (!dd)) {
3056       team = team->t.t_parent;
3057       continue;
3058     }
3059     if (ii > level) {
3060       team = team->t.t_parent;
3061       dd = team->t.t_serialized;
3062       ii--;
3063     }
3064   }
3065 
3066   return (dd > 1) ? (0) : (team->t.t_master_tid);
3067 }
3068 
3069 int __kmp_get_team_size(int gtid, int level) {
3070 
3071   int ii, dd;
3072   kmp_team_t *team;
3073   kmp_info_t *thr;
3074 
3075   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3076   KMP_DEBUG_ASSERT(__kmp_init_serial);
3077 
3078   // validate level
3079   if (level == 0)
3080     return 1;
3081   if (level < 0)
3082     return -1;
3083   thr = __kmp_threads[gtid];
3084   team = thr->th.th_team;
3085   ii = team->t.t_level;
3086   if (level > ii)
3087     return -1;
3088 
3089   if (thr->th.th_teams_microtask) {
3090     // AC: we are in teams region where multiple nested teams have same level
3091     int tlevel = thr->th.th_teams_level; // the level of the teams construct
3092     if (level <=
3093         tlevel) { // otherwise usual algorithm works (will not touch the teams)
3094       KMP_DEBUG_ASSERT(ii >= tlevel);
3095       // AC: As we need to pass by the teams league, we need to artificially
3096       // increase ii
3097       if (ii == tlevel) {
3098         ii += 2; // three teams have same level
3099       } else {
3100         ii++; // two teams have same level
3101       }
3102     }
3103   }
3104 
3105   while (ii > level) {
3106     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3107     }
3108     if (team->t.t_serialized && (!dd)) {
3109       team = team->t.t_parent;
3110       continue;
3111     }
3112     if (ii > level) {
3113       team = team->t.t_parent;
3114       ii--;
3115     }
3116   }
3117 
3118   return team->t.t_nproc;
3119 }
3120 
3121 kmp_r_sched_t __kmp_get_schedule_global() {
3122   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3123   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3124   // independently. So one can get the updated schedule here.
3125 
3126   kmp_r_sched_t r_sched;
3127 
3128   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3129   // __kmp_guided. __kmp_sched should keep original value, so that user can set
3130   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3131   // different roots (even in OMP 2.5)
3132   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3133   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3134   if (s == kmp_sch_static) {
3135     // replace STATIC with more detailed schedule (balanced or greedy)
3136     r_sched.r_sched_type = __kmp_static;
3137   } else if (s == kmp_sch_guided_chunked) {
3138     // replace GUIDED with more detailed schedule (iterative or analytical)
3139     r_sched.r_sched_type = __kmp_guided;
3140   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3141     r_sched.r_sched_type = __kmp_sched;
3142   }
3143   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3144 
3145   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3146     // __kmp_chunk may be wrong here (if it was not ever set)
3147     r_sched.chunk = KMP_DEFAULT_CHUNK;
3148   } else {
3149     r_sched.chunk = __kmp_chunk;
3150   }
3151 
3152   return r_sched;
3153 }
3154 
3155 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3156    at least argc number of *t_argv entries for the requested team. */
3157 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3158 
3159   KMP_DEBUG_ASSERT(team);
3160   if (!realloc || argc > team->t.t_max_argc) {
3161 
3162     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3163                    "current entries=%d\n",
3164                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3165     /* if previously allocated heap space for args, free them */
3166     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3167       __kmp_free((void *)team->t.t_argv);
3168 
3169     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3170       /* use unused space in the cache line for arguments */
3171       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3172       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3173                      "argv entries\n",
3174                      team->t.t_id, team->t.t_max_argc));
3175       team->t.t_argv = &team->t.t_inline_argv[0];
3176       if (__kmp_storage_map) {
3177         __kmp_print_storage_map_gtid(
3178             -1, &team->t.t_inline_argv[0],
3179             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3180             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3181             team->t.t_id);
3182       }
3183     } else {
3184       /* allocate space for arguments in the heap */
3185       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3186                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3187                                : 2 * argc;
3188       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3189                      "argv entries\n",
3190                      team->t.t_id, team->t.t_max_argc));
3191       team->t.t_argv =
3192           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3193       if (__kmp_storage_map) {
3194         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3195                                      &team->t.t_argv[team->t.t_max_argc],
3196                                      sizeof(void *) * team->t.t_max_argc,
3197                                      "team_%d.t_argv", team->t.t_id);
3198       }
3199     }
3200   }
3201 }
3202 
3203 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3204   int i;
3205   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3206   team->t.t_threads =
3207       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3208   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3209       sizeof(dispatch_shared_info_t) * num_disp_buff);
3210   team->t.t_dispatch =
3211       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3212   team->t.t_implicit_task_taskdata =
3213       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3214   team->t.t_max_nproc = max_nth;
3215 
3216   /* setup dispatch buffers */
3217   for (i = 0; i < num_disp_buff; ++i) {
3218     team->t.t_disp_buffer[i].buffer_index = i;
3219     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3220   }
3221 }
3222 
3223 static void __kmp_free_team_arrays(kmp_team_t *team) {
3224   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3225   int i;
3226   for (i = 0; i < team->t.t_max_nproc; ++i) {
3227     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3228       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3229       team->t.t_dispatch[i].th_disp_buffer = NULL;
3230     }
3231   }
3232 #if KMP_USE_HIER_SCHED
3233   __kmp_dispatch_free_hierarchies(team);
3234 #endif
3235   __kmp_free(team->t.t_threads);
3236   __kmp_free(team->t.t_disp_buffer);
3237   __kmp_free(team->t.t_dispatch);
3238   __kmp_free(team->t.t_implicit_task_taskdata);
3239   team->t.t_threads = NULL;
3240   team->t.t_disp_buffer = NULL;
3241   team->t.t_dispatch = NULL;
3242   team->t.t_implicit_task_taskdata = 0;
3243 }
3244 
3245 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3246   kmp_info_t **oldThreads = team->t.t_threads;
3247 
3248   __kmp_free(team->t.t_disp_buffer);
3249   __kmp_free(team->t.t_dispatch);
3250   __kmp_free(team->t.t_implicit_task_taskdata);
3251   __kmp_allocate_team_arrays(team, max_nth);
3252 
3253   KMP_MEMCPY(team->t.t_threads, oldThreads,
3254              team->t.t_nproc * sizeof(kmp_info_t *));
3255 
3256   __kmp_free(oldThreads);
3257 }
3258 
3259 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3260 
3261   kmp_r_sched_t r_sched =
3262       __kmp_get_schedule_global(); // get current state of scheduling globals
3263 
3264   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3265 
3266   kmp_internal_control_t g_icvs = {
3267     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3268     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3269     // adjustment of threads (per thread)
3270     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3271     // whether blocktime is explicitly set
3272     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3273 #if KMP_USE_MONITOR
3274     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3275 // intervals
3276 #endif
3277     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3278     // next parallel region (per thread)
3279     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3280     __kmp_cg_max_nth, // int thread_limit;
3281     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3282     // for max_active_levels
3283     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3284     // {sched,chunk} pair
3285     __kmp_nested_proc_bind.bind_types[0],
3286     __kmp_default_device,
3287     NULL // struct kmp_internal_control *next;
3288   };
3289 
3290   return g_icvs;
3291 }
3292 
3293 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3294 
3295   kmp_internal_control_t gx_icvs;
3296   gx_icvs.serial_nesting_level =
3297       0; // probably =team->t.t_serial like in save_inter_controls
3298   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3299   gx_icvs.next = NULL;
3300 
3301   return gx_icvs;
3302 }
3303 
3304 static void __kmp_initialize_root(kmp_root_t *root) {
3305   int f;
3306   kmp_team_t *root_team;
3307   kmp_team_t *hot_team;
3308   int hot_team_max_nth;
3309   kmp_r_sched_t r_sched =
3310       __kmp_get_schedule_global(); // get current state of scheduling globals
3311   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3312   KMP_DEBUG_ASSERT(root);
3313   KMP_ASSERT(!root->r.r_begin);
3314 
3315   /* setup the root state structure */
3316   __kmp_init_lock(&root->r.r_begin_lock);
3317   root->r.r_begin = FALSE;
3318   root->r.r_active = FALSE;
3319   root->r.r_in_parallel = 0;
3320   root->r.r_blocktime = __kmp_dflt_blocktime;
3321 #if KMP_AFFINITY_SUPPORTED
3322   root->r.r_affinity_assigned = FALSE;
3323 #endif
3324 
3325   /* setup the root team for this task */
3326   /* allocate the root team structure */
3327   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3328 
3329   root_team =
3330       __kmp_allocate_team(root,
3331                           1, // new_nproc
3332                           1, // max_nproc
3333 #if OMPT_SUPPORT
3334                           ompt_data_none, // root parallel id
3335 #endif
3336                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3337                           0 // argc
3338                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3339       );
3340 #if USE_DEBUGGER
3341   // Non-NULL value should be assigned to make the debugger display the root
3342   // team.
3343   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3344 #endif
3345 
3346   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3347 
3348   root->r.r_root_team = root_team;
3349   root_team->t.t_control_stack_top = NULL;
3350 
3351   /* initialize root team */
3352   root_team->t.t_threads[0] = NULL;
3353   root_team->t.t_nproc = 1;
3354   root_team->t.t_serialized = 1;
3355   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3356   root_team->t.t_sched.sched = r_sched.sched;
3357   KA_TRACE(
3358       20,
3359       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3360        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3361 
3362   /* setup the  hot team for this task */
3363   /* allocate the hot team structure */
3364   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3365 
3366   hot_team =
3367       __kmp_allocate_team(root,
3368                           1, // new_nproc
3369                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3370 #if OMPT_SUPPORT
3371                           ompt_data_none, // root parallel id
3372 #endif
3373                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3374                           0 // argc
3375                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3376       );
3377   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3378 
3379   root->r.r_hot_team = hot_team;
3380   root_team->t.t_control_stack_top = NULL;
3381 
3382   /* first-time initialization */
3383   hot_team->t.t_parent = root_team;
3384 
3385   /* initialize hot team */
3386   hot_team_max_nth = hot_team->t.t_max_nproc;
3387   for (f = 0; f < hot_team_max_nth; ++f) {
3388     hot_team->t.t_threads[f] = NULL;
3389   }
3390   hot_team->t.t_nproc = 1;
3391   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3392   hot_team->t.t_sched.sched = r_sched.sched;
3393   hot_team->t.t_size_changed = 0;
3394 }
3395 
3396 #ifdef KMP_DEBUG
3397 
3398 typedef struct kmp_team_list_item {
3399   kmp_team_p const *entry;
3400   struct kmp_team_list_item *next;
3401 } kmp_team_list_item_t;
3402 typedef kmp_team_list_item_t *kmp_team_list_t;
3403 
3404 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3405     kmp_team_list_t list, // List of teams.
3406     kmp_team_p const *team // Team to add.
3407 ) {
3408 
3409   // List must terminate with item where both entry and next are NULL.
3410   // Team is added to the list only once.
3411   // List is sorted in ascending order by team id.
3412   // Team id is *not* a key.
3413 
3414   kmp_team_list_t l;
3415 
3416   KMP_DEBUG_ASSERT(list != NULL);
3417   if (team == NULL) {
3418     return;
3419   }
3420 
3421   __kmp_print_structure_team_accum(list, team->t.t_parent);
3422   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3423 
3424   // Search list for the team.
3425   l = list;
3426   while (l->next != NULL && l->entry != team) {
3427     l = l->next;
3428   }
3429   if (l->next != NULL) {
3430     return; // Team has been added before, exit.
3431   }
3432 
3433   // Team is not found. Search list again for insertion point.
3434   l = list;
3435   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3436     l = l->next;
3437   }
3438 
3439   // Insert team.
3440   {
3441     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3442         sizeof(kmp_team_list_item_t));
3443     *item = *l;
3444     l->entry = team;
3445     l->next = item;
3446   }
3447 }
3448 
3449 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3450 
3451 ) {
3452   __kmp_printf("%s", title);
3453   if (team != NULL) {
3454     __kmp_printf("%2x %p\n", team->t.t_id, team);
3455   } else {
3456     __kmp_printf(" - (nil)\n");
3457   }
3458 }
3459 
3460 static void __kmp_print_structure_thread(char const *title,
3461                                          kmp_info_p const *thread) {
3462   __kmp_printf("%s", title);
3463   if (thread != NULL) {
3464     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3465   } else {
3466     __kmp_printf(" - (nil)\n");
3467   }
3468 }
3469 
3470 void __kmp_print_structure(void) {
3471 
3472   kmp_team_list_t list;
3473 
3474   // Initialize list of teams.
3475   list =
3476       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3477   list->entry = NULL;
3478   list->next = NULL;
3479 
3480   __kmp_printf("\n------------------------------\nGlobal Thread "
3481                "Table\n------------------------------\n");
3482   {
3483     int gtid;
3484     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3485       __kmp_printf("%2d", gtid);
3486       if (__kmp_threads != NULL) {
3487         __kmp_printf(" %p", __kmp_threads[gtid]);
3488       }
3489       if (__kmp_root != NULL) {
3490         __kmp_printf(" %p", __kmp_root[gtid]);
3491       }
3492       __kmp_printf("\n");
3493     }
3494   }
3495 
3496   // Print out __kmp_threads array.
3497   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3498                "----------\n");
3499   if (__kmp_threads != NULL) {
3500     int gtid;
3501     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3502       kmp_info_t const *thread = __kmp_threads[gtid];
3503       if (thread != NULL) {
3504         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3505         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3506         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3507         __kmp_print_structure_team("    Serial Team:  ",
3508                                    thread->th.th_serial_team);
3509         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3510         __kmp_print_structure_thread("    Primary:      ",
3511                                      thread->th.th_team_master);
3512         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3513         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3514         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3515         __kmp_print_structure_thread("    Next in pool: ",
3516                                      thread->th.th_next_pool);
3517         __kmp_printf("\n");
3518         __kmp_print_structure_team_accum(list, thread->th.th_team);
3519         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3520       }
3521     }
3522   } else {
3523     __kmp_printf("Threads array is not allocated.\n");
3524   }
3525 
3526   // Print out __kmp_root array.
3527   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3528                "--------\n");
3529   if (__kmp_root != NULL) {
3530     int gtid;
3531     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3532       kmp_root_t const *root = __kmp_root[gtid];
3533       if (root != NULL) {
3534         __kmp_printf("GTID %2d %p:\n", gtid, root);
3535         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3536         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3537         __kmp_print_structure_thread("    Uber Thread:  ",
3538                                      root->r.r_uber_thread);
3539         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3540         __kmp_printf("    In Parallel:  %2d\n",
3541                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3542         __kmp_printf("\n");
3543         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3544         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3545       }
3546     }
3547   } else {
3548     __kmp_printf("Ubers array is not allocated.\n");
3549   }
3550 
3551   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3552                "--------\n");
3553   while (list->next != NULL) {
3554     kmp_team_p const *team = list->entry;
3555     int i;
3556     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3557     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3558     __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
3559     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3560     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3561     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3562     for (i = 0; i < team->t.t_nproc; ++i) {
3563       __kmp_printf("    Thread %2d:      ", i);
3564       __kmp_print_structure_thread("", team->t.t_threads[i]);
3565     }
3566     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3567     __kmp_printf("\n");
3568     list = list->next;
3569   }
3570 
3571   // Print out __kmp_thread_pool and __kmp_team_pool.
3572   __kmp_printf("\n------------------------------\nPools\n----------------------"
3573                "--------\n");
3574   __kmp_print_structure_thread("Thread pool:          ",
3575                                CCAST(kmp_info_t *, __kmp_thread_pool));
3576   __kmp_print_structure_team("Team pool:            ",
3577                              CCAST(kmp_team_t *, __kmp_team_pool));
3578   __kmp_printf("\n");
3579 
3580   // Free team list.
3581   while (list != NULL) {
3582     kmp_team_list_item_t *item = list;
3583     list = list->next;
3584     KMP_INTERNAL_FREE(item);
3585   }
3586 }
3587 
3588 #endif
3589 
3590 //---------------------------------------------------------------------------
3591 //  Stuff for per-thread fast random number generator
3592 //  Table of primes
3593 static const unsigned __kmp_primes[] = {
3594     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3595     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3596     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3597     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3598     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3599     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3600     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3601     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3602     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3603     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3604     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3605 
3606 //---------------------------------------------------------------------------
3607 //  __kmp_get_random: Get a random number using a linear congruential method.
3608 unsigned short __kmp_get_random(kmp_info_t *thread) {
3609   unsigned x = thread->th.th_x;
3610   unsigned short r = (unsigned short)(x >> 16);
3611 
3612   thread->th.th_x = x * thread->th.th_a + 1;
3613 
3614   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3615                 thread->th.th_info.ds.ds_tid, r));
3616 
3617   return r;
3618 }
3619 //--------------------------------------------------------
3620 // __kmp_init_random: Initialize a random number generator
3621 void __kmp_init_random(kmp_info_t *thread) {
3622   unsigned seed = thread->th.th_info.ds.ds_tid;
3623 
3624   thread->th.th_a =
3625       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3626   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3627   KA_TRACE(30,
3628            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3629 }
3630 
3631 #if KMP_OS_WINDOWS
3632 /* reclaim array entries for root threads that are already dead, returns number
3633  * reclaimed */
3634 static int __kmp_reclaim_dead_roots(void) {
3635   int i, r = 0;
3636 
3637   for (i = 0; i < __kmp_threads_capacity; ++i) {
3638     if (KMP_UBER_GTID(i) &&
3639         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3640         !__kmp_root[i]
3641              ->r.r_active) { // AC: reclaim only roots died in non-active state
3642       r += __kmp_unregister_root_other_thread(i);
3643     }
3644   }
3645   return r;
3646 }
3647 #endif
3648 
3649 /* This function attempts to create free entries in __kmp_threads and
3650    __kmp_root, and returns the number of free entries generated.
3651 
3652    For Windows* OS static library, the first mechanism used is to reclaim array
3653    entries for root threads that are already dead.
3654 
3655    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3656    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3657    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3658    threadprivate cache array has been created. Synchronization with
3659    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3660 
3661    After any dead root reclamation, if the clipping value allows array expansion
3662    to result in the generation of a total of nNeed free slots, the function does
3663    that expansion. If not, nothing is done beyond the possible initial root
3664    thread reclamation.
3665 
3666    If any argument is negative, the behavior is undefined. */
3667 static int __kmp_expand_threads(int nNeed) {
3668   int added = 0;
3669   int minimumRequiredCapacity;
3670   int newCapacity;
3671   kmp_info_t **newThreads;
3672   kmp_root_t **newRoot;
3673 
3674   // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3675   // resizing __kmp_threads does not need additional protection if foreign
3676   // threads are present
3677 
3678 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3679   /* only for Windows static library */
3680   /* reclaim array entries for root threads that are already dead */
3681   added = __kmp_reclaim_dead_roots();
3682 
3683   if (nNeed) {
3684     nNeed -= added;
3685     if (nNeed < 0)
3686       nNeed = 0;
3687   }
3688 #endif
3689   if (nNeed <= 0)
3690     return added;
3691 
3692   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3693   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3694   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3695   // > __kmp_max_nth in one of two ways:
3696   //
3697   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3698   //    may not be reused by another thread, so we may need to increase
3699   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3700   //
3701   // 2) New foreign root(s) are encountered.  We always register new foreign
3702   //    roots. This may cause a smaller # of threads to be allocated at
3703   //    subsequent parallel regions, but the worker threads hang around (and
3704   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3705   //
3706   // Anyway, that is the reason for moving the check to see if
3707   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3708   // instead of having it performed here. -BB
3709 
3710   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3711 
3712   /* compute expansion headroom to check if we can expand */
3713   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3714     /* possible expansion too small -- give up */
3715     return added;
3716   }
3717   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3718 
3719   newCapacity = __kmp_threads_capacity;
3720   do {
3721     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3722                                                           : __kmp_sys_max_nth;
3723   } while (newCapacity < minimumRequiredCapacity);
3724   newThreads = (kmp_info_t **)__kmp_allocate(
3725       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3726   newRoot =
3727       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3728   KMP_MEMCPY(newThreads, __kmp_threads,
3729              __kmp_threads_capacity * sizeof(kmp_info_t *));
3730   KMP_MEMCPY(newRoot, __kmp_root,
3731              __kmp_threads_capacity * sizeof(kmp_root_t *));
3732   // Put old __kmp_threads array on a list. Any ongoing references to the old
3733   // list will be valid. This list is cleaned up at library shutdown.
3734   kmp_old_threads_list_t *node =
3735       (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3736   node->threads = __kmp_threads;
3737   node->next = __kmp_old_threads_list;
3738   __kmp_old_threads_list = node;
3739 
3740   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3741   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3742   added += newCapacity - __kmp_threads_capacity;
3743   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3744 
3745   if (newCapacity > __kmp_tp_capacity) {
3746     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3747     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3748       __kmp_threadprivate_resize_cache(newCapacity);
3749     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3750       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3751     }
3752     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3753   }
3754 
3755   return added;
3756 }
3757 
3758 /* Register the current thread as a root thread and obtain our gtid. We must
3759    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3760    thread that calls from __kmp_do_serial_initialize() */
3761 int __kmp_register_root(int initial_thread) {
3762   kmp_info_t *root_thread;
3763   kmp_root_t *root;
3764   int gtid;
3765   int capacity;
3766   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3767   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3768   KMP_MB();
3769 
3770   /* 2007-03-02:
3771      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3772      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3773      work as expected -- it may return false (that means there is at least one
3774      empty slot in __kmp_threads array), but it is possible the only free slot
3775      is #0, which is reserved for initial thread and so cannot be used for this
3776      one. Following code workarounds this bug.
3777 
3778      However, right solution seems to be not reserving slot #0 for initial
3779      thread because:
3780      (1) there is no magic in slot #0,
3781      (2) we cannot detect initial thread reliably (the first thread which does
3782         serial initialization may be not a real initial thread).
3783   */
3784   capacity = __kmp_threads_capacity;
3785   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3786     --capacity;
3787   }
3788 
3789   // If it is not for initializing the hidden helper team, we need to take
3790   // __kmp_hidden_helper_threads_num out of the capacity because it is included
3791   // in __kmp_threads_capacity.
3792   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3793     capacity -= __kmp_hidden_helper_threads_num;
3794   }
3795 
3796   /* see if there are too many threads */
3797   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3798     if (__kmp_tp_cached) {
3799       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3800                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3801                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3802     } else {
3803       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3804                   __kmp_msg_null);
3805     }
3806   }
3807 
3808   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3809   // 0: initial thread, also a regular OpenMP thread.
3810   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3811   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3812   // regular OpenMP threads.
3813   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3814     // Find an available thread slot for hidden helper thread. Slots for hidden
3815     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3816     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3817                    gtid <= __kmp_hidden_helper_threads_num;
3818          gtid++)
3819       ;
3820     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3821     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3822                  "hidden helper thread: T#%d\n",
3823                  gtid));
3824   } else {
3825     /* find an available thread slot */
3826     // Don't reassign the zero slot since we need that to only be used by
3827     // initial thread. Slots for hidden helper threads should also be skipped.
3828     if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3829       gtid = 0;
3830     } else {
3831       for (gtid = __kmp_hidden_helper_threads_num + 1;
3832            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3833         ;
3834     }
3835     KA_TRACE(
3836         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3837     KMP_ASSERT(gtid < __kmp_threads_capacity);
3838   }
3839 
3840   /* update global accounting */
3841   __kmp_all_nth++;
3842   TCW_4(__kmp_nth, __kmp_nth + 1);
3843 
3844   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3845   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3846   if (__kmp_adjust_gtid_mode) {
3847     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3848       if (TCR_4(__kmp_gtid_mode) != 2) {
3849         TCW_4(__kmp_gtid_mode, 2);
3850       }
3851     } else {
3852       if (TCR_4(__kmp_gtid_mode) != 1) {
3853         TCW_4(__kmp_gtid_mode, 1);
3854       }
3855     }
3856   }
3857 
3858 #ifdef KMP_ADJUST_BLOCKTIME
3859   /* Adjust blocktime to zero if necessary            */
3860   /* Middle initialization might not have occurred yet */
3861   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3862     if (__kmp_nth > __kmp_avail_proc) {
3863       __kmp_zero_bt = TRUE;
3864     }
3865   }
3866 #endif /* KMP_ADJUST_BLOCKTIME */
3867 
3868   /* setup this new hierarchy */
3869   if (!(root = __kmp_root[gtid])) {
3870     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3871     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3872   }
3873 
3874 #if KMP_STATS_ENABLED
3875   // Initialize stats as soon as possible (right after gtid assignment).
3876   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3877   __kmp_stats_thread_ptr->startLife();
3878   KMP_SET_THREAD_STATE(SERIAL_REGION);
3879   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3880 #endif
3881   __kmp_initialize_root(root);
3882 
3883   /* setup new root thread structure */
3884   if (root->r.r_uber_thread) {
3885     root_thread = root->r.r_uber_thread;
3886   } else {
3887     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3888     if (__kmp_storage_map) {
3889       __kmp_print_thread_storage_map(root_thread, gtid);
3890     }
3891     root_thread->th.th_info.ds.ds_gtid = gtid;
3892 #if OMPT_SUPPORT
3893     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3894 #endif
3895     root_thread->th.th_root = root;
3896     if (__kmp_env_consistency_check) {
3897       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3898     }
3899 #if USE_FAST_MEMORY
3900     __kmp_initialize_fast_memory(root_thread);
3901 #endif /* USE_FAST_MEMORY */
3902 
3903 #if KMP_USE_BGET
3904     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3905     __kmp_initialize_bget(root_thread);
3906 #endif
3907     __kmp_init_random(root_thread); // Initialize random number generator
3908   }
3909 
3910   /* setup the serial team held in reserve by the root thread */
3911   if (!root_thread->th.th_serial_team) {
3912     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3913     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3914     root_thread->th.th_serial_team = __kmp_allocate_team(
3915         root, 1, 1,
3916 #if OMPT_SUPPORT
3917         ompt_data_none, // root parallel id
3918 #endif
3919         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3920   }
3921   KMP_ASSERT(root_thread->th.th_serial_team);
3922   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3923                 root_thread->th.th_serial_team));
3924 
3925   /* drop root_thread into place */
3926   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3927 
3928   root->r.r_root_team->t.t_threads[0] = root_thread;
3929   root->r.r_hot_team->t.t_threads[0] = root_thread;
3930   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3931   // AC: the team created in reserve, not for execution (it is unused for now).
3932   root_thread->th.th_serial_team->t.t_serialized = 0;
3933   root->r.r_uber_thread = root_thread;
3934 
3935   /* initialize the thread, get it ready to go */
3936   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3937   TCW_4(__kmp_init_gtid, TRUE);
3938 
3939   /* prepare the primary thread for get_gtid() */
3940   __kmp_gtid_set_specific(gtid);
3941 
3942 #if USE_ITT_BUILD
3943   __kmp_itt_thread_name(gtid);
3944 #endif /* USE_ITT_BUILD */
3945 
3946 #ifdef KMP_TDATA_GTID
3947   __kmp_gtid = gtid;
3948 #endif
3949   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3950   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3951 
3952   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3953                 "plain=%u\n",
3954                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3955                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3956                 KMP_INIT_BARRIER_STATE));
3957   { // Initialize barrier data.
3958     int b;
3959     for (b = 0; b < bs_last_barrier; ++b) {
3960       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3961 #if USE_DEBUGGER
3962       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3963 #endif
3964     }
3965   }
3966   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3967                    KMP_INIT_BARRIER_STATE);
3968 
3969 #if KMP_AFFINITY_SUPPORTED
3970   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3971   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3972   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3973   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3974 #endif /* KMP_AFFINITY_SUPPORTED */
3975   root_thread->th.th_def_allocator = __kmp_def_allocator;
3976   root_thread->th.th_prev_level = 0;
3977   root_thread->th.th_prev_num_threads = 1;
3978 
3979   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3980   tmp->cg_root = root_thread;
3981   tmp->cg_thread_limit = __kmp_cg_max_nth;
3982   tmp->cg_nthreads = 1;
3983   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3984                  " cg_nthreads init to 1\n",
3985                  root_thread, tmp));
3986   tmp->up = NULL;
3987   root_thread->th.th_cg_roots = tmp;
3988 
3989   __kmp_root_counter++;
3990 
3991 #if OMPT_SUPPORT
3992   if (!initial_thread && ompt_enabled.enabled) {
3993 
3994     kmp_info_t *root_thread = ompt_get_thread();
3995 
3996     ompt_set_thread_state(root_thread, ompt_state_overhead);
3997 
3998     if (ompt_enabled.ompt_callback_thread_begin) {
3999       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4000           ompt_thread_initial, __ompt_get_thread_data_internal());
4001     }
4002     ompt_data_t *task_data;
4003     ompt_data_t *parallel_data;
4004     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4005                                   NULL);
4006     if (ompt_enabled.ompt_callback_implicit_task) {
4007       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4008           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4009     }
4010 
4011     ompt_set_thread_state(root_thread, ompt_state_work_serial);
4012   }
4013 #endif
4014 #if OMPD_SUPPORT
4015   if (ompd_state & OMPD_ENABLE_BP)
4016     ompd_bp_thread_begin();
4017 #endif
4018 
4019   KMP_MB();
4020   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4021 
4022   return gtid;
4023 }
4024 
4025 #if KMP_NESTED_HOT_TEAMS
4026 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4027                                 const int max_level) {
4028   int i, n, nth;
4029   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4030   if (!hot_teams || !hot_teams[level].hot_team) {
4031     return 0;
4032   }
4033   KMP_DEBUG_ASSERT(level < max_level);
4034   kmp_team_t *team = hot_teams[level].hot_team;
4035   nth = hot_teams[level].hot_team_nth;
4036   n = nth - 1; // primary thread is not freed
4037   if (level < max_level - 1) {
4038     for (i = 0; i < nth; ++i) {
4039       kmp_info_t *th = team->t.t_threads[i];
4040       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4041       if (i > 0 && th->th.th_hot_teams) {
4042         __kmp_free(th->th.th_hot_teams);
4043         th->th.th_hot_teams = NULL;
4044       }
4045     }
4046   }
4047   __kmp_free_team(root, team, NULL);
4048   return n;
4049 }
4050 #endif
4051 
4052 // Resets a root thread and clear its root and hot teams.
4053 // Returns the number of __kmp_threads entries directly and indirectly freed.
4054 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4055   kmp_team_t *root_team = root->r.r_root_team;
4056   kmp_team_t *hot_team = root->r.r_hot_team;
4057   int n = hot_team->t.t_nproc;
4058   int i;
4059 
4060   KMP_DEBUG_ASSERT(!root->r.r_active);
4061 
4062   root->r.r_root_team = NULL;
4063   root->r.r_hot_team = NULL;
4064   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4065   // before call to __kmp_free_team().
4066   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4067 #if KMP_NESTED_HOT_TEAMS
4068   if (__kmp_hot_teams_max_level >
4069       0) { // need to free nested hot teams and their threads if any
4070     for (i = 0; i < hot_team->t.t_nproc; ++i) {
4071       kmp_info_t *th = hot_team->t.t_threads[i];
4072       if (__kmp_hot_teams_max_level > 1) {
4073         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4074       }
4075       if (th->th.th_hot_teams) {
4076         __kmp_free(th->th.th_hot_teams);
4077         th->th.th_hot_teams = NULL;
4078       }
4079     }
4080   }
4081 #endif
4082   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4083 
4084   // Before we can reap the thread, we need to make certain that all other
4085   // threads in the teams that had this root as ancestor have stopped trying to
4086   // steal tasks.
4087   if (__kmp_tasking_mode != tskm_immediate_exec) {
4088     __kmp_wait_to_unref_task_teams();
4089   }
4090 
4091 #if KMP_OS_WINDOWS
4092   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4093   KA_TRACE(
4094       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4095            "\n",
4096            (LPVOID) & (root->r.r_uber_thread->th),
4097            root->r.r_uber_thread->th.th_info.ds.ds_thread));
4098   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4099 #endif /* KMP_OS_WINDOWS */
4100 
4101 #if OMPD_SUPPORT
4102   if (ompd_state & OMPD_ENABLE_BP)
4103     ompd_bp_thread_end();
4104 #endif
4105 
4106 #if OMPT_SUPPORT
4107   ompt_data_t *task_data;
4108   ompt_data_t *parallel_data;
4109   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4110                                 NULL);
4111   if (ompt_enabled.ompt_callback_implicit_task) {
4112     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4113         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4114   }
4115   if (ompt_enabled.ompt_callback_thread_end) {
4116     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4117         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4118   }
4119 #endif
4120 
4121   TCW_4(__kmp_nth,
4122         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4123   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4124   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4125                  " to %d\n",
4126                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4127                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4128   if (i == 1) {
4129     // need to free contention group structure
4130     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4131                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
4132     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4133     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4134     root->r.r_uber_thread->th.th_cg_roots = NULL;
4135   }
4136   __kmp_reap_thread(root->r.r_uber_thread, 1);
4137 
4138   // We canot put root thread to __kmp_thread_pool, so we have to reap it
4139   // instead of freeing.
4140   root->r.r_uber_thread = NULL;
4141   /* mark root as no longer in use */
4142   root->r.r_begin = FALSE;
4143 
4144   return n;
4145 }
4146 
4147 void __kmp_unregister_root_current_thread(int gtid) {
4148   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4149   /* this lock should be ok, since unregister_root_current_thread is never
4150      called during an abort, only during a normal close. furthermore, if you
4151      have the forkjoin lock, you should never try to get the initz lock */
4152   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4153   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4154     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4155                   "exiting T#%d\n",
4156                   gtid));
4157     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4158     return;
4159   }
4160   kmp_root_t *root = __kmp_root[gtid];
4161 
4162   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4163   KMP_ASSERT(KMP_UBER_GTID(gtid));
4164   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4165   KMP_ASSERT(root->r.r_active == FALSE);
4166 
4167   KMP_MB();
4168 
4169   kmp_info_t *thread = __kmp_threads[gtid];
4170   kmp_team_t *team = thread->th.th_team;
4171   kmp_task_team_t *task_team = thread->th.th_task_team;
4172 
4173   // we need to wait for the proxy tasks before finishing the thread
4174   if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4175                             task_team->tt.tt_hidden_helper_task_encountered)) {
4176 #if OMPT_SUPPORT
4177     // the runtime is shutting down so we won't report any events
4178     thread->th.ompt_thread_info.state = ompt_state_undefined;
4179 #endif
4180     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4181   }
4182 
4183   __kmp_reset_root(gtid, root);
4184 
4185   KMP_MB();
4186   KC_TRACE(10,
4187            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4188 
4189   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4190 }
4191 
4192 #if KMP_OS_WINDOWS
4193 /* __kmp_forkjoin_lock must be already held
4194    Unregisters a root thread that is not the current thread.  Returns the number
4195    of __kmp_threads entries freed as a result. */
4196 static int __kmp_unregister_root_other_thread(int gtid) {
4197   kmp_root_t *root = __kmp_root[gtid];
4198   int r;
4199 
4200   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4201   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4202   KMP_ASSERT(KMP_UBER_GTID(gtid));
4203   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4204   KMP_ASSERT(root->r.r_active == FALSE);
4205 
4206   r = __kmp_reset_root(gtid, root);
4207   KC_TRACE(10,
4208            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4209   return r;
4210 }
4211 #endif
4212 
4213 #if KMP_DEBUG
4214 void __kmp_task_info() {
4215 
4216   kmp_int32 gtid = __kmp_entry_gtid();
4217   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4218   kmp_info_t *this_thr = __kmp_threads[gtid];
4219   kmp_team_t *steam = this_thr->th.th_serial_team;
4220   kmp_team_t *team = this_thr->th.th_team;
4221 
4222   __kmp_printf(
4223       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4224       "ptask=%p\n",
4225       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4226       team->t.t_implicit_task_taskdata[tid].td_parent);
4227 }
4228 #endif // KMP_DEBUG
4229 
4230 /* TODO optimize with one big memclr, take out what isn't needed, split
4231    responsibility to workers as much as possible, and delay initialization of
4232    features as much as possible  */
4233 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4234                                   int tid, int gtid) {
4235   /* this_thr->th.th_info.ds.ds_gtid is setup in
4236      kmp_allocate_thread/create_worker.
4237      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4238   KMP_DEBUG_ASSERT(this_thr != NULL);
4239   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4240   KMP_DEBUG_ASSERT(team);
4241   KMP_DEBUG_ASSERT(team->t.t_threads);
4242   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4243   kmp_info_t *master = team->t.t_threads[0];
4244   KMP_DEBUG_ASSERT(master);
4245   KMP_DEBUG_ASSERT(master->th.th_root);
4246 
4247   KMP_MB();
4248 
4249   TCW_SYNC_PTR(this_thr->th.th_team, team);
4250 
4251   this_thr->th.th_info.ds.ds_tid = tid;
4252   this_thr->th.th_set_nproc = 0;
4253   if (__kmp_tasking_mode != tskm_immediate_exec)
4254     // When tasking is possible, threads are not safe to reap until they are
4255     // done tasking; this will be set when tasking code is exited in wait
4256     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4257   else // no tasking --> always safe to reap
4258     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4259   this_thr->th.th_set_proc_bind = proc_bind_default;
4260 #if KMP_AFFINITY_SUPPORTED
4261   this_thr->th.th_new_place = this_thr->th.th_current_place;
4262 #endif
4263   this_thr->th.th_root = master->th.th_root;
4264 
4265   /* setup the thread's cache of the team structure */
4266   this_thr->th.th_team_nproc = team->t.t_nproc;
4267   this_thr->th.th_team_master = master;
4268   this_thr->th.th_team_serialized = team->t.t_serialized;
4269 
4270   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4271 
4272   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4273                 tid, gtid, this_thr, this_thr->th.th_current_task));
4274 
4275   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4276                            team, tid, TRUE);
4277 
4278   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4279                 tid, gtid, this_thr, this_thr->th.th_current_task));
4280   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4281   // __kmp_initialize_team()?
4282 
4283   /* TODO no worksharing in speculative threads */
4284   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4285 
4286   this_thr->th.th_local.this_construct = 0;
4287 
4288   if (!this_thr->th.th_pri_common) {
4289     this_thr->th.th_pri_common =
4290         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4291     if (__kmp_storage_map) {
4292       __kmp_print_storage_map_gtid(
4293           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4294           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4295     }
4296     this_thr->th.th_pri_head = NULL;
4297   }
4298 
4299   if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4300       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4301     // Make new thread's CG root same as primary thread's
4302     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4303     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4304     if (tmp) {
4305       // worker changes CG, need to check if old CG should be freed
4306       int i = tmp->cg_nthreads--;
4307       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4308                      " on node %p of thread %p to %d\n",
4309                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4310       if (i == 1) {
4311         __kmp_free(tmp); // last thread left CG --> free it
4312       }
4313     }
4314     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4315     // Increment new thread's CG root's counter to add the new thread
4316     this_thr->th.th_cg_roots->cg_nthreads++;
4317     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4318                    " node %p of thread %p to %d\n",
4319                    this_thr, this_thr->th.th_cg_roots,
4320                    this_thr->th.th_cg_roots->cg_root,
4321                    this_thr->th.th_cg_roots->cg_nthreads));
4322     this_thr->th.th_current_task->td_icvs.thread_limit =
4323         this_thr->th.th_cg_roots->cg_thread_limit;
4324   }
4325 
4326   /* Initialize dynamic dispatch */
4327   {
4328     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4329     // Use team max_nproc since this will never change for the team.
4330     size_t disp_size =
4331         sizeof(dispatch_private_info_t) *
4332         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4333     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4334                   team->t.t_max_nproc));
4335     KMP_ASSERT(dispatch);
4336     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4337     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4338 
4339     dispatch->th_disp_index = 0;
4340     dispatch->th_doacross_buf_idx = 0;
4341     if (!dispatch->th_disp_buffer) {
4342       dispatch->th_disp_buffer =
4343           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4344 
4345       if (__kmp_storage_map) {
4346         __kmp_print_storage_map_gtid(
4347             gtid, &dispatch->th_disp_buffer[0],
4348             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4349                                           ? 1
4350                                           : __kmp_dispatch_num_buffers],
4351             disp_size,
4352             "th_%d.th_dispatch.th_disp_buffer "
4353             "(team_%d.t_dispatch[%d].th_disp_buffer)",
4354             gtid, team->t.t_id, gtid);
4355       }
4356     } else {
4357       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4358     }
4359 
4360     dispatch->th_dispatch_pr_current = 0;
4361     dispatch->th_dispatch_sh_current = 0;
4362 
4363     dispatch->th_deo_fcn = 0; /* ORDERED     */
4364     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4365   }
4366 
4367   this_thr->th.th_next_pool = NULL;
4368 
4369   if (!this_thr->th.th_task_state_memo_stack) {
4370     size_t i;
4371     this_thr->th.th_task_state_memo_stack =
4372         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4373     this_thr->th.th_task_state_top = 0;
4374     this_thr->th.th_task_state_stack_sz = 4;
4375     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4376          ++i) // zero init the stack
4377       this_thr->th.th_task_state_memo_stack[i] = 0;
4378   }
4379 
4380   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4381   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4382 
4383   KMP_MB();
4384 }
4385 
4386 /* allocate a new thread for the requesting team. this is only called from
4387    within a forkjoin critical section. we will first try to get an available
4388    thread from the thread pool. if none is available, we will fork a new one
4389    assuming we are able to create a new one. this should be assured, as the
4390    caller should check on this first. */
4391 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4392                                   int new_tid) {
4393   kmp_team_t *serial_team;
4394   kmp_info_t *new_thr;
4395   int new_gtid;
4396 
4397   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4398   KMP_DEBUG_ASSERT(root && team);
4399 #if !KMP_NESTED_HOT_TEAMS
4400   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4401 #endif
4402   KMP_MB();
4403 
4404   /* first, try to get one from the thread pool */
4405   if (__kmp_thread_pool) {
4406     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4407     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4408     if (new_thr == __kmp_thread_pool_insert_pt) {
4409       __kmp_thread_pool_insert_pt = NULL;
4410     }
4411     TCW_4(new_thr->th.th_in_pool, FALSE);
4412     __kmp_suspend_initialize_thread(new_thr);
4413     __kmp_lock_suspend_mx(new_thr);
4414     if (new_thr->th.th_active_in_pool == TRUE) {
4415       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4416       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4417       new_thr->th.th_active_in_pool = FALSE;
4418     }
4419     __kmp_unlock_suspend_mx(new_thr);
4420 
4421     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4422                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4423     KMP_ASSERT(!new_thr->th.th_team);
4424     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4425 
4426     /* setup the thread structure */
4427     __kmp_initialize_info(new_thr, team, new_tid,
4428                           new_thr->th.th_info.ds.ds_gtid);
4429     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4430 
4431     TCW_4(__kmp_nth, __kmp_nth + 1);
4432 
4433     new_thr->th.th_task_state = 0;
4434     new_thr->th.th_task_state_top = 0;
4435     new_thr->th.th_task_state_stack_sz = 4;
4436 
4437     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4438       // Make sure pool thread has transitioned to waiting on own thread struct
4439       KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4440       // Thread activated in __kmp_allocate_team when increasing team size
4441     }
4442 
4443 #ifdef KMP_ADJUST_BLOCKTIME
4444     /* Adjust blocktime back to zero if necessary */
4445     /* Middle initialization might not have occurred yet */
4446     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4447       if (__kmp_nth > __kmp_avail_proc) {
4448         __kmp_zero_bt = TRUE;
4449       }
4450     }
4451 #endif /* KMP_ADJUST_BLOCKTIME */
4452 
4453 #if KMP_DEBUG
4454     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4455     // KMP_BARRIER_PARENT_FLAG.
4456     int b;
4457     kmp_balign_t *balign = new_thr->th.th_bar;
4458     for (b = 0; b < bs_last_barrier; ++b)
4459       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4460 #endif
4461 
4462     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4463                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4464 
4465     KMP_MB();
4466     return new_thr;
4467   }
4468 
4469   /* no, well fork a new one */
4470   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4471   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4472 
4473 #if KMP_USE_MONITOR
4474   // If this is the first worker thread the RTL is creating, then also
4475   // launch the monitor thread.  We try to do this as early as possible.
4476   if (!TCR_4(__kmp_init_monitor)) {
4477     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4478     if (!TCR_4(__kmp_init_monitor)) {
4479       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4480       TCW_4(__kmp_init_monitor, 1);
4481       __kmp_create_monitor(&__kmp_monitor);
4482       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4483 #if KMP_OS_WINDOWS
4484       // AC: wait until monitor has started. This is a fix for CQ232808.
4485       // The reason is that if the library is loaded/unloaded in a loop with
4486       // small (parallel) work in between, then there is high probability that
4487       // monitor thread started after the library shutdown. At shutdown it is
4488       // too late to cope with the problem, because when the primary thread is
4489       // in DllMain (process detach) the monitor has no chances to start (it is
4490       // blocked), and primary thread has no means to inform the monitor that
4491       // the library has gone, because all the memory which the monitor can
4492       // access is going to be released/reset.
4493       while (TCR_4(__kmp_init_monitor) < 2) {
4494         KMP_YIELD(TRUE);
4495       }
4496       KF_TRACE(10, ("after monitor thread has started\n"));
4497 #endif
4498     }
4499     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4500   }
4501 #endif
4502 
4503   KMP_MB();
4504 
4505   {
4506     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4507                              ? 1
4508                              : __kmp_hidden_helper_threads_num + 1;
4509 
4510     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4511          ++new_gtid) {
4512       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4513     }
4514 
4515     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4516       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4517     }
4518   }
4519 
4520   /* allocate space for it. */
4521   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4522 
4523   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4524 
4525 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4526   // suppress race conditions detection on synchronization flags in debug mode
4527   // this helps to analyze library internals eliminating false positives
4528   __itt_suppress_mark_range(
4529       __itt_suppress_range, __itt_suppress_threading_errors,
4530       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4531   __itt_suppress_mark_range(
4532       __itt_suppress_range, __itt_suppress_threading_errors,
4533       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4534 #if KMP_OS_WINDOWS
4535   __itt_suppress_mark_range(
4536       __itt_suppress_range, __itt_suppress_threading_errors,
4537       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4538 #else
4539   __itt_suppress_mark_range(__itt_suppress_range,
4540                             __itt_suppress_threading_errors,
4541                             &new_thr->th.th_suspend_init_count,
4542                             sizeof(new_thr->th.th_suspend_init_count));
4543 #endif
4544   // TODO: check if we need to also suppress b_arrived flags
4545   __itt_suppress_mark_range(__itt_suppress_range,
4546                             __itt_suppress_threading_errors,
4547                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4548                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4549   __itt_suppress_mark_range(__itt_suppress_range,
4550                             __itt_suppress_threading_errors,
4551                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4552                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4553   __itt_suppress_mark_range(__itt_suppress_range,
4554                             __itt_suppress_threading_errors,
4555                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4556                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4557 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4558   if (__kmp_storage_map) {
4559     __kmp_print_thread_storage_map(new_thr, new_gtid);
4560   }
4561 
4562   // add the reserve serialized team, initialized from the team's primary thread
4563   {
4564     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4565     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4566     new_thr->th.th_serial_team = serial_team =
4567         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4568 #if OMPT_SUPPORT
4569                                           ompt_data_none, // root parallel id
4570 #endif
4571                                           proc_bind_default, &r_icvs,
4572                                           0 USE_NESTED_HOT_ARG(NULL));
4573   }
4574   KMP_ASSERT(serial_team);
4575   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4576   // execution (it is unused for now).
4577   serial_team->t.t_threads[0] = new_thr;
4578   KF_TRACE(10,
4579            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4580             new_thr));
4581 
4582   /* setup the thread structures */
4583   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4584 
4585 #if USE_FAST_MEMORY
4586   __kmp_initialize_fast_memory(new_thr);
4587 #endif /* USE_FAST_MEMORY */
4588 
4589 #if KMP_USE_BGET
4590   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4591   __kmp_initialize_bget(new_thr);
4592 #endif
4593 
4594   __kmp_init_random(new_thr); // Initialize random number generator
4595 
4596   /* Initialize these only once when thread is grabbed for a team allocation */
4597   KA_TRACE(20,
4598            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4599             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4600 
4601   int b;
4602   kmp_balign_t *balign = new_thr->th.th_bar;
4603   for (b = 0; b < bs_last_barrier; ++b) {
4604     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4605     balign[b].bb.team = NULL;
4606     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4607     balign[b].bb.use_oncore_barrier = 0;
4608   }
4609 
4610   TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4611   new_thr->th.th_sleep_loc_type = flag_unset;
4612 
4613   new_thr->th.th_spin_here = FALSE;
4614   new_thr->th.th_next_waiting = 0;
4615 #if KMP_OS_UNIX
4616   new_thr->th.th_blocking = false;
4617 #endif
4618 
4619 #if KMP_AFFINITY_SUPPORTED
4620   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4621   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4622   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4623   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4624 #endif
4625   new_thr->th.th_def_allocator = __kmp_def_allocator;
4626   new_thr->th.th_prev_level = 0;
4627   new_thr->th.th_prev_num_threads = 1;
4628 
4629   TCW_4(new_thr->th.th_in_pool, FALSE);
4630   new_thr->th.th_active_in_pool = FALSE;
4631   TCW_4(new_thr->th.th_active, TRUE);
4632 
4633   /* adjust the global counters */
4634   __kmp_all_nth++;
4635   __kmp_nth++;
4636 
4637   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4638   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4639   if (__kmp_adjust_gtid_mode) {
4640     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4641       if (TCR_4(__kmp_gtid_mode) != 2) {
4642         TCW_4(__kmp_gtid_mode, 2);
4643       }
4644     } else {
4645       if (TCR_4(__kmp_gtid_mode) != 1) {
4646         TCW_4(__kmp_gtid_mode, 1);
4647       }
4648     }
4649   }
4650 
4651 #ifdef KMP_ADJUST_BLOCKTIME
4652   /* Adjust blocktime back to zero if necessary       */
4653   /* Middle initialization might not have occurred yet */
4654   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4655     if (__kmp_nth > __kmp_avail_proc) {
4656       __kmp_zero_bt = TRUE;
4657     }
4658   }
4659 #endif /* KMP_ADJUST_BLOCKTIME */
4660 
4661   /* actually fork it and create the new worker thread */
4662   KF_TRACE(
4663       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4664   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4665   KF_TRACE(10,
4666            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4667 
4668   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4669                 new_gtid));
4670   KMP_MB();
4671   return new_thr;
4672 }
4673 
4674 /* Reinitialize team for reuse.
4675    The hot team code calls this case at every fork barrier, so EPCC barrier
4676    test are extremely sensitive to changes in it, esp. writes to the team
4677    struct, which cause a cache invalidation in all threads.
4678    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4679 static void __kmp_reinitialize_team(kmp_team_t *team,
4680                                     kmp_internal_control_t *new_icvs,
4681                                     ident_t *loc) {
4682   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4683                 team->t.t_threads[0], team));
4684   KMP_DEBUG_ASSERT(team && new_icvs);
4685   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4686   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4687 
4688   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4689   // Copy ICVs to the primary thread's implicit taskdata
4690   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4691   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4692 
4693   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4694                 team->t.t_threads[0], team));
4695 }
4696 
4697 /* Initialize the team data structure.
4698    This assumes the t_threads and t_max_nproc are already set.
4699    Also, we don't touch the arguments */
4700 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4701                                   kmp_internal_control_t *new_icvs,
4702                                   ident_t *loc) {
4703   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4704 
4705   /* verify */
4706   KMP_DEBUG_ASSERT(team);
4707   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4708   KMP_DEBUG_ASSERT(team->t.t_threads);
4709   KMP_MB();
4710 
4711   team->t.t_master_tid = 0; /* not needed */
4712   /* team->t.t_master_bar;        not needed */
4713   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4714   team->t.t_nproc = new_nproc;
4715 
4716   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4717   team->t.t_next_pool = NULL;
4718   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4719    * up hot team */
4720 
4721   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4722   team->t.t_invoke = NULL; /* not needed */
4723 
4724   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4725   team->t.t_sched.sched = new_icvs->sched.sched;
4726 
4727 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4728   team->t.t_fp_control_saved = FALSE; /* not needed */
4729   team->t.t_x87_fpu_control_word = 0; /* not needed */
4730   team->t.t_mxcsr = 0; /* not needed */
4731 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4732 
4733   team->t.t_construct = 0;
4734 
4735   team->t.t_ordered.dt.t_value = 0;
4736   team->t.t_master_active = FALSE;
4737 
4738 #ifdef KMP_DEBUG
4739   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4740 #endif
4741 #if KMP_OS_WINDOWS
4742   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4743 #endif
4744 
4745   team->t.t_control_stack_top = NULL;
4746 
4747   __kmp_reinitialize_team(team, new_icvs, loc);
4748 
4749   KMP_MB();
4750   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4751 }
4752 
4753 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4754 /* Sets full mask for thread and returns old mask, no changes to structures. */
4755 static void
4756 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4757   if (KMP_AFFINITY_CAPABLE()) {
4758     int status;
4759     if (old_mask != NULL) {
4760       status = __kmp_get_system_affinity(old_mask, TRUE);
4761       int error = errno;
4762       if (status != 0) {
4763         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4764                     __kmp_msg_null);
4765       }
4766     }
4767     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4768   }
4769 }
4770 #endif
4771 
4772 #if KMP_AFFINITY_SUPPORTED
4773 
4774 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4775 // It calculates the worker + primary thread's partition based upon the parent
4776 // thread's partition, and binds each worker to a thread in their partition.
4777 // The primary thread's partition should already include its current binding.
4778 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4779   // Do not partition places for the hidden helper team
4780   if (KMP_HIDDEN_HELPER_TEAM(team))
4781     return;
4782   // Copy the primary thread's place partition to the team struct
4783   kmp_info_t *master_th = team->t.t_threads[0];
4784   KMP_DEBUG_ASSERT(master_th != NULL);
4785   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4786   int first_place = master_th->th.th_first_place;
4787   int last_place = master_th->th.th_last_place;
4788   int masters_place = master_th->th.th_current_place;
4789   int num_masks = __kmp_affinity.num_masks;
4790   team->t.t_first_place = first_place;
4791   team->t.t_last_place = last_place;
4792 
4793   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4794                 "bound to place %d partition = [%d,%d]\n",
4795                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4796                 team->t.t_id, masters_place, first_place, last_place));
4797 
4798   switch (proc_bind) {
4799 
4800   case proc_bind_default:
4801     // Serial teams might have the proc_bind policy set to proc_bind_default.
4802     // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4803     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4804     break;
4805 
4806   case proc_bind_primary: {
4807     int f;
4808     int n_th = team->t.t_nproc;
4809     for (f = 1; f < n_th; f++) {
4810       kmp_info_t *th = team->t.t_threads[f];
4811       KMP_DEBUG_ASSERT(th != NULL);
4812       th->th.th_first_place = first_place;
4813       th->th.th_last_place = last_place;
4814       th->th.th_new_place = masters_place;
4815       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4816           team->t.t_display_affinity != 1) {
4817         team->t.t_display_affinity = 1;
4818       }
4819 
4820       KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4821                      "partition = [%d,%d]\n",
4822                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4823                      f, masters_place, first_place, last_place));
4824     }
4825   } break;
4826 
4827   case proc_bind_close: {
4828     int f;
4829     int n_th = team->t.t_nproc;
4830     int n_places;
4831     if (first_place <= last_place) {
4832       n_places = last_place - first_place + 1;
4833     } else {
4834       n_places = num_masks - first_place + last_place + 1;
4835     }
4836     if (n_th <= n_places) {
4837       int place = masters_place;
4838       for (f = 1; f < n_th; f++) {
4839         kmp_info_t *th = team->t.t_threads[f];
4840         KMP_DEBUG_ASSERT(th != NULL);
4841 
4842         if (place == last_place) {
4843           place = first_place;
4844         } else if (place == (num_masks - 1)) {
4845           place = 0;
4846         } else {
4847           place++;
4848         }
4849         th->th.th_first_place = first_place;
4850         th->th.th_last_place = last_place;
4851         th->th.th_new_place = place;
4852         if (__kmp_display_affinity && place != th->th.th_current_place &&
4853             team->t.t_display_affinity != 1) {
4854           team->t.t_display_affinity = 1;
4855         }
4856 
4857         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4858                        "partition = [%d,%d]\n",
4859                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4860                        team->t.t_id, f, place, first_place, last_place));
4861       }
4862     } else {
4863       int S, rem, gap, s_count;
4864       S = n_th / n_places;
4865       s_count = 0;
4866       rem = n_th - (S * n_places);
4867       gap = rem > 0 ? n_places / rem : n_places;
4868       int place = masters_place;
4869       int gap_ct = gap;
4870       for (f = 0; f < n_th; f++) {
4871         kmp_info_t *th = team->t.t_threads[f];
4872         KMP_DEBUG_ASSERT(th != NULL);
4873 
4874         th->th.th_first_place = first_place;
4875         th->th.th_last_place = last_place;
4876         th->th.th_new_place = place;
4877         if (__kmp_display_affinity && place != th->th.th_current_place &&
4878             team->t.t_display_affinity != 1) {
4879           team->t.t_display_affinity = 1;
4880         }
4881         s_count++;
4882 
4883         if ((s_count == S) && rem && (gap_ct == gap)) {
4884           // do nothing, add an extra thread to place on next iteration
4885         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4886           // we added an extra thread to this place; move to next place
4887           if (place == last_place) {
4888             place = first_place;
4889           } else if (place == (num_masks - 1)) {
4890             place = 0;
4891           } else {
4892             place++;
4893           }
4894           s_count = 0;
4895           gap_ct = 1;
4896           rem--;
4897         } else if (s_count == S) { // place full; don't add extra
4898           if (place == last_place) {
4899             place = first_place;
4900           } else if (place == (num_masks - 1)) {
4901             place = 0;
4902           } else {
4903             place++;
4904           }
4905           gap_ct++;
4906           s_count = 0;
4907         }
4908 
4909         KA_TRACE(100,
4910                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4911                   "partition = [%d,%d]\n",
4912                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4913                   th->th.th_new_place, first_place, last_place));
4914       }
4915       KMP_DEBUG_ASSERT(place == masters_place);
4916     }
4917   } break;
4918 
4919   case proc_bind_spread: {
4920     int f;
4921     int n_th = team->t.t_nproc;
4922     int n_places;
4923     int thidx;
4924     if (first_place <= last_place) {
4925       n_places = last_place - first_place + 1;
4926     } else {
4927       n_places = num_masks - first_place + last_place + 1;
4928     }
4929     if (n_th <= n_places) {
4930       int place = -1;
4931 
4932       if (n_places != num_masks) {
4933         int S = n_places / n_th;
4934         int s_count, rem, gap, gap_ct;
4935 
4936         place = masters_place;
4937         rem = n_places - n_th * S;
4938         gap = rem ? n_th / rem : 1;
4939         gap_ct = gap;
4940         thidx = n_th;
4941         if (update_master_only == 1)
4942           thidx = 1;
4943         for (f = 0; f < thidx; f++) {
4944           kmp_info_t *th = team->t.t_threads[f];
4945           KMP_DEBUG_ASSERT(th != NULL);
4946 
4947           th->th.th_first_place = place;
4948           th->th.th_new_place = place;
4949           if (__kmp_display_affinity && place != th->th.th_current_place &&
4950               team->t.t_display_affinity != 1) {
4951             team->t.t_display_affinity = 1;
4952           }
4953           s_count = 1;
4954           while (s_count < S) {
4955             if (place == last_place) {
4956               place = first_place;
4957             } else if (place == (num_masks - 1)) {
4958               place = 0;
4959             } else {
4960               place++;
4961             }
4962             s_count++;
4963           }
4964           if (rem && (gap_ct == gap)) {
4965             if (place == last_place) {
4966               place = first_place;
4967             } else if (place == (num_masks - 1)) {
4968               place = 0;
4969             } else {
4970               place++;
4971             }
4972             rem--;
4973             gap_ct = 0;
4974           }
4975           th->th.th_last_place = place;
4976           gap_ct++;
4977 
4978           if (place == last_place) {
4979             place = first_place;
4980           } else if (place == (num_masks - 1)) {
4981             place = 0;
4982           } else {
4983             place++;
4984           }
4985 
4986           KA_TRACE(100,
4987                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4988                     "partition = [%d,%d], num_masks: %u\n",
4989                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4990                     f, th->th.th_new_place, th->th.th_first_place,
4991                     th->th.th_last_place, num_masks));
4992         }
4993       } else {
4994         /* Having uniform space of available computation places I can create
4995            T partitions of round(P/T) size and put threads into the first
4996            place of each partition. */
4997         double current = static_cast<double>(masters_place);
4998         double spacing =
4999             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5000         int first, last;
5001         kmp_info_t *th;
5002 
5003         thidx = n_th + 1;
5004         if (update_master_only == 1)
5005           thidx = 1;
5006         for (f = 0; f < thidx; f++) {
5007           first = static_cast<int>(current);
5008           last = static_cast<int>(current + spacing) - 1;
5009           KMP_DEBUG_ASSERT(last >= first);
5010           if (first >= n_places) {
5011             if (masters_place) {
5012               first -= n_places;
5013               last -= n_places;
5014               if (first == (masters_place + 1)) {
5015                 KMP_DEBUG_ASSERT(f == n_th);
5016                 first--;
5017               }
5018               if (last == masters_place) {
5019                 KMP_DEBUG_ASSERT(f == (n_th - 1));
5020                 last--;
5021               }
5022             } else {
5023               KMP_DEBUG_ASSERT(f == n_th);
5024               first = 0;
5025               last = 0;
5026             }
5027           }
5028           if (last >= n_places) {
5029             last = (n_places - 1);
5030           }
5031           place = first;
5032           current += spacing;
5033           if (f < n_th) {
5034             KMP_DEBUG_ASSERT(0 <= first);
5035             KMP_DEBUG_ASSERT(n_places > first);
5036             KMP_DEBUG_ASSERT(0 <= last);
5037             KMP_DEBUG_ASSERT(n_places > last);
5038             KMP_DEBUG_ASSERT(last_place >= first_place);
5039             th = team->t.t_threads[f];
5040             KMP_DEBUG_ASSERT(th);
5041             th->th.th_first_place = first;
5042             th->th.th_new_place = place;
5043             th->th.th_last_place = last;
5044             if (__kmp_display_affinity && place != th->th.th_current_place &&
5045                 team->t.t_display_affinity != 1) {
5046               team->t.t_display_affinity = 1;
5047             }
5048             KA_TRACE(100,
5049                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5050                       "partition = [%d,%d], spacing = %.4f\n",
5051                       __kmp_gtid_from_thread(team->t.t_threads[f]),
5052                       team->t.t_id, f, th->th.th_new_place,
5053                       th->th.th_first_place, th->th.th_last_place, spacing));
5054           }
5055         }
5056       }
5057       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5058     } else {
5059       int S, rem, gap, s_count;
5060       S = n_th / n_places;
5061       s_count = 0;
5062       rem = n_th - (S * n_places);
5063       gap = rem > 0 ? n_places / rem : n_places;
5064       int place = masters_place;
5065       int gap_ct = gap;
5066       thidx = n_th;
5067       if (update_master_only == 1)
5068         thidx = 1;
5069       for (f = 0; f < thidx; f++) {
5070         kmp_info_t *th = team->t.t_threads[f];
5071         KMP_DEBUG_ASSERT(th != NULL);
5072 
5073         th->th.th_first_place = place;
5074         th->th.th_last_place = place;
5075         th->th.th_new_place = place;
5076         if (__kmp_display_affinity && place != th->th.th_current_place &&
5077             team->t.t_display_affinity != 1) {
5078           team->t.t_display_affinity = 1;
5079         }
5080         s_count++;
5081 
5082         if ((s_count == S) && rem && (gap_ct == gap)) {
5083           // do nothing, add an extra thread to place on next iteration
5084         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5085           // we added an extra thread to this place; move on to next place
5086           if (place == last_place) {
5087             place = first_place;
5088           } else if (place == (num_masks - 1)) {
5089             place = 0;
5090           } else {
5091             place++;
5092           }
5093           s_count = 0;
5094           gap_ct = 1;
5095           rem--;
5096         } else if (s_count == S) { // place is full; don't add extra thread
5097           if (place == last_place) {
5098             place = first_place;
5099           } else if (place == (num_masks - 1)) {
5100             place = 0;
5101           } else {
5102             place++;
5103           }
5104           gap_ct++;
5105           s_count = 0;
5106         }
5107 
5108         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5109                        "partition = [%d,%d]\n",
5110                        __kmp_gtid_from_thread(team->t.t_threads[f]),
5111                        team->t.t_id, f, th->th.th_new_place,
5112                        th->th.th_first_place, th->th.th_last_place));
5113       }
5114       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5115     }
5116   } break;
5117 
5118   default:
5119     break;
5120   }
5121 
5122   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5123 }
5124 
5125 #endif // KMP_AFFINITY_SUPPORTED
5126 
5127 /* allocate a new team data structure to use.  take one off of the free pool if
5128    available */
5129 kmp_team_t *
5130 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5131 #if OMPT_SUPPORT
5132                     ompt_data_t ompt_parallel_data,
5133 #endif
5134                     kmp_proc_bind_t new_proc_bind,
5135                     kmp_internal_control_t *new_icvs,
5136                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5137   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5138   int f;
5139   kmp_team_t *team;
5140   int use_hot_team = !root->r.r_active;
5141   int level = 0;
5142   int do_place_partition = 1;
5143 
5144   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5145   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5146   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5147   KMP_MB();
5148 
5149 #if KMP_NESTED_HOT_TEAMS
5150   kmp_hot_team_ptr_t *hot_teams;
5151   if (master) {
5152     team = master->th.th_team;
5153     level = team->t.t_active_level;
5154     if (master->th.th_teams_microtask) { // in teams construct?
5155       if (master->th.th_teams_size.nteams > 1 &&
5156           ( // #teams > 1
5157               team->t.t_pkfn ==
5158                   (microtask_t)__kmp_teams_master || // inner fork of the teams
5159               master->th.th_teams_level <
5160                   team->t.t_level)) { // or nested parallel inside the teams
5161         ++level; // not increment if #teams==1, or for outer fork of the teams;
5162         // increment otherwise
5163       }
5164       // Do not perform the place partition if inner fork of the teams
5165       // Wait until nested parallel region encountered inside teams construct
5166       if ((master->th.th_teams_size.nteams == 1 &&
5167            master->th.th_teams_level >= team->t.t_level) ||
5168           (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5169         do_place_partition = 0;
5170     }
5171     hot_teams = master->th.th_hot_teams;
5172     if (level < __kmp_hot_teams_max_level && hot_teams &&
5173         hot_teams[level].hot_team) {
5174       // hot team has already been allocated for given level
5175       use_hot_team = 1;
5176     } else {
5177       use_hot_team = 0;
5178     }
5179   } else {
5180     // check we won't access uninitialized hot_teams, just in case
5181     KMP_DEBUG_ASSERT(new_nproc == 1);
5182   }
5183 #endif
5184   // Optimization to use a "hot" team
5185   if (use_hot_team && new_nproc > 1) {
5186     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5187 #if KMP_NESTED_HOT_TEAMS
5188     team = hot_teams[level].hot_team;
5189 #else
5190     team = root->r.r_hot_team;
5191 #endif
5192 #if KMP_DEBUG
5193     if (__kmp_tasking_mode != tskm_immediate_exec) {
5194       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5195                     "task_team[1] = %p before reinit\n",
5196                     team->t.t_task_team[0], team->t.t_task_team[1]));
5197     }
5198 #endif
5199 
5200     if (team->t.t_nproc != new_nproc &&
5201         __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5202       // Distributed barrier may need a resize
5203       int old_nthr = team->t.t_nproc;
5204       __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5205     }
5206 
5207     // If not doing the place partition, then reset the team's proc bind
5208     // to indicate that partitioning of all threads still needs to take place
5209     if (do_place_partition == 0)
5210       team->t.t_proc_bind = proc_bind_default;
5211     // Has the number of threads changed?
5212     /* Let's assume the most common case is that the number of threads is
5213        unchanged, and put that case first. */
5214     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5215       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5216       // This case can mean that omp_set_num_threads() was called and the hot
5217       // team size was already reduced, so we check the special flag
5218       if (team->t.t_size_changed == -1) {
5219         team->t.t_size_changed = 1;
5220       } else {
5221         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5222       }
5223 
5224       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5225       kmp_r_sched_t new_sched = new_icvs->sched;
5226       // set primary thread's schedule as new run-time schedule
5227       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5228 
5229       __kmp_reinitialize_team(team, new_icvs,
5230                               root->r.r_uber_thread->th.th_ident);
5231 
5232       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5233                     team->t.t_threads[0], team));
5234       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5235 
5236 #if KMP_AFFINITY_SUPPORTED
5237       if ((team->t.t_size_changed == 0) &&
5238           (team->t.t_proc_bind == new_proc_bind)) {
5239         if (new_proc_bind == proc_bind_spread) {
5240           if (do_place_partition) {
5241             // add flag to update only master for spread
5242             __kmp_partition_places(team, 1);
5243           }
5244         }
5245         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5246                        "proc_bind = %d, partition = [%d,%d]\n",
5247                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5248                        team->t.t_last_place));
5249       } else {
5250         if (do_place_partition) {
5251           KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5252           __kmp_partition_places(team);
5253         }
5254       }
5255 #else
5256       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5257 #endif /* KMP_AFFINITY_SUPPORTED */
5258     } else if (team->t.t_nproc > new_nproc) {
5259       KA_TRACE(20,
5260                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5261                 new_nproc));
5262 
5263       team->t.t_size_changed = 1;
5264       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5265         // Barrier size already reduced earlier in this function
5266         // Activate team threads via th_used_in_team
5267         __kmp_add_threads_to_team(team, new_nproc);
5268       }
5269 #if KMP_NESTED_HOT_TEAMS
5270       if (__kmp_hot_teams_mode == 0) {
5271         // AC: saved number of threads should correspond to team's value in this
5272         // mode, can be bigger in mode 1, when hot team has threads in reserve
5273         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5274         hot_teams[level].hot_team_nth = new_nproc;
5275 #endif // KMP_NESTED_HOT_TEAMS
5276         /* release the extra threads we don't need any more */
5277         for (f = new_nproc; f < team->t.t_nproc; f++) {
5278           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5279           if (__kmp_tasking_mode != tskm_immediate_exec) {
5280             // When decreasing team size, threads no longer in the team should
5281             // unref task team.
5282             team->t.t_threads[f]->th.th_task_team = NULL;
5283           }
5284           __kmp_free_thread(team->t.t_threads[f]);
5285           team->t.t_threads[f] = NULL;
5286         }
5287 #if KMP_NESTED_HOT_TEAMS
5288       } // (__kmp_hot_teams_mode == 0)
5289       else {
5290         // When keeping extra threads in team, switch threads to wait on own
5291         // b_go flag
5292         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5293           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5294           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5295           for (int b = 0; b < bs_last_barrier; ++b) {
5296             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5297               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5298             }
5299             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5300           }
5301         }
5302       }
5303 #endif // KMP_NESTED_HOT_TEAMS
5304       team->t.t_nproc = new_nproc;
5305       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5306       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5307       __kmp_reinitialize_team(team, new_icvs,
5308                               root->r.r_uber_thread->th.th_ident);
5309 
5310       // Update remaining threads
5311       for (f = 0; f < new_nproc; ++f) {
5312         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5313       }
5314 
5315       // restore the current task state of the primary thread: should be the
5316       // implicit task
5317       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5318                     team->t.t_threads[0], team));
5319 
5320       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5321 
5322 #ifdef KMP_DEBUG
5323       for (f = 0; f < team->t.t_nproc; f++) {
5324         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5325                          team->t.t_threads[f]->th.th_team_nproc ==
5326                              team->t.t_nproc);
5327       }
5328 #endif
5329 
5330       if (do_place_partition) {
5331         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5332 #if KMP_AFFINITY_SUPPORTED
5333         __kmp_partition_places(team);
5334 #endif
5335       }
5336     } else { // team->t.t_nproc < new_nproc
5337 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5338       kmp_affin_mask_t *old_mask;
5339       if (KMP_AFFINITY_CAPABLE()) {
5340         KMP_CPU_ALLOC(old_mask);
5341       }
5342 #endif
5343 
5344       KA_TRACE(20,
5345                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5346                 new_nproc));
5347       int old_nproc = team->t.t_nproc; // save old value and use to update only
5348       team->t.t_size_changed = 1;
5349 
5350 #if KMP_NESTED_HOT_TEAMS
5351       int avail_threads = hot_teams[level].hot_team_nth;
5352       if (new_nproc < avail_threads)
5353         avail_threads = new_nproc;
5354       kmp_info_t **other_threads = team->t.t_threads;
5355       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5356         // Adjust barrier data of reserved threads (if any) of the team
5357         // Other data will be set in __kmp_initialize_info() below.
5358         int b;
5359         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5360         for (b = 0; b < bs_last_barrier; ++b) {
5361           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5362           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5363 #if USE_DEBUGGER
5364           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5365 #endif
5366         }
5367       }
5368       if (hot_teams[level].hot_team_nth >= new_nproc) {
5369         // we have all needed threads in reserve, no need to allocate any
5370         // this only possible in mode 1, cannot have reserved threads in mode 0
5371         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5372         team->t.t_nproc = new_nproc; // just get reserved threads involved
5373       } else {
5374         // We may have some threads in reserve, but not enough;
5375         // get reserved threads involved if any.
5376         team->t.t_nproc = hot_teams[level].hot_team_nth;
5377         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5378 #endif // KMP_NESTED_HOT_TEAMS
5379         if (team->t.t_max_nproc < new_nproc) {
5380           /* reallocate larger arrays */
5381           __kmp_reallocate_team_arrays(team, new_nproc);
5382           __kmp_reinitialize_team(team, new_icvs, NULL);
5383         }
5384 
5385 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5386         /* Temporarily set full mask for primary thread before creation of
5387            workers. The reason is that workers inherit the affinity from the
5388            primary thread, so if a lot of workers are created on the single
5389            core quickly, they don't get a chance to set their own affinity for
5390            a long time. */
5391         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5392 #endif
5393 
5394         /* allocate new threads for the hot team */
5395         for (f = team->t.t_nproc; f < new_nproc; f++) {
5396           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5397           KMP_DEBUG_ASSERT(new_worker);
5398           team->t.t_threads[f] = new_worker;
5399 
5400           KA_TRACE(20,
5401                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5402                     "join=%llu, plain=%llu\n",
5403                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5404                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5405                     team->t.t_bar[bs_plain_barrier].b_arrived));
5406 
5407           { // Initialize barrier data for new threads.
5408             int b;
5409             kmp_balign_t *balign = new_worker->th.th_bar;
5410             for (b = 0; b < bs_last_barrier; ++b) {
5411               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5412               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5413                                KMP_BARRIER_PARENT_FLAG);
5414 #if USE_DEBUGGER
5415               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5416 #endif
5417             }
5418           }
5419         }
5420 
5421 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5422         if (KMP_AFFINITY_CAPABLE()) {
5423           /* Restore initial primary thread's affinity mask */
5424           __kmp_set_system_affinity(old_mask, TRUE);
5425           KMP_CPU_FREE(old_mask);
5426         }
5427 #endif
5428 #if KMP_NESTED_HOT_TEAMS
5429       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5430 #endif // KMP_NESTED_HOT_TEAMS
5431       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5432         // Barrier size already increased earlier in this function
5433         // Activate team threads via th_used_in_team
5434         __kmp_add_threads_to_team(team, new_nproc);
5435       }
5436       /* make sure everyone is syncronized */
5437       // new threads below
5438       __kmp_initialize_team(team, new_nproc, new_icvs,
5439                             root->r.r_uber_thread->th.th_ident);
5440 
5441       /* reinitialize the threads */
5442       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5443       for (f = 0; f < team->t.t_nproc; ++f)
5444         __kmp_initialize_info(team->t.t_threads[f], team, f,
5445                               __kmp_gtid_from_tid(f, team));
5446 
5447       if (level) { // set th_task_state for new threads in nested hot team
5448         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5449         // only need to set the th_task_state for the new threads. th_task_state
5450         // for primary thread will not be accurate until after this in
5451         // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5452         // get the correct value.
5453         for (f = old_nproc; f < team->t.t_nproc; ++f)
5454           team->t.t_threads[f]->th.th_task_state =
5455               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5456       } else { // set th_task_state for new threads in non-nested hot team
5457         // copy primary thread's state
5458         kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5459         for (f = old_nproc; f < team->t.t_nproc; ++f)
5460           team->t.t_threads[f]->th.th_task_state = old_state;
5461       }
5462 
5463 #ifdef KMP_DEBUG
5464       for (f = 0; f < team->t.t_nproc; ++f) {
5465         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5466                          team->t.t_threads[f]->th.th_team_nproc ==
5467                              team->t.t_nproc);
5468       }
5469 #endif
5470 
5471       if (do_place_partition) {
5472         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5473 #if KMP_AFFINITY_SUPPORTED
5474         __kmp_partition_places(team);
5475 #endif
5476       }
5477     } // Check changes in number of threads
5478 
5479     kmp_info_t *master = team->t.t_threads[0];
5480     if (master->th.th_teams_microtask) {
5481       for (f = 1; f < new_nproc; ++f) {
5482         // propagate teams construct specific info to workers
5483         kmp_info_t *thr = team->t.t_threads[f];
5484         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5485         thr->th.th_teams_level = master->th.th_teams_level;
5486         thr->th.th_teams_size = master->th.th_teams_size;
5487       }
5488     }
5489 #if KMP_NESTED_HOT_TEAMS
5490     if (level) {
5491       // Sync barrier state for nested hot teams, not needed for outermost hot
5492       // team.
5493       for (f = 1; f < new_nproc; ++f) {
5494         kmp_info_t *thr = team->t.t_threads[f];
5495         int b;
5496         kmp_balign_t *balign = thr->th.th_bar;
5497         for (b = 0; b < bs_last_barrier; ++b) {
5498           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5499           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5500 #if USE_DEBUGGER
5501           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5502 #endif
5503         }
5504       }
5505     }
5506 #endif // KMP_NESTED_HOT_TEAMS
5507 
5508     /* reallocate space for arguments if necessary */
5509     __kmp_alloc_argv_entries(argc, team, TRUE);
5510     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5511     // The hot team re-uses the previous task team,
5512     // if untouched during the previous release->gather phase.
5513 
5514     KF_TRACE(10, (" hot_team = %p\n", team));
5515 
5516 #if KMP_DEBUG
5517     if (__kmp_tasking_mode != tskm_immediate_exec) {
5518       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5519                     "task_team[1] = %p after reinit\n",
5520                     team->t.t_task_team[0], team->t.t_task_team[1]));
5521     }
5522 #endif
5523 
5524 #if OMPT_SUPPORT
5525     __ompt_team_assign_id(team, ompt_parallel_data);
5526 #endif
5527 
5528     KMP_MB();
5529 
5530     return team;
5531   }
5532 
5533   /* next, let's try to take one from the team pool */
5534   KMP_MB();
5535   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5536     /* TODO: consider resizing undersized teams instead of reaping them, now
5537        that we have a resizing mechanism */
5538     if (team->t.t_max_nproc >= max_nproc) {
5539       /* take this team from the team pool */
5540       __kmp_team_pool = team->t.t_next_pool;
5541 
5542       if (max_nproc > 1 &&
5543           __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5544         if (!team->t.b) { // Allocate barrier structure
5545           team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5546         }
5547       }
5548 
5549       /* setup the team for fresh use */
5550       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5551 
5552       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5553                     "task_team[1] %p to NULL\n",
5554                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5555       team->t.t_task_team[0] = NULL;
5556       team->t.t_task_team[1] = NULL;
5557 
5558       /* reallocate space for arguments if necessary */
5559       __kmp_alloc_argv_entries(argc, team, TRUE);
5560       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5561 
5562       KA_TRACE(
5563           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5564                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5565       { // Initialize barrier data.
5566         int b;
5567         for (b = 0; b < bs_last_barrier; ++b) {
5568           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5569 #if USE_DEBUGGER
5570           team->t.t_bar[b].b_master_arrived = 0;
5571           team->t.t_bar[b].b_team_arrived = 0;
5572 #endif
5573         }
5574       }
5575 
5576       team->t.t_proc_bind = new_proc_bind;
5577 
5578       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5579                     team->t.t_id));
5580 
5581 #if OMPT_SUPPORT
5582       __ompt_team_assign_id(team, ompt_parallel_data);
5583 #endif
5584 
5585       KMP_MB();
5586 
5587       return team;
5588     }
5589 
5590     /* reap team if it is too small, then loop back and check the next one */
5591     // not sure if this is wise, but, will be redone during the hot-teams
5592     // rewrite.
5593     /* TODO: Use technique to find the right size hot-team, don't reap them */
5594     team = __kmp_reap_team(team);
5595     __kmp_team_pool = team;
5596   }
5597 
5598   /* nothing available in the pool, no matter, make a new team! */
5599   KMP_MB();
5600   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5601 
5602   /* and set it up */
5603   team->t.t_max_nproc = max_nproc;
5604   if (max_nproc > 1 &&
5605       __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5606     // Allocate barrier structure
5607     team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5608   }
5609 
5610   /* NOTE well, for some reason allocating one big buffer and dividing it up
5611      seems to really hurt performance a lot on the P4, so, let's not use this */
5612   __kmp_allocate_team_arrays(team, max_nproc);
5613 
5614   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5615   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5616 
5617   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5618                 "%p to NULL\n",
5619                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5620   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5621   // memory, no need to duplicate
5622   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5623   // memory, no need to duplicate
5624 
5625   if (__kmp_storage_map) {
5626     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5627   }
5628 
5629   /* allocate space for arguments */
5630   __kmp_alloc_argv_entries(argc, team, FALSE);
5631   team->t.t_argc = argc;
5632 
5633   KA_TRACE(20,
5634            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5635             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5636   { // Initialize barrier data.
5637     int b;
5638     for (b = 0; b < bs_last_barrier; ++b) {
5639       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5640 #if USE_DEBUGGER
5641       team->t.t_bar[b].b_master_arrived = 0;
5642       team->t.t_bar[b].b_team_arrived = 0;
5643 #endif
5644     }
5645   }
5646 
5647   team->t.t_proc_bind = new_proc_bind;
5648 
5649 #if OMPT_SUPPORT
5650   __ompt_team_assign_id(team, ompt_parallel_data);
5651   team->t.ompt_serialized_team_info = NULL;
5652 #endif
5653 
5654   KMP_MB();
5655 
5656   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5657                 team->t.t_id));
5658 
5659   return team;
5660 }
5661 
5662 /* TODO implement hot-teams at all levels */
5663 /* TODO implement lazy thread release on demand (disband request) */
5664 
5665 /* free the team.  return it to the team pool.  release all the threads
5666  * associated with it */
5667 void __kmp_free_team(kmp_root_t *root,
5668                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5669   int f;
5670   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5671                 team->t.t_id));
5672 
5673   /* verify state */
5674   KMP_DEBUG_ASSERT(root);
5675   KMP_DEBUG_ASSERT(team);
5676   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5677   KMP_DEBUG_ASSERT(team->t.t_threads);
5678 
5679   int use_hot_team = team == root->r.r_hot_team;
5680 #if KMP_NESTED_HOT_TEAMS
5681   int level;
5682   if (master) {
5683     level = team->t.t_active_level - 1;
5684     if (master->th.th_teams_microtask) { // in teams construct?
5685       if (master->th.th_teams_size.nteams > 1) {
5686         ++level; // level was not increased in teams construct for
5687         // team_of_masters
5688       }
5689       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5690           master->th.th_teams_level == team->t.t_level) {
5691         ++level; // level was not increased in teams construct for
5692         // team_of_workers before the parallel
5693       } // team->t.t_level will be increased inside parallel
5694     }
5695 #if KMP_DEBUG
5696     kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5697 #endif
5698     if (level < __kmp_hot_teams_max_level) {
5699       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5700       use_hot_team = 1;
5701     }
5702   }
5703 #endif // KMP_NESTED_HOT_TEAMS
5704 
5705   /* team is done working */
5706   TCW_SYNC_PTR(team->t.t_pkfn,
5707                NULL); // Important for Debugging Support Library.
5708 #if KMP_OS_WINDOWS
5709   team->t.t_copyin_counter = 0; // init counter for possible reuse
5710 #endif
5711   // Do not reset pointer to parent team to NULL for hot teams.
5712 
5713   /* if we are non-hot team, release our threads */
5714   if (!use_hot_team) {
5715     if (__kmp_tasking_mode != tskm_immediate_exec) {
5716       // Wait for threads to reach reapable state
5717       for (f = 1; f < team->t.t_nproc; ++f) {
5718         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5719         kmp_info_t *th = team->t.t_threads[f];
5720         volatile kmp_uint32 *state = &th->th.th_reap_state;
5721         while (*state != KMP_SAFE_TO_REAP) {
5722 #if KMP_OS_WINDOWS
5723           // On Windows a thread can be killed at any time, check this
5724           DWORD ecode;
5725           if (!__kmp_is_thread_alive(th, &ecode)) {
5726             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5727             break;
5728           }
5729 #endif
5730           // first check if thread is sleeping
5731           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5732           if (fl.is_sleeping())
5733             fl.resume(__kmp_gtid_from_thread(th));
5734           KMP_CPU_PAUSE();
5735         }
5736       }
5737 
5738       // Delete task teams
5739       int tt_idx;
5740       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5741         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5742         if (task_team != NULL) {
5743           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5744             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5745             team->t.t_threads[f]->th.th_task_team = NULL;
5746           }
5747           KA_TRACE(
5748               20,
5749               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5750                __kmp_get_gtid(), task_team, team->t.t_id));
5751 #if KMP_NESTED_HOT_TEAMS
5752           __kmp_free_task_team(master, task_team);
5753 #endif
5754           team->t.t_task_team[tt_idx] = NULL;
5755         }
5756       }
5757     }
5758 
5759     // Reset pointer to parent team only for non-hot teams.
5760     team->t.t_parent = NULL;
5761     team->t.t_level = 0;
5762     team->t.t_active_level = 0;
5763 
5764     /* free the worker threads */
5765     for (f = 1; f < team->t.t_nproc; ++f) {
5766       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5767       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5768         KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5769                                     1, 2);
5770       }
5771       __kmp_free_thread(team->t.t_threads[f]);
5772     }
5773 
5774     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5775       if (team->t.b) {
5776         // wake up thread at old location
5777         team->t.b->go_release();
5778         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5779           for (f = 1; f < team->t.t_nproc; ++f) {
5780             if (team->t.b->sleep[f].sleep) {
5781               __kmp_atomic_resume_64(
5782                   team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5783                   (kmp_atomic_flag_64<> *)NULL);
5784             }
5785           }
5786         }
5787         // Wait for threads to be removed from team
5788         for (int f = 1; f < team->t.t_nproc; ++f) {
5789           while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5790             KMP_CPU_PAUSE();
5791         }
5792       }
5793     }
5794 
5795     for (f = 1; f < team->t.t_nproc; ++f) {
5796       team->t.t_threads[f] = NULL;
5797     }
5798 
5799     if (team->t.t_max_nproc > 1 &&
5800         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5801       distributedBarrier::deallocate(team->t.b);
5802       team->t.b = NULL;
5803     }
5804     /* put the team back in the team pool */
5805     /* TODO limit size of team pool, call reap_team if pool too large */
5806     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5807     __kmp_team_pool = (volatile kmp_team_t *)team;
5808   } else { // Check if team was created for primary threads in teams construct
5809     // See if first worker is a CG root
5810     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5811                      team->t.t_threads[1]->th.th_cg_roots);
5812     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5813       // Clean up the CG root nodes on workers so that this team can be re-used
5814       for (f = 1; f < team->t.t_nproc; ++f) {
5815         kmp_info_t *thr = team->t.t_threads[f];
5816         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5817                          thr->th.th_cg_roots->cg_root == thr);
5818         // Pop current CG root off list
5819         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5820         thr->th.th_cg_roots = tmp->up;
5821         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5822                        " up to node %p. cg_nthreads was %d\n",
5823                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5824         int i = tmp->cg_nthreads--;
5825         if (i == 1) {
5826           __kmp_free(tmp); // free CG if we are the last thread in it
5827         }
5828         // Restore current task's thread_limit from CG root
5829         if (thr->th.th_cg_roots)
5830           thr->th.th_current_task->td_icvs.thread_limit =
5831               thr->th.th_cg_roots->cg_thread_limit;
5832       }
5833     }
5834   }
5835 
5836   KMP_MB();
5837 }
5838 
5839 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5840 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5841   kmp_team_t *next_pool = team->t.t_next_pool;
5842 
5843   KMP_DEBUG_ASSERT(team);
5844   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5845   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5846   KMP_DEBUG_ASSERT(team->t.t_threads);
5847   KMP_DEBUG_ASSERT(team->t.t_argv);
5848 
5849   /* TODO clean the threads that are a part of this? */
5850 
5851   /* free stuff */
5852   __kmp_free_team_arrays(team);
5853   if (team->t.t_argv != &team->t.t_inline_argv[0])
5854     __kmp_free((void *)team->t.t_argv);
5855   __kmp_free(team);
5856 
5857   KMP_MB();
5858   return next_pool;
5859 }
5860 
5861 // Free the thread.  Don't reap it, just place it on the pool of available
5862 // threads.
5863 //
5864 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5865 // binding for the affinity mechanism to be useful.
5866 //
5867 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5868 // However, we want to avoid a potential performance problem by always
5869 // scanning through the list to find the correct point at which to insert
5870 // the thread (potential N**2 behavior).  To do this we keep track of the
5871 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5872 // With single-level parallelism, threads will always be added to the tail
5873 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5874 // parallelism, all bets are off and we may need to scan through the entire
5875 // free list.
5876 //
5877 // This change also has a potentially large performance benefit, for some
5878 // applications.  Previously, as threads were freed from the hot team, they
5879 // would be placed back on the free list in inverse order.  If the hot team
5880 // grew back to it's original size, then the freed thread would be placed
5881 // back on the hot team in reverse order.  This could cause bad cache
5882 // locality problems on programs where the size of the hot team regularly
5883 // grew and shrunk.
5884 //
5885 // Now, for single-level parallelism, the OMP tid is always == gtid.
5886 void __kmp_free_thread(kmp_info_t *this_th) {
5887   int gtid;
5888   kmp_info_t **scan;
5889 
5890   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5891                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5892 
5893   KMP_DEBUG_ASSERT(this_th);
5894 
5895   // When moving thread to pool, switch thread to wait on own b_go flag, and
5896   // uninitialized (NULL team).
5897   int b;
5898   kmp_balign_t *balign = this_th->th.th_bar;
5899   for (b = 0; b < bs_last_barrier; ++b) {
5900     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5901       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5902     balign[b].bb.team = NULL;
5903     balign[b].bb.leaf_kids = 0;
5904   }
5905   this_th->th.th_task_state = 0;
5906   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5907 
5908   /* put thread back on the free pool */
5909   TCW_PTR(this_th->th.th_team, NULL);
5910   TCW_PTR(this_th->th.th_root, NULL);
5911   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5912 
5913   while (this_th->th.th_cg_roots) {
5914     this_th->th.th_cg_roots->cg_nthreads--;
5915     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5916                    " %p of thread  %p to %d\n",
5917                    this_th, this_th->th.th_cg_roots,
5918                    this_th->th.th_cg_roots->cg_root,
5919                    this_th->th.th_cg_roots->cg_nthreads));
5920     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5921     if (tmp->cg_root == this_th) { // Thread is a cg_root
5922       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5923       KA_TRACE(
5924           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5925       this_th->th.th_cg_roots = tmp->up;
5926       __kmp_free(tmp);
5927     } else { // Worker thread
5928       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5929         __kmp_free(tmp);
5930       }
5931       this_th->th.th_cg_roots = NULL;
5932       break;
5933     }
5934   }
5935 
5936   /* If the implicit task assigned to this thread can be used by other threads
5937    * -> multiple threads can share the data and try to free the task at
5938    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5939    * with higher probability when hot team is disabled but can occurs even when
5940    * the hot team is enabled */
5941   __kmp_free_implicit_task(this_th);
5942   this_th->th.th_current_task = NULL;
5943 
5944   // If the __kmp_thread_pool_insert_pt is already past the new insert
5945   // point, then we need to re-scan the entire list.
5946   gtid = this_th->th.th_info.ds.ds_gtid;
5947   if (__kmp_thread_pool_insert_pt != NULL) {
5948     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5949     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5950       __kmp_thread_pool_insert_pt = NULL;
5951     }
5952   }
5953 
5954   // Scan down the list to find the place to insert the thread.
5955   // scan is the address of a link in the list, possibly the address of
5956   // __kmp_thread_pool itself.
5957   //
5958   // In the absence of nested parallelism, the for loop will have 0 iterations.
5959   if (__kmp_thread_pool_insert_pt != NULL) {
5960     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5961   } else {
5962     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5963   }
5964   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5965        scan = &((*scan)->th.th_next_pool))
5966     ;
5967 
5968   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5969   // to its address.
5970   TCW_PTR(this_th->th.th_next_pool, *scan);
5971   __kmp_thread_pool_insert_pt = *scan = this_th;
5972   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5973                    (this_th->th.th_info.ds.ds_gtid <
5974                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5975   TCW_4(this_th->th.th_in_pool, TRUE);
5976   __kmp_suspend_initialize_thread(this_th);
5977   __kmp_lock_suspend_mx(this_th);
5978   if (this_th->th.th_active == TRUE) {
5979     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5980     this_th->th.th_active_in_pool = TRUE;
5981   }
5982 #if KMP_DEBUG
5983   else {
5984     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5985   }
5986 #endif
5987   __kmp_unlock_suspend_mx(this_th);
5988 
5989   TCW_4(__kmp_nth, __kmp_nth - 1);
5990 
5991 #ifdef KMP_ADJUST_BLOCKTIME
5992   /* Adjust blocktime back to user setting or default if necessary */
5993   /* Middle initialization might never have occurred                */
5994   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5995     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5996     if (__kmp_nth <= __kmp_avail_proc) {
5997       __kmp_zero_bt = FALSE;
5998     }
5999   }
6000 #endif /* KMP_ADJUST_BLOCKTIME */
6001 
6002   KMP_MB();
6003 }
6004 
6005 /* ------------------------------------------------------------------------ */
6006 
6007 void *__kmp_launch_thread(kmp_info_t *this_thr) {
6008 #if OMP_PROFILING_SUPPORT
6009   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
6010   // TODO: add a configuration option for time granularity
6011   if (ProfileTraceFile)
6012     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
6013 #endif
6014 
6015   int gtid = this_thr->th.th_info.ds.ds_gtid;
6016   /*    void                 *stack_data;*/
6017   kmp_team_t **volatile pteam;
6018 
6019   KMP_MB();
6020   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6021 
6022   if (__kmp_env_consistency_check) {
6023     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6024   }
6025 
6026 #if OMPD_SUPPORT
6027   if (ompd_state & OMPD_ENABLE_BP)
6028     ompd_bp_thread_begin();
6029 #endif
6030 
6031 #if OMPT_SUPPORT
6032   ompt_data_t *thread_data = nullptr;
6033   if (ompt_enabled.enabled) {
6034     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6035     *thread_data = ompt_data_none;
6036 
6037     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6038     this_thr->th.ompt_thread_info.wait_id = 0;
6039     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6040     this_thr->th.ompt_thread_info.parallel_flags = 0;
6041     if (ompt_enabled.ompt_callback_thread_begin) {
6042       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6043           ompt_thread_worker, thread_data);
6044     }
6045     this_thr->th.ompt_thread_info.state = ompt_state_idle;
6046   }
6047 #endif
6048 
6049   /* This is the place where threads wait for work */
6050   while (!TCR_4(__kmp_global.g.g_done)) {
6051     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6052     KMP_MB();
6053 
6054     /* wait for work to do */
6055     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6056 
6057     /* No tid yet since not part of a team */
6058     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6059 
6060 #if OMPT_SUPPORT
6061     if (ompt_enabled.enabled) {
6062       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6063     }
6064 #endif
6065 
6066     pteam = &this_thr->th.th_team;
6067 
6068     /* have we been allocated? */
6069     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6070       /* we were just woken up, so run our new task */
6071       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6072         int rc;
6073         KA_TRACE(20,
6074                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6075                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6076                   (*pteam)->t.t_pkfn));
6077 
6078         updateHWFPControl(*pteam);
6079 
6080 #if OMPT_SUPPORT
6081         if (ompt_enabled.enabled) {
6082           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6083         }
6084 #endif
6085 
6086         rc = (*pteam)->t.t_invoke(gtid);
6087         KMP_ASSERT(rc);
6088 
6089         KMP_MB();
6090         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6091                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6092                       (*pteam)->t.t_pkfn));
6093       }
6094 #if OMPT_SUPPORT
6095       if (ompt_enabled.enabled) {
6096         /* no frame set while outside task */
6097         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6098 
6099         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6100       }
6101 #endif
6102       /* join barrier after parallel region */
6103       __kmp_join_barrier(gtid);
6104     }
6105   }
6106   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
6107 
6108 #if OMPD_SUPPORT
6109   if (ompd_state & OMPD_ENABLE_BP)
6110     ompd_bp_thread_end();
6111 #endif
6112 
6113 #if OMPT_SUPPORT
6114   if (ompt_enabled.ompt_callback_thread_end) {
6115     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6116   }
6117 #endif
6118 
6119   this_thr->th.th_task_team = NULL;
6120   /* run the destructors for the threadprivate data for this thread */
6121   __kmp_common_destroy_gtid(gtid);
6122 
6123   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6124   KMP_MB();
6125 
6126 #if OMP_PROFILING_SUPPORT
6127   llvm::timeTraceProfilerFinishThread();
6128 #endif
6129   return this_thr;
6130 }
6131 
6132 /* ------------------------------------------------------------------------ */
6133 
6134 void __kmp_internal_end_dest(void *specific_gtid) {
6135   // Make sure no significant bits are lost
6136   int gtid;
6137   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6138 
6139   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6140   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6141    * this is because 0 is reserved for the nothing-stored case */
6142 
6143   __kmp_internal_end_thread(gtid);
6144 }
6145 
6146 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6147 
6148 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6149   __kmp_internal_end_atexit();
6150 }
6151 
6152 #endif
6153 
6154 /* [Windows] josh: when the atexit handler is called, there may still be more
6155    than one thread alive */
6156 void __kmp_internal_end_atexit(void) {
6157   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6158   /* [Windows]
6159      josh: ideally, we want to completely shutdown the library in this atexit
6160      handler, but stat code that depends on thread specific data for gtid fails
6161      because that data becomes unavailable at some point during the shutdown, so
6162      we call __kmp_internal_end_thread instead. We should eventually remove the
6163      dependency on __kmp_get_specific_gtid in the stat code and use
6164      __kmp_internal_end_library to cleanly shutdown the library.
6165 
6166      // TODO: Can some of this comment about GVS be removed?
6167      I suspect that the offending stat code is executed when the calling thread
6168      tries to clean up a dead root thread's data structures, resulting in GVS
6169      code trying to close the GVS structures for that thread, but since the stat
6170      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6171      the calling thread is cleaning up itself instead of another thread, it get
6172      confused. This happens because allowing a thread to unregister and cleanup
6173      another thread is a recent modification for addressing an issue.
6174      Based on the current design (20050722), a thread may end up
6175      trying to unregister another thread only if thread death does not trigger
6176      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
6177      thread specific data destructor function to detect thread death. For
6178      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6179      is nothing.  Thus, the workaround is applicable only for Windows static
6180      stat library. */
6181   __kmp_internal_end_library(-1);
6182 #if KMP_OS_WINDOWS
6183   __kmp_close_console();
6184 #endif
6185 }
6186 
6187 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6188   // It is assumed __kmp_forkjoin_lock is acquired.
6189 
6190   int gtid;
6191 
6192   KMP_DEBUG_ASSERT(thread != NULL);
6193 
6194   gtid = thread->th.th_info.ds.ds_gtid;
6195 
6196   if (!is_root) {
6197     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6198       /* Assume the threads are at the fork barrier here */
6199       KA_TRACE(
6200           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6201                gtid));
6202       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6203         while (
6204             !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6205           KMP_CPU_PAUSE();
6206         __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6207       } else {
6208         /* Need release fence here to prevent seg faults for tree forkjoin
6209            barrier (GEH) */
6210         kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6211                            thread);
6212         __kmp_release_64(&flag);
6213       }
6214     }
6215 
6216     // Terminate OS thread.
6217     __kmp_reap_worker(thread);
6218 
6219     // The thread was killed asynchronously.  If it was actively
6220     // spinning in the thread pool, decrement the global count.
6221     //
6222     // There is a small timing hole here - if the worker thread was just waking
6223     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6224     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6225     // the global counter might not get updated.
6226     //
6227     // Currently, this can only happen as the library is unloaded,
6228     // so there are no harmful side effects.
6229     if (thread->th.th_active_in_pool) {
6230       thread->th.th_active_in_pool = FALSE;
6231       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6232       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6233     }
6234   }
6235 
6236   __kmp_free_implicit_task(thread);
6237 
6238 // Free the fast memory for tasking
6239 #if USE_FAST_MEMORY
6240   __kmp_free_fast_memory(thread);
6241 #endif /* USE_FAST_MEMORY */
6242 
6243   __kmp_suspend_uninitialize_thread(thread);
6244 
6245   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6246   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6247 
6248   --__kmp_all_nth;
6249   // __kmp_nth was decremented when thread is added to the pool.
6250 
6251 #ifdef KMP_ADJUST_BLOCKTIME
6252   /* Adjust blocktime back to user setting or default if necessary */
6253   /* Middle initialization might never have occurred                */
6254   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6255     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6256     if (__kmp_nth <= __kmp_avail_proc) {
6257       __kmp_zero_bt = FALSE;
6258     }
6259   }
6260 #endif /* KMP_ADJUST_BLOCKTIME */
6261 
6262   /* free the memory being used */
6263   if (__kmp_env_consistency_check) {
6264     if (thread->th.th_cons) {
6265       __kmp_free_cons_stack(thread->th.th_cons);
6266       thread->th.th_cons = NULL;
6267     }
6268   }
6269 
6270   if (thread->th.th_pri_common != NULL) {
6271     __kmp_free(thread->th.th_pri_common);
6272     thread->th.th_pri_common = NULL;
6273   }
6274 
6275   if (thread->th.th_task_state_memo_stack != NULL) {
6276     __kmp_free(thread->th.th_task_state_memo_stack);
6277     thread->th.th_task_state_memo_stack = NULL;
6278   }
6279 
6280 #if KMP_USE_BGET
6281   if (thread->th.th_local.bget_data != NULL) {
6282     __kmp_finalize_bget(thread);
6283   }
6284 #endif
6285 
6286 #if KMP_AFFINITY_SUPPORTED
6287   if (thread->th.th_affin_mask != NULL) {
6288     KMP_CPU_FREE(thread->th.th_affin_mask);
6289     thread->th.th_affin_mask = NULL;
6290   }
6291 #endif /* KMP_AFFINITY_SUPPORTED */
6292 
6293 #if KMP_USE_HIER_SCHED
6294   if (thread->th.th_hier_bar_data != NULL) {
6295     __kmp_free(thread->th.th_hier_bar_data);
6296     thread->th.th_hier_bar_data = NULL;
6297   }
6298 #endif
6299 
6300   __kmp_reap_team(thread->th.th_serial_team);
6301   thread->th.th_serial_team = NULL;
6302   __kmp_free(thread);
6303 
6304   KMP_MB();
6305 
6306 } // __kmp_reap_thread
6307 
6308 static void __kmp_itthash_clean(kmp_info_t *th) {
6309 #if USE_ITT_NOTIFY
6310   if (__kmp_itt_region_domains.count > 0) {
6311     for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6312       kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6313       while (bucket) {
6314         kmp_itthash_entry_t *next = bucket->next_in_bucket;
6315         __kmp_thread_free(th, bucket);
6316         bucket = next;
6317       }
6318     }
6319   }
6320   if (__kmp_itt_barrier_domains.count > 0) {
6321     for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6322       kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6323       while (bucket) {
6324         kmp_itthash_entry_t *next = bucket->next_in_bucket;
6325         __kmp_thread_free(th, bucket);
6326         bucket = next;
6327       }
6328     }
6329   }
6330 #endif
6331 }
6332 
6333 static void __kmp_internal_end(void) {
6334   int i;
6335 
6336   /* First, unregister the library */
6337   __kmp_unregister_library();
6338 
6339 #if KMP_OS_WINDOWS
6340   /* In Win static library, we can't tell when a root actually dies, so we
6341      reclaim the data structures for any root threads that have died but not
6342      unregistered themselves, in order to shut down cleanly.
6343      In Win dynamic library we also can't tell when a thread dies.  */
6344   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6345 // dead roots
6346 #endif
6347 
6348   for (i = 0; i < __kmp_threads_capacity; i++)
6349     if (__kmp_root[i])
6350       if (__kmp_root[i]->r.r_active)
6351         break;
6352   KMP_MB(); /* Flush all pending memory write invalidates.  */
6353   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6354 
6355   if (i < __kmp_threads_capacity) {
6356 #if KMP_USE_MONITOR
6357     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6358     KMP_MB(); /* Flush all pending memory write invalidates.  */
6359 
6360     // Need to check that monitor was initialized before reaping it. If we are
6361     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6362     // __kmp_monitor will appear to contain valid data, but it is only valid in
6363     // the parent process, not the child.
6364     // New behavior (201008): instead of keying off of the flag
6365     // __kmp_init_parallel, the monitor thread creation is keyed off
6366     // of the new flag __kmp_init_monitor.
6367     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6368     if (TCR_4(__kmp_init_monitor)) {
6369       __kmp_reap_monitor(&__kmp_monitor);
6370       TCW_4(__kmp_init_monitor, 0);
6371     }
6372     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6373     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6374 #endif // KMP_USE_MONITOR
6375   } else {
6376 /* TODO move this to cleanup code */
6377 #ifdef KMP_DEBUG
6378     /* make sure that everything has properly ended */
6379     for (i = 0; i < __kmp_threads_capacity; i++) {
6380       if (__kmp_root[i]) {
6381         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6382         //                    there can be uber threads alive here
6383         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6384       }
6385     }
6386 #endif
6387 
6388     KMP_MB();
6389 
6390     // Reap the worker threads.
6391     // This is valid for now, but be careful if threads are reaped sooner.
6392     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6393       // Get the next thread from the pool.
6394       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6395       __kmp_thread_pool = thread->th.th_next_pool;
6396       // Reap it.
6397       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6398       thread->th.th_next_pool = NULL;
6399       thread->th.th_in_pool = FALSE;
6400       __kmp_reap_thread(thread, 0);
6401     }
6402     __kmp_thread_pool_insert_pt = NULL;
6403 
6404     // Reap teams.
6405     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6406       // Get the next team from the pool.
6407       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6408       __kmp_team_pool = team->t.t_next_pool;
6409       // Reap it.
6410       team->t.t_next_pool = NULL;
6411       __kmp_reap_team(team);
6412     }
6413 
6414     __kmp_reap_task_teams();
6415 
6416 #if KMP_OS_UNIX
6417     // Threads that are not reaped should not access any resources since they
6418     // are going to be deallocated soon, so the shutdown sequence should wait
6419     // until all threads either exit the final spin-waiting loop or begin
6420     // sleeping after the given blocktime.
6421     for (i = 0; i < __kmp_threads_capacity; i++) {
6422       kmp_info_t *thr = __kmp_threads[i];
6423       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6424         KMP_CPU_PAUSE();
6425     }
6426 #endif
6427 
6428     for (i = 0; i < __kmp_threads_capacity; ++i) {
6429       // TBD: Add some checking...
6430       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6431     }
6432 
6433     /* Make sure all threadprivate destructors get run by joining with all
6434        worker threads before resetting this flag */
6435     TCW_SYNC_4(__kmp_init_common, FALSE);
6436 
6437     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6438     KMP_MB();
6439 
6440 #if KMP_USE_MONITOR
6441     // See note above: One of the possible fixes for CQ138434 / CQ140126
6442     //
6443     // FIXME: push both code fragments down and CSE them?
6444     // push them into __kmp_cleanup() ?
6445     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6446     if (TCR_4(__kmp_init_monitor)) {
6447       __kmp_reap_monitor(&__kmp_monitor);
6448       TCW_4(__kmp_init_monitor, 0);
6449     }
6450     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6451     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6452 #endif
6453   } /* else !__kmp_global.t_active */
6454   TCW_4(__kmp_init_gtid, FALSE);
6455   KMP_MB(); /* Flush all pending memory write invalidates.  */
6456 
6457   __kmp_cleanup();
6458 #if OMPT_SUPPORT
6459   ompt_fini();
6460 #endif
6461 }
6462 
6463 void __kmp_internal_end_library(int gtid_req) {
6464   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6465   /* this shouldn't be a race condition because __kmp_internal_end() is the
6466      only place to clear __kmp_serial_init */
6467   /* we'll check this later too, after we get the lock */
6468   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6469   // redundant, because the next check will work in any case.
6470   if (__kmp_global.g.g_abort) {
6471     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6472     /* TODO abort? */
6473     return;
6474   }
6475   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6476     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6477     return;
6478   }
6479 
6480   // If hidden helper team has been initialized, we need to deinit it
6481   if (TCR_4(__kmp_init_hidden_helper) &&
6482       !TCR_4(__kmp_hidden_helper_team_done)) {
6483     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6484     // First release the main thread to let it continue its work
6485     __kmp_hidden_helper_main_thread_release();
6486     // Wait until the hidden helper team has been destroyed
6487     __kmp_hidden_helper_threads_deinitz_wait();
6488   }
6489 
6490   KMP_MB(); /* Flush all pending memory write invalidates.  */
6491   /* find out who we are and what we should do */
6492   {
6493     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6494     KA_TRACE(
6495         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6496     if (gtid == KMP_GTID_SHUTDOWN) {
6497       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6498                     "already shutdown\n"));
6499       return;
6500     } else if (gtid == KMP_GTID_MONITOR) {
6501       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6502                     "registered, or system shutdown\n"));
6503       return;
6504     } else if (gtid == KMP_GTID_DNE) {
6505       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6506                     "shutdown\n"));
6507       /* we don't know who we are, but we may still shutdown the library */
6508     } else if (KMP_UBER_GTID(gtid)) {
6509       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6510       if (__kmp_root[gtid]->r.r_active) {
6511         __kmp_global.g.g_abort = -1;
6512         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6513         __kmp_unregister_library();
6514         KA_TRACE(10,
6515                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6516                   gtid));
6517         return;
6518       } else {
6519         __kmp_itthash_clean(__kmp_threads[gtid]);
6520         KA_TRACE(
6521             10,
6522             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6523         __kmp_unregister_root_current_thread(gtid);
6524       }
6525     } else {
6526 /* worker threads may call this function through the atexit handler, if they
6527  * call exit() */
6528 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6529    TODO: do a thorough shutdown instead */
6530 #ifdef DUMP_DEBUG_ON_EXIT
6531       if (__kmp_debug_buf)
6532         __kmp_dump_debug_buffer();
6533 #endif
6534       // added unregister library call here when we switch to shm linux
6535       // if we don't, it will leave lots of files in /dev/shm
6536       // cleanup shared memory file before exiting.
6537       __kmp_unregister_library();
6538       return;
6539     }
6540   }
6541   /* synchronize the termination process */
6542   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6543 
6544   /* have we already finished */
6545   if (__kmp_global.g.g_abort) {
6546     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6547     /* TODO abort? */
6548     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6549     return;
6550   }
6551   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6552     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6553     return;
6554   }
6555 
6556   /* We need this lock to enforce mutex between this reading of
6557      __kmp_threads_capacity and the writing by __kmp_register_root.
6558      Alternatively, we can use a counter of roots that is atomically updated by
6559      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6560      __kmp_internal_end_*.  */
6561   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6562 
6563   /* now we can safely conduct the actual termination */
6564   __kmp_internal_end();
6565 
6566   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6567   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6568 
6569   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6570 
6571 #ifdef DUMP_DEBUG_ON_EXIT
6572   if (__kmp_debug_buf)
6573     __kmp_dump_debug_buffer();
6574 #endif
6575 
6576 #if KMP_OS_WINDOWS
6577   __kmp_close_console();
6578 #endif
6579 
6580   __kmp_fini_allocator();
6581 
6582 } // __kmp_internal_end_library
6583 
6584 void __kmp_internal_end_thread(int gtid_req) {
6585   int i;
6586 
6587   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6588   /* this shouldn't be a race condition because __kmp_internal_end() is the
6589    * only place to clear __kmp_serial_init */
6590   /* we'll check this later too, after we get the lock */
6591   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6592   // redundant, because the next check will work in any case.
6593   if (__kmp_global.g.g_abort) {
6594     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6595     /* TODO abort? */
6596     return;
6597   }
6598   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6599     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6600     return;
6601   }
6602 
6603   // If hidden helper team has been initialized, we need to deinit it
6604   if (TCR_4(__kmp_init_hidden_helper) &&
6605       !TCR_4(__kmp_hidden_helper_team_done)) {
6606     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6607     // First release the main thread to let it continue its work
6608     __kmp_hidden_helper_main_thread_release();
6609     // Wait until the hidden helper team has been destroyed
6610     __kmp_hidden_helper_threads_deinitz_wait();
6611   }
6612 
6613   KMP_MB(); /* Flush all pending memory write invalidates.  */
6614 
6615   /* find out who we are and what we should do */
6616   {
6617     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6618     KA_TRACE(10,
6619              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6620     if (gtid == KMP_GTID_SHUTDOWN) {
6621       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6622                     "already shutdown\n"));
6623       return;
6624     } else if (gtid == KMP_GTID_MONITOR) {
6625       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6626                     "registered, or system shutdown\n"));
6627       return;
6628     } else if (gtid == KMP_GTID_DNE) {
6629       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6630                     "shutdown\n"));
6631       return;
6632       /* we don't know who we are */
6633     } else if (KMP_UBER_GTID(gtid)) {
6634       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6635       if (__kmp_root[gtid]->r.r_active) {
6636         __kmp_global.g.g_abort = -1;
6637         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6638         KA_TRACE(10,
6639                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6640                   gtid));
6641         return;
6642       } else {
6643         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6644                       gtid));
6645         __kmp_unregister_root_current_thread(gtid);
6646       }
6647     } else {
6648       /* just a worker thread, let's leave */
6649       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6650 
6651       if (gtid >= 0) {
6652         __kmp_threads[gtid]->th.th_task_team = NULL;
6653       }
6654 
6655       KA_TRACE(10,
6656                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6657                 gtid));
6658       return;
6659     }
6660   }
6661 #if KMP_DYNAMIC_LIB
6662   if (__kmp_pause_status != kmp_hard_paused)
6663   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6664   // because we will better shutdown later in the library destructor.
6665   {
6666     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6667     return;
6668   }
6669 #endif
6670   /* synchronize the termination process */
6671   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6672 
6673   /* have we already finished */
6674   if (__kmp_global.g.g_abort) {
6675     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6676     /* TODO abort? */
6677     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6678     return;
6679   }
6680   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6681     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6682     return;
6683   }
6684 
6685   /* We need this lock to enforce mutex between this reading of
6686      __kmp_threads_capacity and the writing by __kmp_register_root.
6687      Alternatively, we can use a counter of roots that is atomically updated by
6688      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6689      __kmp_internal_end_*.  */
6690 
6691   /* should we finish the run-time?  are all siblings done? */
6692   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6693 
6694   for (i = 0; i < __kmp_threads_capacity; ++i) {
6695     if (KMP_UBER_GTID(i)) {
6696       KA_TRACE(
6697           10,
6698           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6699       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6700       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6701       return;
6702     }
6703   }
6704 
6705   /* now we can safely conduct the actual termination */
6706 
6707   __kmp_internal_end();
6708 
6709   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6710   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6711 
6712   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6713 
6714 #ifdef DUMP_DEBUG_ON_EXIT
6715   if (__kmp_debug_buf)
6716     __kmp_dump_debug_buffer();
6717 #endif
6718 } // __kmp_internal_end_thread
6719 
6720 // -----------------------------------------------------------------------------
6721 // Library registration stuff.
6722 
6723 static long __kmp_registration_flag = 0;
6724 // Random value used to indicate library initialization.
6725 static char *__kmp_registration_str = NULL;
6726 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6727 
6728 static inline char *__kmp_reg_status_name() {
6729 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6730    each thread. If registration and unregistration go in different threads
6731    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6732    env var can not be found, because the name will contain different pid. */
6733 // macOS* complains about name being too long with additional getuid()
6734 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6735   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6736                           (int)getuid());
6737 #else
6738   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6739 #endif
6740 } // __kmp_reg_status_get
6741 
6742 #if defined(KMP_USE_SHM)
6743 // If /dev/shm is not accessible, we will create a temporary file under /tmp.
6744 char *temp_reg_status_file_name = nullptr;
6745 #endif
6746 
6747 void __kmp_register_library_startup(void) {
6748 
6749   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6750   int done = 0;
6751   union {
6752     double dtime;
6753     long ltime;
6754   } time;
6755 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6756   __kmp_initialize_system_tick();
6757 #endif
6758   __kmp_read_system_time(&time.dtime);
6759   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6760   __kmp_registration_str =
6761       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6762                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6763 
6764   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6765                 __kmp_registration_str));
6766 
6767   while (!done) {
6768 
6769     char *value = NULL; // Actual value of the environment variable.
6770 
6771 #if defined(KMP_USE_SHM)
6772     char *shm_name = __kmp_str_format("/%s", name);
6773     int shm_preexist = 0;
6774     char *data1;
6775     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6776     if ((fd1 == -1) && (errno == EEXIST)) {
6777       // file didn't open because it already exists.
6778       // try opening existing file
6779       fd1 = shm_open(shm_name, O_RDWR, 0666);
6780       if (fd1 == -1) { // file didn't open
6781         // error out here
6782         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6783                     __kmp_msg_null);
6784       } else {
6785         // able to open existing file
6786         shm_preexist = 1;
6787       }
6788     } else if (fd1 == -1) {
6789       // SHM didn't open; it was due to error other than already exists. Try to
6790       // create a temp file under /tmp.
6791       // TODO: /tmp might not always be the temporary directory. For now we will
6792       // not consider TMPDIR. If /tmp is not accessible, we simply error out.
6793       char *temp_file_name = __kmp_str_format("/tmp/%sXXXXXX", name);
6794       fd1 = mkstemp(temp_file_name);
6795       if (fd1 == -1) {
6796         // error out here.
6797         __kmp_fatal(KMP_MSG(FunctionError, "Can't open TEMP"), KMP_ERR(errno),
6798                     __kmp_msg_null);
6799       }
6800       temp_reg_status_file_name = temp_file_name;
6801     }
6802     if (shm_preexist == 0) {
6803       // we created SHM now set size
6804       if (ftruncate(fd1, SHM_SIZE) == -1) {
6805         // error occured setting size;
6806         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6807                     KMP_ERR(errno), __kmp_msg_null);
6808       }
6809     }
6810     data1 =
6811         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6812     if (data1 == MAP_FAILED) {
6813       // failed to map shared memory
6814       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6815                   __kmp_msg_null);
6816     }
6817     if (shm_preexist == 0) { // set data to SHM, set value
6818       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6819     }
6820     // Read value from either what we just wrote or existing file.
6821     value = __kmp_str_format("%s", data1); // read value from SHM
6822     munmap(data1, SHM_SIZE);
6823     close(fd1);
6824 #else // Windows and unix with static library
6825     // Set environment variable, but do not overwrite if it is exist.
6826     __kmp_env_set(name, __kmp_registration_str, 0);
6827     // read value to see if it got set
6828     value = __kmp_env_get(name);
6829 #endif
6830 
6831     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6832       done = 1; // Ok, environment variable set successfully, exit the loop.
6833     } else {
6834       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6835       // Check whether it alive or dead.
6836       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6837       char *tail = value;
6838       char *flag_addr_str = NULL;
6839       char *flag_val_str = NULL;
6840       char const *file_name = NULL;
6841       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6842       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6843       file_name = tail;
6844       if (tail != NULL) {
6845         unsigned long *flag_addr = 0;
6846         unsigned long flag_val = 0;
6847         KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6848         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6849         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6850           // First, check whether environment-encoded address is mapped into
6851           // addr space.
6852           // If so, dereference it to see if it still has the right value.
6853           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6854             neighbor = 1;
6855           } else {
6856             // If not, then we know the other copy of the library is no longer
6857             // running.
6858             neighbor = 2;
6859           }
6860         }
6861       }
6862       switch (neighbor) {
6863       case 0: // Cannot parse environment variable -- neighbor status unknown.
6864         // Assume it is the incompatible format of future version of the
6865         // library. Assume the other library is alive.
6866         // WARN( ... ); // TODO: Issue a warning.
6867         file_name = "unknown library";
6868         KMP_FALLTHROUGH();
6869       // Attention! Falling to the next case. That's intentional.
6870       case 1: { // Neighbor is alive.
6871         // Check it is allowed.
6872         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6873         if (!__kmp_str_match_true(duplicate_ok)) {
6874           // That's not allowed. Issue fatal error.
6875           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6876                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6877         }
6878         KMP_INTERNAL_FREE(duplicate_ok);
6879         __kmp_duplicate_library_ok = 1;
6880         done = 1; // Exit the loop.
6881       } break;
6882       case 2: { // Neighbor is dead.
6883 
6884 #if defined(KMP_USE_SHM)
6885         // close shared memory.
6886         shm_unlink(shm_name); // this removes file in /dev/shm
6887 #else
6888         // Clear the variable and try to register library again.
6889         __kmp_env_unset(name);
6890 #endif
6891       } break;
6892       default: {
6893         KMP_DEBUG_ASSERT(0);
6894       } break;
6895       }
6896     }
6897     KMP_INTERNAL_FREE((void *)value);
6898 #if defined(KMP_USE_SHM)
6899     KMP_INTERNAL_FREE((void *)shm_name);
6900 #endif
6901   } // while
6902   KMP_INTERNAL_FREE((void *)name);
6903 
6904 } // func __kmp_register_library_startup
6905 
6906 void __kmp_unregister_library(void) {
6907 
6908   char *name = __kmp_reg_status_name();
6909   char *value = NULL;
6910 
6911 #if defined(KMP_USE_SHM)
6912   bool use_shm = true;
6913   char *shm_name = __kmp_str_format("/%s", name);
6914   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6915   if (fd1 == -1) {
6916     // File did not open. Try the temporary file.
6917     use_shm = false;
6918     KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6919     fd1 = open(temp_reg_status_file_name, O_RDONLY);
6920     if (fd1 == -1) {
6921       // give it up now.
6922       return;
6923     }
6924   }
6925   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6926   if (data1 != MAP_FAILED) {
6927     value = __kmp_str_format("%s", data1); // read value from SHM
6928     munmap(data1, SHM_SIZE);
6929   }
6930   close(fd1);
6931 #else
6932   value = __kmp_env_get(name);
6933 #endif
6934 
6935   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6936   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6937   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6938 //  Ok, this is our variable. Delete it.
6939 #if defined(KMP_USE_SHM)
6940     if (use_shm) {
6941       shm_unlink(shm_name); // this removes file in /dev/shm
6942     } else {
6943       KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6944       unlink(temp_reg_status_file_name); // this removes the temp file
6945     }
6946 #else
6947     __kmp_env_unset(name);
6948 #endif
6949   }
6950 
6951 #if defined(KMP_USE_SHM)
6952   KMP_INTERNAL_FREE(shm_name);
6953   if (!use_shm) {
6954     KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6955     KMP_INTERNAL_FREE(temp_reg_status_file_name);
6956   }
6957 #endif
6958 
6959   KMP_INTERNAL_FREE(__kmp_registration_str);
6960   KMP_INTERNAL_FREE(value);
6961   KMP_INTERNAL_FREE(name);
6962 
6963   __kmp_registration_flag = 0;
6964   __kmp_registration_str = NULL;
6965 
6966 } // __kmp_unregister_library
6967 
6968 // End of Library registration stuff.
6969 // -----------------------------------------------------------------------------
6970 
6971 #if KMP_MIC_SUPPORTED
6972 
6973 static void __kmp_check_mic_type() {
6974   kmp_cpuid_t cpuid_state = {0};
6975   kmp_cpuid_t *cs_p = &cpuid_state;
6976   __kmp_x86_cpuid(1, 0, cs_p);
6977   // We don't support mic1 at the moment
6978   if ((cs_p->eax & 0xff0) == 0xB10) {
6979     __kmp_mic_type = mic2;
6980   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6981     __kmp_mic_type = mic3;
6982   } else {
6983     __kmp_mic_type = non_mic;
6984   }
6985 }
6986 
6987 #endif /* KMP_MIC_SUPPORTED */
6988 
6989 #if KMP_HAVE_UMWAIT
6990 static void __kmp_user_level_mwait_init() {
6991   struct kmp_cpuid buf;
6992   __kmp_x86_cpuid(7, 0, &buf);
6993   __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
6994   __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
6995   __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
6996   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6997                 __kmp_umwait_enabled));
6998 }
6999 #elif KMP_HAVE_MWAIT
7000 #ifndef AT_INTELPHIUSERMWAIT
7001 // Spurious, non-existent value that should always fail to return anything.
7002 // Will be replaced with the correct value when we know that.
7003 #define AT_INTELPHIUSERMWAIT 10000
7004 #endif
7005 // getauxval() function is available in RHEL7 and SLES12. If a system with an
7006 // earlier OS is used to build the RTL, we'll use the following internal
7007 // function when the entry is not found.
7008 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7009 unsigned long getauxval(unsigned long) { return 0; }
7010 
7011 static void __kmp_user_level_mwait_init() {
7012   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7013   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7014   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7015   // KMP_USER_LEVEL_MWAIT was set to TRUE.
7016   if (__kmp_mic_type == mic3) {
7017     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7018     if ((res & 0x1) || __kmp_user_level_mwait) {
7019       __kmp_mwait_enabled = TRUE;
7020       if (__kmp_user_level_mwait) {
7021         KMP_INFORM(EnvMwaitWarn);
7022       }
7023     } else {
7024       __kmp_mwait_enabled = FALSE;
7025     }
7026   }
7027   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7028                 "__kmp_mwait_enabled = %d\n",
7029                 __kmp_mic_type, __kmp_mwait_enabled));
7030 }
7031 #endif /* KMP_HAVE_UMWAIT */
7032 
7033 static void __kmp_do_serial_initialize(void) {
7034   int i, gtid;
7035   size_t size;
7036 
7037   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7038 
7039   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7040   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7041   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7042   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7043   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7044 
7045 #if OMPT_SUPPORT
7046   ompt_pre_init();
7047 #endif
7048 #if OMPD_SUPPORT
7049   __kmp_env_dump();
7050   ompd_init();
7051 #endif
7052 
7053   __kmp_validate_locks();
7054 
7055   /* Initialize internal memory allocator */
7056   __kmp_init_allocator();
7057 
7058   /* Register the library startup via an environment variable or via mapped
7059      shared memory file and check to see whether another copy of the library is
7060      already registered. Since forked child process is often terminated, we
7061      postpone the registration till middle initialization in the child */
7062   if (__kmp_need_register_serial)
7063     __kmp_register_library_startup();
7064 
7065   /* TODO reinitialization of library */
7066   if (TCR_4(__kmp_global.g.g_done)) {
7067     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7068   }
7069 
7070   __kmp_global.g.g_abort = 0;
7071   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7072 
7073 /* initialize the locks */
7074 #if KMP_USE_ADAPTIVE_LOCKS
7075 #if KMP_DEBUG_ADAPTIVE_LOCKS
7076   __kmp_init_speculative_stats();
7077 #endif
7078 #endif
7079 #if KMP_STATS_ENABLED
7080   __kmp_stats_init();
7081 #endif
7082   __kmp_init_lock(&__kmp_global_lock);
7083   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7084   __kmp_init_lock(&__kmp_debug_lock);
7085   __kmp_init_atomic_lock(&__kmp_atomic_lock);
7086   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7087   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7088   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7089   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7090   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7091   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7092   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7093   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7094   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7095   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7096   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7097   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7098   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7099   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7100 #if KMP_USE_MONITOR
7101   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7102 #endif
7103   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7104 
7105   /* conduct initialization and initial setup of configuration */
7106 
7107   __kmp_runtime_initialize();
7108 
7109 #if KMP_MIC_SUPPORTED
7110   __kmp_check_mic_type();
7111 #endif
7112 
7113 // Some global variable initialization moved here from kmp_env_initialize()
7114 #ifdef KMP_DEBUG
7115   kmp_diag = 0;
7116 #endif
7117   __kmp_abort_delay = 0;
7118 
7119   // From __kmp_init_dflt_team_nth()
7120   /* assume the entire machine will be used */
7121   __kmp_dflt_team_nth_ub = __kmp_xproc;
7122   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7123     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7124   }
7125   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7126     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7127   }
7128   __kmp_max_nth = __kmp_sys_max_nth;
7129   __kmp_cg_max_nth = __kmp_sys_max_nth;
7130   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7131   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7132     __kmp_teams_max_nth = __kmp_sys_max_nth;
7133   }
7134 
7135   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7136   // part
7137   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7138 #if KMP_USE_MONITOR
7139   __kmp_monitor_wakeups =
7140       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7141   __kmp_bt_intervals =
7142       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7143 #endif
7144   // From "KMP_LIBRARY" part of __kmp_env_initialize()
7145   __kmp_library = library_throughput;
7146   // From KMP_SCHEDULE initialization
7147   __kmp_static = kmp_sch_static_balanced;
7148 // AC: do not use analytical here, because it is non-monotonous
7149 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7150 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7151 // need to repeat assignment
7152 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7153 // bit control and barrier method control parts
7154 #if KMP_FAST_REDUCTION_BARRIER
7155 #define kmp_reduction_barrier_gather_bb ((int)1)
7156 #define kmp_reduction_barrier_release_bb ((int)1)
7157 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7158 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7159 #endif // KMP_FAST_REDUCTION_BARRIER
7160   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7161     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7162     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7163     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7164     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7165 #if KMP_FAST_REDUCTION_BARRIER
7166     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7167       // lin_64 ): hyper,1
7168       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7169       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7170       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7171       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7172     }
7173 #endif // KMP_FAST_REDUCTION_BARRIER
7174   }
7175 #if KMP_FAST_REDUCTION_BARRIER
7176 #undef kmp_reduction_barrier_release_pat
7177 #undef kmp_reduction_barrier_gather_pat
7178 #undef kmp_reduction_barrier_release_bb
7179 #undef kmp_reduction_barrier_gather_bb
7180 #endif // KMP_FAST_REDUCTION_BARRIER
7181 #if KMP_MIC_SUPPORTED
7182   if (__kmp_mic_type == mic2) { // KNC
7183     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7184     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7185     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7186         1; // forkjoin release
7187     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7188     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7189   }
7190 #if KMP_FAST_REDUCTION_BARRIER
7191   if (__kmp_mic_type == mic2) { // KNC
7192     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7193     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7194   }
7195 #endif // KMP_FAST_REDUCTION_BARRIER
7196 #endif // KMP_MIC_SUPPORTED
7197 
7198 // From KMP_CHECKS initialization
7199 #ifdef KMP_DEBUG
7200   __kmp_env_checks = TRUE; /* development versions have the extra checks */
7201 #else
7202   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7203 #endif
7204 
7205   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7206   __kmp_foreign_tp = TRUE;
7207 
7208   __kmp_global.g.g_dynamic = FALSE;
7209   __kmp_global.g.g_dynamic_mode = dynamic_default;
7210 
7211   __kmp_init_nesting_mode();
7212 
7213   __kmp_env_initialize(NULL);
7214 
7215 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7216   __kmp_user_level_mwait_init();
7217 #endif
7218 // Print all messages in message catalog for testing purposes.
7219 #ifdef KMP_DEBUG
7220   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7221   if (__kmp_str_match_true(val)) {
7222     kmp_str_buf_t buffer;
7223     __kmp_str_buf_init(&buffer);
7224     __kmp_i18n_dump_catalog(&buffer);
7225     __kmp_printf("%s", buffer.str);
7226     __kmp_str_buf_free(&buffer);
7227   }
7228   __kmp_env_free(&val);
7229 #endif
7230 
7231   __kmp_threads_capacity =
7232       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7233   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7234   __kmp_tp_capacity = __kmp_default_tp_capacity(
7235       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7236 
7237   // If the library is shut down properly, both pools must be NULL. Just in
7238   // case, set them to NULL -- some memory may leak, but subsequent code will
7239   // work even if pools are not freed.
7240   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7241   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7242   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7243   __kmp_thread_pool = NULL;
7244   __kmp_thread_pool_insert_pt = NULL;
7245   __kmp_team_pool = NULL;
7246 
7247   /* Allocate all of the variable sized records */
7248   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7249    * expandable */
7250   /* Since allocation is cache-aligned, just add extra padding at the end */
7251   size =
7252       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7253       CACHE_LINE;
7254   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7255   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7256                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
7257 
7258   /* init thread counts */
7259   KMP_DEBUG_ASSERT(__kmp_all_nth ==
7260                    0); // Asserts fail if the library is reinitializing and
7261   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7262   __kmp_all_nth = 0;
7263   __kmp_nth = 0;
7264 
7265   /* setup the uber master thread and hierarchy */
7266   gtid = __kmp_register_root(TRUE);
7267   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
7268   KMP_ASSERT(KMP_UBER_GTID(gtid));
7269   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7270 
7271   KMP_MB(); /* Flush all pending memory write invalidates.  */
7272 
7273   __kmp_common_initialize();
7274 
7275 #if KMP_OS_UNIX
7276   /* invoke the child fork handler */
7277   __kmp_register_atfork();
7278 #endif
7279 
7280 #if !KMP_DYNAMIC_LIB ||                                                        \
7281     ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7282   {
7283     /* Invoke the exit handler when the program finishes, only for static
7284        library and macOS* dynamic. For other dynamic libraries, we already
7285        have _fini and DllMain. */
7286     int rc = atexit(__kmp_internal_end_atexit);
7287     if (rc != 0) {
7288       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7289                   __kmp_msg_null);
7290     }
7291   }
7292 #endif
7293 
7294 #if KMP_HANDLE_SIGNALS
7295 #if KMP_OS_UNIX
7296   /* NOTE: make sure that this is called before the user installs their own
7297      signal handlers so that the user handlers are called first. this way they
7298      can return false, not call our handler, avoid terminating the library, and
7299      continue execution where they left off. */
7300   __kmp_install_signals(FALSE);
7301 #endif /* KMP_OS_UNIX */
7302 #if KMP_OS_WINDOWS
7303   __kmp_install_signals(TRUE);
7304 #endif /* KMP_OS_WINDOWS */
7305 #endif
7306 
7307   /* we have finished the serial initialization */
7308   __kmp_init_counter++;
7309 
7310   __kmp_init_serial = TRUE;
7311 
7312   if (__kmp_settings) {
7313     __kmp_env_print();
7314   }
7315 
7316   if (__kmp_display_env || __kmp_display_env_verbose) {
7317     __kmp_env_print_2();
7318   }
7319 
7320 #if OMPT_SUPPORT
7321   ompt_post_init();
7322 #endif
7323 
7324   KMP_MB();
7325 
7326   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7327 }
7328 
7329 void __kmp_serial_initialize(void) {
7330   if (__kmp_init_serial) {
7331     return;
7332   }
7333   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7334   if (__kmp_init_serial) {
7335     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7336     return;
7337   }
7338   __kmp_do_serial_initialize();
7339   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7340 }
7341 
7342 static void __kmp_do_middle_initialize(void) {
7343   int i, j;
7344   int prev_dflt_team_nth;
7345 
7346   if (!__kmp_init_serial) {
7347     __kmp_do_serial_initialize();
7348   }
7349 
7350   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7351 
7352   if (UNLIKELY(!__kmp_need_register_serial)) {
7353     // We are in a forked child process. The registration was skipped during
7354     // serial initialization in __kmp_atfork_child handler. Do it here.
7355     __kmp_register_library_startup();
7356   }
7357 
7358   // Save the previous value for the __kmp_dflt_team_nth so that
7359   // we can avoid some reinitialization if it hasn't changed.
7360   prev_dflt_team_nth = __kmp_dflt_team_nth;
7361 
7362 #if KMP_AFFINITY_SUPPORTED
7363   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7364   // number of cores on the machine.
7365   __kmp_affinity_initialize(__kmp_affinity);
7366 
7367 #endif /* KMP_AFFINITY_SUPPORTED */
7368 
7369   KMP_ASSERT(__kmp_xproc > 0);
7370   if (__kmp_avail_proc == 0) {
7371     __kmp_avail_proc = __kmp_xproc;
7372   }
7373 
7374   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7375   // correct them now
7376   j = 0;
7377   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7378     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7379         __kmp_avail_proc;
7380     j++;
7381   }
7382 
7383   if (__kmp_dflt_team_nth == 0) {
7384 #ifdef KMP_DFLT_NTH_CORES
7385     // Default #threads = #cores
7386     __kmp_dflt_team_nth = __kmp_ncores;
7387     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7388                   "__kmp_ncores (%d)\n",
7389                   __kmp_dflt_team_nth));
7390 #else
7391     // Default #threads = #available OS procs
7392     __kmp_dflt_team_nth = __kmp_avail_proc;
7393     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7394                   "__kmp_avail_proc(%d)\n",
7395                   __kmp_dflt_team_nth));
7396 #endif /* KMP_DFLT_NTH_CORES */
7397   }
7398 
7399   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7400     __kmp_dflt_team_nth = KMP_MIN_NTH;
7401   }
7402   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7403     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7404   }
7405 
7406   if (__kmp_nesting_mode > 0)
7407     __kmp_set_nesting_mode_threads();
7408 
7409   // There's no harm in continuing if the following check fails,
7410   // but it indicates an error in the previous logic.
7411   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7412 
7413   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7414     // Run through the __kmp_threads array and set the num threads icv for each
7415     // root thread that is currently registered with the RTL (which has not
7416     // already explicitly set its nthreads-var with a call to
7417     // omp_set_num_threads()).
7418     for (i = 0; i < __kmp_threads_capacity; i++) {
7419       kmp_info_t *thread = __kmp_threads[i];
7420       if (thread == NULL)
7421         continue;
7422       if (thread->th.th_current_task->td_icvs.nproc != 0)
7423         continue;
7424 
7425       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7426     }
7427   }
7428   KA_TRACE(
7429       20,
7430       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7431        __kmp_dflt_team_nth));
7432 
7433 #ifdef KMP_ADJUST_BLOCKTIME
7434   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7435   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7436     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7437     if (__kmp_nth > __kmp_avail_proc) {
7438       __kmp_zero_bt = TRUE;
7439     }
7440   }
7441 #endif /* KMP_ADJUST_BLOCKTIME */
7442 
7443   /* we have finished middle initialization */
7444   TCW_SYNC_4(__kmp_init_middle, TRUE);
7445 
7446   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7447 }
7448 
7449 void __kmp_middle_initialize(void) {
7450   if (__kmp_init_middle) {
7451     return;
7452   }
7453   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7454   if (__kmp_init_middle) {
7455     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7456     return;
7457   }
7458   __kmp_do_middle_initialize();
7459   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7460 }
7461 
7462 void __kmp_parallel_initialize(void) {
7463   int gtid = __kmp_entry_gtid(); // this might be a new root
7464 
7465   /* synchronize parallel initialization (for sibling) */
7466   if (TCR_4(__kmp_init_parallel))
7467     return;
7468   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7469   if (TCR_4(__kmp_init_parallel)) {
7470     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7471     return;
7472   }
7473 
7474   /* TODO reinitialization after we have already shut down */
7475   if (TCR_4(__kmp_global.g.g_done)) {
7476     KA_TRACE(
7477         10,
7478         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7479     __kmp_infinite_loop();
7480   }
7481 
7482   /* jc: The lock __kmp_initz_lock is already held, so calling
7483      __kmp_serial_initialize would cause a deadlock.  So we call
7484      __kmp_do_serial_initialize directly. */
7485   if (!__kmp_init_middle) {
7486     __kmp_do_middle_initialize();
7487   }
7488   __kmp_assign_root_init_mask();
7489   __kmp_resume_if_hard_paused();
7490 
7491   /* begin initialization */
7492   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7493   KMP_ASSERT(KMP_UBER_GTID(gtid));
7494 
7495 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7496   // Save the FP control regs.
7497   // Worker threads will set theirs to these values at thread startup.
7498   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7499   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7500   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7501 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7502 
7503 #if KMP_OS_UNIX
7504 #if KMP_HANDLE_SIGNALS
7505   /*  must be after __kmp_serial_initialize  */
7506   __kmp_install_signals(TRUE);
7507 #endif
7508 #endif
7509 
7510   __kmp_suspend_initialize();
7511 
7512 #if defined(USE_LOAD_BALANCE)
7513   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7514     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7515   }
7516 #else
7517   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7518     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7519   }
7520 #endif
7521 
7522   if (__kmp_version) {
7523     __kmp_print_version_2();
7524   }
7525 
7526   /* we have finished parallel initialization */
7527   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7528 
7529   KMP_MB();
7530   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7531 
7532   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7533 }
7534 
7535 void __kmp_hidden_helper_initialize() {
7536   if (TCR_4(__kmp_init_hidden_helper))
7537     return;
7538 
7539   // __kmp_parallel_initialize is required before we initialize hidden helper
7540   if (!TCR_4(__kmp_init_parallel))
7541     __kmp_parallel_initialize();
7542 
7543   // Double check. Note that this double check should not be placed before
7544   // __kmp_parallel_initialize as it will cause dead lock.
7545   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7546   if (TCR_4(__kmp_init_hidden_helper)) {
7547     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7548     return;
7549   }
7550 
7551 #if KMP_AFFINITY_SUPPORTED
7552   // Initialize hidden helper affinity settings.
7553   // The above __kmp_parallel_initialize() will initialize
7554   // regular affinity (and topology) if not already done.
7555   if (!__kmp_hh_affinity.flags.initialized)
7556     __kmp_affinity_initialize(__kmp_hh_affinity);
7557 #endif
7558 
7559   // Set the count of hidden helper tasks to be executed to zero
7560   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7561 
7562   // Set the global variable indicating that we're initializing hidden helper
7563   // team/threads
7564   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7565 
7566   // Platform independent initialization
7567   __kmp_do_initialize_hidden_helper_threads();
7568 
7569   // Wait here for the finish of initialization of hidden helper teams
7570   __kmp_hidden_helper_threads_initz_wait();
7571 
7572   // We have finished hidden helper initialization
7573   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7574 
7575   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7576 }
7577 
7578 /* ------------------------------------------------------------------------ */
7579 
7580 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7581                                    kmp_team_t *team) {
7582   kmp_disp_t *dispatch;
7583 
7584   KMP_MB();
7585 
7586   /* none of the threads have encountered any constructs, yet. */
7587   this_thr->th.th_local.this_construct = 0;
7588 #if KMP_CACHE_MANAGE
7589   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7590 #endif /* KMP_CACHE_MANAGE */
7591   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7592   KMP_DEBUG_ASSERT(dispatch);
7593   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7594   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7595   // this_thr->th.th_info.ds.ds_tid ] );
7596 
7597   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7598   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7599   if (__kmp_env_consistency_check)
7600     __kmp_push_parallel(gtid, team->t.t_ident);
7601 
7602   KMP_MB(); /* Flush all pending memory write invalidates.  */
7603 }
7604 
7605 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7606                                   kmp_team_t *team) {
7607   if (__kmp_env_consistency_check)
7608     __kmp_pop_parallel(gtid, team->t.t_ident);
7609 
7610   __kmp_finish_implicit_task(this_thr);
7611 }
7612 
7613 int __kmp_invoke_task_func(int gtid) {
7614   int rc;
7615   int tid = __kmp_tid_from_gtid(gtid);
7616   kmp_info_t *this_thr = __kmp_threads[gtid];
7617   kmp_team_t *team = this_thr->th.th_team;
7618 
7619   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7620 #if USE_ITT_BUILD
7621   if (__itt_stack_caller_create_ptr) {
7622     // inform ittnotify about entering user's code
7623     if (team->t.t_stack_id != NULL) {
7624       __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7625     } else {
7626       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7627       __kmp_itt_stack_callee_enter(
7628           (__itt_caller)team->t.t_parent->t.t_stack_id);
7629     }
7630   }
7631 #endif /* USE_ITT_BUILD */
7632 #if INCLUDE_SSC_MARKS
7633   SSC_MARK_INVOKING();
7634 #endif
7635 
7636 #if OMPT_SUPPORT
7637   void *dummy;
7638   void **exit_frame_p;
7639   ompt_data_t *my_task_data;
7640   ompt_data_t *my_parallel_data;
7641   int ompt_team_size;
7642 
7643   if (ompt_enabled.enabled) {
7644     exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7645                          .ompt_task_info.frame.exit_frame.ptr);
7646   } else {
7647     exit_frame_p = &dummy;
7648   }
7649 
7650   my_task_data =
7651       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7652   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7653   if (ompt_enabled.ompt_callback_implicit_task) {
7654     ompt_team_size = team->t.t_nproc;
7655     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7656         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7657         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7658     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7659   }
7660 #endif
7661 
7662 #if KMP_STATS_ENABLED
7663   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7664   if (previous_state == stats_state_e::TEAMS_REGION) {
7665     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7666   } else {
7667     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7668   }
7669   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7670 #endif
7671 
7672   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7673                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7674 #if OMPT_SUPPORT
7675                               ,
7676                               exit_frame_p
7677 #endif
7678   );
7679 #if OMPT_SUPPORT
7680   *exit_frame_p = NULL;
7681   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7682 #endif
7683 
7684 #if KMP_STATS_ENABLED
7685   if (previous_state == stats_state_e::TEAMS_REGION) {
7686     KMP_SET_THREAD_STATE(previous_state);
7687   }
7688   KMP_POP_PARTITIONED_TIMER();
7689 #endif
7690 
7691 #if USE_ITT_BUILD
7692   if (__itt_stack_caller_create_ptr) {
7693     // inform ittnotify about leaving user's code
7694     if (team->t.t_stack_id != NULL) {
7695       __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7696     } else {
7697       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7698       __kmp_itt_stack_callee_leave(
7699           (__itt_caller)team->t.t_parent->t.t_stack_id);
7700     }
7701   }
7702 #endif /* USE_ITT_BUILD */
7703   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7704 
7705   return rc;
7706 }
7707 
7708 void __kmp_teams_master(int gtid) {
7709   // This routine is called by all primary threads in teams construct
7710   kmp_info_t *thr = __kmp_threads[gtid];
7711   kmp_team_t *team = thr->th.th_team;
7712   ident_t *loc = team->t.t_ident;
7713   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7714   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7715   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7716   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7717                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7718 
7719   // This thread is a new CG root.  Set up the proper variables.
7720   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7721   tmp->cg_root = thr; // Make thr the CG root
7722   // Init to thread limit stored when league primary threads were forked
7723   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7724   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7725   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7726                  " cg_nthreads to 1\n",
7727                  thr, tmp));
7728   tmp->up = thr->th.th_cg_roots;
7729   thr->th.th_cg_roots = tmp;
7730 
7731 // Launch league of teams now, but not let workers execute
7732 // (they hang on fork barrier until next parallel)
7733 #if INCLUDE_SSC_MARKS
7734   SSC_MARK_FORKING();
7735 #endif
7736   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7737                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7738                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7739 #if INCLUDE_SSC_MARKS
7740   SSC_MARK_JOINING();
7741 #endif
7742   // If the team size was reduced from the limit, set it to the new size
7743   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7744     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7745   // AC: last parameter "1" eliminates join barrier which won't work because
7746   // worker threads are in a fork barrier waiting for more parallel regions
7747   __kmp_join_call(loc, gtid
7748 #if OMPT_SUPPORT
7749                   ,
7750                   fork_context_intel
7751 #endif
7752                   ,
7753                   1);
7754 }
7755 
7756 int __kmp_invoke_teams_master(int gtid) {
7757   kmp_info_t *this_thr = __kmp_threads[gtid];
7758   kmp_team_t *team = this_thr->th.th_team;
7759 #if KMP_DEBUG
7760   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7761     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7762                      (void *)__kmp_teams_master);
7763 #endif
7764   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7765 #if OMPT_SUPPORT
7766   int tid = __kmp_tid_from_gtid(gtid);
7767   ompt_data_t *task_data =
7768       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7769   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7770   if (ompt_enabled.ompt_callback_implicit_task) {
7771     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7772         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7773         ompt_task_initial);
7774     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7775   }
7776 #endif
7777   __kmp_teams_master(gtid);
7778 #if OMPT_SUPPORT
7779   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7780 #endif
7781   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7782   return 1;
7783 }
7784 
7785 /* this sets the requested number of threads for the next parallel region
7786    encountered by this team. since this should be enclosed in the forkjoin
7787    critical section it should avoid race conditions with asymmetrical nested
7788    parallelism */
7789 
7790 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7791   kmp_info_t *thr = __kmp_threads[gtid];
7792 
7793   if (num_threads > 0)
7794     thr->th.th_set_nproc = num_threads;
7795 }
7796 
7797 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7798                                     int num_threads) {
7799   KMP_DEBUG_ASSERT(thr);
7800   // Remember the number of threads for inner parallel regions
7801   if (!TCR_4(__kmp_init_middle))
7802     __kmp_middle_initialize(); // get internal globals calculated
7803   __kmp_assign_root_init_mask();
7804   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7805   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7806 
7807   if (num_threads == 0) {
7808     if (__kmp_teams_thread_limit > 0) {
7809       num_threads = __kmp_teams_thread_limit;
7810     } else {
7811       num_threads = __kmp_avail_proc / num_teams;
7812     }
7813     // adjust num_threads w/o warning as it is not user setting
7814     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7815     // no thread_limit clause specified -  do not change thread-limit-var ICV
7816     if (num_threads > __kmp_dflt_team_nth) {
7817       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7818     }
7819     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7820       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7821     } // prevent team size to exceed thread-limit-var
7822     if (num_teams * num_threads > __kmp_teams_max_nth) {
7823       num_threads = __kmp_teams_max_nth / num_teams;
7824     }
7825     if (num_threads == 0) {
7826       num_threads = 1;
7827     }
7828   } else {
7829     if (num_threads < 0) {
7830       __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7831                 __kmp_msg_null);
7832       num_threads = 1;
7833     }
7834     // This thread will be the primary thread of the league primary threads
7835     // Store new thread limit; old limit is saved in th_cg_roots list
7836     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7837     // num_threads = min(num_threads, nthreads-var)
7838     if (num_threads > __kmp_dflt_team_nth) {
7839       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7840     }
7841     if (num_teams * num_threads > __kmp_teams_max_nth) {
7842       int new_threads = __kmp_teams_max_nth / num_teams;
7843       if (new_threads == 0) {
7844         new_threads = 1;
7845       }
7846       if (new_threads != num_threads) {
7847         if (!__kmp_reserve_warn) { // user asked for too many threads
7848           __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7849           __kmp_msg(kmp_ms_warning,
7850                     KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7851                     KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7852         }
7853       }
7854       num_threads = new_threads;
7855     }
7856   }
7857   thr->th.th_teams_size.nth = num_threads;
7858 }
7859 
7860 /* this sets the requested number of teams for the teams region and/or
7861    the number of threads for the next parallel region encountered  */
7862 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7863                           int num_threads) {
7864   kmp_info_t *thr = __kmp_threads[gtid];
7865   if (num_teams < 0) {
7866     // OpenMP specification requires requested values to be positive,
7867     // but people can send us any value, so we'd better check
7868     __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7869               __kmp_msg_null);
7870     num_teams = 1;
7871   }
7872   if (num_teams == 0) {
7873     if (__kmp_nteams > 0) {
7874       num_teams = __kmp_nteams;
7875     } else {
7876       num_teams = 1; // default number of teams is 1.
7877     }
7878   }
7879   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7880     if (!__kmp_reserve_warn) {
7881       __kmp_reserve_warn = 1;
7882       __kmp_msg(kmp_ms_warning,
7883                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7884                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7885     }
7886     num_teams = __kmp_teams_max_nth;
7887   }
7888   // Set number of teams (number of threads in the outer "parallel" of the
7889   // teams)
7890   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7891 
7892   __kmp_push_thread_limit(thr, num_teams, num_threads);
7893 }
7894 
7895 /* This sets the requested number of teams for the teams region and/or
7896    the number of threads for the next parallel region encountered  */
7897 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7898                              int num_teams_ub, int num_threads) {
7899   kmp_info_t *thr = __kmp_threads[gtid];
7900   KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7901   KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7902   KMP_DEBUG_ASSERT(num_threads >= 0);
7903 
7904   if (num_teams_lb > num_teams_ub) {
7905     __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7906                 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7907   }
7908 
7909   int num_teams = 1; // defalt number of teams is 1.
7910 
7911   if (num_teams_lb == 0 && num_teams_ub > 0)
7912     num_teams_lb = num_teams_ub;
7913 
7914   if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7915     num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7916     if (num_teams > __kmp_teams_max_nth) {
7917       if (!__kmp_reserve_warn) {
7918         __kmp_reserve_warn = 1;
7919         __kmp_msg(kmp_ms_warning,
7920                   KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7921                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7922       }
7923       num_teams = __kmp_teams_max_nth;
7924     }
7925   } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7926     num_teams = num_teams_ub;
7927   } else { // num_teams_lb <= num_teams <= num_teams_ub
7928     if (num_threads <= 0) {
7929       if (num_teams_ub > __kmp_teams_max_nth) {
7930         num_teams = num_teams_lb;
7931       } else {
7932         num_teams = num_teams_ub;
7933       }
7934     } else {
7935       num_teams = (num_threads > __kmp_teams_max_nth)
7936                       ? num_teams
7937                       : __kmp_teams_max_nth / num_threads;
7938       if (num_teams < num_teams_lb) {
7939         num_teams = num_teams_lb;
7940       } else if (num_teams > num_teams_ub) {
7941         num_teams = num_teams_ub;
7942       }
7943     }
7944   }
7945   // Set number of teams (number of threads in the outer "parallel" of the
7946   // teams)
7947   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7948 
7949   __kmp_push_thread_limit(thr, num_teams, num_threads);
7950 }
7951 
7952 // Set the proc_bind var to use in the following parallel region.
7953 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7954   kmp_info_t *thr = __kmp_threads[gtid];
7955   thr->th.th_set_proc_bind = proc_bind;
7956 }
7957 
7958 /* Launch the worker threads into the microtask. */
7959 
7960 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7961   kmp_info_t *this_thr = __kmp_threads[gtid];
7962 
7963 #ifdef KMP_DEBUG
7964   int f;
7965 #endif /* KMP_DEBUG */
7966 
7967   KMP_DEBUG_ASSERT(team);
7968   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7969   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7970   KMP_MB(); /* Flush all pending memory write invalidates.  */
7971 
7972   team->t.t_construct = 0; /* no single directives seen yet */
7973   team->t.t_ordered.dt.t_value =
7974       0; /* thread 0 enters the ordered section first */
7975 
7976   /* Reset the identifiers on the dispatch buffer */
7977   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7978   if (team->t.t_max_nproc > 1) {
7979     int i;
7980     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7981       team->t.t_disp_buffer[i].buffer_index = i;
7982       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7983     }
7984   } else {
7985     team->t.t_disp_buffer[0].buffer_index = 0;
7986     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7987   }
7988 
7989   KMP_MB(); /* Flush all pending memory write invalidates.  */
7990   KMP_ASSERT(this_thr->th.th_team == team);
7991 
7992 #ifdef KMP_DEBUG
7993   for (f = 0; f < team->t.t_nproc; f++) {
7994     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7995                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7996   }
7997 #endif /* KMP_DEBUG */
7998 
7999   /* release the worker threads so they may begin working */
8000   __kmp_fork_barrier(gtid, 0);
8001 }
8002 
8003 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8004   kmp_info_t *this_thr = __kmp_threads[gtid];
8005 
8006   KMP_DEBUG_ASSERT(team);
8007   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8008   KMP_ASSERT(KMP_MASTER_GTID(gtid));
8009   KMP_MB(); /* Flush all pending memory write invalidates.  */
8010 
8011   /* Join barrier after fork */
8012 
8013 #ifdef KMP_DEBUG
8014   if (__kmp_threads[gtid] &&
8015       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8016     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8017                  __kmp_threads[gtid]);
8018     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8019                  "team->t.t_nproc=%d\n",
8020                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8021                  team->t.t_nproc);
8022     __kmp_print_structure();
8023   }
8024   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8025                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8026 #endif /* KMP_DEBUG */
8027 
8028   __kmp_join_barrier(gtid); /* wait for everyone */
8029 #if OMPT_SUPPORT
8030   if (ompt_enabled.enabled &&
8031       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
8032     int ds_tid = this_thr->th.th_info.ds.ds_tid;
8033     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8034     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8035 #if OMPT_OPTIONAL
8036     void *codeptr = NULL;
8037     if (KMP_MASTER_TID(ds_tid) &&
8038         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8039          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8040       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8041 
8042     if (ompt_enabled.ompt_callback_sync_region_wait) {
8043       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8044           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8045           codeptr);
8046     }
8047     if (ompt_enabled.ompt_callback_sync_region) {
8048       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8049           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8050           codeptr);
8051     }
8052 #endif
8053     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8054       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8055           ompt_scope_end, NULL, task_data, 0, ds_tid,
8056           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8057     }
8058   }
8059 #endif
8060 
8061   KMP_MB(); /* Flush all pending memory write invalidates.  */
8062   KMP_ASSERT(this_thr->th.th_team == team);
8063 }
8064 
8065 /* ------------------------------------------------------------------------ */
8066 
8067 #ifdef USE_LOAD_BALANCE
8068 
8069 // Return the worker threads actively spinning in the hot team, if we
8070 // are at the outermost level of parallelism.  Otherwise, return 0.
8071 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8072   int i;
8073   int retval;
8074   kmp_team_t *hot_team;
8075 
8076   if (root->r.r_active) {
8077     return 0;
8078   }
8079   hot_team = root->r.r_hot_team;
8080   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8081     return hot_team->t.t_nproc - 1; // Don't count primary thread
8082   }
8083 
8084   // Skip the primary thread - it is accounted for elsewhere.
8085   retval = 0;
8086   for (i = 1; i < hot_team->t.t_nproc; i++) {
8087     if (hot_team->t.t_threads[i]->th.th_active) {
8088       retval++;
8089     }
8090   }
8091   return retval;
8092 }
8093 
8094 // Perform an automatic adjustment to the number of
8095 // threads used by the next parallel region.
8096 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8097   int retval;
8098   int pool_active;
8099   int hot_team_active;
8100   int team_curr_active;
8101   int system_active;
8102 
8103   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8104                 set_nproc));
8105   KMP_DEBUG_ASSERT(root);
8106   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8107                        ->th.th_current_task->td_icvs.dynamic == TRUE);
8108   KMP_DEBUG_ASSERT(set_nproc > 1);
8109 
8110   if (set_nproc == 1) {
8111     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8112     return 1;
8113   }
8114 
8115   // Threads that are active in the thread pool, active in the hot team for this
8116   // particular root (if we are at the outer par level), and the currently
8117   // executing thread (to become the primary thread) are available to add to the
8118   // new team, but are currently contributing to the system load, and must be
8119   // accounted for.
8120   pool_active = __kmp_thread_pool_active_nth;
8121   hot_team_active = __kmp_active_hot_team_nproc(root);
8122   team_curr_active = pool_active + hot_team_active + 1;
8123 
8124   // Check the system load.
8125   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8126   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8127                 "hot team active = %d\n",
8128                 system_active, pool_active, hot_team_active));
8129 
8130   if (system_active < 0) {
8131     // There was an error reading the necessary info from /proc, so use the
8132     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8133     // = dynamic_thread_limit, we shouldn't wind up getting back here.
8134     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8135     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8136 
8137     // Make this call behave like the thread limit algorithm.
8138     retval = __kmp_avail_proc - __kmp_nth +
8139              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8140     if (retval > set_nproc) {
8141       retval = set_nproc;
8142     }
8143     if (retval < KMP_MIN_NTH) {
8144       retval = KMP_MIN_NTH;
8145     }
8146 
8147     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8148                   retval));
8149     return retval;
8150   }
8151 
8152   // There is a slight delay in the load balance algorithm in detecting new
8153   // running procs. The real system load at this instant should be at least as
8154   // large as the #active omp thread that are available to add to the team.
8155   if (system_active < team_curr_active) {
8156     system_active = team_curr_active;
8157   }
8158   retval = __kmp_avail_proc - system_active + team_curr_active;
8159   if (retval > set_nproc) {
8160     retval = set_nproc;
8161   }
8162   if (retval < KMP_MIN_NTH) {
8163     retval = KMP_MIN_NTH;
8164   }
8165 
8166   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8167   return retval;
8168 } // __kmp_load_balance_nproc()
8169 
8170 #endif /* USE_LOAD_BALANCE */
8171 
8172 /* ------------------------------------------------------------------------ */
8173 
8174 /* NOTE: this is called with the __kmp_init_lock held */
8175 void __kmp_cleanup(void) {
8176   int f;
8177 
8178   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8179 
8180   if (TCR_4(__kmp_init_parallel)) {
8181 #if KMP_HANDLE_SIGNALS
8182     __kmp_remove_signals();
8183 #endif
8184     TCW_4(__kmp_init_parallel, FALSE);
8185   }
8186 
8187   if (TCR_4(__kmp_init_middle)) {
8188 #if KMP_AFFINITY_SUPPORTED
8189     __kmp_affinity_uninitialize();
8190 #endif /* KMP_AFFINITY_SUPPORTED */
8191     __kmp_cleanup_hierarchy();
8192     TCW_4(__kmp_init_middle, FALSE);
8193   }
8194 
8195   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8196 
8197   if (__kmp_init_serial) {
8198     __kmp_runtime_destroy();
8199     __kmp_init_serial = FALSE;
8200   }
8201 
8202   __kmp_cleanup_threadprivate_caches();
8203 
8204   for (f = 0; f < __kmp_threads_capacity; f++) {
8205     if (__kmp_root[f] != NULL) {
8206       __kmp_free(__kmp_root[f]);
8207       __kmp_root[f] = NULL;
8208     }
8209   }
8210   __kmp_free(__kmp_threads);
8211   // __kmp_threads and __kmp_root were allocated at once, as single block, so
8212   // there is no need in freeing __kmp_root.
8213   __kmp_threads = NULL;
8214   __kmp_root = NULL;
8215   __kmp_threads_capacity = 0;
8216 
8217   // Free old __kmp_threads arrays if they exist.
8218   kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8219   while (ptr) {
8220     kmp_old_threads_list_t *next = ptr->next;
8221     __kmp_free(ptr->threads);
8222     __kmp_free(ptr);
8223     ptr = next;
8224   }
8225 
8226 #if KMP_USE_DYNAMIC_LOCK
8227   __kmp_cleanup_indirect_user_locks();
8228 #else
8229   __kmp_cleanup_user_locks();
8230 #endif
8231 #if OMPD_SUPPORT
8232   if (ompd_state) {
8233     __kmp_free(ompd_env_block);
8234     ompd_env_block = NULL;
8235     ompd_env_block_size = 0;
8236   }
8237 #endif
8238 
8239 #if KMP_AFFINITY_SUPPORTED
8240   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8241   __kmp_cpuinfo_file = NULL;
8242 #endif /* KMP_AFFINITY_SUPPORTED */
8243 
8244 #if KMP_USE_ADAPTIVE_LOCKS
8245 #if KMP_DEBUG_ADAPTIVE_LOCKS
8246   __kmp_print_speculative_stats();
8247 #endif
8248 #endif
8249   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8250   __kmp_nested_nth.nth = NULL;
8251   __kmp_nested_nth.size = 0;
8252   __kmp_nested_nth.used = 0;
8253   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8254   __kmp_nested_proc_bind.bind_types = NULL;
8255   __kmp_nested_proc_bind.size = 0;
8256   __kmp_nested_proc_bind.used = 0;
8257   if (__kmp_affinity_format) {
8258     KMP_INTERNAL_FREE(__kmp_affinity_format);
8259     __kmp_affinity_format = NULL;
8260   }
8261 
8262   __kmp_i18n_catclose();
8263 
8264 #if KMP_USE_HIER_SCHED
8265   __kmp_hier_scheds.deallocate();
8266 #endif
8267 
8268 #if KMP_STATS_ENABLED
8269   __kmp_stats_fini();
8270 #endif
8271 
8272   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8273 }
8274 
8275 /* ------------------------------------------------------------------------ */
8276 
8277 int __kmp_ignore_mppbeg(void) {
8278   char *env;
8279 
8280   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8281     if (__kmp_str_match_false(env))
8282       return FALSE;
8283   }
8284   // By default __kmpc_begin() is no-op.
8285   return TRUE;
8286 }
8287 
8288 int __kmp_ignore_mppend(void) {
8289   char *env;
8290 
8291   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8292     if (__kmp_str_match_false(env))
8293       return FALSE;
8294   }
8295   // By default __kmpc_end() is no-op.
8296   return TRUE;
8297 }
8298 
8299 void __kmp_internal_begin(void) {
8300   int gtid;
8301   kmp_root_t *root;
8302 
8303   /* this is a very important step as it will register new sibling threads
8304      and assign these new uber threads a new gtid */
8305   gtid = __kmp_entry_gtid();
8306   root = __kmp_threads[gtid]->th.th_root;
8307   KMP_ASSERT(KMP_UBER_GTID(gtid));
8308 
8309   if (root->r.r_begin)
8310     return;
8311   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8312   if (root->r.r_begin) {
8313     __kmp_release_lock(&root->r.r_begin_lock, gtid);
8314     return;
8315   }
8316 
8317   root->r.r_begin = TRUE;
8318 
8319   __kmp_release_lock(&root->r.r_begin_lock, gtid);
8320 }
8321 
8322 /* ------------------------------------------------------------------------ */
8323 
8324 void __kmp_user_set_library(enum library_type arg) {
8325   int gtid;
8326   kmp_root_t *root;
8327   kmp_info_t *thread;
8328 
8329   /* first, make sure we are initialized so we can get our gtid */
8330 
8331   gtid = __kmp_entry_gtid();
8332   thread = __kmp_threads[gtid];
8333 
8334   root = thread->th.th_root;
8335 
8336   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8337                 library_serial));
8338   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8339                                   thread */
8340     KMP_WARNING(SetLibraryIncorrectCall);
8341     return;
8342   }
8343 
8344   switch (arg) {
8345   case library_serial:
8346     thread->th.th_set_nproc = 0;
8347     set__nproc(thread, 1);
8348     break;
8349   case library_turnaround:
8350     thread->th.th_set_nproc = 0;
8351     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8352                                            : __kmp_dflt_team_nth_ub);
8353     break;
8354   case library_throughput:
8355     thread->th.th_set_nproc = 0;
8356     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8357                                            : __kmp_dflt_team_nth_ub);
8358     break;
8359   default:
8360     KMP_FATAL(UnknownLibraryType, arg);
8361   }
8362 
8363   __kmp_aux_set_library(arg);
8364 }
8365 
8366 void __kmp_aux_set_stacksize(size_t arg) {
8367   if (!__kmp_init_serial)
8368     __kmp_serial_initialize();
8369 
8370 #if KMP_OS_DARWIN
8371   if (arg & (0x1000 - 1)) {
8372     arg &= ~(0x1000 - 1);
8373     if (arg + 0x1000) /* check for overflow if we round up */
8374       arg += 0x1000;
8375   }
8376 #endif
8377   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8378 
8379   /* only change the default stacksize before the first parallel region */
8380   if (!TCR_4(__kmp_init_parallel)) {
8381     size_t value = arg; /* argument is in bytes */
8382 
8383     if (value < __kmp_sys_min_stksize)
8384       value = __kmp_sys_min_stksize;
8385     else if (value > KMP_MAX_STKSIZE)
8386       value = KMP_MAX_STKSIZE;
8387 
8388     __kmp_stksize = value;
8389 
8390     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8391   }
8392 
8393   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8394 }
8395 
8396 /* set the behaviour of the runtime library */
8397 /* TODO this can cause some odd behaviour with sibling parallelism... */
8398 void __kmp_aux_set_library(enum library_type arg) {
8399   __kmp_library = arg;
8400 
8401   switch (__kmp_library) {
8402   case library_serial: {
8403     KMP_INFORM(LibraryIsSerial);
8404   } break;
8405   case library_turnaround:
8406     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8407       __kmp_use_yield = 2; // only yield when oversubscribed
8408     break;
8409   case library_throughput:
8410     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8411       __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8412     break;
8413   default:
8414     KMP_FATAL(UnknownLibraryType, arg);
8415   }
8416 }
8417 
8418 /* Getting team information common for all team API */
8419 // Returns NULL if not in teams construct
8420 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8421   kmp_info_t *thr = __kmp_entry_thread();
8422   teams_serialized = 0;
8423   if (thr->th.th_teams_microtask) {
8424     kmp_team_t *team = thr->th.th_team;
8425     int tlevel = thr->th.th_teams_level; // the level of the teams construct
8426     int ii = team->t.t_level;
8427     teams_serialized = team->t.t_serialized;
8428     int level = tlevel + 1;
8429     KMP_DEBUG_ASSERT(ii >= tlevel);
8430     while (ii > level) {
8431       for (teams_serialized = team->t.t_serialized;
8432            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8433       }
8434       if (team->t.t_serialized && (!teams_serialized)) {
8435         team = team->t.t_parent;
8436         continue;
8437       }
8438       if (ii > level) {
8439         team = team->t.t_parent;
8440         ii--;
8441       }
8442     }
8443     return team;
8444   }
8445   return NULL;
8446 }
8447 
8448 int __kmp_aux_get_team_num() {
8449   int serialized;
8450   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8451   if (team) {
8452     if (serialized > 1) {
8453       return 0; // teams region is serialized ( 1 team of 1 thread ).
8454     } else {
8455       return team->t.t_master_tid;
8456     }
8457   }
8458   return 0;
8459 }
8460 
8461 int __kmp_aux_get_num_teams() {
8462   int serialized;
8463   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8464   if (team) {
8465     if (serialized > 1) {
8466       return 1;
8467     } else {
8468       return team->t.t_parent->t.t_nproc;
8469     }
8470   }
8471   return 1;
8472 }
8473 
8474 /* ------------------------------------------------------------------------ */
8475 
8476 /*
8477  * Affinity Format Parser
8478  *
8479  * Field is in form of: %[[[0].]size]type
8480  * % and type are required (%% means print a literal '%')
8481  * type is either single char or long name surrounded by {},
8482  * e.g., N or {num_threads}
8483  * 0 => leading zeros
8484  * . => right justified when size is specified
8485  * by default output is left justified
8486  * size is the *minimum* field length
8487  * All other characters are printed as is
8488  *
8489  * Available field types:
8490  * L {thread_level}      - omp_get_level()
8491  * n {thread_num}        - omp_get_thread_num()
8492  * h {host}              - name of host machine
8493  * P {process_id}        - process id (integer)
8494  * T {thread_identifier} - native thread identifier (integer)
8495  * N {num_threads}       - omp_get_num_threads()
8496  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8497  * a {thread_affinity}   - comma separated list of integers or integer ranges
8498  *                         (values of affinity mask)
8499  *
8500  * Implementation-specific field types can be added
8501  * If a type is unknown, print "undefined"
8502  */
8503 
8504 // Structure holding the short name, long name, and corresponding data type
8505 // for snprintf.  A table of these will represent the entire valid keyword
8506 // field types.
8507 typedef struct kmp_affinity_format_field_t {
8508   char short_name; // from spec e.g., L -> thread level
8509   const char *long_name; // from spec thread_level -> thread level
8510   char field_format; // data type for snprintf (typically 'd' or 's'
8511   // for integer or string)
8512 } kmp_affinity_format_field_t;
8513 
8514 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8515 #if KMP_AFFINITY_SUPPORTED
8516     {'A', "thread_affinity", 's'},
8517 #endif
8518     {'t', "team_num", 'd'},
8519     {'T', "num_teams", 'd'},
8520     {'L', "nesting_level", 'd'},
8521     {'n', "thread_num", 'd'},
8522     {'N', "num_threads", 'd'},
8523     {'a', "ancestor_tnum", 'd'},
8524     {'H', "host", 's'},
8525     {'P', "process_id", 'd'},
8526     {'i', "native_thread_id", 'd'}};
8527 
8528 // Return the number of characters it takes to hold field
8529 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8530                                             const char **ptr,
8531                                             kmp_str_buf_t *field_buffer) {
8532   int rc, format_index, field_value;
8533   const char *width_left, *width_right;
8534   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8535   static const int FORMAT_SIZE = 20;
8536   char format[FORMAT_SIZE] = {0};
8537   char absolute_short_name = 0;
8538 
8539   KMP_DEBUG_ASSERT(gtid >= 0);
8540   KMP_DEBUG_ASSERT(th);
8541   KMP_DEBUG_ASSERT(**ptr == '%');
8542   KMP_DEBUG_ASSERT(field_buffer);
8543 
8544   __kmp_str_buf_clear(field_buffer);
8545 
8546   // Skip the initial %
8547   (*ptr)++;
8548 
8549   // Check for %% first
8550   if (**ptr == '%') {
8551     __kmp_str_buf_cat(field_buffer, "%", 1);
8552     (*ptr)++; // skip over the second %
8553     return 1;
8554   }
8555 
8556   // Parse field modifiers if they are present
8557   pad_zeros = false;
8558   if (**ptr == '0') {
8559     pad_zeros = true;
8560     (*ptr)++; // skip over 0
8561   }
8562   right_justify = false;
8563   if (**ptr == '.') {
8564     right_justify = true;
8565     (*ptr)++; // skip over .
8566   }
8567   // Parse width of field: [width_left, width_right)
8568   width_left = width_right = NULL;
8569   if (**ptr >= '0' && **ptr <= '9') {
8570     width_left = *ptr;
8571     SKIP_DIGITS(*ptr);
8572     width_right = *ptr;
8573   }
8574 
8575   // Create the format for KMP_SNPRINTF based on flags parsed above
8576   format_index = 0;
8577   format[format_index++] = '%';
8578   if (!right_justify)
8579     format[format_index++] = '-';
8580   if (pad_zeros)
8581     format[format_index++] = '0';
8582   if (width_left && width_right) {
8583     int i = 0;
8584     // Only allow 8 digit number widths.
8585     // This also prevents overflowing format variable
8586     while (i < 8 && width_left < width_right) {
8587       format[format_index++] = *width_left;
8588       width_left++;
8589       i++;
8590     }
8591   }
8592 
8593   // Parse a name (long or short)
8594   // Canonicalize the name into absolute_short_name
8595   found_valid_name = false;
8596   parse_long_name = (**ptr == '{');
8597   if (parse_long_name)
8598     (*ptr)++; // skip initial left brace
8599   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8600                              sizeof(__kmp_affinity_format_table[0]);
8601        ++i) {
8602     char short_name = __kmp_affinity_format_table[i].short_name;
8603     const char *long_name = __kmp_affinity_format_table[i].long_name;
8604     char field_format = __kmp_affinity_format_table[i].field_format;
8605     if (parse_long_name) {
8606       size_t length = KMP_STRLEN(long_name);
8607       if (strncmp(*ptr, long_name, length) == 0) {
8608         found_valid_name = true;
8609         (*ptr) += length; // skip the long name
8610       }
8611     } else if (**ptr == short_name) {
8612       found_valid_name = true;
8613       (*ptr)++; // skip the short name
8614     }
8615     if (found_valid_name) {
8616       format[format_index++] = field_format;
8617       format[format_index++] = '\0';
8618       absolute_short_name = short_name;
8619       break;
8620     }
8621   }
8622   if (parse_long_name) {
8623     if (**ptr != '}') {
8624       absolute_short_name = 0;
8625     } else {
8626       (*ptr)++; // skip over the right brace
8627     }
8628   }
8629 
8630   // Attempt to fill the buffer with the requested
8631   // value using snprintf within __kmp_str_buf_print()
8632   switch (absolute_short_name) {
8633   case 't':
8634     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8635     break;
8636   case 'T':
8637     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8638     break;
8639   case 'L':
8640     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8641     break;
8642   case 'n':
8643     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8644     break;
8645   case 'H': {
8646     static const int BUFFER_SIZE = 256;
8647     char buf[BUFFER_SIZE];
8648     __kmp_expand_host_name(buf, BUFFER_SIZE);
8649     rc = __kmp_str_buf_print(field_buffer, format, buf);
8650   } break;
8651   case 'P':
8652     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8653     break;
8654   case 'i':
8655     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8656     break;
8657   case 'N':
8658     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8659     break;
8660   case 'a':
8661     field_value =
8662         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8663     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8664     break;
8665 #if KMP_AFFINITY_SUPPORTED
8666   case 'A': {
8667     kmp_str_buf_t buf;
8668     __kmp_str_buf_init(&buf);
8669     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8670     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8671     __kmp_str_buf_free(&buf);
8672   } break;
8673 #endif
8674   default:
8675     // According to spec, If an implementation does not have info for field
8676     // type, then "undefined" is printed
8677     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8678     // Skip the field
8679     if (parse_long_name) {
8680       SKIP_TOKEN(*ptr);
8681       if (**ptr == '}')
8682         (*ptr)++;
8683     } else {
8684       (*ptr)++;
8685     }
8686   }
8687 
8688   KMP_ASSERT(format_index <= FORMAT_SIZE);
8689   return rc;
8690 }
8691 
8692 /*
8693  * Return number of characters needed to hold the affinity string
8694  * (not including null byte character)
8695  * The resultant string is printed to buffer, which the caller can then
8696  * handle afterwards
8697  */
8698 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8699                                   kmp_str_buf_t *buffer) {
8700   const char *parse_ptr;
8701   size_t retval;
8702   const kmp_info_t *th;
8703   kmp_str_buf_t field;
8704 
8705   KMP_DEBUG_ASSERT(buffer);
8706   KMP_DEBUG_ASSERT(gtid >= 0);
8707 
8708   __kmp_str_buf_init(&field);
8709   __kmp_str_buf_clear(buffer);
8710 
8711   th = __kmp_threads[gtid];
8712   retval = 0;
8713 
8714   // If format is NULL or zero-length string, then we use
8715   // affinity-format-var ICV
8716   parse_ptr = format;
8717   if (parse_ptr == NULL || *parse_ptr == '\0') {
8718     parse_ptr = __kmp_affinity_format;
8719   }
8720   KMP_DEBUG_ASSERT(parse_ptr);
8721 
8722   while (*parse_ptr != '\0') {
8723     // Parse a field
8724     if (*parse_ptr == '%') {
8725       // Put field in the buffer
8726       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8727       __kmp_str_buf_catbuf(buffer, &field);
8728       retval += rc;
8729     } else {
8730       // Put literal character in buffer
8731       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8732       retval++;
8733       parse_ptr++;
8734     }
8735   }
8736   __kmp_str_buf_free(&field);
8737   return retval;
8738 }
8739 
8740 // Displays the affinity string to stdout
8741 void __kmp_aux_display_affinity(int gtid, const char *format) {
8742   kmp_str_buf_t buf;
8743   __kmp_str_buf_init(&buf);
8744   __kmp_aux_capture_affinity(gtid, format, &buf);
8745   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8746   __kmp_str_buf_free(&buf);
8747 }
8748 
8749 /* ------------------------------------------------------------------------ */
8750 
8751 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8752   int blocktime = arg; /* argument is in milliseconds */
8753 #if KMP_USE_MONITOR
8754   int bt_intervals;
8755 #endif
8756   kmp_int8 bt_set;
8757 
8758   __kmp_save_internal_controls(thread);
8759 
8760   /* Normalize and set blocktime for the teams */
8761   if (blocktime < KMP_MIN_BLOCKTIME)
8762     blocktime = KMP_MIN_BLOCKTIME;
8763   else if (blocktime > KMP_MAX_BLOCKTIME)
8764     blocktime = KMP_MAX_BLOCKTIME;
8765 
8766   set__blocktime_team(thread->th.th_team, tid, blocktime);
8767   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8768 
8769 #if KMP_USE_MONITOR
8770   /* Calculate and set blocktime intervals for the teams */
8771   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8772 
8773   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8774   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8775 #endif
8776 
8777   /* Set whether blocktime has been set to "TRUE" */
8778   bt_set = TRUE;
8779 
8780   set__bt_set_team(thread->th.th_team, tid, bt_set);
8781   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8782 #if KMP_USE_MONITOR
8783   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8784                 "bt_intervals=%d, monitor_updates=%d\n",
8785                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8786                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8787                 __kmp_monitor_wakeups));
8788 #else
8789   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8790                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8791                 thread->th.th_team->t.t_id, tid, blocktime));
8792 #endif
8793 }
8794 
8795 void __kmp_aux_set_defaults(char const *str, size_t len) {
8796   if (!__kmp_init_serial) {
8797     __kmp_serial_initialize();
8798   }
8799   __kmp_env_initialize(str);
8800 
8801   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8802     __kmp_env_print();
8803   }
8804 } // __kmp_aux_set_defaults
8805 
8806 /* ------------------------------------------------------------------------ */
8807 /* internal fast reduction routines */
8808 
8809 PACKED_REDUCTION_METHOD_T
8810 __kmp_determine_reduction_method(
8811     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8812     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8813     kmp_critical_name *lck) {
8814 
8815   // Default reduction method: critical construct ( lck != NULL, like in current
8816   // PAROPT )
8817   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8818   // can be selected by RTL
8819   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8820   // can be selected by RTL
8821   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8822   // among generated by PAROPT.
8823 
8824   PACKED_REDUCTION_METHOD_T retval;
8825 
8826   int team_size;
8827 
8828   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8829   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8830 
8831 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8832   (loc &&                                                                      \
8833    ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8834 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8835 
8836   retval = critical_reduce_block;
8837 
8838   // another choice of getting a team size (with 1 dynamic deference) is slower
8839   team_size = __kmp_get_team_num_threads(global_tid);
8840   if (team_size == 1) {
8841 
8842     retval = empty_reduce_block;
8843 
8844   } else {
8845 
8846     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8847 
8848 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8849     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
8850 
8851 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8852     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8853 
8854     int teamsize_cutoff = 4;
8855 
8856 #if KMP_MIC_SUPPORTED
8857     if (__kmp_mic_type != non_mic) {
8858       teamsize_cutoff = 8;
8859     }
8860 #endif
8861     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8862     if (tree_available) {
8863       if (team_size <= teamsize_cutoff) {
8864         if (atomic_available) {
8865           retval = atomic_reduce_block;
8866         }
8867       } else {
8868         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8869       }
8870     } else if (atomic_available) {
8871       retval = atomic_reduce_block;
8872     }
8873 #else
8874 #error "Unknown or unsupported OS"
8875 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8876        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8877 
8878 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8879 
8880 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8881 
8882     // basic tuning
8883 
8884     if (atomic_available) {
8885       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8886         retval = atomic_reduce_block;
8887       }
8888     } // otherwise: use critical section
8889 
8890 #elif KMP_OS_DARWIN
8891 
8892     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8893     if (atomic_available && (num_vars <= 3)) {
8894       retval = atomic_reduce_block;
8895     } else if (tree_available) {
8896       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8897           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8898         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8899       }
8900     } // otherwise: use critical section
8901 
8902 #else
8903 #error "Unknown or unsupported OS"
8904 #endif
8905 
8906 #else
8907 #error "Unknown or unsupported architecture"
8908 #endif
8909   }
8910 
8911   // KMP_FORCE_REDUCTION
8912 
8913   // If the team is serialized (team_size == 1), ignore the forced reduction
8914   // method and stay with the unsynchronized method (empty_reduce_block)
8915   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8916       team_size != 1) {
8917 
8918     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8919 
8920     int atomic_available, tree_available;
8921 
8922     switch ((forced_retval = __kmp_force_reduction_method)) {
8923     case critical_reduce_block:
8924       KMP_ASSERT(lck); // lck should be != 0
8925       break;
8926 
8927     case atomic_reduce_block:
8928       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8929       if (!atomic_available) {
8930         KMP_WARNING(RedMethodNotSupported, "atomic");
8931         forced_retval = critical_reduce_block;
8932       }
8933       break;
8934 
8935     case tree_reduce_block:
8936       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8937       if (!tree_available) {
8938         KMP_WARNING(RedMethodNotSupported, "tree");
8939         forced_retval = critical_reduce_block;
8940       } else {
8941 #if KMP_FAST_REDUCTION_BARRIER
8942         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8943 #endif
8944       }
8945       break;
8946 
8947     default:
8948       KMP_ASSERT(0); // "unsupported method specified"
8949     }
8950 
8951     retval = forced_retval;
8952   }
8953 
8954   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8955 
8956 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8957 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8958 
8959   return (retval);
8960 }
8961 // this function is for testing set/get/determine reduce method
8962 kmp_int32 __kmp_get_reduce_method(void) {
8963   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8964 }
8965 
8966 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8967 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8968 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8969 
8970 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8971 // OpenMP is used subsequently.
8972 void __kmp_hard_pause() {
8973   __kmp_pause_status = kmp_hard_paused;
8974   __kmp_internal_end_thread(-1);
8975 }
8976 
8977 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8978 void __kmp_resume_if_soft_paused() {
8979   if (__kmp_pause_status == kmp_soft_paused) {
8980     __kmp_pause_status = kmp_not_paused;
8981 
8982     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8983       kmp_info_t *thread = __kmp_threads[gtid];
8984       if (thread) { // Wake it if sleeping
8985         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8986                          thread);
8987         if (fl.is_sleeping())
8988           fl.resume(gtid);
8989         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8990           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8991         } else { // thread holds the lock and may sleep soon
8992           do { // until either the thread sleeps, or we can get the lock
8993             if (fl.is_sleeping()) {
8994               fl.resume(gtid);
8995               break;
8996             } else if (__kmp_try_suspend_mx(thread)) {
8997               __kmp_unlock_suspend_mx(thread);
8998               break;
8999             }
9000           } while (1);
9001         }
9002       }
9003     }
9004   }
9005 }
9006 
9007 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
9008 // TODO: add warning messages
9009 int __kmp_pause_resource(kmp_pause_status_t level) {
9010   if (level == kmp_not_paused) { // requesting resume
9011     if (__kmp_pause_status == kmp_not_paused) {
9012       // error message about runtime not being paused, so can't resume
9013       return 1;
9014     } else {
9015       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9016                        __kmp_pause_status == kmp_hard_paused);
9017       __kmp_pause_status = kmp_not_paused;
9018       return 0;
9019     }
9020   } else if (level == kmp_soft_paused) { // requesting soft pause
9021     if (__kmp_pause_status != kmp_not_paused) {
9022       // error message about already being paused
9023       return 1;
9024     } else {
9025       __kmp_soft_pause();
9026       return 0;
9027     }
9028   } else if (level == kmp_hard_paused) { // requesting hard pause
9029     if (__kmp_pause_status != kmp_not_paused) {
9030       // error message about already being paused
9031       return 1;
9032     } else {
9033       __kmp_hard_pause();
9034       return 0;
9035     }
9036   } else {
9037     // error message about invalid level
9038     return 1;
9039   }
9040 }
9041 
9042 void __kmp_omp_display_env(int verbose) {
9043   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9044   if (__kmp_init_serial == 0)
9045     __kmp_do_serial_initialize();
9046   __kmp_display_env_impl(!verbose, verbose);
9047   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9048 }
9049 
9050 // The team size is changing, so distributed barrier must be modified
9051 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9052                                int new_nthreads) {
9053   KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9054                    bp_dist_bar);
9055   kmp_info_t **other_threads = team->t.t_threads;
9056 
9057   // We want all the workers to stop waiting on the barrier while we adjust the
9058   // size of the team.
9059   for (int f = 1; f < old_nthreads; ++f) {
9060     KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9061     // Ignore threads that are already inactive or not present in the team
9062     if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9063       // teams construct causes thread_limit to get passed in, and some of
9064       // those could be inactive; just ignore them
9065       continue;
9066     }
9067     // If thread is transitioning still to in_use state, wait for it
9068     if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9069       while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9070         KMP_CPU_PAUSE();
9071     }
9072     // The thread should be in_use now
9073     KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9074     // Transition to unused state
9075     team->t.t_threads[f]->th.th_used_in_team.store(2);
9076     KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9077   }
9078   // Release all the workers
9079   team->t.b->go_release();
9080 
9081   KMP_MFENCE();
9082 
9083   // Workers should see transition status 2 and move to 0; but may need to be
9084   // woken up first
9085   int count = old_nthreads - 1;
9086   while (count > 0) {
9087     count = old_nthreads - 1;
9088     for (int f = 1; f < old_nthreads; ++f) {
9089       if (other_threads[f]->th.th_used_in_team.load() != 0) {
9090         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9091           kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9092               void *, other_threads[f]->th.th_sleep_loc);
9093           __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9094         }
9095       } else {
9096         KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9097         count--;
9098       }
9099     }
9100   }
9101   // Now update the barrier size
9102   team->t.b->update_num_threads(new_nthreads);
9103   team->t.b->go_reset();
9104 }
9105 
9106 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9107   // Add the threads back to the team
9108   KMP_DEBUG_ASSERT(team);
9109   // Threads were paused and pointed at th_used_in_team temporarily during a
9110   // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9111   // the thread that it should transition itself back into the team. Then, if
9112   // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9113   // to wake it up.
9114   for (int f = 1; f < new_nthreads; ++f) {
9115     KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9116     KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9117                                 3);
9118     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9119       __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9120                       (kmp_flag_32<false, false> *)NULL);
9121     }
9122   }
9123   // The threads should be transitioning to the team; when they are done, they
9124   // should have set th_used_in_team to 1. This loop forces master to wait until
9125   // all threads have moved into the team and are waiting in the barrier.
9126   int count = new_nthreads - 1;
9127   while (count > 0) {
9128     count = new_nthreads - 1;
9129     for (int f = 1; f < new_nthreads; ++f) {
9130       if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9131         count--;
9132       }
9133     }
9134   }
9135 }
9136 
9137 // Globals and functions for hidden helper task
9138 kmp_info_t **__kmp_hidden_helper_threads;
9139 kmp_info_t *__kmp_hidden_helper_main_thread;
9140 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9141 #if KMP_OS_LINUX
9142 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9143 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9144 #else
9145 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9146 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9147 #endif
9148 
9149 namespace {
9150 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9151 
9152 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9153   // This is an explicit synchronization on all hidden helper threads in case
9154   // that when a regular thread pushes a hidden helper task to one hidden
9155   // helper thread, the thread has not been awaken once since they're released
9156   // by the main thread after creating the team.
9157   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9158   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9159          __kmp_hidden_helper_threads_num)
9160     ;
9161 
9162   // If main thread, then wait for signal
9163   if (__kmpc_master(nullptr, *gtid)) {
9164     // First, unset the initial state and release the initial thread
9165     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9166     __kmp_hidden_helper_initz_release();
9167     __kmp_hidden_helper_main_thread_wait();
9168     // Now wake up all worker threads
9169     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9170       __kmp_hidden_helper_worker_thread_signal();
9171     }
9172   }
9173 }
9174 } // namespace
9175 
9176 void __kmp_hidden_helper_threads_initz_routine() {
9177   // Create a new root for hidden helper team/threads
9178   const int gtid = __kmp_register_root(TRUE);
9179   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9180   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9181   __kmp_hidden_helper_main_thread->th.th_set_nproc =
9182       __kmp_hidden_helper_threads_num;
9183 
9184   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9185 
9186   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9187 
9188   // Set the initialization flag to FALSE
9189   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9190 
9191   __kmp_hidden_helper_threads_deinitz_release();
9192 }
9193 
9194 /* Nesting Mode:
9195    Set via KMP_NESTING_MODE, which takes an integer.
9196    Note: we skip duplicate topology levels, and skip levels with only
9197       one entity.
9198    KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9199    KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9200       in the topology, and initializes the number of threads at each of those
9201       levels to the number of entities at each level, respectively, below the
9202       entity at the parent level.
9203    KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9204       but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9205       the user to turn nesting on explicitly. This is an even more experimental
9206       option to this experimental feature, and may change or go away in the
9207       future.
9208 */
9209 
9210 // Allocate space to store nesting levels
9211 void __kmp_init_nesting_mode() {
9212   int levels = KMP_HW_LAST;
9213   __kmp_nesting_mode_nlevels = levels;
9214   __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9215   for (int i = 0; i < levels; ++i)
9216     __kmp_nesting_nth_level[i] = 0;
9217   if (__kmp_nested_nth.size < levels) {
9218     __kmp_nested_nth.nth =
9219         (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9220     __kmp_nested_nth.size = levels;
9221   }
9222 }
9223 
9224 // Set # threads for top levels of nesting; must be called after topology set
9225 void __kmp_set_nesting_mode_threads() {
9226   kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9227 
9228   if (__kmp_nesting_mode == 1)
9229     __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9230   else if (__kmp_nesting_mode > 1)
9231     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9232 
9233   if (__kmp_topology) { // use topology info
9234     int loc, hw_level;
9235     for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9236                                 loc < __kmp_nesting_mode_nlevels;
9237          loc++, hw_level++) {
9238       __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9239       if (__kmp_nesting_nth_level[loc] == 1)
9240         loc--;
9241     }
9242     // Make sure all cores are used
9243     if (__kmp_nesting_mode > 1 && loc > 1) {
9244       int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9245       int num_cores = __kmp_topology->get_count(core_level);
9246       int upper_levels = 1;
9247       for (int level = 0; level < loc - 1; ++level)
9248         upper_levels *= __kmp_nesting_nth_level[level];
9249       if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9250         __kmp_nesting_nth_level[loc - 1] =
9251             num_cores / __kmp_nesting_nth_level[loc - 2];
9252     }
9253     __kmp_nesting_mode_nlevels = loc;
9254     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9255   } else { // no topology info available; provide a reasonable guesstimation
9256     if (__kmp_avail_proc >= 4) {
9257       __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9258       __kmp_nesting_nth_level[1] = 2;
9259       __kmp_nesting_mode_nlevels = 2;
9260     } else {
9261       __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9262       __kmp_nesting_mode_nlevels = 1;
9263     }
9264     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9265   }
9266   for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9267     __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9268   }
9269   set__nproc(thread, __kmp_nesting_nth_level[0]);
9270   if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9271     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9272   if (get__max_active_levels(thread) > 1) {
9273     // if max levels was set, set nesting mode levels to same
9274     __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9275   }
9276   if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9277     set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9278 }
9279 
9280 // Empty symbols to export (see exports_so.txt) when feature is disabled
9281 extern "C" {
9282 #if !KMP_STATS_ENABLED
9283 void __kmp_reset_stats() {}
9284 #endif
9285 #if !USE_DEBUGGER
9286 int __kmp_omp_debug_struct_info = FALSE;
9287 int __kmp_debugging = FALSE;
9288 #endif
9289 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9290 void __kmp_itt_fini_ittlib() {}
9291 void __kmp_itt_init_ittlib() {}
9292 #endif
9293 }
9294 
9295 // end of file
9296