xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_runtime.cpp (revision da759cfa320d5076b075d15ff3f00ab3ba5634fd)
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37 
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41 
42 #include "tsan_annotations.h"
43 
44 #if defined(KMP_GOMP_COMPAT)
45 char const __kmp_version_alt_comp[] =
46     KMP_VERSION_PREFIX "alternative compiler support: yes";
47 #endif /* defined(KMP_GOMP_COMPAT) */
48 
49 char const __kmp_version_omp_api[] =
50     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
51 
52 #ifdef KMP_DEBUG
53 char const __kmp_version_lock[] =
54     KMP_VERSION_PREFIX "lock type: run time selectable";
55 #endif /* KMP_DEBUG */
56 
57 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
58 
59 /* ------------------------------------------------------------------------ */
60 
61 #if KMP_USE_MONITOR
62 kmp_info_t __kmp_monitor;
63 #endif
64 
65 /* Forward declarations */
66 
67 void __kmp_cleanup(void);
68 
69 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
70                                   int gtid);
71 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
72                                   kmp_internal_control_t *new_icvs,
73                                   ident_t *loc);
74 #if KMP_AFFINITY_SUPPORTED
75 static void __kmp_partition_places(kmp_team_t *team,
76                                    int update_master_only = 0);
77 #endif
78 static void __kmp_do_serial_initialize(void);
79 void __kmp_fork_barrier(int gtid, int tid);
80 void __kmp_join_barrier(int gtid);
81 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
82                           kmp_internal_control_t *new_icvs, ident_t *loc);
83 
84 #ifdef USE_LOAD_BALANCE
85 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
86 #endif
87 
88 static int __kmp_expand_threads(int nNeed);
89 #if KMP_OS_WINDOWS
90 static int __kmp_unregister_root_other_thread(int gtid);
91 #endif
92 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
93 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
94 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
95 
96 /* Calculate the identifier of the current thread */
97 /* fast (and somewhat portable) way to get unique identifier of executing
98    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
99 int __kmp_get_global_thread_id() {
100   int i;
101   kmp_info_t **other_threads;
102   size_t stack_data;
103   char *stack_addr;
104   size_t stack_size;
105   char *stack_base;
106 
107   KA_TRACE(
108       1000,
109       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
110        __kmp_nth, __kmp_all_nth));
111 
112   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
113      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
114      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
115      __kmp_init_gtid for this to work. */
116 
117   if (!TCR_4(__kmp_init_gtid))
118     return KMP_GTID_DNE;
119 
120 #ifdef KMP_TDATA_GTID
121   if (TCR_4(__kmp_gtid_mode) >= 3) {
122     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
123     return __kmp_gtid;
124   }
125 #endif
126   if (TCR_4(__kmp_gtid_mode) >= 2) {
127     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
128     return __kmp_gtid_get_specific();
129   }
130   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
131 
132   stack_addr = (char *)&stack_data;
133   other_threads = __kmp_threads;
134 
135   /* ATT: The code below is a source of potential bugs due to unsynchronized
136      access to __kmp_threads array. For example:
137      1. Current thread loads other_threads[i] to thr and checks it, it is
138         non-NULL.
139      2. Current thread is suspended by OS.
140      3. Another thread unregisters and finishes (debug versions of free()
141         may fill memory with something like 0xEF).
142      4. Current thread is resumed.
143      5. Current thread reads junk from *thr.
144      TODO: Fix it.  --ln  */
145 
146   for (i = 0; i < __kmp_threads_capacity; i++) {
147 
148     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
149     if (!thr)
150       continue;
151 
152     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
153     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
154 
155     /* stack grows down -- search through all of the active threads */
156 
157     if (stack_addr <= stack_base) {
158       size_t stack_diff = stack_base - stack_addr;
159 
160       if (stack_diff <= stack_size) {
161         /* The only way we can be closer than the allocated */
162         /* stack size is if we are running on this thread. */
163         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
164         return i;
165       }
166     }
167   }
168 
169   /* get specific to try and determine our gtid */
170   KA_TRACE(1000,
171            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
172             "thread, using TLS\n"));
173   i = __kmp_gtid_get_specific();
174 
175   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
176 
177   /* if we havn't been assigned a gtid, then return code */
178   if (i < 0)
179     return i;
180 
181   /* dynamically updated stack window for uber threads to avoid get_specific
182      call */
183   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
184     KMP_FATAL(StackOverflow, i);
185   }
186 
187   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
188   if (stack_addr > stack_base) {
189     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
190     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
191             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
192                 stack_base);
193   } else {
194     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
195             stack_base - stack_addr);
196   }
197 
198   /* Reprint stack bounds for ubermaster since they have been refined */
199   if (__kmp_storage_map) {
200     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
202     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
203                                  other_threads[i]->th.th_info.ds.ds_stacksize,
204                                  "th_%d stack (refinement)", i);
205   }
206   return i;
207 }
208 
209 int __kmp_get_global_thread_id_reg() {
210   int gtid;
211 
212   if (!__kmp_init_serial) {
213     gtid = KMP_GTID_DNE;
214   } else
215 #ifdef KMP_TDATA_GTID
216       if (TCR_4(__kmp_gtid_mode) >= 3) {
217     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
218     gtid = __kmp_gtid;
219   } else
220 #endif
221       if (TCR_4(__kmp_gtid_mode) >= 2) {
222     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
223     gtid = __kmp_gtid_get_specific();
224   } else {
225     KA_TRACE(1000,
226              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
227     gtid = __kmp_get_global_thread_id();
228   }
229 
230   /* we must be a new uber master sibling thread */
231   if (gtid == KMP_GTID_DNE) {
232     KA_TRACE(10,
233              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
234               "Registering a new gtid.\n"));
235     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
236     if (!__kmp_init_serial) {
237       __kmp_do_serial_initialize();
238       gtid = __kmp_gtid_get_specific();
239     } else {
240       gtid = __kmp_register_root(FALSE);
241     }
242     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
243     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
244   }
245 
246   KMP_DEBUG_ASSERT(gtid >= 0);
247 
248   return gtid;
249 }
250 
251 /* caller must hold forkjoin_lock */
252 void __kmp_check_stack_overlap(kmp_info_t *th) {
253   int f;
254   char *stack_beg = NULL;
255   char *stack_end = NULL;
256   int gtid;
257 
258   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
259   if (__kmp_storage_map) {
260     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
261     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
262 
263     gtid = __kmp_gtid_from_thread(th);
264 
265     if (gtid == KMP_GTID_MONITOR) {
266       __kmp_print_storage_map_gtid(
267           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
268           "th_%s stack (%s)", "mon",
269           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
270     } else {
271       __kmp_print_storage_map_gtid(
272           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
273           "th_%d stack (%s)", gtid,
274           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
275     }
276   }
277 
278   /* No point in checking ubermaster threads since they use refinement and
279    * cannot overlap */
280   gtid = __kmp_gtid_from_thread(th);
281   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
282     KA_TRACE(10,
283              ("__kmp_check_stack_overlap: performing extensive checking\n"));
284     if (stack_beg == NULL) {
285       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
286       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
287     }
288 
289     for (f = 0; f < __kmp_threads_capacity; f++) {
290       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
291 
292       if (f_th && f_th != th) {
293         char *other_stack_end =
294             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
295         char *other_stack_beg =
296             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
297         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
298             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
299 
300           /* Print the other stack values before the abort */
301           if (__kmp_storage_map)
302             __kmp_print_storage_map_gtid(
303                 -1, other_stack_beg, other_stack_end,
304                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
305                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
306 
307           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
308                       __kmp_msg_null);
309         }
310       }
311     }
312   }
313   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
314 }
315 
316 /* ------------------------------------------------------------------------ */
317 
318 void __kmp_infinite_loop(void) {
319   static int done = FALSE;
320 
321   while (!done) {
322     KMP_YIELD(TRUE);
323   }
324 }
325 
326 #define MAX_MESSAGE 512
327 
328 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
329                                   char const *format, ...) {
330   char buffer[MAX_MESSAGE];
331   va_list ap;
332 
333   va_start(ap, format);
334   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
335                p2, (unsigned long)size, format);
336   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
337   __kmp_vprintf(kmp_err, buffer, ap);
338 #if KMP_PRINT_DATA_PLACEMENT
339   int node;
340   if (gtid >= 0) {
341     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
342       if (__kmp_storage_map_verbose) {
343         node = __kmp_get_host_node(p1);
344         if (node < 0) /* doesn't work, so don't try this next time */
345           __kmp_storage_map_verbose = FALSE;
346         else {
347           char *last;
348           int lastNode;
349           int localProc = __kmp_get_cpu_from_gtid(gtid);
350 
351           const int page_size = KMP_GET_PAGE_SIZE();
352 
353           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
354           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
355           if (localProc >= 0)
356             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
357                                  localProc >> 1);
358           else
359             __kmp_printf_no_lock("  GTID %d\n", gtid);
360 #if KMP_USE_PRCTL
361           /* The more elaborate format is disabled for now because of the prctl
362            * hanging bug. */
363           do {
364             last = p1;
365             lastNode = node;
366             /* This loop collates adjacent pages with the same host node. */
367             do {
368               (char *)p1 += page_size;
369             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
370             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
371                                  lastNode);
372           } while (p1 <= p2);
373 #else
374           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
375                                (char *)p1 + (page_size - 1),
376                                __kmp_get_host_node(p1));
377           if (p1 < p2) {
378             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
379                                  (char *)p2 + (page_size - 1),
380                                  __kmp_get_host_node(p2));
381           }
382 #endif
383         }
384       }
385     } else
386       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
387   }
388 #endif /* KMP_PRINT_DATA_PLACEMENT */
389   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
390 }
391 
392 void __kmp_warn(char const *format, ...) {
393   char buffer[MAX_MESSAGE];
394   va_list ap;
395 
396   if (__kmp_generate_warnings == kmp_warnings_off) {
397     return;
398   }
399 
400   va_start(ap, format);
401 
402   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
403   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
404   __kmp_vprintf(kmp_err, buffer, ap);
405   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
406 
407   va_end(ap);
408 }
409 
410 void __kmp_abort_process() {
411   // Later threads may stall here, but that's ok because abort() will kill them.
412   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
413 
414   if (__kmp_debug_buf) {
415     __kmp_dump_debug_buffer();
416   }
417 
418   if (KMP_OS_WINDOWS) {
419     // Let other threads know of abnormal termination and prevent deadlock
420     // if abort happened during library initialization or shutdown
421     __kmp_global.g.g_abort = SIGABRT;
422 
423     /* On Windows* OS by default abort() causes pop-up error box, which stalls
424        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
425        boxes. _set_abort_behavior() works well, but this function is not
426        available in VS7 (this is not problem for DLL, but it is a problem for
427        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
428        help, at least in some versions of MS C RTL.
429 
430        It seems following sequence is the only way to simulate abort() and
431        avoid pop-up error box. */
432     raise(SIGABRT);
433     _exit(3); // Just in case, if signal ignored, exit anyway.
434   } else {
435     abort();
436   }
437 
438   __kmp_infinite_loop();
439   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
440 
441 } // __kmp_abort_process
442 
443 void __kmp_abort_thread(void) {
444   // TODO: Eliminate g_abort global variable and this function.
445   // In case of abort just call abort(), it will kill all the threads.
446   __kmp_infinite_loop();
447 } // __kmp_abort_thread
448 
449 /* Print out the storage map for the major kmp_info_t thread data structures
450    that are allocated together. */
451 
452 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
453   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
454                                gtid);
455 
456   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
457                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
458 
459   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
460                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
461 
462   __kmp_print_storage_map_gtid(
463       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
464       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
465 
466   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
467                                &thr->th.th_bar[bs_plain_barrier + 1],
468                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
469                                gtid);
470 
471   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
472                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
473                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
474                                gtid);
475 
476 #if KMP_FAST_REDUCTION_BARRIER
477   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
478                                &thr->th.th_bar[bs_reduction_barrier + 1],
479                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
480                                gtid);
481 #endif // KMP_FAST_REDUCTION_BARRIER
482 }
483 
484 /* Print out the storage map for the major kmp_team_t team data structures
485    that are allocated together. */
486 
487 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
488                                          int team_id, int num_thr) {
489   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
490   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
491                                header, team_id);
492 
493   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
494                                &team->t.t_bar[bs_last_barrier],
495                                sizeof(kmp_balign_team_t) * bs_last_barrier,
496                                "%s_%d.t_bar", header, team_id);
497 
498   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
499                                &team->t.t_bar[bs_plain_barrier + 1],
500                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
501                                header, team_id);
502 
503   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
504                                &team->t.t_bar[bs_forkjoin_barrier + 1],
505                                sizeof(kmp_balign_team_t),
506                                "%s_%d.t_bar[forkjoin]", header, team_id);
507 
508 #if KMP_FAST_REDUCTION_BARRIER
509   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
510                                &team->t.t_bar[bs_reduction_barrier + 1],
511                                sizeof(kmp_balign_team_t),
512                                "%s_%d.t_bar[reduction]", header, team_id);
513 #endif // KMP_FAST_REDUCTION_BARRIER
514 
515   __kmp_print_storage_map_gtid(
516       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
517       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
518 
519   __kmp_print_storage_map_gtid(
520       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
521       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
522 
523   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
524                                &team->t.t_disp_buffer[num_disp_buff],
525                                sizeof(dispatch_shared_info_t) * num_disp_buff,
526                                "%s_%d.t_disp_buffer", header, team_id);
527 }
528 
529 static void __kmp_init_allocator() { __kmp_init_memkind(); }
530 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
531 
532 /* ------------------------------------------------------------------------ */
533 
534 #if KMP_DYNAMIC_LIB
535 #if KMP_OS_WINDOWS
536 
537 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
538   // TODO: Change to __kmp_break_bootstrap_lock().
539   __kmp_init_bootstrap_lock(lck); // make the lock released
540 }
541 
542 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
543   int i;
544   int thread_count;
545 
546   // PROCESS_DETACH is expected to be called by a thread that executes
547   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
548   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
549   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
550   // threads can be still alive here, although being about to be terminated. The
551   // threads in the array with ds_thread==0 are most suspicious. Actually, it
552   // can be not safe to access the __kmp_threads[].
553 
554   // TODO: does it make sense to check __kmp_roots[] ?
555 
556   // Let's check that there are no other alive threads registered with the OMP
557   // lib.
558   while (1) {
559     thread_count = 0;
560     for (i = 0; i < __kmp_threads_capacity; ++i) {
561       if (!__kmp_threads)
562         continue;
563       kmp_info_t *th = __kmp_threads[i];
564       if (th == NULL)
565         continue;
566       int gtid = th->th.th_info.ds.ds_gtid;
567       if (gtid == gtid_req)
568         continue;
569       if (gtid < 0)
570         continue;
571       DWORD exit_val;
572       int alive = __kmp_is_thread_alive(th, &exit_val);
573       if (alive) {
574         ++thread_count;
575       }
576     }
577     if (thread_count == 0)
578       break; // success
579   }
580 
581   // Assume that I'm alone. Now it might be safe to check and reset locks.
582   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
583   __kmp_reset_lock(&__kmp_forkjoin_lock);
584 #ifdef KMP_DEBUG
585   __kmp_reset_lock(&__kmp_stdio_lock);
586 #endif // KMP_DEBUG
587 }
588 
589 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
590   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
591 
592   switch (fdwReason) {
593 
594   case DLL_PROCESS_ATTACH:
595     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
596 
597     return TRUE;
598 
599   case DLL_PROCESS_DETACH:
600     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
601 
602     if (lpReserved != NULL) {
603       // lpReserved is used for telling the difference:
604       //   lpReserved == NULL when FreeLibrary() was called,
605       //   lpReserved != NULL when the process terminates.
606       // When FreeLibrary() is called, worker threads remain alive. So they will
607       // release the forkjoin lock by themselves. When the process terminates,
608       // worker threads disappear triggering the problem of unreleased forkjoin
609       // lock as described below.
610 
611       // A worker thread can take the forkjoin lock. The problem comes up if
612       // that worker thread becomes dead before it releases the forkjoin lock.
613       // The forkjoin lock remains taken, while the thread executing
614       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
615       // to take the forkjoin lock and will always fail, so that the application
616       // will never finish [normally]. This scenario is possible if
617       // __kmpc_end() has not been executed. It looks like it's not a corner
618       // case, but common cases:
619       // - the main function was compiled by an alternative compiler;
620       // - the main function was compiled by icl but without /Qopenmp
621       //   (application with plugins);
622       // - application terminates by calling C exit(), Fortran CALL EXIT() or
623       //   Fortran STOP.
624       // - alive foreign thread prevented __kmpc_end from doing cleanup.
625       //
626       // This is a hack to work around the problem.
627       // TODO: !!! figure out something better.
628       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
629     }
630 
631     __kmp_internal_end_library(__kmp_gtid_get_specific());
632 
633     return TRUE;
634 
635   case DLL_THREAD_ATTACH:
636     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
637 
638     /* if we want to register new siblings all the time here call
639      * __kmp_get_gtid(); */
640     return TRUE;
641 
642   case DLL_THREAD_DETACH:
643     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
644 
645     __kmp_internal_end_thread(__kmp_gtid_get_specific());
646     return TRUE;
647   }
648 
649   return TRUE;
650 }
651 
652 #endif /* KMP_OS_WINDOWS */
653 #endif /* KMP_DYNAMIC_LIB */
654 
655 /* __kmp_parallel_deo -- Wait until it's our turn. */
656 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
657   int gtid = *gtid_ref;
658 #ifdef BUILD_PARALLEL_ORDERED
659   kmp_team_t *team = __kmp_team_from_gtid(gtid);
660 #endif /* BUILD_PARALLEL_ORDERED */
661 
662   if (__kmp_env_consistency_check) {
663     if (__kmp_threads[gtid]->th.th_root->r.r_active)
664 #if KMP_USE_DYNAMIC_LOCK
665       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
666 #else
667       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
668 #endif
669   }
670 #ifdef BUILD_PARALLEL_ORDERED
671   if (!team->t.t_serialized) {
672     KMP_MB();
673     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
674              NULL);
675     KMP_MB();
676   }
677 #endif /* BUILD_PARALLEL_ORDERED */
678 }
679 
680 /* __kmp_parallel_dxo -- Signal the next task. */
681 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682   int gtid = *gtid_ref;
683 #ifdef BUILD_PARALLEL_ORDERED
684   int tid = __kmp_tid_from_gtid(gtid);
685   kmp_team_t *team = __kmp_team_from_gtid(gtid);
686 #endif /* BUILD_PARALLEL_ORDERED */
687 
688   if (__kmp_env_consistency_check) {
689     if (__kmp_threads[gtid]->th.th_root->r.r_active)
690       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
691   }
692 #ifdef BUILD_PARALLEL_ORDERED
693   if (!team->t.t_serialized) {
694     KMP_MB(); /* Flush all pending memory write invalidates.  */
695 
696     /* use the tid of the next thread in this team */
697     /* TODO replace with general release procedure */
698     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
699 
700     KMP_MB(); /* Flush all pending memory write invalidates.  */
701   }
702 #endif /* BUILD_PARALLEL_ORDERED */
703 }
704 
705 /* ------------------------------------------------------------------------ */
706 /* The BARRIER for a SINGLE process section is always explicit   */
707 
708 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
709   int status;
710   kmp_info_t *th;
711   kmp_team_t *team;
712 
713   if (!TCR_4(__kmp_init_parallel))
714     __kmp_parallel_initialize();
715   __kmp_resume_if_soft_paused();
716 
717   th = __kmp_threads[gtid];
718   team = th->th.th_team;
719   status = 0;
720 
721   th->th.th_ident = id_ref;
722 
723   if (team->t.t_serialized) {
724     status = 1;
725   } else {
726     kmp_int32 old_this = th->th.th_local.this_construct;
727 
728     ++th->th.th_local.this_construct;
729     /* try to set team count to thread count--success means thread got the
730        single block */
731     /* TODO: Should this be acquire or release? */
732     if (team->t.t_construct == old_this) {
733       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
734                                               th->th.th_local.this_construct);
735     }
736 #if USE_ITT_BUILD
737     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
738         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
739         team->t.t_active_level ==
740             1) { // Only report metadata by master of active team at level 1
741       __kmp_itt_metadata_single(id_ref);
742     }
743 #endif /* USE_ITT_BUILD */
744   }
745 
746   if (__kmp_env_consistency_check) {
747     if (status && push_ws) {
748       __kmp_push_workshare(gtid, ct_psingle, id_ref);
749     } else {
750       __kmp_check_workshare(gtid, ct_psingle, id_ref);
751     }
752   }
753 #if USE_ITT_BUILD
754   if (status) {
755     __kmp_itt_single_start(gtid);
756   }
757 #endif /* USE_ITT_BUILD */
758   return status;
759 }
760 
761 void __kmp_exit_single(int gtid) {
762 #if USE_ITT_BUILD
763   __kmp_itt_single_end(gtid);
764 #endif /* USE_ITT_BUILD */
765   if (__kmp_env_consistency_check)
766     __kmp_pop_workshare(gtid, ct_psingle, NULL);
767 }
768 
769 /* determine if we can go parallel or must use a serialized parallel region and
770  * how many threads we can use
771  * set_nproc is the number of threads requested for the team
772  * returns 0 if we should serialize or only use one thread,
773  * otherwise the number of threads to use
774  * The forkjoin lock is held by the caller. */
775 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
776                                  int master_tid, int set_nthreads,
777                                  int enter_teams) {
778   int capacity;
779   int new_nthreads;
780   KMP_DEBUG_ASSERT(__kmp_init_serial);
781   KMP_DEBUG_ASSERT(root && parent_team);
782   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
783 
784   // If dyn-var is set, dynamically adjust the number of desired threads,
785   // according to the method specified by dynamic_mode.
786   new_nthreads = set_nthreads;
787   if (!get__dynamic_2(parent_team, master_tid)) {
788     ;
789   }
790 #ifdef USE_LOAD_BALANCE
791   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
792     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
793     if (new_nthreads == 1) {
794       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
795                     "reservation to 1 thread\n",
796                     master_tid));
797       return 1;
798     }
799     if (new_nthreads < set_nthreads) {
800       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
801                     "reservation to %d threads\n",
802                     master_tid, new_nthreads));
803     }
804   }
805 #endif /* USE_LOAD_BALANCE */
806   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
807     new_nthreads = __kmp_avail_proc - __kmp_nth +
808                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
809     if (new_nthreads <= 1) {
810       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
811                     "reservation to 1 thread\n",
812                     master_tid));
813       return 1;
814     }
815     if (new_nthreads < set_nthreads) {
816       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
817                     "reservation to %d threads\n",
818                     master_tid, new_nthreads));
819     } else {
820       new_nthreads = set_nthreads;
821     }
822   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
823     if (set_nthreads > 2) {
824       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
825       new_nthreads = (new_nthreads % set_nthreads) + 1;
826       if (new_nthreads == 1) {
827         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
828                       "reservation to 1 thread\n",
829                       master_tid));
830         return 1;
831       }
832       if (new_nthreads < set_nthreads) {
833         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
834                       "reservation to %d threads\n",
835                       master_tid, new_nthreads));
836       }
837     }
838   } else {
839     KMP_ASSERT(0);
840   }
841 
842   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
843   if (__kmp_nth + new_nthreads -
844           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
845       __kmp_max_nth) {
846     int tl_nthreads = __kmp_max_nth - __kmp_nth +
847                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
848     if (tl_nthreads <= 0) {
849       tl_nthreads = 1;
850     }
851 
852     // If dyn-var is false, emit a 1-time warning.
853     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
854       __kmp_reserve_warn = 1;
855       __kmp_msg(kmp_ms_warning,
856                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
857                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
858     }
859     if (tl_nthreads == 1) {
860       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
861                     "reduced reservation to 1 thread\n",
862                     master_tid));
863       return 1;
864     }
865     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
866                   "reservation to %d threads\n",
867                   master_tid, tl_nthreads));
868     new_nthreads = tl_nthreads;
869   }
870 
871   // Respect OMP_THREAD_LIMIT
872   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
873   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
874   if (cg_nthreads + new_nthreads -
875           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
876       max_cg_threads) {
877     int tl_nthreads = max_cg_threads - cg_nthreads +
878                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
879     if (tl_nthreads <= 0) {
880       tl_nthreads = 1;
881     }
882 
883     // If dyn-var is false, emit a 1-time warning.
884     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
885       __kmp_reserve_warn = 1;
886       __kmp_msg(kmp_ms_warning,
887                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
888                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
889     }
890     if (tl_nthreads == 1) {
891       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
892                     "reduced reservation to 1 thread\n",
893                     master_tid));
894       return 1;
895     }
896     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
897                   "reservation to %d threads\n",
898                   master_tid, tl_nthreads));
899     new_nthreads = tl_nthreads;
900   }
901 
902   // Check if the threads array is large enough, or needs expanding.
903   // See comment in __kmp_register_root() about the adjustment if
904   // __kmp_threads[0] == NULL.
905   capacity = __kmp_threads_capacity;
906   if (TCR_PTR(__kmp_threads[0]) == NULL) {
907     --capacity;
908   }
909   if (__kmp_nth + new_nthreads -
910           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
911       capacity) {
912     // Expand the threads array.
913     int slotsRequired = __kmp_nth + new_nthreads -
914                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
915                         capacity;
916     int slotsAdded = __kmp_expand_threads(slotsRequired);
917     if (slotsAdded < slotsRequired) {
918       // The threads array was not expanded enough.
919       new_nthreads -= (slotsRequired - slotsAdded);
920       KMP_ASSERT(new_nthreads >= 1);
921 
922       // If dyn-var is false, emit a 1-time warning.
923       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
924         __kmp_reserve_warn = 1;
925         if (__kmp_tp_cached) {
926           __kmp_msg(kmp_ms_warning,
927                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
928                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
929                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
930         } else {
931           __kmp_msg(kmp_ms_warning,
932                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
933                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
934         }
935       }
936     }
937   }
938 
939 #ifdef KMP_DEBUG
940   if (new_nthreads == 1) {
941     KC_TRACE(10,
942              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
943               "dead roots and rechecking; requested %d threads\n",
944               __kmp_get_gtid(), set_nthreads));
945   } else {
946     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
947                   " %d threads\n",
948                   __kmp_get_gtid(), new_nthreads, set_nthreads));
949   }
950 #endif // KMP_DEBUG
951   return new_nthreads;
952 }
953 
954 /* Allocate threads from the thread pool and assign them to the new team. We are
955    assured that there are enough threads available, because we checked on that
956    earlier within critical section forkjoin */
957 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
958                                     kmp_info_t *master_th, int master_gtid) {
959   int i;
960   int use_hot_team;
961 
962   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
963   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
964   KMP_MB();
965 
966   /* first, let's setup the master thread */
967   master_th->th.th_info.ds.ds_tid = 0;
968   master_th->th.th_team = team;
969   master_th->th.th_team_nproc = team->t.t_nproc;
970   master_th->th.th_team_master = master_th;
971   master_th->th.th_team_serialized = FALSE;
972   master_th->th.th_dispatch = &team->t.t_dispatch[0];
973 
974 /* make sure we are not the optimized hot team */
975 #if KMP_NESTED_HOT_TEAMS
976   use_hot_team = 0;
977   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
978   if (hot_teams) { // hot teams array is not allocated if
979     // KMP_HOT_TEAMS_MAX_LEVEL=0
980     int level = team->t.t_active_level - 1; // index in array of hot teams
981     if (master_th->th.th_teams_microtask) { // are we inside the teams?
982       if (master_th->th.th_teams_size.nteams > 1) {
983         ++level; // level was not increased in teams construct for
984         // team_of_masters
985       }
986       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
987           master_th->th.th_teams_level == team->t.t_level) {
988         ++level; // level was not increased in teams construct for
989         // team_of_workers before the parallel
990       } // team->t.t_level will be increased inside parallel
991     }
992     if (level < __kmp_hot_teams_max_level) {
993       if (hot_teams[level].hot_team) {
994         // hot team has already been allocated for given level
995         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
996         use_hot_team = 1; // the team is ready to use
997       } else {
998         use_hot_team = 0; // AC: threads are not allocated yet
999         hot_teams[level].hot_team = team; // remember new hot team
1000         hot_teams[level].hot_team_nth = team->t.t_nproc;
1001       }
1002     } else {
1003       use_hot_team = 0;
1004     }
1005   }
1006 #else
1007   use_hot_team = team == root->r.r_hot_team;
1008 #endif
1009   if (!use_hot_team) {
1010 
1011     /* install the master thread */
1012     team->t.t_threads[0] = master_th;
1013     __kmp_initialize_info(master_th, team, 0, master_gtid);
1014 
1015     /* now, install the worker threads */
1016     for (i = 1; i < team->t.t_nproc; i++) {
1017 
1018       /* fork or reallocate a new thread and install it in team */
1019       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1020       team->t.t_threads[i] = thr;
1021       KMP_DEBUG_ASSERT(thr);
1022       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1023       /* align team and thread arrived states */
1024       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1025                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1026                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1027                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1028                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1029                     team->t.t_bar[bs_plain_barrier].b_arrived));
1030       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1031       thr->th.th_teams_level = master_th->th.th_teams_level;
1032       thr->th.th_teams_size = master_th->th.th_teams_size;
1033       { // Initialize threads' barrier data.
1034         int b;
1035         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1036         for (b = 0; b < bs_last_barrier; ++b) {
1037           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1038           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1039 #if USE_DEBUGGER
1040           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1041 #endif
1042         }
1043       }
1044     }
1045 
1046 #if KMP_AFFINITY_SUPPORTED
1047     __kmp_partition_places(team);
1048 #endif
1049   }
1050 
1051   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1052     for (i = 0; i < team->t.t_nproc; i++) {
1053       kmp_info_t *thr = team->t.t_threads[i];
1054       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1055           thr->th.th_prev_level != team->t.t_level) {
1056         team->t.t_display_affinity = 1;
1057         break;
1058       }
1059     }
1060   }
1061 
1062   KMP_MB();
1063 }
1064 
1065 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1066 // Propagate any changes to the floating point control registers out to the team
1067 // We try to avoid unnecessary writes to the relevant cache line in the team
1068 // structure, so we don't make changes unless they are needed.
1069 inline static void propagateFPControl(kmp_team_t *team) {
1070   if (__kmp_inherit_fp_control) {
1071     kmp_int16 x87_fpu_control_word;
1072     kmp_uint32 mxcsr;
1073 
1074     // Get master values of FPU control flags (both X87 and vector)
1075     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1076     __kmp_store_mxcsr(&mxcsr);
1077     mxcsr &= KMP_X86_MXCSR_MASK;
1078 
1079     // There is no point looking at t_fp_control_saved here.
1080     // If it is TRUE, we still have to update the values if they are different
1081     // from those we now have. If it is FALSE we didn't save anything yet, but
1082     // our objective is the same. We have to ensure that the values in the team
1083     // are the same as those we have.
1084     // So, this code achieves what we need whether or not t_fp_control_saved is
1085     // true. By checking whether the value needs updating we avoid unnecessary
1086     // writes that would put the cache-line into a written state, causing all
1087     // threads in the team to have to read it again.
1088     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1089     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1090     // Although we don't use this value, other code in the runtime wants to know
1091     // whether it should restore them. So we must ensure it is correct.
1092     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1093   } else {
1094     // Similarly here. Don't write to this cache-line in the team structure
1095     // unless we have to.
1096     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1097   }
1098 }
1099 
1100 // Do the opposite, setting the hardware registers to the updated values from
1101 // the team.
1102 inline static void updateHWFPControl(kmp_team_t *team) {
1103   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1104     // Only reset the fp control regs if they have been changed in the team.
1105     // the parallel region that we are exiting.
1106     kmp_int16 x87_fpu_control_word;
1107     kmp_uint32 mxcsr;
1108     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1109     __kmp_store_mxcsr(&mxcsr);
1110     mxcsr &= KMP_X86_MXCSR_MASK;
1111 
1112     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1113       __kmp_clear_x87_fpu_status_word();
1114       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1115     }
1116 
1117     if (team->t.t_mxcsr != mxcsr) {
1118       __kmp_load_mxcsr(&team->t.t_mxcsr);
1119     }
1120   }
1121 }
1122 #else
1123 #define propagateFPControl(x) ((void)0)
1124 #define updateHWFPControl(x) ((void)0)
1125 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1126 
1127 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1128                                      int realloc); // forward declaration
1129 
1130 /* Run a parallel region that has been serialized, so runs only in a team of the
1131    single master thread. */
1132 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1133   kmp_info_t *this_thr;
1134   kmp_team_t *serial_team;
1135 
1136   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1137 
1138   /* Skip all this code for autopar serialized loops since it results in
1139      unacceptable overhead */
1140   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1141     return;
1142 
1143   if (!TCR_4(__kmp_init_parallel))
1144     __kmp_parallel_initialize();
1145   __kmp_resume_if_soft_paused();
1146 
1147   this_thr = __kmp_threads[global_tid];
1148   serial_team = this_thr->th.th_serial_team;
1149 
1150   /* utilize the serialized team held by this thread */
1151   KMP_DEBUG_ASSERT(serial_team);
1152   KMP_MB();
1153 
1154   if (__kmp_tasking_mode != tskm_immediate_exec) {
1155     KMP_DEBUG_ASSERT(
1156         this_thr->th.th_task_team ==
1157         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1158     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1159                      NULL);
1160     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1161                   "team %p, new task_team = NULL\n",
1162                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1163     this_thr->th.th_task_team = NULL;
1164   }
1165 
1166   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1167   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1168     proc_bind = proc_bind_false;
1169   } else if (proc_bind == proc_bind_default) {
1170     // No proc_bind clause was specified, so use the current value
1171     // of proc-bind-var for this parallel region.
1172     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1173   }
1174   // Reset for next parallel region
1175   this_thr->th.th_set_proc_bind = proc_bind_default;
1176 
1177 #if OMPT_SUPPORT
1178   ompt_data_t ompt_parallel_data = ompt_data_none;
1179   ompt_data_t *implicit_task_data;
1180   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1181   if (ompt_enabled.enabled &&
1182       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1183 
1184     ompt_task_info_t *parent_task_info;
1185     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1186 
1187     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1188     if (ompt_enabled.ompt_callback_parallel_begin) {
1189       int team_size = 1;
1190 
1191       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1192           &(parent_task_info->task_data), &(parent_task_info->frame),
1193           &ompt_parallel_data, team_size,
1194           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1195     }
1196   }
1197 #endif // OMPT_SUPPORT
1198 
1199   if (this_thr->th.th_team != serial_team) {
1200     // Nested level will be an index in the nested nthreads array
1201     int level = this_thr->th.th_team->t.t_level;
1202 
1203     if (serial_team->t.t_serialized) {
1204       /* this serial team was already used
1205          TODO increase performance by making this locks more specific */
1206       kmp_team_t *new_team;
1207 
1208       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1209 
1210       new_team =
1211           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1212 #if OMPT_SUPPORT
1213                               ompt_parallel_data,
1214 #endif
1215                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1216                               0 USE_NESTED_HOT_ARG(NULL));
1217       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1218       KMP_ASSERT(new_team);
1219 
1220       /* setup new serialized team and install it */
1221       new_team->t.t_threads[0] = this_thr;
1222       new_team->t.t_parent = this_thr->th.th_team;
1223       serial_team = new_team;
1224       this_thr->th.th_serial_team = serial_team;
1225 
1226       KF_TRACE(
1227           10,
1228           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1229            global_tid, serial_team));
1230 
1231       /* TODO the above breaks the requirement that if we run out of resources,
1232          then we can still guarantee that serialized teams are ok, since we may
1233          need to allocate a new one */
1234     } else {
1235       KF_TRACE(
1236           10,
1237           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1238            global_tid, serial_team));
1239     }
1240 
1241     /* we have to initialize this serial team */
1242     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1243     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1244     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1245     serial_team->t.t_ident = loc;
1246     serial_team->t.t_serialized = 1;
1247     serial_team->t.t_nproc = 1;
1248     serial_team->t.t_parent = this_thr->th.th_team;
1249     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1250     this_thr->th.th_team = serial_team;
1251     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1252 
1253     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1254                   this_thr->th.th_current_task));
1255     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1256     this_thr->th.th_current_task->td_flags.executing = 0;
1257 
1258     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1259 
1260     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1261        implicit task for each serialized task represented by
1262        team->t.t_serialized? */
1263     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1264               &this_thr->th.th_current_task->td_parent->td_icvs);
1265 
1266     // Thread value exists in the nested nthreads array for the next nested
1267     // level
1268     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1269       this_thr->th.th_current_task->td_icvs.nproc =
1270           __kmp_nested_nth.nth[level + 1];
1271     }
1272 
1273     if (__kmp_nested_proc_bind.used &&
1274         (level + 1 < __kmp_nested_proc_bind.used)) {
1275       this_thr->th.th_current_task->td_icvs.proc_bind =
1276           __kmp_nested_proc_bind.bind_types[level + 1];
1277     }
1278 
1279 #if USE_DEBUGGER
1280     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1281 #endif
1282     this_thr->th.th_info.ds.ds_tid = 0;
1283 
1284     /* set thread cache values */
1285     this_thr->th.th_team_nproc = 1;
1286     this_thr->th.th_team_master = this_thr;
1287     this_thr->th.th_team_serialized = 1;
1288 
1289     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1290     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1291     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1292 
1293     propagateFPControl(serial_team);
1294 
1295     /* check if we need to allocate dispatch buffers stack */
1296     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1297     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1298       serial_team->t.t_dispatch->th_disp_buffer =
1299           (dispatch_private_info_t *)__kmp_allocate(
1300               sizeof(dispatch_private_info_t));
1301     }
1302     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1303 
1304     KMP_MB();
1305 
1306   } else {
1307     /* this serialized team is already being used,
1308      * that's fine, just add another nested level */
1309     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1310     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1311     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1312     ++serial_team->t.t_serialized;
1313     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1314 
1315     // Nested level will be an index in the nested nthreads array
1316     int level = this_thr->th.th_team->t.t_level;
1317     // Thread value exists in the nested nthreads array for the next nested
1318     // level
1319     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1320       this_thr->th.th_current_task->td_icvs.nproc =
1321           __kmp_nested_nth.nth[level + 1];
1322     }
1323     serial_team->t.t_level++;
1324     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1325                   "of serial team %p to %d\n",
1326                   global_tid, serial_team, serial_team->t.t_level));
1327 
1328     /* allocate/push dispatch buffers stack */
1329     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1330     {
1331       dispatch_private_info_t *disp_buffer =
1332           (dispatch_private_info_t *)__kmp_allocate(
1333               sizeof(dispatch_private_info_t));
1334       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1335       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1336     }
1337     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1338 
1339     KMP_MB();
1340   }
1341   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1342 
1343   // Perform the display affinity functionality for
1344   // serialized parallel regions
1345   if (__kmp_display_affinity) {
1346     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1347         this_thr->th.th_prev_num_threads != 1) {
1348       // NULL means use the affinity-format-var ICV
1349       __kmp_aux_display_affinity(global_tid, NULL);
1350       this_thr->th.th_prev_level = serial_team->t.t_level;
1351       this_thr->th.th_prev_num_threads = 1;
1352     }
1353   }
1354 
1355   if (__kmp_env_consistency_check)
1356     __kmp_push_parallel(global_tid, NULL);
1357 #if OMPT_SUPPORT
1358   serial_team->t.ompt_team_info.master_return_address = codeptr;
1359   if (ompt_enabled.enabled &&
1360       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1361     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1362 
1363     ompt_lw_taskteam_t lw_taskteam;
1364     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1365                             &ompt_parallel_data, codeptr);
1366 
1367     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1368     // don't use lw_taskteam after linking. content was swaped
1369 
1370     /* OMPT implicit task begin */
1371     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1372     if (ompt_enabled.ompt_callback_implicit_task) {
1373       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1374           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1375           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1376       OMPT_CUR_TASK_INFO(this_thr)
1377           ->thread_num = __kmp_tid_from_gtid(global_tid);
1378     }
1379 
1380     /* OMPT state */
1381     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1382     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1383   }
1384 #endif
1385 }
1386 
1387 /* most of the work for a fork */
1388 /* return true if we really went parallel, false if serialized */
1389 int __kmp_fork_call(ident_t *loc, int gtid,
1390                     enum fork_context_e call_context, // Intel, GNU, ...
1391                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1392 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1393 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1394                     va_list *ap
1395 #else
1396                     va_list ap
1397 #endif
1398                     ) {
1399   void **argv;
1400   int i;
1401   int master_tid;
1402   int master_this_cons;
1403   kmp_team_t *team;
1404   kmp_team_t *parent_team;
1405   kmp_info_t *master_th;
1406   kmp_root_t *root;
1407   int nthreads;
1408   int master_active;
1409   int master_set_numthreads;
1410   int level;
1411   int active_level;
1412   int teams_level;
1413 #if KMP_NESTED_HOT_TEAMS
1414   kmp_hot_team_ptr_t **p_hot_teams;
1415 #endif
1416   { // KMP_TIME_BLOCK
1417     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1418     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1419 
1420     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1421     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1422       /* Some systems prefer the stack for the root thread(s) to start with */
1423       /* some gap from the parent stack to prevent false sharing. */
1424       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1425       /* These 2 lines below are so this does not get optimized out */
1426       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1427         __kmp_stkpadding += (short)((kmp_int64)dummy);
1428     }
1429 
1430     /* initialize if needed */
1431     KMP_DEBUG_ASSERT(
1432         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1433     if (!TCR_4(__kmp_init_parallel))
1434       __kmp_parallel_initialize();
1435     __kmp_resume_if_soft_paused();
1436 
1437     /* setup current data */
1438     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1439     // shutdown
1440     parent_team = master_th->th.th_team;
1441     master_tid = master_th->th.th_info.ds.ds_tid;
1442     master_this_cons = master_th->th.th_local.this_construct;
1443     root = master_th->th.th_root;
1444     master_active = root->r.r_active;
1445     master_set_numthreads = master_th->th.th_set_nproc;
1446 
1447 #if OMPT_SUPPORT
1448     ompt_data_t ompt_parallel_data = ompt_data_none;
1449     ompt_data_t *parent_task_data;
1450     ompt_frame_t *ompt_frame;
1451     ompt_data_t *implicit_task_data;
1452     void *return_address = NULL;
1453 
1454     if (ompt_enabled.enabled) {
1455       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1456                                     NULL, NULL);
1457       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1458     }
1459 #endif
1460 
1461     // Nested level will be an index in the nested nthreads array
1462     level = parent_team->t.t_level;
1463     // used to launch non-serial teams even if nested is not allowed
1464     active_level = parent_team->t.t_active_level;
1465     // needed to check nesting inside the teams
1466     teams_level = master_th->th.th_teams_level;
1467 #if KMP_NESTED_HOT_TEAMS
1468     p_hot_teams = &master_th->th.th_hot_teams;
1469     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1470       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1471           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1472       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1473       // it is either actual or not needed (when active_level > 0)
1474       (*p_hot_teams)[0].hot_team_nth = 1;
1475     }
1476 #endif
1477 
1478 #if OMPT_SUPPORT
1479     if (ompt_enabled.enabled) {
1480       if (ompt_enabled.ompt_callback_parallel_begin) {
1481         int team_size = master_set_numthreads
1482                             ? master_set_numthreads
1483                             : get__nproc_2(parent_team, master_tid);
1484         int flags = OMPT_INVOKER(call_context) |
1485                     ((microtask == (microtask_t)__kmp_teams_master)
1486                          ? ompt_parallel_league
1487                          : ompt_parallel_team);
1488         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1489             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1490             return_address);
1491       }
1492       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1493     }
1494 #endif
1495 
1496     master_th->th.th_ident = loc;
1497 
1498     if (master_th->th.th_teams_microtask && ap &&
1499         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1500       // AC: This is start of parallel that is nested inside teams construct.
1501       // The team is actual (hot), all workers are ready at the fork barrier.
1502       // No lock needed to initialize the team a bit, then free workers.
1503       parent_team->t.t_ident = loc;
1504       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1505       parent_team->t.t_argc = argc;
1506       argv = (void **)parent_team->t.t_argv;
1507       for (i = argc - 1; i >= 0; --i)
1508 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1509 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1510         *argv++ = va_arg(*ap, void *);
1511 #else
1512         *argv++ = va_arg(ap, void *);
1513 #endif
1514       // Increment our nested depth levels, but not increase the serialization
1515       if (parent_team == master_th->th.th_serial_team) {
1516         // AC: we are in serialized parallel
1517         __kmpc_serialized_parallel(loc, gtid);
1518         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1519 
1520 #if OMPT_SUPPORT
1521         void *dummy;
1522         void **exit_frame_p;
1523 
1524         ompt_lw_taskteam_t lw_taskteam;
1525 
1526         if (ompt_enabled.enabled) {
1527           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1528                                   &ompt_parallel_data, return_address);
1529           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1530 
1531           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1532           // don't use lw_taskteam after linking. content was swaped
1533 
1534           /* OMPT implicit task begin */
1535           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1536           if (ompt_enabled.ompt_callback_implicit_task) {
1537             OMPT_CUR_TASK_INFO(master_th)
1538                 ->thread_num = __kmp_tid_from_gtid(gtid);
1539             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1540                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1541                 implicit_task_data, 1,
1542                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1543           }
1544 
1545           /* OMPT state */
1546           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1547         } else {
1548           exit_frame_p = &dummy;
1549         }
1550 #endif
1551         // AC: need to decrement t_serialized for enquiry functions to work
1552         // correctly, will restore at join time
1553         parent_team->t.t_serialized--;
1554 
1555         {
1556           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1557           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1558           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1559 #if OMPT_SUPPORT
1560                                  ,
1561                                  exit_frame_p
1562 #endif
1563                                  );
1564         }
1565 
1566 #if OMPT_SUPPORT
1567         if (ompt_enabled.enabled) {
1568           *exit_frame_p = NULL;
1569           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1570           if (ompt_enabled.ompt_callback_implicit_task) {
1571             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1572                 ompt_scope_end, NULL, implicit_task_data, 1,
1573                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1574           }
1575           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1576           __ompt_lw_taskteam_unlink(master_th);
1577           if (ompt_enabled.ompt_callback_parallel_end) {
1578             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1579                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1580                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1581                 return_address);
1582           }
1583           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1584         }
1585 #endif
1586         return TRUE;
1587       }
1588 
1589       parent_team->t.t_pkfn = microtask;
1590       parent_team->t.t_invoke = invoker;
1591       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1592       parent_team->t.t_active_level++;
1593       parent_team->t.t_level++;
1594       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1595 
1596 #if OMPT_SUPPORT
1597       if (ompt_enabled.enabled) {
1598         ompt_lw_taskteam_t lw_taskteam;
1599         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1600                                 &ompt_parallel_data, return_address);
1601         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1602       }
1603 #endif
1604 
1605       /* Change number of threads in the team if requested */
1606       if (master_set_numthreads) { // The parallel has num_threads clause
1607         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1608           // AC: only can reduce number of threads dynamically, can't increase
1609           kmp_info_t **other_threads = parent_team->t.t_threads;
1610           parent_team->t.t_nproc = master_set_numthreads;
1611           for (i = 0; i < master_set_numthreads; ++i) {
1612             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1613           }
1614           // Keep extra threads hot in the team for possible next parallels
1615         }
1616         master_th->th.th_set_nproc = 0;
1617       }
1618 
1619 #if USE_DEBUGGER
1620       if (__kmp_debugging) { // Let debugger override number of threads.
1621         int nth = __kmp_omp_num_threads(loc);
1622         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1623           master_set_numthreads = nth;
1624         }
1625       }
1626 #endif
1627 
1628       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1629                     "master_th=%p, gtid=%d\n",
1630                     root, parent_team, master_th, gtid));
1631       __kmp_internal_fork(loc, gtid, parent_team);
1632       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1633                     "master_th=%p, gtid=%d\n",
1634                     root, parent_team, master_th, gtid));
1635 
1636       /* Invoke microtask for MASTER thread */
1637       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1638                     parent_team->t.t_id, parent_team->t.t_pkfn));
1639 
1640       if (!parent_team->t.t_invoke(gtid)) {
1641         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1642       }
1643       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1644                     parent_team->t.t_id, parent_team->t.t_pkfn));
1645       KMP_MB(); /* Flush all pending memory write invalidates.  */
1646 
1647       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1648 
1649       return TRUE;
1650     } // Parallel closely nested in teams construct
1651 
1652 #if KMP_DEBUG
1653     if (__kmp_tasking_mode != tskm_immediate_exec) {
1654       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1655                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1656     }
1657 #endif
1658 
1659     if (parent_team->t.t_active_level >=
1660         master_th->th.th_current_task->td_icvs.max_active_levels) {
1661       nthreads = 1;
1662     } else {
1663       int enter_teams = ((ap == NULL && active_level == 0) ||
1664                          (ap && teams_level > 0 && teams_level == level));
1665       nthreads =
1666           master_set_numthreads
1667               ? master_set_numthreads
1668               : get__nproc_2(
1669                     parent_team,
1670                     master_tid); // TODO: get nproc directly from current task
1671 
1672       // Check if we need to take forkjoin lock? (no need for serialized
1673       // parallel out of teams construct). This code moved here from
1674       // __kmp_reserve_threads() to speedup nested serialized parallels.
1675       if (nthreads > 1) {
1676         if ((get__max_active_levels(master_th) == 1 &&
1677              (root->r.r_in_parallel && !enter_teams)) ||
1678             (__kmp_library == library_serial)) {
1679           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1680                         " threads\n",
1681                         gtid, nthreads));
1682           nthreads = 1;
1683         }
1684       }
1685       if (nthreads > 1) {
1686         /* determine how many new threads we can use */
1687         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1688         /* AC: If we execute teams from parallel region (on host), then teams
1689            should be created but each can only have 1 thread if nesting is
1690            disabled. If teams called from serial region, then teams and their
1691            threads should be created regardless of the nesting setting. */
1692         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1693                                          nthreads, enter_teams);
1694         if (nthreads == 1) {
1695           // Free lock for single thread execution here; for multi-thread
1696           // execution it will be freed later after team of threads created
1697           // and initialized
1698           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1699         }
1700       }
1701     }
1702     KMP_DEBUG_ASSERT(nthreads > 0);
1703 
1704     // If we temporarily changed the set number of threads then restore it now
1705     master_th->th.th_set_nproc = 0;
1706 
1707     /* create a serialized parallel region? */
1708     if (nthreads == 1) {
1709 /* josh todo: hypothetical question: what do we do for OS X*? */
1710 #if KMP_OS_LINUX &&                                                            \
1711     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1712       void *args[argc];
1713 #else
1714       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1715 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1716           KMP_ARCH_AARCH64) */
1717 
1718       KA_TRACE(20,
1719                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1720 
1721       __kmpc_serialized_parallel(loc, gtid);
1722 
1723       if (call_context == fork_context_intel) {
1724         /* TODO this sucks, use the compiler itself to pass args! :) */
1725         master_th->th.th_serial_team->t.t_ident = loc;
1726         if (!ap) {
1727           // revert change made in __kmpc_serialized_parallel()
1728           master_th->th.th_serial_team->t.t_level--;
1729 // Get args from parent team for teams construct
1730 
1731 #if OMPT_SUPPORT
1732           void *dummy;
1733           void **exit_frame_p;
1734           ompt_task_info_t *task_info;
1735 
1736           ompt_lw_taskteam_t lw_taskteam;
1737 
1738           if (ompt_enabled.enabled) {
1739             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1740                                     &ompt_parallel_data, return_address);
1741 
1742             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1743             // don't use lw_taskteam after linking. content was swaped
1744 
1745             task_info = OMPT_CUR_TASK_INFO(master_th);
1746             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1747             if (ompt_enabled.ompt_callback_implicit_task) {
1748               OMPT_CUR_TASK_INFO(master_th)
1749                   ->thread_num = __kmp_tid_from_gtid(gtid);
1750               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1751                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1752                   &(task_info->task_data), 1,
1753                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1754                   ompt_task_implicit);
1755             }
1756 
1757             /* OMPT state */
1758             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1759           } else {
1760             exit_frame_p = &dummy;
1761           }
1762 #endif
1763 
1764           {
1765             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1766             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1767             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1768                                    parent_team->t.t_argv
1769 #if OMPT_SUPPORT
1770                                    ,
1771                                    exit_frame_p
1772 #endif
1773                                    );
1774           }
1775 
1776 #if OMPT_SUPPORT
1777           if (ompt_enabled.enabled) {
1778             *exit_frame_p = NULL;
1779             if (ompt_enabled.ompt_callback_implicit_task) {
1780               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1781                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1782                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1783                   ompt_task_implicit);
1784             }
1785             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1786             __ompt_lw_taskteam_unlink(master_th);
1787             if (ompt_enabled.ompt_callback_parallel_end) {
1788               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1789                   &ompt_parallel_data, parent_task_data,
1790                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1791                   return_address);
1792             }
1793             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1794           }
1795 #endif
1796         } else if (microtask == (microtask_t)__kmp_teams_master) {
1797           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1798                            master_th->th.th_serial_team);
1799           team = master_th->th.th_team;
1800           // team->t.t_pkfn = microtask;
1801           team->t.t_invoke = invoker;
1802           __kmp_alloc_argv_entries(argc, team, TRUE);
1803           team->t.t_argc = argc;
1804           argv = (void **)team->t.t_argv;
1805           if (ap) {
1806             for (i = argc - 1; i >= 0; --i)
1807 // TODO: revert workaround for Intel(R) 64 tracker #96
1808 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1809               *argv++ = va_arg(*ap, void *);
1810 #else
1811               *argv++ = va_arg(ap, void *);
1812 #endif
1813           } else {
1814             for (i = 0; i < argc; ++i)
1815               // Get args from parent team for teams construct
1816               argv[i] = parent_team->t.t_argv[i];
1817           }
1818           // AC: revert change made in __kmpc_serialized_parallel()
1819           //     because initial code in teams should have level=0
1820           team->t.t_level--;
1821           // AC: call special invoker for outer "parallel" of teams construct
1822           invoker(gtid);
1823 #if OMPT_SUPPORT
1824           if (ompt_enabled.enabled) {
1825             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1826             if (ompt_enabled.ompt_callback_implicit_task) {
1827               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1828                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1829                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1830             }
1831             if (ompt_enabled.ompt_callback_parallel_end) {
1832               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1833                   &ompt_parallel_data, parent_task_data,
1834                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1835                   return_address);
1836             }
1837             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1838           }
1839 #endif
1840         } else {
1841           argv = args;
1842           for (i = argc - 1; i >= 0; --i)
1843 // TODO: revert workaround for Intel(R) 64 tracker #96
1844 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1845             *argv++ = va_arg(*ap, void *);
1846 #else
1847             *argv++ = va_arg(ap, void *);
1848 #endif
1849           KMP_MB();
1850 
1851 #if OMPT_SUPPORT
1852           void *dummy;
1853           void **exit_frame_p;
1854           ompt_task_info_t *task_info;
1855 
1856           ompt_lw_taskteam_t lw_taskteam;
1857 
1858           if (ompt_enabled.enabled) {
1859             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1860                                     &ompt_parallel_data, return_address);
1861             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1862             // don't use lw_taskteam after linking. content was swaped
1863             task_info = OMPT_CUR_TASK_INFO(master_th);
1864             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1865 
1866             /* OMPT implicit task begin */
1867             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1868             if (ompt_enabled.ompt_callback_implicit_task) {
1869               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1870                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1871                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1872                   ompt_task_implicit);
1873               OMPT_CUR_TASK_INFO(master_th)
1874                   ->thread_num = __kmp_tid_from_gtid(gtid);
1875             }
1876 
1877             /* OMPT state */
1878             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1879           } else {
1880             exit_frame_p = &dummy;
1881           }
1882 #endif
1883 
1884           {
1885             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1886             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1887             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1888 #if OMPT_SUPPORT
1889                                    ,
1890                                    exit_frame_p
1891 #endif
1892                                    );
1893           }
1894 
1895 #if OMPT_SUPPORT
1896           if (ompt_enabled.enabled) {
1897             *exit_frame_p = NULL;
1898             if (ompt_enabled.ompt_callback_implicit_task) {
1899               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1900                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1901                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1902                   ompt_task_implicit);
1903             }
1904 
1905             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1906             __ompt_lw_taskteam_unlink(master_th);
1907             if (ompt_enabled.ompt_callback_parallel_end) {
1908               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1909                   &ompt_parallel_data, parent_task_data,
1910                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1911                   return_address);
1912             }
1913             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1914           }
1915 #endif
1916         }
1917       } else if (call_context == fork_context_gnu) {
1918 #if OMPT_SUPPORT
1919         ompt_lw_taskteam_t lwt;
1920         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1921                                 return_address);
1922 
1923         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1924         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1925 // don't use lw_taskteam after linking. content was swaped
1926 #endif
1927 
1928         // we were called from GNU native code
1929         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1930         return FALSE;
1931       } else {
1932         KMP_ASSERT2(call_context < fork_context_last,
1933                     "__kmp_fork_call: unknown fork_context parameter");
1934       }
1935 
1936       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1937       KMP_MB();
1938       return FALSE;
1939     } // if (nthreads == 1)
1940 
1941     // GEH: only modify the executing flag in the case when not serialized
1942     //      serialized case is handled in kmpc_serialized_parallel
1943     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1944                   "curtask=%p, curtask_max_aclevel=%d\n",
1945                   parent_team->t.t_active_level, master_th,
1946                   master_th->th.th_current_task,
1947                   master_th->th.th_current_task->td_icvs.max_active_levels));
1948     // TODO: GEH - cannot do this assertion because root thread not set up as
1949     // executing
1950     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1951     master_th->th.th_current_task->td_flags.executing = 0;
1952 
1953     if (!master_th->th.th_teams_microtask || level > teams_level) {
1954       /* Increment our nested depth level */
1955       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1956     }
1957 
1958     // See if we need to make a copy of the ICVs.
1959     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1960     if ((level + 1 < __kmp_nested_nth.used) &&
1961         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1962       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1963     } else {
1964       nthreads_icv = 0; // don't update
1965     }
1966 
1967     // Figure out the proc_bind_policy for the new team.
1968     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1969     kmp_proc_bind_t proc_bind_icv =
1970         proc_bind_default; // proc_bind_default means don't update
1971     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1972       proc_bind = proc_bind_false;
1973     } else {
1974       if (proc_bind == proc_bind_default) {
1975         // No proc_bind clause specified; use current proc-bind-var for this
1976         // parallel region
1977         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1978       }
1979       /* else: The proc_bind policy was specified explicitly on parallel clause.
1980          This overrides proc-bind-var for this parallel region, but does not
1981          change proc-bind-var. */
1982       // Figure the value of proc-bind-var for the child threads.
1983       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1984           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1985            master_th->th.th_current_task->td_icvs.proc_bind)) {
1986         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1987       }
1988     }
1989 
1990     // Reset for next parallel region
1991     master_th->th.th_set_proc_bind = proc_bind_default;
1992 
1993     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1994       kmp_internal_control_t new_icvs;
1995       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1996       new_icvs.next = NULL;
1997       if (nthreads_icv > 0) {
1998         new_icvs.nproc = nthreads_icv;
1999       }
2000       if (proc_bind_icv != proc_bind_default) {
2001         new_icvs.proc_bind = proc_bind_icv;
2002       }
2003 
2004       /* allocate a new parallel team */
2005       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2006       team = __kmp_allocate_team(root, nthreads, nthreads,
2007 #if OMPT_SUPPORT
2008                                  ompt_parallel_data,
2009 #endif
2010                                  proc_bind, &new_icvs,
2011                                  argc USE_NESTED_HOT_ARG(master_th));
2012     } else {
2013       /* allocate a new parallel team */
2014       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2015       team = __kmp_allocate_team(root, nthreads, nthreads,
2016 #if OMPT_SUPPORT
2017                                  ompt_parallel_data,
2018 #endif
2019                                  proc_bind,
2020                                  &master_th->th.th_current_task->td_icvs,
2021                                  argc USE_NESTED_HOT_ARG(master_th));
2022     }
2023     KF_TRACE(
2024         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2025 
2026     /* setup the new team */
2027     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2028     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2029     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2030     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2031     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2032 #if OMPT_SUPPORT
2033     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2034                           return_address);
2035 #endif
2036     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2037     // TODO: parent_team->t.t_level == INT_MAX ???
2038     if (!master_th->th.th_teams_microtask || level > teams_level) {
2039       int new_level = parent_team->t.t_level + 1;
2040       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2041       new_level = parent_team->t.t_active_level + 1;
2042       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2043     } else {
2044       // AC: Do not increase parallel level at start of the teams construct
2045       int new_level = parent_team->t.t_level;
2046       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2047       new_level = parent_team->t.t_active_level;
2048       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2049     }
2050     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2051     // set master's schedule as new run-time schedule
2052     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2053 
2054     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2055     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2056 
2057     // Update the floating point rounding in the team if required.
2058     propagateFPControl(team);
2059 
2060     if (__kmp_tasking_mode != tskm_immediate_exec) {
2061       // Set master's task team to team's task team. Unless this is hot team, it
2062       // should be NULL.
2063       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2064                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2065       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2066                     "%p, new task_team %p / team %p\n",
2067                     __kmp_gtid_from_thread(master_th),
2068                     master_th->th.th_task_team, parent_team,
2069                     team->t.t_task_team[master_th->th.th_task_state], team));
2070 
2071       if (active_level || master_th->th.th_task_team) {
2072         // Take a memo of master's task_state
2073         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2074         if (master_th->th.th_task_state_top >=
2075             master_th->th.th_task_state_stack_sz) { // increase size
2076           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2077           kmp_uint8 *old_stack, *new_stack;
2078           kmp_uint32 i;
2079           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2080           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2081             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2082           }
2083           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2084                ++i) { // zero-init rest of stack
2085             new_stack[i] = 0;
2086           }
2087           old_stack = master_th->th.th_task_state_memo_stack;
2088           master_th->th.th_task_state_memo_stack = new_stack;
2089           master_th->th.th_task_state_stack_sz = new_size;
2090           __kmp_free(old_stack);
2091         }
2092         // Store master's task_state on stack
2093         master_th->th
2094             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2095             master_th->th.th_task_state;
2096         master_th->th.th_task_state_top++;
2097 #if KMP_NESTED_HOT_TEAMS
2098         if (master_th->th.th_hot_teams &&
2099             active_level < __kmp_hot_teams_max_level &&
2100             team == master_th->th.th_hot_teams[active_level].hot_team) {
2101           // Restore master's nested state if nested hot team
2102           master_th->th.th_task_state =
2103               master_th->th
2104                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2105         } else {
2106 #endif
2107           master_th->th.th_task_state = 0;
2108 #if KMP_NESTED_HOT_TEAMS
2109         }
2110 #endif
2111       }
2112 #if !KMP_NESTED_HOT_TEAMS
2113       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2114                        (team == root->r.r_hot_team));
2115 #endif
2116     }
2117 
2118     KA_TRACE(
2119         20,
2120         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2121          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2122          team->t.t_nproc));
2123     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2124                      (team->t.t_master_tid == 0 &&
2125                       (team->t.t_parent == root->r.r_root_team ||
2126                        team->t.t_parent->t.t_serialized)));
2127     KMP_MB();
2128 
2129     /* now, setup the arguments */
2130     argv = (void **)team->t.t_argv;
2131     if (ap) {
2132       for (i = argc - 1; i >= 0; --i) {
2133 // TODO: revert workaround for Intel(R) 64 tracker #96
2134 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2135         void *new_argv = va_arg(*ap, void *);
2136 #else
2137         void *new_argv = va_arg(ap, void *);
2138 #endif
2139         KMP_CHECK_UPDATE(*argv, new_argv);
2140         argv++;
2141       }
2142     } else {
2143       for (i = 0; i < argc; ++i) {
2144         // Get args from parent team for teams construct
2145         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2146       }
2147     }
2148 
2149     /* now actually fork the threads */
2150     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2151     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2152       root->r.r_active = TRUE;
2153 
2154     __kmp_fork_team_threads(root, team, master_th, gtid);
2155     __kmp_setup_icv_copy(team, nthreads,
2156                          &master_th->th.th_current_task->td_icvs, loc);
2157 
2158 #if OMPT_SUPPORT
2159     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2160 #endif
2161 
2162     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2163 
2164 #if USE_ITT_BUILD
2165     if (team->t.t_active_level == 1 // only report frames at level 1
2166         && !master_th->th.th_teams_microtask) { // not in teams construct
2167 #if USE_ITT_NOTIFY
2168       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2169           (__kmp_forkjoin_frames_mode == 3 ||
2170            __kmp_forkjoin_frames_mode == 1)) {
2171         kmp_uint64 tmp_time = 0;
2172         if (__itt_get_timestamp_ptr)
2173           tmp_time = __itt_get_timestamp();
2174         // Internal fork - report frame begin
2175         master_th->th.th_frame_time = tmp_time;
2176         if (__kmp_forkjoin_frames_mode == 3)
2177           team->t.t_region_time = tmp_time;
2178       } else
2179 // only one notification scheme (either "submit" or "forking/joined", not both)
2180 #endif /* USE_ITT_NOTIFY */
2181           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2182               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2183         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2184         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2185       }
2186     }
2187 #endif /* USE_ITT_BUILD */
2188 
2189     /* now go on and do the work */
2190     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2191     KMP_MB();
2192     KF_TRACE(10,
2193              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2194               root, team, master_th, gtid));
2195 
2196 #if USE_ITT_BUILD
2197     if (__itt_stack_caller_create_ptr) {
2198       team->t.t_stack_id =
2199           __kmp_itt_stack_caller_create(); // create new stack stitching id
2200       // before entering fork barrier
2201     }
2202 #endif /* USE_ITT_BUILD */
2203 
2204     // AC: skip __kmp_internal_fork at teams construct, let only master
2205     // threads execute
2206     if (ap) {
2207       __kmp_internal_fork(loc, gtid, team);
2208       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2209                     "master_th=%p, gtid=%d\n",
2210                     root, team, master_th, gtid));
2211     }
2212 
2213     if (call_context == fork_context_gnu) {
2214       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2215       return TRUE;
2216     }
2217 
2218     /* Invoke microtask for MASTER thread */
2219     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2220                   team->t.t_id, team->t.t_pkfn));
2221   } // END of timer KMP_fork_call block
2222 
2223 #if KMP_STATS_ENABLED
2224   // If beginning a teams construct, then change thread state
2225   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2226   if (!ap) {
2227     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2228   }
2229 #endif
2230 
2231   if (!team->t.t_invoke(gtid)) {
2232     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2233   }
2234 
2235 #if KMP_STATS_ENABLED
2236   // If was beginning of a teams construct, then reset thread state
2237   if (!ap) {
2238     KMP_SET_THREAD_STATE(previous_state);
2239   }
2240 #endif
2241 
2242   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2243                 team->t.t_id, team->t.t_pkfn));
2244   KMP_MB(); /* Flush all pending memory write invalidates.  */
2245 
2246   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2247 
2248 #if OMPT_SUPPORT
2249   if (ompt_enabled.enabled) {
2250     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2251   }
2252 #endif
2253 
2254   return TRUE;
2255 }
2256 
2257 #if OMPT_SUPPORT
2258 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2259                                             kmp_team_t *team) {
2260   // restore state outside the region
2261   thread->th.ompt_thread_info.state =
2262       ((team->t.t_serialized) ? ompt_state_work_serial
2263                               : ompt_state_work_parallel);
2264 }
2265 
2266 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2267                                    kmp_team_t *team, ompt_data_t *parallel_data,
2268                                    int flags, void *codeptr) {
2269   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2270   if (ompt_enabled.ompt_callback_parallel_end) {
2271     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2272         parallel_data, &(task_info->task_data), flags, codeptr);
2273   }
2274 
2275   task_info->frame.enter_frame = ompt_data_none;
2276   __kmp_join_restore_state(thread, team);
2277 }
2278 #endif
2279 
2280 void __kmp_join_call(ident_t *loc, int gtid
2281 #if OMPT_SUPPORT
2282                      ,
2283                      enum fork_context_e fork_context
2284 #endif
2285                      ,
2286                      int exit_teams) {
2287   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2288   kmp_team_t *team;
2289   kmp_team_t *parent_team;
2290   kmp_info_t *master_th;
2291   kmp_root_t *root;
2292   int master_active;
2293 
2294   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2295 
2296   /* setup current data */
2297   master_th = __kmp_threads[gtid];
2298   root = master_th->th.th_root;
2299   team = master_th->th.th_team;
2300   parent_team = team->t.t_parent;
2301 
2302   master_th->th.th_ident = loc;
2303 
2304 #if OMPT_SUPPORT
2305   void *team_microtask = (void *)team->t.t_pkfn;
2306   if (ompt_enabled.enabled) {
2307     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2308   }
2309 #endif
2310 
2311 #if KMP_DEBUG
2312   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2313     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2314                   "th_task_team = %p\n",
2315                   __kmp_gtid_from_thread(master_th), team,
2316                   team->t.t_task_team[master_th->th.th_task_state],
2317                   master_th->th.th_task_team));
2318     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2319                      team->t.t_task_team[master_th->th.th_task_state]);
2320   }
2321 #endif
2322 
2323   if (team->t.t_serialized) {
2324     if (master_th->th.th_teams_microtask) {
2325       // We are in teams construct
2326       int level = team->t.t_level;
2327       int tlevel = master_th->th.th_teams_level;
2328       if (level == tlevel) {
2329         // AC: we haven't incremented it earlier at start of teams construct,
2330         //     so do it here - at the end of teams construct
2331         team->t.t_level++;
2332       } else if (level == tlevel + 1) {
2333         // AC: we are exiting parallel inside teams, need to increment
2334         // serialization in order to restore it in the next call to
2335         // __kmpc_end_serialized_parallel
2336         team->t.t_serialized++;
2337       }
2338     }
2339     __kmpc_end_serialized_parallel(loc, gtid);
2340 
2341 #if OMPT_SUPPORT
2342     if (ompt_enabled.enabled) {
2343       __kmp_join_restore_state(master_th, parent_team);
2344     }
2345 #endif
2346 
2347     return;
2348   }
2349 
2350   master_active = team->t.t_master_active;
2351 
2352   if (!exit_teams) {
2353     // AC: No barrier for internal teams at exit from teams construct.
2354     //     But there is barrier for external team (league).
2355     __kmp_internal_join(loc, gtid, team);
2356   } else {
2357     master_th->th.th_task_state =
2358         0; // AC: no tasking in teams (out of any parallel)
2359   }
2360 
2361   KMP_MB();
2362 
2363 #if OMPT_SUPPORT
2364   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2365   void *codeptr = team->t.ompt_team_info.master_return_address;
2366 #endif
2367 
2368 #if USE_ITT_BUILD
2369   if (__itt_stack_caller_create_ptr) {
2370     __kmp_itt_stack_caller_destroy(
2371         (__itt_caller)team->t
2372             .t_stack_id); // destroy the stack stitching id after join barrier
2373   }
2374 
2375   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2376   if (team->t.t_active_level == 1 &&
2377       !master_th->th.th_teams_microtask) { /* not in teams construct */
2378     master_th->th.th_ident = loc;
2379     // only one notification scheme (either "submit" or "forking/joined", not
2380     // both)
2381     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2382         __kmp_forkjoin_frames_mode == 3)
2383       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2384                              master_th->th.th_frame_time, 0, loc,
2385                              master_th->th.th_team_nproc, 1);
2386     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2387              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2388       __kmp_itt_region_joined(gtid);
2389   } // active_level == 1
2390 #endif /* USE_ITT_BUILD */
2391 
2392   if (master_th->th.th_teams_microtask && !exit_teams &&
2393       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2394       team->t.t_level == master_th->th.th_teams_level + 1) {
2395 // AC: We need to leave the team structure intact at the end of parallel
2396 // inside the teams construct, so that at the next parallel same (hot) team
2397 // works, only adjust nesting levels
2398 #if OMPT_SUPPORT
2399     ompt_data_t ompt_parallel_data = ompt_data_none;
2400     if (ompt_enabled.enabled) {
2401       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2402       if (ompt_enabled.ompt_callback_implicit_task) {
2403         int ompt_team_size = team->t.t_nproc;
2404         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2405             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2406             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2407       }
2408       task_info->frame.exit_frame = ompt_data_none;
2409       task_info->task_data = ompt_data_none;
2410       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2411       __ompt_lw_taskteam_unlink(master_th);
2412     }
2413 #endif
2414     /* Decrement our nested depth level */
2415     team->t.t_level--;
2416     team->t.t_active_level--;
2417     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2418 
2419     // Restore number of threads in the team if needed. This code relies on
2420     // the proper adjustment of th_teams_size.nth after the fork in
2421     // __kmp_teams_master on each teams master in the case that
2422     // __kmp_reserve_threads reduced it.
2423     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2424       int old_num = master_th->th.th_team_nproc;
2425       int new_num = master_th->th.th_teams_size.nth;
2426       kmp_info_t **other_threads = team->t.t_threads;
2427       team->t.t_nproc = new_num;
2428       for (int i = 0; i < old_num; ++i) {
2429         other_threads[i]->th.th_team_nproc = new_num;
2430       }
2431       // Adjust states of non-used threads of the team
2432       for (int i = old_num; i < new_num; ++i) {
2433         // Re-initialize thread's barrier data.
2434         KMP_DEBUG_ASSERT(other_threads[i]);
2435         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2436         for (int b = 0; b < bs_last_barrier; ++b) {
2437           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2438           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2439 #if USE_DEBUGGER
2440           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2441 #endif
2442         }
2443         if (__kmp_tasking_mode != tskm_immediate_exec) {
2444           // Synchronize thread's task state
2445           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2446         }
2447       }
2448     }
2449 
2450 #if OMPT_SUPPORT
2451     if (ompt_enabled.enabled) {
2452       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2453                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2454     }
2455 #endif
2456 
2457     return;
2458   }
2459 
2460   /* do cleanup and restore the parent team */
2461   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2462   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2463 
2464   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2465 
2466   /* jc: The following lock has instructions with REL and ACQ semantics,
2467      separating the parallel user code called in this parallel region
2468      from the serial user code called after this function returns. */
2469   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2470 
2471   if (!master_th->th.th_teams_microtask ||
2472       team->t.t_level > master_th->th.th_teams_level) {
2473     /* Decrement our nested depth level */
2474     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2475   }
2476   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2477 
2478 #if OMPT_SUPPORT
2479   if (ompt_enabled.enabled) {
2480     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2481     if (ompt_enabled.ompt_callback_implicit_task) {
2482       int flags = (team_microtask == (void *)__kmp_teams_master)
2483                       ? ompt_task_initial
2484                       : ompt_task_implicit;
2485       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2486       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2487           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2488           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2489     }
2490     task_info->frame.exit_frame = ompt_data_none;
2491     task_info->task_data = ompt_data_none;
2492   }
2493 #endif
2494 
2495   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2496                 master_th, team));
2497   __kmp_pop_current_task_from_thread(master_th);
2498 
2499 #if KMP_AFFINITY_SUPPORTED
2500   // Restore master thread's partition.
2501   master_th->th.th_first_place = team->t.t_first_place;
2502   master_th->th.th_last_place = team->t.t_last_place;
2503 #endif // KMP_AFFINITY_SUPPORTED
2504   master_th->th.th_def_allocator = team->t.t_def_allocator;
2505 
2506   updateHWFPControl(team);
2507 
2508   if (root->r.r_active != master_active)
2509     root->r.r_active = master_active;
2510 
2511   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2512                             master_th)); // this will free worker threads
2513 
2514   /* this race was fun to find. make sure the following is in the critical
2515      region otherwise assertions may fail occasionally since the old team may be
2516      reallocated and the hierarchy appears inconsistent. it is actually safe to
2517      run and won't cause any bugs, but will cause those assertion failures. it's
2518      only one deref&assign so might as well put this in the critical region */
2519   master_th->th.th_team = parent_team;
2520   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2521   master_th->th.th_team_master = parent_team->t.t_threads[0];
2522   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2523 
2524   /* restore serialized team, if need be */
2525   if (parent_team->t.t_serialized &&
2526       parent_team != master_th->th.th_serial_team &&
2527       parent_team != root->r.r_root_team) {
2528     __kmp_free_team(root,
2529                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2530     master_th->th.th_serial_team = parent_team;
2531   }
2532 
2533   if (__kmp_tasking_mode != tskm_immediate_exec) {
2534     if (master_th->th.th_task_state_top >
2535         0) { // Restore task state from memo stack
2536       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2537       // Remember master's state if we re-use this nested hot team
2538       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2539           master_th->th.th_task_state;
2540       --master_th->th.th_task_state_top; // pop
2541       // Now restore state at this level
2542       master_th->th.th_task_state =
2543           master_th->th
2544               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2545     }
2546     // Copy the task team from the parent team to the master thread
2547     master_th->th.th_task_team =
2548         parent_team->t.t_task_team[master_th->th.th_task_state];
2549     KA_TRACE(20,
2550              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2551               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2552               parent_team));
2553   }
2554 
2555   // TODO: GEH - cannot do this assertion because root thread not set up as
2556   // executing
2557   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2558   master_th->th.th_current_task->td_flags.executing = 1;
2559 
2560   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2561 
2562 #if OMPT_SUPPORT
2563   int flags =
2564       OMPT_INVOKER(fork_context) |
2565       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2566                                                       : ompt_parallel_team);
2567   if (ompt_enabled.enabled) {
2568     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2569                     codeptr);
2570   }
2571 #endif
2572 
2573   KMP_MB();
2574   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2575 }
2576 
2577 /* Check whether we should push an internal control record onto the
2578    serial team stack.  If so, do it.  */
2579 void __kmp_save_internal_controls(kmp_info_t *thread) {
2580 
2581   if (thread->th.th_team != thread->th.th_serial_team) {
2582     return;
2583   }
2584   if (thread->th.th_team->t.t_serialized > 1) {
2585     int push = 0;
2586 
2587     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2588       push = 1;
2589     } else {
2590       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2591           thread->th.th_team->t.t_serialized) {
2592         push = 1;
2593       }
2594     }
2595     if (push) { /* push a record on the serial team's stack */
2596       kmp_internal_control_t *control =
2597           (kmp_internal_control_t *)__kmp_allocate(
2598               sizeof(kmp_internal_control_t));
2599 
2600       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2601 
2602       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2603 
2604       control->next = thread->th.th_team->t.t_control_stack_top;
2605       thread->th.th_team->t.t_control_stack_top = control;
2606     }
2607   }
2608 }
2609 
2610 /* Changes set_nproc */
2611 void __kmp_set_num_threads(int new_nth, int gtid) {
2612   kmp_info_t *thread;
2613   kmp_root_t *root;
2614 
2615   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2616   KMP_DEBUG_ASSERT(__kmp_init_serial);
2617 
2618   if (new_nth < 1)
2619     new_nth = 1;
2620   else if (new_nth > __kmp_max_nth)
2621     new_nth = __kmp_max_nth;
2622 
2623   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2624   thread = __kmp_threads[gtid];
2625   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2626     return; // nothing to do
2627 
2628   __kmp_save_internal_controls(thread);
2629 
2630   set__nproc(thread, new_nth);
2631 
2632   // If this omp_set_num_threads() call will cause the hot team size to be
2633   // reduced (in the absence of a num_threads clause), then reduce it now,
2634   // rather than waiting for the next parallel region.
2635   root = thread->th.th_root;
2636   if (__kmp_init_parallel && (!root->r.r_active) &&
2637       (root->r.r_hot_team->t.t_nproc > new_nth)
2638 #if KMP_NESTED_HOT_TEAMS
2639       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2640 #endif
2641       ) {
2642     kmp_team_t *hot_team = root->r.r_hot_team;
2643     int f;
2644 
2645     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2646 
2647     // Release the extra threads we don't need any more.
2648     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2649       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2650       if (__kmp_tasking_mode != tskm_immediate_exec) {
2651         // When decreasing team size, threads no longer in the team should unref
2652         // task team.
2653         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2654       }
2655       __kmp_free_thread(hot_team->t.t_threads[f]);
2656       hot_team->t.t_threads[f] = NULL;
2657     }
2658     hot_team->t.t_nproc = new_nth;
2659 #if KMP_NESTED_HOT_TEAMS
2660     if (thread->th.th_hot_teams) {
2661       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2662       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2663     }
2664 #endif
2665 
2666     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2667 
2668     // Update the t_nproc field in the threads that are still active.
2669     for (f = 0; f < new_nth; f++) {
2670       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2671       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2672     }
2673     // Special flag in case omp_set_num_threads() call
2674     hot_team->t.t_size_changed = -1;
2675   }
2676 }
2677 
2678 /* Changes max_active_levels */
2679 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2680   kmp_info_t *thread;
2681 
2682   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2683                 "%d = (%d)\n",
2684                 gtid, max_active_levels));
2685   KMP_DEBUG_ASSERT(__kmp_init_serial);
2686 
2687   // validate max_active_levels
2688   if (max_active_levels < 0) {
2689     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2690     // We ignore this call if the user has specified a negative value.
2691     // The current setting won't be changed. The last valid setting will be
2692     // used. A warning will be issued (if warnings are allowed as controlled by
2693     // the KMP_WARNINGS env var).
2694     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2695                   "max_active_levels for thread %d = (%d)\n",
2696                   gtid, max_active_levels));
2697     return;
2698   }
2699   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2700     // it's OK, the max_active_levels is within the valid range: [ 0;
2701     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2702     // We allow a zero value. (implementation defined behavior)
2703   } else {
2704     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2705                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2706     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2707     // Current upper limit is MAX_INT. (implementation defined behavior)
2708     // If the input exceeds the upper limit, we correct the input to be the
2709     // upper limit. (implementation defined behavior)
2710     // Actually, the flow should never get here until we use MAX_INT limit.
2711   }
2712   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2713                 "max_active_levels for thread %d = (%d)\n",
2714                 gtid, max_active_levels));
2715 
2716   thread = __kmp_threads[gtid];
2717 
2718   __kmp_save_internal_controls(thread);
2719 
2720   set__max_active_levels(thread, max_active_levels);
2721 }
2722 
2723 /* Gets max_active_levels */
2724 int __kmp_get_max_active_levels(int gtid) {
2725   kmp_info_t *thread;
2726 
2727   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2728   KMP_DEBUG_ASSERT(__kmp_init_serial);
2729 
2730   thread = __kmp_threads[gtid];
2731   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2732   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2733                 "curtask_maxaclevel=%d\n",
2734                 gtid, thread->th.th_current_task,
2735                 thread->th.th_current_task->td_icvs.max_active_levels));
2736   return thread->th.th_current_task->td_icvs.max_active_levels;
2737 }
2738 
2739 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2740 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2741 
2742 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2743 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2744   kmp_info_t *thread;
2745   kmp_sched_t orig_kind;
2746   //    kmp_team_t *team;
2747 
2748   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2749                 gtid, (int)kind, chunk));
2750   KMP_DEBUG_ASSERT(__kmp_init_serial);
2751 
2752   // Check if the kind parameter is valid, correct if needed.
2753   // Valid parameters should fit in one of two intervals - standard or extended:
2754   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2755   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2756   orig_kind = kind;
2757   kind = __kmp_sched_without_mods(kind);
2758 
2759   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2760       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2761     // TODO: Hint needs attention in case we change the default schedule.
2762     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2763               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2764               __kmp_msg_null);
2765     kind = kmp_sched_default;
2766     chunk = 0; // ignore chunk value in case of bad kind
2767   }
2768 
2769   thread = __kmp_threads[gtid];
2770 
2771   __kmp_save_internal_controls(thread);
2772 
2773   if (kind < kmp_sched_upper_std) {
2774     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2775       // differ static chunked vs. unchunked:  chunk should be invalid to
2776       // indicate unchunked schedule (which is the default)
2777       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2778     } else {
2779       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2780           __kmp_sch_map[kind - kmp_sched_lower - 1];
2781     }
2782   } else {
2783     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2784     //    kmp_sched_lower - 2 ];
2785     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2786         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2787                       kmp_sched_lower - 2];
2788   }
2789   __kmp_sched_apply_mods_intkind(
2790       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2791   if (kind == kmp_sched_auto || chunk < 1) {
2792     // ignore parameter chunk for schedule auto
2793     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2794   } else {
2795     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2796   }
2797 }
2798 
2799 /* Gets def_sched_var ICV values */
2800 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2801   kmp_info_t *thread;
2802   enum sched_type th_type;
2803 
2804   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2805   KMP_DEBUG_ASSERT(__kmp_init_serial);
2806 
2807   thread = __kmp_threads[gtid];
2808 
2809   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2810   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2811   case kmp_sch_static:
2812   case kmp_sch_static_greedy:
2813   case kmp_sch_static_balanced:
2814     *kind = kmp_sched_static;
2815     __kmp_sched_apply_mods_stdkind(kind, th_type);
2816     *chunk = 0; // chunk was not set, try to show this fact via zero value
2817     return;
2818   case kmp_sch_static_chunked:
2819     *kind = kmp_sched_static;
2820     break;
2821   case kmp_sch_dynamic_chunked:
2822     *kind = kmp_sched_dynamic;
2823     break;
2824   case kmp_sch_guided_chunked:
2825   case kmp_sch_guided_iterative_chunked:
2826   case kmp_sch_guided_analytical_chunked:
2827     *kind = kmp_sched_guided;
2828     break;
2829   case kmp_sch_auto:
2830     *kind = kmp_sched_auto;
2831     break;
2832   case kmp_sch_trapezoidal:
2833     *kind = kmp_sched_trapezoidal;
2834     break;
2835 #if KMP_STATIC_STEAL_ENABLED
2836   case kmp_sch_static_steal:
2837     *kind = kmp_sched_static_steal;
2838     break;
2839 #endif
2840   default:
2841     KMP_FATAL(UnknownSchedulingType, th_type);
2842   }
2843 
2844   __kmp_sched_apply_mods_stdkind(kind, th_type);
2845   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2846 }
2847 
2848 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2849 
2850   int ii, dd;
2851   kmp_team_t *team;
2852   kmp_info_t *thr;
2853 
2854   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2855   KMP_DEBUG_ASSERT(__kmp_init_serial);
2856 
2857   // validate level
2858   if (level == 0)
2859     return 0;
2860   if (level < 0)
2861     return -1;
2862   thr = __kmp_threads[gtid];
2863   team = thr->th.th_team;
2864   ii = team->t.t_level;
2865   if (level > ii)
2866     return -1;
2867 
2868   if (thr->th.th_teams_microtask) {
2869     // AC: we are in teams region where multiple nested teams have same level
2870     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2871     if (level <=
2872         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2873       KMP_DEBUG_ASSERT(ii >= tlevel);
2874       // AC: As we need to pass by the teams league, we need to artificially
2875       // increase ii
2876       if (ii == tlevel) {
2877         ii += 2; // three teams have same level
2878       } else {
2879         ii++; // two teams have same level
2880       }
2881     }
2882   }
2883 
2884   if (ii == level)
2885     return __kmp_tid_from_gtid(gtid);
2886 
2887   dd = team->t.t_serialized;
2888   level++;
2889   while (ii > level) {
2890     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2891     }
2892     if ((team->t.t_serialized) && (!dd)) {
2893       team = team->t.t_parent;
2894       continue;
2895     }
2896     if (ii > level) {
2897       team = team->t.t_parent;
2898       dd = team->t.t_serialized;
2899       ii--;
2900     }
2901   }
2902 
2903   return (dd > 1) ? (0) : (team->t.t_master_tid);
2904 }
2905 
2906 int __kmp_get_team_size(int gtid, int level) {
2907 
2908   int ii, dd;
2909   kmp_team_t *team;
2910   kmp_info_t *thr;
2911 
2912   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2913   KMP_DEBUG_ASSERT(__kmp_init_serial);
2914 
2915   // validate level
2916   if (level == 0)
2917     return 1;
2918   if (level < 0)
2919     return -1;
2920   thr = __kmp_threads[gtid];
2921   team = thr->th.th_team;
2922   ii = team->t.t_level;
2923   if (level > ii)
2924     return -1;
2925 
2926   if (thr->th.th_teams_microtask) {
2927     // AC: we are in teams region where multiple nested teams have same level
2928     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2929     if (level <=
2930         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2931       KMP_DEBUG_ASSERT(ii >= tlevel);
2932       // AC: As we need to pass by the teams league, we need to artificially
2933       // increase ii
2934       if (ii == tlevel) {
2935         ii += 2; // three teams have same level
2936       } else {
2937         ii++; // two teams have same level
2938       }
2939     }
2940   }
2941 
2942   while (ii > level) {
2943     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2944     }
2945     if (team->t.t_serialized && (!dd)) {
2946       team = team->t.t_parent;
2947       continue;
2948     }
2949     if (ii > level) {
2950       team = team->t.t_parent;
2951       ii--;
2952     }
2953   }
2954 
2955   return team->t.t_nproc;
2956 }
2957 
2958 kmp_r_sched_t __kmp_get_schedule_global() {
2959   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2960   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2961   // independently. So one can get the updated schedule here.
2962 
2963   kmp_r_sched_t r_sched;
2964 
2965   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2966   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2967   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2968   // different roots (even in OMP 2.5)
2969   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2970   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2971   if (s == kmp_sch_static) {
2972     // replace STATIC with more detailed schedule (balanced or greedy)
2973     r_sched.r_sched_type = __kmp_static;
2974   } else if (s == kmp_sch_guided_chunked) {
2975     // replace GUIDED with more detailed schedule (iterative or analytical)
2976     r_sched.r_sched_type = __kmp_guided;
2977   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2978     r_sched.r_sched_type = __kmp_sched;
2979   }
2980   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2981 
2982   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2983     // __kmp_chunk may be wrong here (if it was not ever set)
2984     r_sched.chunk = KMP_DEFAULT_CHUNK;
2985   } else {
2986     r_sched.chunk = __kmp_chunk;
2987   }
2988 
2989   return r_sched;
2990 }
2991 
2992 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2993    at least argc number of *t_argv entries for the requested team. */
2994 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2995 
2996   KMP_DEBUG_ASSERT(team);
2997   if (!realloc || argc > team->t.t_max_argc) {
2998 
2999     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3000                    "current entries=%d\n",
3001                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3002     /* if previously allocated heap space for args, free them */
3003     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3004       __kmp_free((void *)team->t.t_argv);
3005 
3006     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3007       /* use unused space in the cache line for arguments */
3008       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3009       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3010                      "argv entries\n",
3011                      team->t.t_id, team->t.t_max_argc));
3012       team->t.t_argv = &team->t.t_inline_argv[0];
3013       if (__kmp_storage_map) {
3014         __kmp_print_storage_map_gtid(
3015             -1, &team->t.t_inline_argv[0],
3016             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3017             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3018             team->t.t_id);
3019       }
3020     } else {
3021       /* allocate space for arguments in the heap */
3022       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3023                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3024                                : 2 * argc;
3025       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3026                      "argv entries\n",
3027                      team->t.t_id, team->t.t_max_argc));
3028       team->t.t_argv =
3029           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3030       if (__kmp_storage_map) {
3031         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3032                                      &team->t.t_argv[team->t.t_max_argc],
3033                                      sizeof(void *) * team->t.t_max_argc,
3034                                      "team_%d.t_argv", team->t.t_id);
3035       }
3036     }
3037   }
3038 }
3039 
3040 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3041   int i;
3042   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3043   team->t.t_threads =
3044       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3045   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3046       sizeof(dispatch_shared_info_t) * num_disp_buff);
3047   team->t.t_dispatch =
3048       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3049   team->t.t_implicit_task_taskdata =
3050       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3051   team->t.t_max_nproc = max_nth;
3052 
3053   /* setup dispatch buffers */
3054   for (i = 0; i < num_disp_buff; ++i) {
3055     team->t.t_disp_buffer[i].buffer_index = i;
3056     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3057   }
3058 }
3059 
3060 static void __kmp_free_team_arrays(kmp_team_t *team) {
3061   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3062   int i;
3063   for (i = 0; i < team->t.t_max_nproc; ++i) {
3064     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3065       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3066       team->t.t_dispatch[i].th_disp_buffer = NULL;
3067     }
3068   }
3069 #if KMP_USE_HIER_SCHED
3070   __kmp_dispatch_free_hierarchies(team);
3071 #endif
3072   __kmp_free(team->t.t_threads);
3073   __kmp_free(team->t.t_disp_buffer);
3074   __kmp_free(team->t.t_dispatch);
3075   __kmp_free(team->t.t_implicit_task_taskdata);
3076   team->t.t_threads = NULL;
3077   team->t.t_disp_buffer = NULL;
3078   team->t.t_dispatch = NULL;
3079   team->t.t_implicit_task_taskdata = 0;
3080 }
3081 
3082 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3083   kmp_info_t **oldThreads = team->t.t_threads;
3084 
3085   __kmp_free(team->t.t_disp_buffer);
3086   __kmp_free(team->t.t_dispatch);
3087   __kmp_free(team->t.t_implicit_task_taskdata);
3088   __kmp_allocate_team_arrays(team, max_nth);
3089 
3090   KMP_MEMCPY(team->t.t_threads, oldThreads,
3091              team->t.t_nproc * sizeof(kmp_info_t *));
3092 
3093   __kmp_free(oldThreads);
3094 }
3095 
3096 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3097 
3098   kmp_r_sched_t r_sched =
3099       __kmp_get_schedule_global(); // get current state of scheduling globals
3100 
3101   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3102 
3103   kmp_internal_control_t g_icvs = {
3104     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3105     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3106     // adjustment of threads (per thread)
3107     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3108     // whether blocktime is explicitly set
3109     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3110 #if KMP_USE_MONITOR
3111     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3112 // intervals
3113 #endif
3114     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3115     // next parallel region (per thread)
3116     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3117     __kmp_cg_max_nth, // int thread_limit;
3118     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3119     // for max_active_levels
3120     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3121     // {sched,chunk} pair
3122     __kmp_nested_proc_bind.bind_types[0],
3123     __kmp_default_device,
3124     NULL // struct kmp_internal_control *next;
3125   };
3126 
3127   return g_icvs;
3128 }
3129 
3130 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3131 
3132   kmp_internal_control_t gx_icvs;
3133   gx_icvs.serial_nesting_level =
3134       0; // probably =team->t.t_serial like in save_inter_controls
3135   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3136   gx_icvs.next = NULL;
3137 
3138   return gx_icvs;
3139 }
3140 
3141 static void __kmp_initialize_root(kmp_root_t *root) {
3142   int f;
3143   kmp_team_t *root_team;
3144   kmp_team_t *hot_team;
3145   int hot_team_max_nth;
3146   kmp_r_sched_t r_sched =
3147       __kmp_get_schedule_global(); // get current state of scheduling globals
3148   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3149   KMP_DEBUG_ASSERT(root);
3150   KMP_ASSERT(!root->r.r_begin);
3151 
3152   /* setup the root state structure */
3153   __kmp_init_lock(&root->r.r_begin_lock);
3154   root->r.r_begin = FALSE;
3155   root->r.r_active = FALSE;
3156   root->r.r_in_parallel = 0;
3157   root->r.r_blocktime = __kmp_dflt_blocktime;
3158 
3159   /* setup the root team for this task */
3160   /* allocate the root team structure */
3161   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3162 
3163   root_team =
3164       __kmp_allocate_team(root,
3165                           1, // new_nproc
3166                           1, // max_nproc
3167 #if OMPT_SUPPORT
3168                           ompt_data_none, // root parallel id
3169 #endif
3170                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3171                           0 // argc
3172                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3173                           );
3174 #if USE_DEBUGGER
3175   // Non-NULL value should be assigned to make the debugger display the root
3176   // team.
3177   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3178 #endif
3179 
3180   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3181 
3182   root->r.r_root_team = root_team;
3183   root_team->t.t_control_stack_top = NULL;
3184 
3185   /* initialize root team */
3186   root_team->t.t_threads[0] = NULL;
3187   root_team->t.t_nproc = 1;
3188   root_team->t.t_serialized = 1;
3189   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3190   root_team->t.t_sched.sched = r_sched.sched;
3191   KA_TRACE(
3192       20,
3193       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3194        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3195 
3196   /* setup the  hot team for this task */
3197   /* allocate the hot team structure */
3198   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3199 
3200   hot_team =
3201       __kmp_allocate_team(root,
3202                           1, // new_nproc
3203                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3204 #if OMPT_SUPPORT
3205                           ompt_data_none, // root parallel id
3206 #endif
3207                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3208                           0 // argc
3209                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3210                           );
3211   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3212 
3213   root->r.r_hot_team = hot_team;
3214   root_team->t.t_control_stack_top = NULL;
3215 
3216   /* first-time initialization */
3217   hot_team->t.t_parent = root_team;
3218 
3219   /* initialize hot team */
3220   hot_team_max_nth = hot_team->t.t_max_nproc;
3221   for (f = 0; f < hot_team_max_nth; ++f) {
3222     hot_team->t.t_threads[f] = NULL;
3223   }
3224   hot_team->t.t_nproc = 1;
3225   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3226   hot_team->t.t_sched.sched = r_sched.sched;
3227   hot_team->t.t_size_changed = 0;
3228 }
3229 
3230 #ifdef KMP_DEBUG
3231 
3232 typedef struct kmp_team_list_item {
3233   kmp_team_p const *entry;
3234   struct kmp_team_list_item *next;
3235 } kmp_team_list_item_t;
3236 typedef kmp_team_list_item_t *kmp_team_list_t;
3237 
3238 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3239     kmp_team_list_t list, // List of teams.
3240     kmp_team_p const *team // Team to add.
3241     ) {
3242 
3243   // List must terminate with item where both entry and next are NULL.
3244   // Team is added to the list only once.
3245   // List is sorted in ascending order by team id.
3246   // Team id is *not* a key.
3247 
3248   kmp_team_list_t l;
3249 
3250   KMP_DEBUG_ASSERT(list != NULL);
3251   if (team == NULL) {
3252     return;
3253   }
3254 
3255   __kmp_print_structure_team_accum(list, team->t.t_parent);
3256   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3257 
3258   // Search list for the team.
3259   l = list;
3260   while (l->next != NULL && l->entry != team) {
3261     l = l->next;
3262   }
3263   if (l->next != NULL) {
3264     return; // Team has been added before, exit.
3265   }
3266 
3267   // Team is not found. Search list again for insertion point.
3268   l = list;
3269   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3270     l = l->next;
3271   }
3272 
3273   // Insert team.
3274   {
3275     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3276         sizeof(kmp_team_list_item_t));
3277     *item = *l;
3278     l->entry = team;
3279     l->next = item;
3280   }
3281 }
3282 
3283 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3284 
3285                                        ) {
3286   __kmp_printf("%s", title);
3287   if (team != NULL) {
3288     __kmp_printf("%2x %p\n", team->t.t_id, team);
3289   } else {
3290     __kmp_printf(" - (nil)\n");
3291   }
3292 }
3293 
3294 static void __kmp_print_structure_thread(char const *title,
3295                                          kmp_info_p const *thread) {
3296   __kmp_printf("%s", title);
3297   if (thread != NULL) {
3298     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3299   } else {
3300     __kmp_printf(" - (nil)\n");
3301   }
3302 }
3303 
3304 void __kmp_print_structure(void) {
3305 
3306   kmp_team_list_t list;
3307 
3308   // Initialize list of teams.
3309   list =
3310       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3311   list->entry = NULL;
3312   list->next = NULL;
3313 
3314   __kmp_printf("\n------------------------------\nGlobal Thread "
3315                "Table\n------------------------------\n");
3316   {
3317     int gtid;
3318     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3319       __kmp_printf("%2d", gtid);
3320       if (__kmp_threads != NULL) {
3321         __kmp_printf(" %p", __kmp_threads[gtid]);
3322       }
3323       if (__kmp_root != NULL) {
3324         __kmp_printf(" %p", __kmp_root[gtid]);
3325       }
3326       __kmp_printf("\n");
3327     }
3328   }
3329 
3330   // Print out __kmp_threads array.
3331   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3332                "----------\n");
3333   if (__kmp_threads != NULL) {
3334     int gtid;
3335     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3336       kmp_info_t const *thread = __kmp_threads[gtid];
3337       if (thread != NULL) {
3338         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3339         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3340         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3341         __kmp_print_structure_team("    Serial Team:  ",
3342                                    thread->th.th_serial_team);
3343         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3344         __kmp_print_structure_thread("    Master:       ",
3345                                      thread->th.th_team_master);
3346         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3347         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3348         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3349         __kmp_print_structure_thread("    Next in pool: ",
3350                                      thread->th.th_next_pool);
3351         __kmp_printf("\n");
3352         __kmp_print_structure_team_accum(list, thread->th.th_team);
3353         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3354       }
3355     }
3356   } else {
3357     __kmp_printf("Threads array is not allocated.\n");
3358   }
3359 
3360   // Print out __kmp_root array.
3361   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3362                "--------\n");
3363   if (__kmp_root != NULL) {
3364     int gtid;
3365     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3366       kmp_root_t const *root = __kmp_root[gtid];
3367       if (root != NULL) {
3368         __kmp_printf("GTID %2d %p:\n", gtid, root);
3369         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3370         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3371         __kmp_print_structure_thread("    Uber Thread:  ",
3372                                      root->r.r_uber_thread);
3373         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3374         __kmp_printf("    In Parallel:  %2d\n",
3375                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3376         __kmp_printf("\n");
3377         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3378         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3379       }
3380     }
3381   } else {
3382     __kmp_printf("Ubers array is not allocated.\n");
3383   }
3384 
3385   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3386                "--------\n");
3387   while (list->next != NULL) {
3388     kmp_team_p const *team = list->entry;
3389     int i;
3390     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3391     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3392     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3393     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3394     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3395     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3396     for (i = 0; i < team->t.t_nproc; ++i) {
3397       __kmp_printf("    Thread %2d:      ", i);
3398       __kmp_print_structure_thread("", team->t.t_threads[i]);
3399     }
3400     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3401     __kmp_printf("\n");
3402     list = list->next;
3403   }
3404 
3405   // Print out __kmp_thread_pool and __kmp_team_pool.
3406   __kmp_printf("\n------------------------------\nPools\n----------------------"
3407                "--------\n");
3408   __kmp_print_structure_thread("Thread pool:          ",
3409                                CCAST(kmp_info_t *, __kmp_thread_pool));
3410   __kmp_print_structure_team("Team pool:            ",
3411                              CCAST(kmp_team_t *, __kmp_team_pool));
3412   __kmp_printf("\n");
3413 
3414   // Free team list.
3415   while (list != NULL) {
3416     kmp_team_list_item_t *item = list;
3417     list = list->next;
3418     KMP_INTERNAL_FREE(item);
3419   }
3420 }
3421 
3422 #endif
3423 
3424 //---------------------------------------------------------------------------
3425 //  Stuff for per-thread fast random number generator
3426 //  Table of primes
3427 static const unsigned __kmp_primes[] = {
3428     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3429     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3430     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3431     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3432     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3433     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3434     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3435     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3436     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3437     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3438     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3439 
3440 //---------------------------------------------------------------------------
3441 //  __kmp_get_random: Get a random number using a linear congruential method.
3442 unsigned short __kmp_get_random(kmp_info_t *thread) {
3443   unsigned x = thread->th.th_x;
3444   unsigned short r = x >> 16;
3445 
3446   thread->th.th_x = x * thread->th.th_a + 1;
3447 
3448   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3449                 thread->th.th_info.ds.ds_tid, r));
3450 
3451   return r;
3452 }
3453 //--------------------------------------------------------
3454 // __kmp_init_random: Initialize a random number generator
3455 void __kmp_init_random(kmp_info_t *thread) {
3456   unsigned seed = thread->th.th_info.ds.ds_tid;
3457 
3458   thread->th.th_a =
3459       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3460   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3461   KA_TRACE(30,
3462            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3463 }
3464 
3465 #if KMP_OS_WINDOWS
3466 /* reclaim array entries for root threads that are already dead, returns number
3467  * reclaimed */
3468 static int __kmp_reclaim_dead_roots(void) {
3469   int i, r = 0;
3470 
3471   for (i = 0; i < __kmp_threads_capacity; ++i) {
3472     if (KMP_UBER_GTID(i) &&
3473         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3474         !__kmp_root[i]
3475              ->r.r_active) { // AC: reclaim only roots died in non-active state
3476       r += __kmp_unregister_root_other_thread(i);
3477     }
3478   }
3479   return r;
3480 }
3481 #endif
3482 
3483 /* This function attempts to create free entries in __kmp_threads and
3484    __kmp_root, and returns the number of free entries generated.
3485 
3486    For Windows* OS static library, the first mechanism used is to reclaim array
3487    entries for root threads that are already dead.
3488 
3489    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3490    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3491    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3492    threadprivate cache array has been created. Synchronization with
3493    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3494 
3495    After any dead root reclamation, if the clipping value allows array expansion
3496    to result in the generation of a total of nNeed free slots, the function does
3497    that expansion. If not, nothing is done beyond the possible initial root
3498    thread reclamation.
3499 
3500    If any argument is negative, the behavior is undefined. */
3501 static int __kmp_expand_threads(int nNeed) {
3502   int added = 0;
3503   int minimumRequiredCapacity;
3504   int newCapacity;
3505   kmp_info_t **newThreads;
3506   kmp_root_t **newRoot;
3507 
3508 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3509 // resizing __kmp_threads does not need additional protection if foreign
3510 // threads are present
3511 
3512 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3513   /* only for Windows static library */
3514   /* reclaim array entries for root threads that are already dead */
3515   added = __kmp_reclaim_dead_roots();
3516 
3517   if (nNeed) {
3518     nNeed -= added;
3519     if (nNeed < 0)
3520       nNeed = 0;
3521   }
3522 #endif
3523   if (nNeed <= 0)
3524     return added;
3525 
3526   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3527   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3528   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3529   // > __kmp_max_nth in one of two ways:
3530   //
3531   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3532   //    may not be resused by another thread, so we may need to increase
3533   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3534   //
3535   // 2) New foreign root(s) are encountered.  We always register new foreign
3536   //    roots. This may cause a smaller # of threads to be allocated at
3537   //    subsequent parallel regions, but the worker threads hang around (and
3538   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3539   //
3540   // Anyway, that is the reason for moving the check to see if
3541   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3542   // instead of having it performed here. -BB
3543 
3544   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3545 
3546   /* compute expansion headroom to check if we can expand */
3547   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3548     /* possible expansion too small -- give up */
3549     return added;
3550   }
3551   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3552 
3553   newCapacity = __kmp_threads_capacity;
3554   do {
3555     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3556                                                           : __kmp_sys_max_nth;
3557   } while (newCapacity < minimumRequiredCapacity);
3558   newThreads = (kmp_info_t **)__kmp_allocate(
3559       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3560   newRoot =
3561       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3562   KMP_MEMCPY(newThreads, __kmp_threads,
3563              __kmp_threads_capacity * sizeof(kmp_info_t *));
3564   KMP_MEMCPY(newRoot, __kmp_root,
3565              __kmp_threads_capacity * sizeof(kmp_root_t *));
3566 
3567   kmp_info_t **temp_threads = __kmp_threads;
3568   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3569   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3570   __kmp_free(temp_threads);
3571   added += newCapacity - __kmp_threads_capacity;
3572   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3573 
3574   if (newCapacity > __kmp_tp_capacity) {
3575     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3576     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3577       __kmp_threadprivate_resize_cache(newCapacity);
3578     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3579       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3580     }
3581     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3582   }
3583 
3584   return added;
3585 }
3586 
3587 /* Register the current thread as a root thread and obtain our gtid. We must
3588    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3589    thread that calls from __kmp_do_serial_initialize() */
3590 int __kmp_register_root(int initial_thread) {
3591   kmp_info_t *root_thread;
3592   kmp_root_t *root;
3593   int gtid;
3594   int capacity;
3595   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3596   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3597   KMP_MB();
3598 
3599   /* 2007-03-02:
3600      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3601      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3602      work as expected -- it may return false (that means there is at least one
3603      empty slot in __kmp_threads array), but it is possible the only free slot
3604      is #0, which is reserved for initial thread and so cannot be used for this
3605      one. Following code workarounds this bug.
3606 
3607      However, right solution seems to be not reserving slot #0 for initial
3608      thread because:
3609      (1) there is no magic in slot #0,
3610      (2) we cannot detect initial thread reliably (the first thread which does
3611         serial initialization may be not a real initial thread).
3612   */
3613   capacity = __kmp_threads_capacity;
3614   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3615     --capacity;
3616   }
3617 
3618   /* see if there are too many threads */
3619   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3620     if (__kmp_tp_cached) {
3621       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3622                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3623                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3624     } else {
3625       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3626                   __kmp_msg_null);
3627     }
3628   }
3629 
3630   /* find an available thread slot */
3631   /* Don't reassign the zero slot since we need that to only be used by initial
3632      thread */
3633   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3634        gtid++)
3635     ;
3636   KA_TRACE(1,
3637            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3638   KMP_ASSERT(gtid < __kmp_threads_capacity);
3639 
3640   /* update global accounting */
3641   __kmp_all_nth++;
3642   TCW_4(__kmp_nth, __kmp_nth + 1);
3643 
3644   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3645   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3646   if (__kmp_adjust_gtid_mode) {
3647     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3648       if (TCR_4(__kmp_gtid_mode) != 2) {
3649         TCW_4(__kmp_gtid_mode, 2);
3650       }
3651     } else {
3652       if (TCR_4(__kmp_gtid_mode) != 1) {
3653         TCW_4(__kmp_gtid_mode, 1);
3654       }
3655     }
3656   }
3657 
3658 #ifdef KMP_ADJUST_BLOCKTIME
3659   /* Adjust blocktime to zero if necessary            */
3660   /* Middle initialization might not have occurred yet */
3661   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3662     if (__kmp_nth > __kmp_avail_proc) {
3663       __kmp_zero_bt = TRUE;
3664     }
3665   }
3666 #endif /* KMP_ADJUST_BLOCKTIME */
3667 
3668   /* setup this new hierarchy */
3669   if (!(root = __kmp_root[gtid])) {
3670     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3671     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3672   }
3673 
3674 #if KMP_STATS_ENABLED
3675   // Initialize stats as soon as possible (right after gtid assignment).
3676   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3677   __kmp_stats_thread_ptr->startLife();
3678   KMP_SET_THREAD_STATE(SERIAL_REGION);
3679   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3680 #endif
3681   __kmp_initialize_root(root);
3682 
3683   /* setup new root thread structure */
3684   if (root->r.r_uber_thread) {
3685     root_thread = root->r.r_uber_thread;
3686   } else {
3687     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3688     if (__kmp_storage_map) {
3689       __kmp_print_thread_storage_map(root_thread, gtid);
3690     }
3691     root_thread->th.th_info.ds.ds_gtid = gtid;
3692 #if OMPT_SUPPORT
3693     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3694 #endif
3695     root_thread->th.th_root = root;
3696     if (__kmp_env_consistency_check) {
3697       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3698     }
3699 #if USE_FAST_MEMORY
3700     __kmp_initialize_fast_memory(root_thread);
3701 #endif /* USE_FAST_MEMORY */
3702 
3703 #if KMP_USE_BGET
3704     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3705     __kmp_initialize_bget(root_thread);
3706 #endif
3707     __kmp_init_random(root_thread); // Initialize random number generator
3708   }
3709 
3710   /* setup the serial team held in reserve by the root thread */
3711   if (!root_thread->th.th_serial_team) {
3712     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3713     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3714     root_thread->th.th_serial_team = __kmp_allocate_team(
3715         root, 1, 1,
3716 #if OMPT_SUPPORT
3717         ompt_data_none, // root parallel id
3718 #endif
3719         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3720   }
3721   KMP_ASSERT(root_thread->th.th_serial_team);
3722   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3723                 root_thread->th.th_serial_team));
3724 
3725   /* drop root_thread into place */
3726   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3727 
3728   root->r.r_root_team->t.t_threads[0] = root_thread;
3729   root->r.r_hot_team->t.t_threads[0] = root_thread;
3730   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3731   // AC: the team created in reserve, not for execution (it is unused for now).
3732   root_thread->th.th_serial_team->t.t_serialized = 0;
3733   root->r.r_uber_thread = root_thread;
3734 
3735   /* initialize the thread, get it ready to go */
3736   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3737   TCW_4(__kmp_init_gtid, TRUE);
3738 
3739   /* prepare the master thread for get_gtid() */
3740   __kmp_gtid_set_specific(gtid);
3741 
3742 #if USE_ITT_BUILD
3743   __kmp_itt_thread_name(gtid);
3744 #endif /* USE_ITT_BUILD */
3745 
3746 #ifdef KMP_TDATA_GTID
3747   __kmp_gtid = gtid;
3748 #endif
3749   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3750   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3751 
3752   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3753                 "plain=%u\n",
3754                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3755                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3756                 KMP_INIT_BARRIER_STATE));
3757   { // Initialize barrier data.
3758     int b;
3759     for (b = 0; b < bs_last_barrier; ++b) {
3760       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3761 #if USE_DEBUGGER
3762       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3763 #endif
3764     }
3765   }
3766   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3767                    KMP_INIT_BARRIER_STATE);
3768 
3769 #if KMP_AFFINITY_SUPPORTED
3770   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3771   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3772   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3773   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3774   if (TCR_4(__kmp_init_middle)) {
3775     __kmp_affinity_set_init_mask(gtid, TRUE);
3776   }
3777 #endif /* KMP_AFFINITY_SUPPORTED */
3778   root_thread->th.th_def_allocator = __kmp_def_allocator;
3779   root_thread->th.th_prev_level = 0;
3780   root_thread->th.th_prev_num_threads = 1;
3781 
3782   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3783   tmp->cg_root = root_thread;
3784   tmp->cg_thread_limit = __kmp_cg_max_nth;
3785   tmp->cg_nthreads = 1;
3786   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3787                  " cg_nthreads init to 1\n",
3788                  root_thread, tmp));
3789   tmp->up = NULL;
3790   root_thread->th.th_cg_roots = tmp;
3791 
3792   __kmp_root_counter++;
3793 
3794 #if OMPT_SUPPORT
3795   if (!initial_thread && ompt_enabled.enabled) {
3796 
3797     kmp_info_t *root_thread = ompt_get_thread();
3798 
3799     ompt_set_thread_state(root_thread, ompt_state_overhead);
3800 
3801     if (ompt_enabled.ompt_callback_thread_begin) {
3802       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3803           ompt_thread_initial, __ompt_get_thread_data_internal());
3804     }
3805     ompt_data_t *task_data;
3806     ompt_data_t *parallel_data;
3807     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3808     if (ompt_enabled.ompt_callback_implicit_task) {
3809       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3810           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3811     }
3812 
3813     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3814   }
3815 #endif
3816 
3817   KMP_MB();
3818   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3819 
3820   return gtid;
3821 }
3822 
3823 #if KMP_NESTED_HOT_TEAMS
3824 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3825                                 const int max_level) {
3826   int i, n, nth;
3827   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3828   if (!hot_teams || !hot_teams[level].hot_team) {
3829     return 0;
3830   }
3831   KMP_DEBUG_ASSERT(level < max_level);
3832   kmp_team_t *team = hot_teams[level].hot_team;
3833   nth = hot_teams[level].hot_team_nth;
3834   n = nth - 1; // master is not freed
3835   if (level < max_level - 1) {
3836     for (i = 0; i < nth; ++i) {
3837       kmp_info_t *th = team->t.t_threads[i];
3838       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3839       if (i > 0 && th->th.th_hot_teams) {
3840         __kmp_free(th->th.th_hot_teams);
3841         th->th.th_hot_teams = NULL;
3842       }
3843     }
3844   }
3845   __kmp_free_team(root, team, NULL);
3846   return n;
3847 }
3848 #endif
3849 
3850 // Resets a root thread and clear its root and hot teams.
3851 // Returns the number of __kmp_threads entries directly and indirectly freed.
3852 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3853   kmp_team_t *root_team = root->r.r_root_team;
3854   kmp_team_t *hot_team = root->r.r_hot_team;
3855   int n = hot_team->t.t_nproc;
3856   int i;
3857 
3858   KMP_DEBUG_ASSERT(!root->r.r_active);
3859 
3860   root->r.r_root_team = NULL;
3861   root->r.r_hot_team = NULL;
3862   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3863   // before call to __kmp_free_team().
3864   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3865 #if KMP_NESTED_HOT_TEAMS
3866   if (__kmp_hot_teams_max_level >
3867       0) { // need to free nested hot teams and their threads if any
3868     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3869       kmp_info_t *th = hot_team->t.t_threads[i];
3870       if (__kmp_hot_teams_max_level > 1) {
3871         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3872       }
3873       if (th->th.th_hot_teams) {
3874         __kmp_free(th->th.th_hot_teams);
3875         th->th.th_hot_teams = NULL;
3876       }
3877     }
3878   }
3879 #endif
3880   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3881 
3882   // Before we can reap the thread, we need to make certain that all other
3883   // threads in the teams that had this root as ancestor have stopped trying to
3884   // steal tasks.
3885   if (__kmp_tasking_mode != tskm_immediate_exec) {
3886     __kmp_wait_to_unref_task_teams();
3887   }
3888 
3889 #if KMP_OS_WINDOWS
3890   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3891   KA_TRACE(
3892       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3893            "\n",
3894            (LPVOID) & (root->r.r_uber_thread->th),
3895            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3896   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3897 #endif /* KMP_OS_WINDOWS */
3898 
3899 #if OMPT_SUPPORT
3900   ompt_data_t *task_data;
3901   ompt_data_t *parallel_data;
3902   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3903   if (ompt_enabled.ompt_callback_implicit_task) {
3904     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3905         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3906   }
3907   if (ompt_enabled.ompt_callback_thread_end) {
3908     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3909         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3910   }
3911 #endif
3912 
3913   TCW_4(__kmp_nth,
3914         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3915   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3916   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3917                  " to %d\n",
3918                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3919                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3920   if (i == 1) {
3921     // need to free contention group structure
3922     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3923                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3924     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3925     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3926     root->r.r_uber_thread->th.th_cg_roots = NULL;
3927   }
3928   __kmp_reap_thread(root->r.r_uber_thread, 1);
3929 
3930   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3931   // instead of freeing.
3932   root->r.r_uber_thread = NULL;
3933   /* mark root as no longer in use */
3934   root->r.r_begin = FALSE;
3935 
3936   return n;
3937 }
3938 
3939 void __kmp_unregister_root_current_thread(int gtid) {
3940   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3941   /* this lock should be ok, since unregister_root_current_thread is never
3942      called during an abort, only during a normal close. furthermore, if you
3943      have the forkjoin lock, you should never try to get the initz lock */
3944   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3945   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3946     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3947                   "exiting T#%d\n",
3948                   gtid));
3949     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3950     return;
3951   }
3952   kmp_root_t *root = __kmp_root[gtid];
3953 
3954   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3955   KMP_ASSERT(KMP_UBER_GTID(gtid));
3956   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3957   KMP_ASSERT(root->r.r_active == FALSE);
3958 
3959   KMP_MB();
3960 
3961   kmp_info_t *thread = __kmp_threads[gtid];
3962   kmp_team_t *team = thread->th.th_team;
3963   kmp_task_team_t *task_team = thread->th.th_task_team;
3964 
3965   // we need to wait for the proxy tasks before finishing the thread
3966   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3967 #if OMPT_SUPPORT
3968     // the runtime is shutting down so we won't report any events
3969     thread->th.ompt_thread_info.state = ompt_state_undefined;
3970 #endif
3971     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3972   }
3973 
3974   __kmp_reset_root(gtid, root);
3975 
3976   /* free up this thread slot */
3977   __kmp_gtid_set_specific(KMP_GTID_DNE);
3978 #ifdef KMP_TDATA_GTID
3979   __kmp_gtid = KMP_GTID_DNE;
3980 #endif
3981 
3982   KMP_MB();
3983   KC_TRACE(10,
3984            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3985 
3986   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3987 }
3988 
3989 #if KMP_OS_WINDOWS
3990 /* __kmp_forkjoin_lock must be already held
3991    Unregisters a root thread that is not the current thread.  Returns the number
3992    of __kmp_threads entries freed as a result. */
3993 static int __kmp_unregister_root_other_thread(int gtid) {
3994   kmp_root_t *root = __kmp_root[gtid];
3995   int r;
3996 
3997   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3998   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3999   KMP_ASSERT(KMP_UBER_GTID(gtid));
4000   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4001   KMP_ASSERT(root->r.r_active == FALSE);
4002 
4003   r = __kmp_reset_root(gtid, root);
4004   KC_TRACE(10,
4005            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4006   return r;
4007 }
4008 #endif
4009 
4010 #if KMP_DEBUG
4011 void __kmp_task_info() {
4012 
4013   kmp_int32 gtid = __kmp_entry_gtid();
4014   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4015   kmp_info_t *this_thr = __kmp_threads[gtid];
4016   kmp_team_t *steam = this_thr->th.th_serial_team;
4017   kmp_team_t *team = this_thr->th.th_team;
4018 
4019   __kmp_printf(
4020       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4021       "ptask=%p\n",
4022       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4023       team->t.t_implicit_task_taskdata[tid].td_parent);
4024 }
4025 #endif // KMP_DEBUG
4026 
4027 /* TODO optimize with one big memclr, take out what isn't needed, split
4028    responsibility to workers as much as possible, and delay initialization of
4029    features as much as possible  */
4030 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4031                                   int tid, int gtid) {
4032   /* this_thr->th.th_info.ds.ds_gtid is setup in
4033      kmp_allocate_thread/create_worker.
4034      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4035   kmp_info_t *master = team->t.t_threads[0];
4036   KMP_DEBUG_ASSERT(this_thr != NULL);
4037   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4038   KMP_DEBUG_ASSERT(team);
4039   KMP_DEBUG_ASSERT(team->t.t_threads);
4040   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4041   KMP_DEBUG_ASSERT(master);
4042   KMP_DEBUG_ASSERT(master->th.th_root);
4043 
4044   KMP_MB();
4045 
4046   TCW_SYNC_PTR(this_thr->th.th_team, team);
4047 
4048   this_thr->th.th_info.ds.ds_tid = tid;
4049   this_thr->th.th_set_nproc = 0;
4050   if (__kmp_tasking_mode != tskm_immediate_exec)
4051     // When tasking is possible, threads are not safe to reap until they are
4052     // done tasking; this will be set when tasking code is exited in wait
4053     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4054   else // no tasking --> always safe to reap
4055     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4056   this_thr->th.th_set_proc_bind = proc_bind_default;
4057 #if KMP_AFFINITY_SUPPORTED
4058   this_thr->th.th_new_place = this_thr->th.th_current_place;
4059 #endif
4060   this_thr->th.th_root = master->th.th_root;
4061 
4062   /* setup the thread's cache of the team structure */
4063   this_thr->th.th_team_nproc = team->t.t_nproc;
4064   this_thr->th.th_team_master = master;
4065   this_thr->th.th_team_serialized = team->t.t_serialized;
4066   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4067 
4068   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4069 
4070   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4071                 tid, gtid, this_thr, this_thr->th.th_current_task));
4072 
4073   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4074                            team, tid, TRUE);
4075 
4076   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4077                 tid, gtid, this_thr, this_thr->th.th_current_task));
4078   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4079   // __kmp_initialize_team()?
4080 
4081   /* TODO no worksharing in speculative threads */
4082   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4083 
4084   this_thr->th.th_local.this_construct = 0;
4085 
4086   if (!this_thr->th.th_pri_common) {
4087     this_thr->th.th_pri_common =
4088         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4089     if (__kmp_storage_map) {
4090       __kmp_print_storage_map_gtid(
4091           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4092           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4093     }
4094     this_thr->th.th_pri_head = NULL;
4095   }
4096 
4097   if (this_thr != master && // Master's CG root is initialized elsewhere
4098       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4099     // Make new thread's CG root same as master's
4100     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4101     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4102     if (tmp) {
4103       // worker changes CG, need to check if old CG should be freed
4104       int i = tmp->cg_nthreads--;
4105       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4106                      " on node %p of thread %p to %d\n",
4107                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4108       if (i == 1) {
4109         __kmp_free(tmp); // last thread left CG --> free it
4110       }
4111     }
4112     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4113     // Increment new thread's CG root's counter to add the new thread
4114     this_thr->th.th_cg_roots->cg_nthreads++;
4115     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4116                    " node %p of thread %p to %d\n",
4117                    this_thr, this_thr->th.th_cg_roots,
4118                    this_thr->th.th_cg_roots->cg_root,
4119                    this_thr->th.th_cg_roots->cg_nthreads));
4120     this_thr->th.th_current_task->td_icvs.thread_limit =
4121         this_thr->th.th_cg_roots->cg_thread_limit;
4122   }
4123 
4124   /* Initialize dynamic dispatch */
4125   {
4126     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4127     // Use team max_nproc since this will never change for the team.
4128     size_t disp_size =
4129         sizeof(dispatch_private_info_t) *
4130         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4131     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4132                   team->t.t_max_nproc));
4133     KMP_ASSERT(dispatch);
4134     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4135     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4136 
4137     dispatch->th_disp_index = 0;
4138     dispatch->th_doacross_buf_idx = 0;
4139     if (!dispatch->th_disp_buffer) {
4140       dispatch->th_disp_buffer =
4141           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4142 
4143       if (__kmp_storage_map) {
4144         __kmp_print_storage_map_gtid(
4145             gtid, &dispatch->th_disp_buffer[0],
4146             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4147                                           ? 1
4148                                           : __kmp_dispatch_num_buffers],
4149             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4150                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4151             gtid, team->t.t_id, gtid);
4152       }
4153     } else {
4154       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4155     }
4156 
4157     dispatch->th_dispatch_pr_current = 0;
4158     dispatch->th_dispatch_sh_current = 0;
4159 
4160     dispatch->th_deo_fcn = 0; /* ORDERED     */
4161     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4162   }
4163 
4164   this_thr->th.th_next_pool = NULL;
4165 
4166   if (!this_thr->th.th_task_state_memo_stack) {
4167     size_t i;
4168     this_thr->th.th_task_state_memo_stack =
4169         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4170     this_thr->th.th_task_state_top = 0;
4171     this_thr->th.th_task_state_stack_sz = 4;
4172     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4173          ++i) // zero init the stack
4174       this_thr->th.th_task_state_memo_stack[i] = 0;
4175   }
4176 
4177   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4178   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4179 
4180   KMP_MB();
4181 }
4182 
4183 /* allocate a new thread for the requesting team. this is only called from
4184    within a forkjoin critical section. we will first try to get an available
4185    thread from the thread pool. if none is available, we will fork a new one
4186    assuming we are able to create a new one. this should be assured, as the
4187    caller should check on this first. */
4188 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4189                                   int new_tid) {
4190   kmp_team_t *serial_team;
4191   kmp_info_t *new_thr;
4192   int new_gtid;
4193 
4194   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4195   KMP_DEBUG_ASSERT(root && team);
4196 #if !KMP_NESTED_HOT_TEAMS
4197   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4198 #endif
4199   KMP_MB();
4200 
4201   /* first, try to get one from the thread pool */
4202   if (__kmp_thread_pool) {
4203     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4204     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4205     if (new_thr == __kmp_thread_pool_insert_pt) {
4206       __kmp_thread_pool_insert_pt = NULL;
4207     }
4208     TCW_4(new_thr->th.th_in_pool, FALSE);
4209     __kmp_suspend_initialize_thread(new_thr);
4210     __kmp_lock_suspend_mx(new_thr);
4211     if (new_thr->th.th_active_in_pool == TRUE) {
4212       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4213       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4214       new_thr->th.th_active_in_pool = FALSE;
4215     }
4216     __kmp_unlock_suspend_mx(new_thr);
4217 
4218     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4219                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4220     KMP_ASSERT(!new_thr->th.th_team);
4221     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4222 
4223     /* setup the thread structure */
4224     __kmp_initialize_info(new_thr, team, new_tid,
4225                           new_thr->th.th_info.ds.ds_gtid);
4226     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4227 
4228     TCW_4(__kmp_nth, __kmp_nth + 1);
4229 
4230     new_thr->th.th_task_state = 0;
4231     new_thr->th.th_task_state_top = 0;
4232     new_thr->th.th_task_state_stack_sz = 4;
4233 
4234 #ifdef KMP_ADJUST_BLOCKTIME
4235     /* Adjust blocktime back to zero if necessary */
4236     /* Middle initialization might not have occurred yet */
4237     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4238       if (__kmp_nth > __kmp_avail_proc) {
4239         __kmp_zero_bt = TRUE;
4240       }
4241     }
4242 #endif /* KMP_ADJUST_BLOCKTIME */
4243 
4244 #if KMP_DEBUG
4245     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4246     // KMP_BARRIER_PARENT_FLAG.
4247     int b;
4248     kmp_balign_t *balign = new_thr->th.th_bar;
4249     for (b = 0; b < bs_last_barrier; ++b)
4250       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4251 #endif
4252 
4253     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4254                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4255 
4256     KMP_MB();
4257     return new_thr;
4258   }
4259 
4260   /* no, well fork a new one */
4261   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4262   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4263 
4264 #if KMP_USE_MONITOR
4265   // If this is the first worker thread the RTL is creating, then also
4266   // launch the monitor thread.  We try to do this as early as possible.
4267   if (!TCR_4(__kmp_init_monitor)) {
4268     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4269     if (!TCR_4(__kmp_init_monitor)) {
4270       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4271       TCW_4(__kmp_init_monitor, 1);
4272       __kmp_create_monitor(&__kmp_monitor);
4273       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4274 #if KMP_OS_WINDOWS
4275       // AC: wait until monitor has started. This is a fix for CQ232808.
4276       // The reason is that if the library is loaded/unloaded in a loop with
4277       // small (parallel) work in between, then there is high probability that
4278       // monitor thread started after the library shutdown. At shutdown it is
4279       // too late to cope with the problem, because when the master is in
4280       // DllMain (process detach) the monitor has no chances to start (it is
4281       // blocked), and master has no means to inform the monitor that the
4282       // library has gone, because all the memory which the monitor can access
4283       // is going to be released/reset.
4284       while (TCR_4(__kmp_init_monitor) < 2) {
4285         KMP_YIELD(TRUE);
4286       }
4287       KF_TRACE(10, ("after monitor thread has started\n"));
4288 #endif
4289     }
4290     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4291   }
4292 #endif
4293 
4294   KMP_MB();
4295   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4296     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4297   }
4298 
4299   /* allocate space for it. */
4300   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4301 
4302   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4303 
4304   if (__kmp_storage_map) {
4305     __kmp_print_thread_storage_map(new_thr, new_gtid);
4306   }
4307 
4308   // add the reserve serialized team, initialized from the team's master thread
4309   {
4310     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4311     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4312     new_thr->th.th_serial_team = serial_team =
4313         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4314 #if OMPT_SUPPORT
4315                                           ompt_data_none, // root parallel id
4316 #endif
4317                                           proc_bind_default, &r_icvs,
4318                                           0 USE_NESTED_HOT_ARG(NULL));
4319   }
4320   KMP_ASSERT(serial_team);
4321   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4322   // execution (it is unused for now).
4323   serial_team->t.t_threads[0] = new_thr;
4324   KF_TRACE(10,
4325            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4326             new_thr));
4327 
4328   /* setup the thread structures */
4329   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4330 
4331 #if USE_FAST_MEMORY
4332   __kmp_initialize_fast_memory(new_thr);
4333 #endif /* USE_FAST_MEMORY */
4334 
4335 #if KMP_USE_BGET
4336   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4337   __kmp_initialize_bget(new_thr);
4338 #endif
4339 
4340   __kmp_init_random(new_thr); // Initialize random number generator
4341 
4342   /* Initialize these only once when thread is grabbed for a team allocation */
4343   KA_TRACE(20,
4344            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4345             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4346 
4347   int b;
4348   kmp_balign_t *balign = new_thr->th.th_bar;
4349   for (b = 0; b < bs_last_barrier; ++b) {
4350     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4351     balign[b].bb.team = NULL;
4352     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4353     balign[b].bb.use_oncore_barrier = 0;
4354   }
4355 
4356   new_thr->th.th_spin_here = FALSE;
4357   new_thr->th.th_next_waiting = 0;
4358 #if KMP_OS_UNIX
4359   new_thr->th.th_blocking = false;
4360 #endif
4361 
4362 #if KMP_AFFINITY_SUPPORTED
4363   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4364   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4365   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4366   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4367 #endif
4368   new_thr->th.th_def_allocator = __kmp_def_allocator;
4369   new_thr->th.th_prev_level = 0;
4370   new_thr->th.th_prev_num_threads = 1;
4371 
4372   TCW_4(new_thr->th.th_in_pool, FALSE);
4373   new_thr->th.th_active_in_pool = FALSE;
4374   TCW_4(new_thr->th.th_active, TRUE);
4375 
4376   /* adjust the global counters */
4377   __kmp_all_nth++;
4378   __kmp_nth++;
4379 
4380   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4381   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4382   if (__kmp_adjust_gtid_mode) {
4383     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4384       if (TCR_4(__kmp_gtid_mode) != 2) {
4385         TCW_4(__kmp_gtid_mode, 2);
4386       }
4387     } else {
4388       if (TCR_4(__kmp_gtid_mode) != 1) {
4389         TCW_4(__kmp_gtid_mode, 1);
4390       }
4391     }
4392   }
4393 
4394 #ifdef KMP_ADJUST_BLOCKTIME
4395   /* Adjust blocktime back to zero if necessary       */
4396   /* Middle initialization might not have occurred yet */
4397   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4398     if (__kmp_nth > __kmp_avail_proc) {
4399       __kmp_zero_bt = TRUE;
4400     }
4401   }
4402 #endif /* KMP_ADJUST_BLOCKTIME */
4403 
4404   /* actually fork it and create the new worker thread */
4405   KF_TRACE(
4406       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4407   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4408   KF_TRACE(10,
4409            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4410 
4411   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4412                 new_gtid));
4413   KMP_MB();
4414   return new_thr;
4415 }
4416 
4417 /* Reinitialize team for reuse.
4418    The hot team code calls this case at every fork barrier, so EPCC barrier
4419    test are extremely sensitive to changes in it, esp. writes to the team
4420    struct, which cause a cache invalidation in all threads.
4421    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4422 static void __kmp_reinitialize_team(kmp_team_t *team,
4423                                     kmp_internal_control_t *new_icvs,
4424                                     ident_t *loc) {
4425   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4426                 team->t.t_threads[0], team));
4427   KMP_DEBUG_ASSERT(team && new_icvs);
4428   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4429   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4430 
4431   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4432   // Copy ICVs to the master thread's implicit taskdata
4433   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4434   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4435 
4436   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4437                 team->t.t_threads[0], team));
4438 }
4439 
4440 /* Initialize the team data structure.
4441    This assumes the t_threads and t_max_nproc are already set.
4442    Also, we don't touch the arguments */
4443 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4444                                   kmp_internal_control_t *new_icvs,
4445                                   ident_t *loc) {
4446   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4447 
4448   /* verify */
4449   KMP_DEBUG_ASSERT(team);
4450   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4451   KMP_DEBUG_ASSERT(team->t.t_threads);
4452   KMP_MB();
4453 
4454   team->t.t_master_tid = 0; /* not needed */
4455   /* team->t.t_master_bar;        not needed */
4456   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4457   team->t.t_nproc = new_nproc;
4458 
4459   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4460   team->t.t_next_pool = NULL;
4461   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4462    * up hot team */
4463 
4464   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4465   team->t.t_invoke = NULL; /* not needed */
4466 
4467   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4468   team->t.t_sched.sched = new_icvs->sched.sched;
4469 
4470 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4471   team->t.t_fp_control_saved = FALSE; /* not needed */
4472   team->t.t_x87_fpu_control_word = 0; /* not needed */
4473   team->t.t_mxcsr = 0; /* not needed */
4474 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4475 
4476   team->t.t_construct = 0;
4477 
4478   team->t.t_ordered.dt.t_value = 0;
4479   team->t.t_master_active = FALSE;
4480 
4481 #ifdef KMP_DEBUG
4482   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4483 #endif
4484 #if KMP_OS_WINDOWS
4485   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4486 #endif
4487 
4488   team->t.t_control_stack_top = NULL;
4489 
4490   __kmp_reinitialize_team(team, new_icvs, loc);
4491 
4492   KMP_MB();
4493   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4494 }
4495 
4496 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4497 /* Sets full mask for thread and returns old mask, no changes to structures. */
4498 static void
4499 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4500   if (KMP_AFFINITY_CAPABLE()) {
4501     int status;
4502     if (old_mask != NULL) {
4503       status = __kmp_get_system_affinity(old_mask, TRUE);
4504       int error = errno;
4505       if (status != 0) {
4506         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4507                     __kmp_msg_null);
4508       }
4509     }
4510     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4511   }
4512 }
4513 #endif
4514 
4515 #if KMP_AFFINITY_SUPPORTED
4516 
4517 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4518 // It calculats the worker + master thread's partition based upon the parent
4519 // thread's partition, and binds each worker to a thread in their partition.
4520 // The master thread's partition should already include its current binding.
4521 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4522   // Copy the master thread's place partion to the team struct
4523   kmp_info_t *master_th = team->t.t_threads[0];
4524   KMP_DEBUG_ASSERT(master_th != NULL);
4525   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4526   int first_place = master_th->th.th_first_place;
4527   int last_place = master_th->th.th_last_place;
4528   int masters_place = master_th->th.th_current_place;
4529   team->t.t_first_place = first_place;
4530   team->t.t_last_place = last_place;
4531 
4532   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4533                 "bound to place %d partition = [%d,%d]\n",
4534                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4535                 team->t.t_id, masters_place, first_place, last_place));
4536 
4537   switch (proc_bind) {
4538 
4539   case proc_bind_default:
4540     // serial teams might have the proc_bind policy set to proc_bind_default. It
4541     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4542     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4543     break;
4544 
4545   case proc_bind_master: {
4546     int f;
4547     int n_th = team->t.t_nproc;
4548     for (f = 1; f < n_th; f++) {
4549       kmp_info_t *th = team->t.t_threads[f];
4550       KMP_DEBUG_ASSERT(th != NULL);
4551       th->th.th_first_place = first_place;
4552       th->th.th_last_place = last_place;
4553       th->th.th_new_place = masters_place;
4554       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4555           team->t.t_display_affinity != 1) {
4556         team->t.t_display_affinity = 1;
4557       }
4558 
4559       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4560                      "partition = [%d,%d]\n",
4561                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4562                      f, masters_place, first_place, last_place));
4563     }
4564   } break;
4565 
4566   case proc_bind_close: {
4567     int f;
4568     int n_th = team->t.t_nproc;
4569     int n_places;
4570     if (first_place <= last_place) {
4571       n_places = last_place - first_place + 1;
4572     } else {
4573       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4574     }
4575     if (n_th <= n_places) {
4576       int place = masters_place;
4577       for (f = 1; f < n_th; f++) {
4578         kmp_info_t *th = team->t.t_threads[f];
4579         KMP_DEBUG_ASSERT(th != NULL);
4580 
4581         if (place == last_place) {
4582           place = first_place;
4583         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4584           place = 0;
4585         } else {
4586           place++;
4587         }
4588         th->th.th_first_place = first_place;
4589         th->th.th_last_place = last_place;
4590         th->th.th_new_place = place;
4591         if (__kmp_display_affinity && place != th->th.th_current_place &&
4592             team->t.t_display_affinity != 1) {
4593           team->t.t_display_affinity = 1;
4594         }
4595 
4596         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4597                        "partition = [%d,%d]\n",
4598                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4599                        team->t.t_id, f, place, first_place, last_place));
4600       }
4601     } else {
4602       int S, rem, gap, s_count;
4603       S = n_th / n_places;
4604       s_count = 0;
4605       rem = n_th - (S * n_places);
4606       gap = rem > 0 ? n_places / rem : n_places;
4607       int place = masters_place;
4608       int gap_ct = gap;
4609       for (f = 0; f < n_th; f++) {
4610         kmp_info_t *th = team->t.t_threads[f];
4611         KMP_DEBUG_ASSERT(th != NULL);
4612 
4613         th->th.th_first_place = first_place;
4614         th->th.th_last_place = last_place;
4615         th->th.th_new_place = place;
4616         if (__kmp_display_affinity && place != th->th.th_current_place &&
4617             team->t.t_display_affinity != 1) {
4618           team->t.t_display_affinity = 1;
4619         }
4620         s_count++;
4621 
4622         if ((s_count == S) && rem && (gap_ct == gap)) {
4623           // do nothing, add an extra thread to place on next iteration
4624         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4625           // we added an extra thread to this place; move to next place
4626           if (place == last_place) {
4627             place = first_place;
4628           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4629             place = 0;
4630           } else {
4631             place++;
4632           }
4633           s_count = 0;
4634           gap_ct = 1;
4635           rem--;
4636         } else if (s_count == S) { // place full; don't add extra
4637           if (place == last_place) {
4638             place = first_place;
4639           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4640             place = 0;
4641           } else {
4642             place++;
4643           }
4644           gap_ct++;
4645           s_count = 0;
4646         }
4647 
4648         KA_TRACE(100,
4649                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4650                   "partition = [%d,%d]\n",
4651                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4652                   th->th.th_new_place, first_place, last_place));
4653       }
4654       KMP_DEBUG_ASSERT(place == masters_place);
4655     }
4656   } break;
4657 
4658   case proc_bind_spread: {
4659     int f;
4660     int n_th = team->t.t_nproc;
4661     int n_places;
4662     int thidx;
4663     if (first_place <= last_place) {
4664       n_places = last_place - first_place + 1;
4665     } else {
4666       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4667     }
4668     if (n_th <= n_places) {
4669       int place = -1;
4670 
4671       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4672         int S = n_places / n_th;
4673         int s_count, rem, gap, gap_ct;
4674 
4675         place = masters_place;
4676         rem = n_places - n_th * S;
4677         gap = rem ? n_th / rem : 1;
4678         gap_ct = gap;
4679         thidx = n_th;
4680         if (update_master_only == 1)
4681           thidx = 1;
4682         for (f = 0; f < thidx; f++) {
4683           kmp_info_t *th = team->t.t_threads[f];
4684           KMP_DEBUG_ASSERT(th != NULL);
4685 
4686           th->th.th_first_place = place;
4687           th->th.th_new_place = place;
4688           if (__kmp_display_affinity && place != th->th.th_current_place &&
4689               team->t.t_display_affinity != 1) {
4690             team->t.t_display_affinity = 1;
4691           }
4692           s_count = 1;
4693           while (s_count < S) {
4694             if (place == last_place) {
4695               place = first_place;
4696             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4697               place = 0;
4698             } else {
4699               place++;
4700             }
4701             s_count++;
4702           }
4703           if (rem && (gap_ct == gap)) {
4704             if (place == last_place) {
4705               place = first_place;
4706             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4707               place = 0;
4708             } else {
4709               place++;
4710             }
4711             rem--;
4712             gap_ct = 0;
4713           }
4714           th->th.th_last_place = place;
4715           gap_ct++;
4716 
4717           if (place == last_place) {
4718             place = first_place;
4719           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4720             place = 0;
4721           } else {
4722             place++;
4723           }
4724 
4725           KA_TRACE(100,
4726                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4727                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4728                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4729                     f, th->th.th_new_place, th->th.th_first_place,
4730                     th->th.th_last_place, __kmp_affinity_num_masks));
4731         }
4732       } else {
4733         /* Having uniform space of available computation places I can create
4734            T partitions of round(P/T) size and put threads into the first
4735            place of each partition. */
4736         double current = static_cast<double>(masters_place);
4737         double spacing =
4738             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4739         int first, last;
4740         kmp_info_t *th;
4741 
4742         thidx = n_th + 1;
4743         if (update_master_only == 1)
4744           thidx = 1;
4745         for (f = 0; f < thidx; f++) {
4746           first = static_cast<int>(current);
4747           last = static_cast<int>(current + spacing) - 1;
4748           KMP_DEBUG_ASSERT(last >= first);
4749           if (first >= n_places) {
4750             if (masters_place) {
4751               first -= n_places;
4752               last -= n_places;
4753               if (first == (masters_place + 1)) {
4754                 KMP_DEBUG_ASSERT(f == n_th);
4755                 first--;
4756               }
4757               if (last == masters_place) {
4758                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4759                 last--;
4760               }
4761             } else {
4762               KMP_DEBUG_ASSERT(f == n_th);
4763               first = 0;
4764               last = 0;
4765             }
4766           }
4767           if (last >= n_places) {
4768             last = (n_places - 1);
4769           }
4770           place = first;
4771           current += spacing;
4772           if (f < n_th) {
4773             KMP_DEBUG_ASSERT(0 <= first);
4774             KMP_DEBUG_ASSERT(n_places > first);
4775             KMP_DEBUG_ASSERT(0 <= last);
4776             KMP_DEBUG_ASSERT(n_places > last);
4777             KMP_DEBUG_ASSERT(last_place >= first_place);
4778             th = team->t.t_threads[f];
4779             KMP_DEBUG_ASSERT(th);
4780             th->th.th_first_place = first;
4781             th->th.th_new_place = place;
4782             th->th.th_last_place = last;
4783             if (__kmp_display_affinity && place != th->th.th_current_place &&
4784                 team->t.t_display_affinity != 1) {
4785               team->t.t_display_affinity = 1;
4786             }
4787             KA_TRACE(100,
4788                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4789                       "partition = [%d,%d], spacing = %.4f\n",
4790                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4791                       team->t.t_id, f, th->th.th_new_place,
4792                       th->th.th_first_place, th->th.th_last_place, spacing));
4793           }
4794         }
4795       }
4796       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4797     } else {
4798       int S, rem, gap, s_count;
4799       S = n_th / n_places;
4800       s_count = 0;
4801       rem = n_th - (S * n_places);
4802       gap = rem > 0 ? n_places / rem : n_places;
4803       int place = masters_place;
4804       int gap_ct = gap;
4805       thidx = n_th;
4806       if (update_master_only == 1)
4807         thidx = 1;
4808       for (f = 0; f < thidx; f++) {
4809         kmp_info_t *th = team->t.t_threads[f];
4810         KMP_DEBUG_ASSERT(th != NULL);
4811 
4812         th->th.th_first_place = place;
4813         th->th.th_last_place = place;
4814         th->th.th_new_place = place;
4815         if (__kmp_display_affinity && place != th->th.th_current_place &&
4816             team->t.t_display_affinity != 1) {
4817           team->t.t_display_affinity = 1;
4818         }
4819         s_count++;
4820 
4821         if ((s_count == S) && rem && (gap_ct == gap)) {
4822           // do nothing, add an extra thread to place on next iteration
4823         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4824           // we added an extra thread to this place; move on to next place
4825           if (place == last_place) {
4826             place = first_place;
4827           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4828             place = 0;
4829           } else {
4830             place++;
4831           }
4832           s_count = 0;
4833           gap_ct = 1;
4834           rem--;
4835         } else if (s_count == S) { // place is full; don't add extra thread
4836           if (place == last_place) {
4837             place = first_place;
4838           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4839             place = 0;
4840           } else {
4841             place++;
4842           }
4843           gap_ct++;
4844           s_count = 0;
4845         }
4846 
4847         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4848                        "partition = [%d,%d]\n",
4849                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4850                        team->t.t_id, f, th->th.th_new_place,
4851                        th->th.th_first_place, th->th.th_last_place));
4852       }
4853       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4854     }
4855   } break;
4856 
4857   default:
4858     break;
4859   }
4860 
4861   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4862 }
4863 
4864 #endif // KMP_AFFINITY_SUPPORTED
4865 
4866 /* allocate a new team data structure to use.  take one off of the free pool if
4867    available */
4868 kmp_team_t *
4869 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4870 #if OMPT_SUPPORT
4871                     ompt_data_t ompt_parallel_data,
4872 #endif
4873                     kmp_proc_bind_t new_proc_bind,
4874                     kmp_internal_control_t *new_icvs,
4875                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4876   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4877   int f;
4878   kmp_team_t *team;
4879   int use_hot_team = !root->r.r_active;
4880   int level = 0;
4881 
4882   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4883   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4884   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4885   KMP_MB();
4886 
4887 #if KMP_NESTED_HOT_TEAMS
4888   kmp_hot_team_ptr_t *hot_teams;
4889   if (master) {
4890     team = master->th.th_team;
4891     level = team->t.t_active_level;
4892     if (master->th.th_teams_microtask) { // in teams construct?
4893       if (master->th.th_teams_size.nteams > 1 &&
4894           ( // #teams > 1
4895               team->t.t_pkfn ==
4896                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4897               master->th.th_teams_level <
4898                   team->t.t_level)) { // or nested parallel inside the teams
4899         ++level; // not increment if #teams==1, or for outer fork of the teams;
4900         // increment otherwise
4901       }
4902     }
4903     hot_teams = master->th.th_hot_teams;
4904     if (level < __kmp_hot_teams_max_level && hot_teams &&
4905         hot_teams[level]
4906             .hot_team) { // hot team has already been allocated for given level
4907       use_hot_team = 1;
4908     } else {
4909       use_hot_team = 0;
4910     }
4911   }
4912 #endif
4913   // Optimization to use a "hot" team
4914   if (use_hot_team && new_nproc > 1) {
4915     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4916 #if KMP_NESTED_HOT_TEAMS
4917     team = hot_teams[level].hot_team;
4918 #else
4919     team = root->r.r_hot_team;
4920 #endif
4921 #if KMP_DEBUG
4922     if (__kmp_tasking_mode != tskm_immediate_exec) {
4923       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4924                     "task_team[1] = %p before reinit\n",
4925                     team->t.t_task_team[0], team->t.t_task_team[1]));
4926     }
4927 #endif
4928 
4929     // Has the number of threads changed?
4930     /* Let's assume the most common case is that the number of threads is
4931        unchanged, and put that case first. */
4932     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4933       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4934       // This case can mean that omp_set_num_threads() was called and the hot
4935       // team size was already reduced, so we check the special flag
4936       if (team->t.t_size_changed == -1) {
4937         team->t.t_size_changed = 1;
4938       } else {
4939         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4940       }
4941 
4942       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4943       kmp_r_sched_t new_sched = new_icvs->sched;
4944       // set master's schedule as new run-time schedule
4945       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4946 
4947       __kmp_reinitialize_team(team, new_icvs,
4948                               root->r.r_uber_thread->th.th_ident);
4949 
4950       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4951                     team->t.t_threads[0], team));
4952       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4953 
4954 #if KMP_AFFINITY_SUPPORTED
4955       if ((team->t.t_size_changed == 0) &&
4956           (team->t.t_proc_bind == new_proc_bind)) {
4957         if (new_proc_bind == proc_bind_spread) {
4958           __kmp_partition_places(
4959               team, 1); // add flag to update only master for spread
4960         }
4961         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4962                        "proc_bind = %d, partition = [%d,%d]\n",
4963                        team->t.t_id, new_proc_bind, team->t.t_first_place,
4964                        team->t.t_last_place));
4965       } else {
4966         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4967         __kmp_partition_places(team);
4968       }
4969 #else
4970       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4971 #endif /* KMP_AFFINITY_SUPPORTED */
4972     } else if (team->t.t_nproc > new_nproc) {
4973       KA_TRACE(20,
4974                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4975                 new_nproc));
4976 
4977       team->t.t_size_changed = 1;
4978 #if KMP_NESTED_HOT_TEAMS
4979       if (__kmp_hot_teams_mode == 0) {
4980         // AC: saved number of threads should correspond to team's value in this
4981         // mode, can be bigger in mode 1, when hot team has threads in reserve
4982         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4983         hot_teams[level].hot_team_nth = new_nproc;
4984 #endif // KMP_NESTED_HOT_TEAMS
4985         /* release the extra threads we don't need any more */
4986         for (f = new_nproc; f < team->t.t_nproc; f++) {
4987           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4988           if (__kmp_tasking_mode != tskm_immediate_exec) {
4989             // When decreasing team size, threads no longer in the team should
4990             // unref task team.
4991             team->t.t_threads[f]->th.th_task_team = NULL;
4992           }
4993           __kmp_free_thread(team->t.t_threads[f]);
4994           team->t.t_threads[f] = NULL;
4995         }
4996 #if KMP_NESTED_HOT_TEAMS
4997       } // (__kmp_hot_teams_mode == 0)
4998       else {
4999         // When keeping extra threads in team, switch threads to wait on own
5000         // b_go flag
5001         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5002           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5003           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5004           for (int b = 0; b < bs_last_barrier; ++b) {
5005             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5006               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5007             }
5008             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5009           }
5010         }
5011       }
5012 #endif // KMP_NESTED_HOT_TEAMS
5013       team->t.t_nproc = new_nproc;
5014       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5015       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5016       __kmp_reinitialize_team(team, new_icvs,
5017                               root->r.r_uber_thread->th.th_ident);
5018 
5019       // Update remaining threads
5020       for (f = 0; f < new_nproc; ++f) {
5021         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5022       }
5023 
5024       // restore the current task state of the master thread: should be the
5025       // implicit task
5026       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5027                     team->t.t_threads[0], team));
5028 
5029       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5030 
5031 #ifdef KMP_DEBUG
5032       for (f = 0; f < team->t.t_nproc; f++) {
5033         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5034                          team->t.t_threads[f]->th.th_team_nproc ==
5035                              team->t.t_nproc);
5036       }
5037 #endif
5038 
5039       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5040 #if KMP_AFFINITY_SUPPORTED
5041       __kmp_partition_places(team);
5042 #endif
5043     } else { // team->t.t_nproc < new_nproc
5044 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5045       kmp_affin_mask_t *old_mask;
5046       if (KMP_AFFINITY_CAPABLE()) {
5047         KMP_CPU_ALLOC(old_mask);
5048       }
5049 #endif
5050 
5051       KA_TRACE(20,
5052                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5053                 new_nproc));
5054 
5055       team->t.t_size_changed = 1;
5056 
5057 #if KMP_NESTED_HOT_TEAMS
5058       int avail_threads = hot_teams[level].hot_team_nth;
5059       if (new_nproc < avail_threads)
5060         avail_threads = new_nproc;
5061       kmp_info_t **other_threads = team->t.t_threads;
5062       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5063         // Adjust barrier data of reserved threads (if any) of the team
5064         // Other data will be set in __kmp_initialize_info() below.
5065         int b;
5066         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5067         for (b = 0; b < bs_last_barrier; ++b) {
5068           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5069           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5070 #if USE_DEBUGGER
5071           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5072 #endif
5073         }
5074       }
5075       if (hot_teams[level].hot_team_nth >= new_nproc) {
5076         // we have all needed threads in reserve, no need to allocate any
5077         // this only possible in mode 1, cannot have reserved threads in mode 0
5078         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5079         team->t.t_nproc = new_nproc; // just get reserved threads involved
5080       } else {
5081         // we may have some threads in reserve, but not enough
5082         team->t.t_nproc =
5083             hot_teams[level]
5084                 .hot_team_nth; // get reserved threads involved if any
5085         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5086 #endif // KMP_NESTED_HOT_TEAMS
5087         if (team->t.t_max_nproc < new_nproc) {
5088           /* reallocate larger arrays */
5089           __kmp_reallocate_team_arrays(team, new_nproc);
5090           __kmp_reinitialize_team(team, new_icvs, NULL);
5091         }
5092 
5093 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5094         /* Temporarily set full mask for master thread before creation of
5095            workers. The reason is that workers inherit the affinity from master,
5096            so if a lot of workers are created on the single core quickly, they
5097            don't get a chance to set their own affinity for a long time. */
5098         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5099 #endif
5100 
5101         /* allocate new threads for the hot team */
5102         for (f = team->t.t_nproc; f < new_nproc; f++) {
5103           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5104           KMP_DEBUG_ASSERT(new_worker);
5105           team->t.t_threads[f] = new_worker;
5106 
5107           KA_TRACE(20,
5108                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5109                     "join=%llu, plain=%llu\n",
5110                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5111                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5112                     team->t.t_bar[bs_plain_barrier].b_arrived));
5113 
5114           { // Initialize barrier data for new threads.
5115             int b;
5116             kmp_balign_t *balign = new_worker->th.th_bar;
5117             for (b = 0; b < bs_last_barrier; ++b) {
5118               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5119               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5120                                KMP_BARRIER_PARENT_FLAG);
5121 #if USE_DEBUGGER
5122               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5123 #endif
5124             }
5125           }
5126         }
5127 
5128 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5129         if (KMP_AFFINITY_CAPABLE()) {
5130           /* Restore initial master thread's affinity mask */
5131           __kmp_set_system_affinity(old_mask, TRUE);
5132           KMP_CPU_FREE(old_mask);
5133         }
5134 #endif
5135 #if KMP_NESTED_HOT_TEAMS
5136       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5137 #endif // KMP_NESTED_HOT_TEAMS
5138       /* make sure everyone is syncronized */
5139       int old_nproc = team->t.t_nproc; // save old value and use to update only
5140       // new threads below
5141       __kmp_initialize_team(team, new_nproc, new_icvs,
5142                             root->r.r_uber_thread->th.th_ident);
5143 
5144       /* reinitialize the threads */
5145       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5146       for (f = 0; f < team->t.t_nproc; ++f)
5147         __kmp_initialize_info(team->t.t_threads[f], team, f,
5148                               __kmp_gtid_from_tid(f, team));
5149 
5150       if (level) { // set th_task_state for new threads in nested hot team
5151         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5152         // only need to set the th_task_state for the new threads. th_task_state
5153         // for master thread will not be accurate until after this in
5154         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5155         // correct value.
5156         for (f = old_nproc; f < team->t.t_nproc; ++f)
5157           team->t.t_threads[f]->th.th_task_state =
5158               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5159       } else { // set th_task_state for new threads in non-nested hot team
5160         int old_state =
5161             team->t.t_threads[0]->th.th_task_state; // copy master's state
5162         for (f = old_nproc; f < team->t.t_nproc; ++f)
5163           team->t.t_threads[f]->th.th_task_state = old_state;
5164       }
5165 
5166 #ifdef KMP_DEBUG
5167       for (f = 0; f < team->t.t_nproc; ++f) {
5168         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5169                          team->t.t_threads[f]->th.th_team_nproc ==
5170                              team->t.t_nproc);
5171       }
5172 #endif
5173 
5174       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5175 #if KMP_AFFINITY_SUPPORTED
5176       __kmp_partition_places(team);
5177 #endif
5178     } // Check changes in number of threads
5179 
5180     kmp_info_t *master = team->t.t_threads[0];
5181     if (master->th.th_teams_microtask) {
5182       for (f = 1; f < new_nproc; ++f) {
5183         // propagate teams construct specific info to workers
5184         kmp_info_t *thr = team->t.t_threads[f];
5185         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5186         thr->th.th_teams_level = master->th.th_teams_level;
5187         thr->th.th_teams_size = master->th.th_teams_size;
5188       }
5189     }
5190 #if KMP_NESTED_HOT_TEAMS
5191     if (level) {
5192       // Sync barrier state for nested hot teams, not needed for outermost hot
5193       // team.
5194       for (f = 1; f < new_nproc; ++f) {
5195         kmp_info_t *thr = team->t.t_threads[f];
5196         int b;
5197         kmp_balign_t *balign = thr->th.th_bar;
5198         for (b = 0; b < bs_last_barrier; ++b) {
5199           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5200           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5201 #if USE_DEBUGGER
5202           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5203 #endif
5204         }
5205       }
5206     }
5207 #endif // KMP_NESTED_HOT_TEAMS
5208 
5209     /* reallocate space for arguments if necessary */
5210     __kmp_alloc_argv_entries(argc, team, TRUE);
5211     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5212     // The hot team re-uses the previous task team,
5213     // if untouched during the previous release->gather phase.
5214 
5215     KF_TRACE(10, (" hot_team = %p\n", team));
5216 
5217 #if KMP_DEBUG
5218     if (__kmp_tasking_mode != tskm_immediate_exec) {
5219       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5220                     "task_team[1] = %p after reinit\n",
5221                     team->t.t_task_team[0], team->t.t_task_team[1]));
5222     }
5223 #endif
5224 
5225 #if OMPT_SUPPORT
5226     __ompt_team_assign_id(team, ompt_parallel_data);
5227 #endif
5228 
5229     KMP_MB();
5230 
5231     return team;
5232   }
5233 
5234   /* next, let's try to take one from the team pool */
5235   KMP_MB();
5236   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5237     /* TODO: consider resizing undersized teams instead of reaping them, now
5238        that we have a resizing mechanism */
5239     if (team->t.t_max_nproc >= max_nproc) {
5240       /* take this team from the team pool */
5241       __kmp_team_pool = team->t.t_next_pool;
5242 
5243       /* setup the team for fresh use */
5244       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5245 
5246       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5247                     "task_team[1] %p to NULL\n",
5248                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5249       team->t.t_task_team[0] = NULL;
5250       team->t.t_task_team[1] = NULL;
5251 
5252       /* reallocate space for arguments if necessary */
5253       __kmp_alloc_argv_entries(argc, team, TRUE);
5254       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5255 
5256       KA_TRACE(
5257           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5258                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5259       { // Initialize barrier data.
5260         int b;
5261         for (b = 0; b < bs_last_barrier; ++b) {
5262           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5263 #if USE_DEBUGGER
5264           team->t.t_bar[b].b_master_arrived = 0;
5265           team->t.t_bar[b].b_team_arrived = 0;
5266 #endif
5267         }
5268       }
5269 
5270       team->t.t_proc_bind = new_proc_bind;
5271 
5272       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5273                     team->t.t_id));
5274 
5275 #if OMPT_SUPPORT
5276       __ompt_team_assign_id(team, ompt_parallel_data);
5277 #endif
5278 
5279       KMP_MB();
5280 
5281       return team;
5282     }
5283 
5284     /* reap team if it is too small, then loop back and check the next one */
5285     // not sure if this is wise, but, will be redone during the hot-teams
5286     // rewrite.
5287     /* TODO: Use technique to find the right size hot-team, don't reap them */
5288     team = __kmp_reap_team(team);
5289     __kmp_team_pool = team;
5290   }
5291 
5292   /* nothing available in the pool, no matter, make a new team! */
5293   KMP_MB();
5294   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5295 
5296   /* and set it up */
5297   team->t.t_max_nproc = max_nproc;
5298   /* NOTE well, for some reason allocating one big buffer and dividing it up
5299      seems to really hurt performance a lot on the P4, so, let's not use this */
5300   __kmp_allocate_team_arrays(team, max_nproc);
5301 
5302   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5303   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5304 
5305   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5306                 "%p to NULL\n",
5307                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5308   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5309   // memory, no need to duplicate
5310   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5311   // memory, no need to duplicate
5312 
5313   if (__kmp_storage_map) {
5314     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5315   }
5316 
5317   /* allocate space for arguments */
5318   __kmp_alloc_argv_entries(argc, team, FALSE);
5319   team->t.t_argc = argc;
5320 
5321   KA_TRACE(20,
5322            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5323             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5324   { // Initialize barrier data.
5325     int b;
5326     for (b = 0; b < bs_last_barrier; ++b) {
5327       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5328 #if USE_DEBUGGER
5329       team->t.t_bar[b].b_master_arrived = 0;
5330       team->t.t_bar[b].b_team_arrived = 0;
5331 #endif
5332     }
5333   }
5334 
5335   team->t.t_proc_bind = new_proc_bind;
5336 
5337 #if OMPT_SUPPORT
5338   __ompt_team_assign_id(team, ompt_parallel_data);
5339   team->t.ompt_serialized_team_info = NULL;
5340 #endif
5341 
5342   KMP_MB();
5343 
5344   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5345                 team->t.t_id));
5346 
5347   return team;
5348 }
5349 
5350 /* TODO implement hot-teams at all levels */
5351 /* TODO implement lazy thread release on demand (disband request) */
5352 
5353 /* free the team.  return it to the team pool.  release all the threads
5354  * associated with it */
5355 void __kmp_free_team(kmp_root_t *root,
5356                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5357   int f;
5358   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5359                 team->t.t_id));
5360 
5361   /* verify state */
5362   KMP_DEBUG_ASSERT(root);
5363   KMP_DEBUG_ASSERT(team);
5364   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5365   KMP_DEBUG_ASSERT(team->t.t_threads);
5366 
5367   int use_hot_team = team == root->r.r_hot_team;
5368 #if KMP_NESTED_HOT_TEAMS
5369   int level;
5370   kmp_hot_team_ptr_t *hot_teams;
5371   if (master) {
5372     level = team->t.t_active_level - 1;
5373     if (master->th.th_teams_microtask) { // in teams construct?
5374       if (master->th.th_teams_size.nteams > 1) {
5375         ++level; // level was not increased in teams construct for
5376         // team_of_masters
5377       }
5378       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5379           master->th.th_teams_level == team->t.t_level) {
5380         ++level; // level was not increased in teams construct for
5381         // team_of_workers before the parallel
5382       } // team->t.t_level will be increased inside parallel
5383     }
5384     hot_teams = master->th.th_hot_teams;
5385     if (level < __kmp_hot_teams_max_level) {
5386       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5387       use_hot_team = 1;
5388     }
5389   }
5390 #endif // KMP_NESTED_HOT_TEAMS
5391 
5392   /* team is done working */
5393   TCW_SYNC_PTR(team->t.t_pkfn,
5394                NULL); // Important for Debugging Support Library.
5395 #if KMP_OS_WINDOWS
5396   team->t.t_copyin_counter = 0; // init counter for possible reuse
5397 #endif
5398   // Do not reset pointer to parent team to NULL for hot teams.
5399 
5400   /* if we are non-hot team, release our threads */
5401   if (!use_hot_team) {
5402     if (__kmp_tasking_mode != tskm_immediate_exec) {
5403       // Wait for threads to reach reapable state
5404       for (f = 1; f < team->t.t_nproc; ++f) {
5405         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5406         kmp_info_t *th = team->t.t_threads[f];
5407         volatile kmp_uint32 *state = &th->th.th_reap_state;
5408         while (*state != KMP_SAFE_TO_REAP) {
5409 #if KMP_OS_WINDOWS
5410           // On Windows a thread can be killed at any time, check this
5411           DWORD ecode;
5412           if (!__kmp_is_thread_alive(th, &ecode)) {
5413             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5414             break;
5415           }
5416 #endif
5417           // first check if thread is sleeping
5418           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5419           if (fl.is_sleeping())
5420             fl.resume(__kmp_gtid_from_thread(th));
5421           KMP_CPU_PAUSE();
5422         }
5423       }
5424 
5425       // Delete task teams
5426       int tt_idx;
5427       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5428         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5429         if (task_team != NULL) {
5430           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5431             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5432             team->t.t_threads[f]->th.th_task_team = NULL;
5433           }
5434           KA_TRACE(
5435               20,
5436               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5437                __kmp_get_gtid(), task_team, team->t.t_id));
5438 #if KMP_NESTED_HOT_TEAMS
5439           __kmp_free_task_team(master, task_team);
5440 #endif
5441           team->t.t_task_team[tt_idx] = NULL;
5442         }
5443       }
5444     }
5445 
5446     // Reset pointer to parent team only for non-hot teams.
5447     team->t.t_parent = NULL;
5448     team->t.t_level = 0;
5449     team->t.t_active_level = 0;
5450 
5451     /* free the worker threads */
5452     for (f = 1; f < team->t.t_nproc; ++f) {
5453       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5454       __kmp_free_thread(team->t.t_threads[f]);
5455       team->t.t_threads[f] = NULL;
5456     }
5457 
5458     /* put the team back in the team pool */
5459     /* TODO limit size of team pool, call reap_team if pool too large */
5460     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5461     __kmp_team_pool = (volatile kmp_team_t *)team;
5462   } else { // Check if team was created for the masters in a teams construct
5463     // See if first worker is a CG root
5464     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5465                      team->t.t_threads[1]->th.th_cg_roots);
5466     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5467       // Clean up the CG root nodes on workers so that this team can be re-used
5468       for (f = 1; f < team->t.t_nproc; ++f) {
5469         kmp_info_t *thr = team->t.t_threads[f];
5470         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5471                          thr->th.th_cg_roots->cg_root == thr);
5472         // Pop current CG root off list
5473         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5474         thr->th.th_cg_roots = tmp->up;
5475         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5476                        " up to node %p. cg_nthreads was %d\n",
5477                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5478         int i = tmp->cg_nthreads--;
5479         if (i == 1) {
5480           __kmp_free(tmp); // free CG if we are the last thread in it
5481         }
5482         // Restore current task's thread_limit from CG root
5483         if (thr->th.th_cg_roots)
5484           thr->th.th_current_task->td_icvs.thread_limit =
5485               thr->th.th_cg_roots->cg_thread_limit;
5486       }
5487     }
5488   }
5489 
5490   KMP_MB();
5491 }
5492 
5493 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5494 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5495   kmp_team_t *next_pool = team->t.t_next_pool;
5496 
5497   KMP_DEBUG_ASSERT(team);
5498   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5499   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5500   KMP_DEBUG_ASSERT(team->t.t_threads);
5501   KMP_DEBUG_ASSERT(team->t.t_argv);
5502 
5503   /* TODO clean the threads that are a part of this? */
5504 
5505   /* free stuff */
5506   __kmp_free_team_arrays(team);
5507   if (team->t.t_argv != &team->t.t_inline_argv[0])
5508     __kmp_free((void *)team->t.t_argv);
5509   __kmp_free(team);
5510 
5511   KMP_MB();
5512   return next_pool;
5513 }
5514 
5515 // Free the thread.  Don't reap it, just place it on the pool of available
5516 // threads.
5517 //
5518 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5519 // binding for the affinity mechanism to be useful.
5520 //
5521 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5522 // However, we want to avoid a potential performance problem by always
5523 // scanning through the list to find the correct point at which to insert
5524 // the thread (potential N**2 behavior).  To do this we keep track of the
5525 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5526 // With single-level parallelism, threads will always be added to the tail
5527 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5528 // parallelism, all bets are off and we may need to scan through the entire
5529 // free list.
5530 //
5531 // This change also has a potentially large performance benefit, for some
5532 // applications.  Previously, as threads were freed from the hot team, they
5533 // would be placed back on the free list in inverse order.  If the hot team
5534 // grew back to it's original size, then the freed thread would be placed
5535 // back on the hot team in reverse order.  This could cause bad cache
5536 // locality problems on programs where the size of the hot team regularly
5537 // grew and shrunk.
5538 //
5539 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5540 void __kmp_free_thread(kmp_info_t *this_th) {
5541   int gtid;
5542   kmp_info_t **scan;
5543 
5544   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5545                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5546 
5547   KMP_DEBUG_ASSERT(this_th);
5548 
5549   // When moving thread to pool, switch thread to wait on own b_go flag, and
5550   // uninitialized (NULL team).
5551   int b;
5552   kmp_balign_t *balign = this_th->th.th_bar;
5553   for (b = 0; b < bs_last_barrier; ++b) {
5554     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5555       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5556     balign[b].bb.team = NULL;
5557     balign[b].bb.leaf_kids = 0;
5558   }
5559   this_th->th.th_task_state = 0;
5560   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5561 
5562   /* put thread back on the free pool */
5563   TCW_PTR(this_th->th.th_team, NULL);
5564   TCW_PTR(this_th->th.th_root, NULL);
5565   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5566 
5567   while (this_th->th.th_cg_roots) {
5568     this_th->th.th_cg_roots->cg_nthreads--;
5569     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5570                    " %p of thread  %p to %d\n",
5571                    this_th, this_th->th.th_cg_roots,
5572                    this_th->th.th_cg_roots->cg_root,
5573                    this_th->th.th_cg_roots->cg_nthreads));
5574     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5575     if (tmp->cg_root == this_th) { // Thread is a cg_root
5576       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5577       KA_TRACE(
5578           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5579       this_th->th.th_cg_roots = tmp->up;
5580       __kmp_free(tmp);
5581     } else { // Worker thread
5582       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5583         __kmp_free(tmp);
5584       }
5585       this_th->th.th_cg_roots = NULL;
5586       break;
5587     }
5588   }
5589 
5590   /* If the implicit task assigned to this thread can be used by other threads
5591    * -> multiple threads can share the data and try to free the task at
5592    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5593    * with higher probability when hot team is disabled but can occurs even when
5594    * the hot team is enabled */
5595   __kmp_free_implicit_task(this_th);
5596   this_th->th.th_current_task = NULL;
5597 
5598   // If the __kmp_thread_pool_insert_pt is already past the new insert
5599   // point, then we need to re-scan the entire list.
5600   gtid = this_th->th.th_info.ds.ds_gtid;
5601   if (__kmp_thread_pool_insert_pt != NULL) {
5602     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5603     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5604       __kmp_thread_pool_insert_pt = NULL;
5605     }
5606   }
5607 
5608   // Scan down the list to find the place to insert the thread.
5609   // scan is the address of a link in the list, possibly the address of
5610   // __kmp_thread_pool itself.
5611   //
5612   // In the absence of nested parallism, the for loop will have 0 iterations.
5613   if (__kmp_thread_pool_insert_pt != NULL) {
5614     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5615   } else {
5616     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5617   }
5618   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5619        scan = &((*scan)->th.th_next_pool))
5620     ;
5621 
5622   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5623   // to its address.
5624   TCW_PTR(this_th->th.th_next_pool, *scan);
5625   __kmp_thread_pool_insert_pt = *scan = this_th;
5626   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5627                    (this_th->th.th_info.ds.ds_gtid <
5628                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5629   TCW_4(this_th->th.th_in_pool, TRUE);
5630   __kmp_suspend_initialize_thread(this_th);
5631   __kmp_lock_suspend_mx(this_th);
5632   if (this_th->th.th_active == TRUE) {
5633     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5634     this_th->th.th_active_in_pool = TRUE;
5635   }
5636 #if KMP_DEBUG
5637   else {
5638     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5639   }
5640 #endif
5641   __kmp_unlock_suspend_mx(this_th);
5642 
5643   TCW_4(__kmp_nth, __kmp_nth - 1);
5644 
5645 #ifdef KMP_ADJUST_BLOCKTIME
5646   /* Adjust blocktime back to user setting or default if necessary */
5647   /* Middle initialization might never have occurred                */
5648   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5649     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5650     if (__kmp_nth <= __kmp_avail_proc) {
5651       __kmp_zero_bt = FALSE;
5652     }
5653   }
5654 #endif /* KMP_ADJUST_BLOCKTIME */
5655 
5656   KMP_MB();
5657 }
5658 
5659 /* ------------------------------------------------------------------------ */
5660 
5661 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5662   int gtid = this_thr->th.th_info.ds.ds_gtid;
5663   /*    void                 *stack_data;*/
5664   kmp_team_t **volatile pteam;
5665 
5666   KMP_MB();
5667   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5668 
5669   if (__kmp_env_consistency_check) {
5670     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5671   }
5672 
5673 #if OMPT_SUPPORT
5674   ompt_data_t *thread_data;
5675   if (ompt_enabled.enabled) {
5676     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5677     *thread_data = ompt_data_none;
5678 
5679     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5680     this_thr->th.ompt_thread_info.wait_id = 0;
5681     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5682     this_thr->th.ompt_thread_info.parallel_flags = 0;
5683     if (ompt_enabled.ompt_callback_thread_begin) {
5684       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5685           ompt_thread_worker, thread_data);
5686     }
5687     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5688   }
5689 #endif
5690 
5691   /* This is the place where threads wait for work */
5692   while (!TCR_4(__kmp_global.g.g_done)) {
5693     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5694     KMP_MB();
5695 
5696     /* wait for work to do */
5697     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5698 
5699     /* No tid yet since not part of a team */
5700     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5701 
5702 #if OMPT_SUPPORT
5703     if (ompt_enabled.enabled) {
5704       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5705     }
5706 #endif
5707 
5708     pteam = &this_thr->th.th_team;
5709 
5710     /* have we been allocated? */
5711     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5712       /* we were just woken up, so run our new task */
5713       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5714         int rc;
5715         KA_TRACE(20,
5716                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5717                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5718                   (*pteam)->t.t_pkfn));
5719 
5720         updateHWFPControl(*pteam);
5721 
5722 #if OMPT_SUPPORT
5723         if (ompt_enabled.enabled) {
5724           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5725         }
5726 #endif
5727 
5728         rc = (*pteam)->t.t_invoke(gtid);
5729         KMP_ASSERT(rc);
5730 
5731         KMP_MB();
5732         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5733                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5734                       (*pteam)->t.t_pkfn));
5735       }
5736 #if OMPT_SUPPORT
5737       if (ompt_enabled.enabled) {
5738         /* no frame set while outside task */
5739         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5740 
5741         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5742       }
5743 #endif
5744       /* join barrier after parallel region */
5745       __kmp_join_barrier(gtid);
5746     }
5747   }
5748   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5749 
5750 #if OMPT_SUPPORT
5751   if (ompt_enabled.ompt_callback_thread_end) {
5752     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5753   }
5754 #endif
5755 
5756   this_thr->th.th_task_team = NULL;
5757   /* run the destructors for the threadprivate data for this thread */
5758   __kmp_common_destroy_gtid(gtid);
5759 
5760   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5761   KMP_MB();
5762   return this_thr;
5763 }
5764 
5765 /* ------------------------------------------------------------------------ */
5766 
5767 void __kmp_internal_end_dest(void *specific_gtid) {
5768 #if KMP_COMPILER_ICC
5769 #pragma warning(push)
5770 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5771 // significant bits
5772 #endif
5773   // Make sure no significant bits are lost
5774   int gtid = (kmp_intptr_t)specific_gtid - 1;
5775 #if KMP_COMPILER_ICC
5776 #pragma warning(pop)
5777 #endif
5778 
5779   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5780   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5781    * this is because 0 is reserved for the nothing-stored case */
5782 
5783   /* josh: One reason for setting the gtid specific data even when it is being
5784      destroyed by pthread is to allow gtid lookup through thread specific data
5785      (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5786      that gets executed in the call to __kmp_internal_end_thread, actually
5787      gets the gtid through the thread specific data.  Setting it here seems
5788      rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5789      to run smoothly.
5790      todo: get rid of this after we remove the dependence on
5791      __kmp_gtid_get_specific  */
5792   if (gtid >= 0 && KMP_UBER_GTID(gtid))
5793     __kmp_gtid_set_specific(gtid);
5794 #ifdef KMP_TDATA_GTID
5795   __kmp_gtid = gtid;
5796 #endif
5797   __kmp_internal_end_thread(gtid);
5798 }
5799 
5800 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5801 
5802 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5803   __kmp_internal_end_atexit();
5804 }
5805 
5806 #endif
5807 
5808 /* [Windows] josh: when the atexit handler is called, there may still be more
5809    than one thread alive */
5810 void __kmp_internal_end_atexit(void) {
5811   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5812   /* [Windows]
5813      josh: ideally, we want to completely shutdown the library in this atexit
5814      handler, but stat code that depends on thread specific data for gtid fails
5815      because that data becomes unavailable at some point during the shutdown, so
5816      we call __kmp_internal_end_thread instead. We should eventually remove the
5817      dependency on __kmp_get_specific_gtid in the stat code and use
5818      __kmp_internal_end_library to cleanly shutdown the library.
5819 
5820      // TODO: Can some of this comment about GVS be removed?
5821      I suspect that the offending stat code is executed when the calling thread
5822      tries to clean up a dead root thread's data structures, resulting in GVS
5823      code trying to close the GVS structures for that thread, but since the stat
5824      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5825      the calling thread is cleaning up itself instead of another thread, it get
5826      confused. This happens because allowing a thread to unregister and cleanup
5827      another thread is a recent modification for addressing an issue.
5828      Based on the current design (20050722), a thread may end up
5829      trying to unregister another thread only if thread death does not trigger
5830      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5831      thread specific data destructor function to detect thread death. For
5832      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5833      is nothing.  Thus, the workaround is applicable only for Windows static
5834      stat library. */
5835   __kmp_internal_end_library(-1);
5836 #if KMP_OS_WINDOWS
5837   __kmp_close_console();
5838 #endif
5839 }
5840 
5841 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5842   // It is assumed __kmp_forkjoin_lock is acquired.
5843 
5844   int gtid;
5845 
5846   KMP_DEBUG_ASSERT(thread != NULL);
5847 
5848   gtid = thread->th.th_info.ds.ds_gtid;
5849 
5850   if (!is_root) {
5851     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5852       /* Assume the threads are at the fork barrier here */
5853       KA_TRACE(
5854           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5855                gtid));
5856       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5857        * (GEH) */
5858       ANNOTATE_HAPPENS_BEFORE(thread);
5859       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5860       __kmp_release_64(&flag);
5861     }
5862 
5863     // Terminate OS thread.
5864     __kmp_reap_worker(thread);
5865 
5866     // The thread was killed asynchronously.  If it was actively
5867     // spinning in the thread pool, decrement the global count.
5868     //
5869     // There is a small timing hole here - if the worker thread was just waking
5870     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5871     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5872     // the global counter might not get updated.
5873     //
5874     // Currently, this can only happen as the library is unloaded,
5875     // so there are no harmful side effects.
5876     if (thread->th.th_active_in_pool) {
5877       thread->th.th_active_in_pool = FALSE;
5878       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5879       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5880     }
5881   }
5882 
5883   __kmp_free_implicit_task(thread);
5884 
5885 // Free the fast memory for tasking
5886 #if USE_FAST_MEMORY
5887   __kmp_free_fast_memory(thread);
5888 #endif /* USE_FAST_MEMORY */
5889 
5890   __kmp_suspend_uninitialize_thread(thread);
5891 
5892   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5893   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5894 
5895   --__kmp_all_nth;
5896 // __kmp_nth was decremented when thread is added to the pool.
5897 
5898 #ifdef KMP_ADJUST_BLOCKTIME
5899   /* Adjust blocktime back to user setting or default if necessary */
5900   /* Middle initialization might never have occurred                */
5901   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5902     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5903     if (__kmp_nth <= __kmp_avail_proc) {
5904       __kmp_zero_bt = FALSE;
5905     }
5906   }
5907 #endif /* KMP_ADJUST_BLOCKTIME */
5908 
5909   /* free the memory being used */
5910   if (__kmp_env_consistency_check) {
5911     if (thread->th.th_cons) {
5912       __kmp_free_cons_stack(thread->th.th_cons);
5913       thread->th.th_cons = NULL;
5914     }
5915   }
5916 
5917   if (thread->th.th_pri_common != NULL) {
5918     __kmp_free(thread->th.th_pri_common);
5919     thread->th.th_pri_common = NULL;
5920   }
5921 
5922   if (thread->th.th_task_state_memo_stack != NULL) {
5923     __kmp_free(thread->th.th_task_state_memo_stack);
5924     thread->th.th_task_state_memo_stack = NULL;
5925   }
5926 
5927 #if KMP_USE_BGET
5928   if (thread->th.th_local.bget_data != NULL) {
5929     __kmp_finalize_bget(thread);
5930   }
5931 #endif
5932 
5933 #if KMP_AFFINITY_SUPPORTED
5934   if (thread->th.th_affin_mask != NULL) {
5935     KMP_CPU_FREE(thread->th.th_affin_mask);
5936     thread->th.th_affin_mask = NULL;
5937   }
5938 #endif /* KMP_AFFINITY_SUPPORTED */
5939 
5940 #if KMP_USE_HIER_SCHED
5941   if (thread->th.th_hier_bar_data != NULL) {
5942     __kmp_free(thread->th.th_hier_bar_data);
5943     thread->th.th_hier_bar_data = NULL;
5944   }
5945 #endif
5946 
5947   __kmp_reap_team(thread->th.th_serial_team);
5948   thread->th.th_serial_team = NULL;
5949   __kmp_free(thread);
5950 
5951   KMP_MB();
5952 
5953 } // __kmp_reap_thread
5954 
5955 static void __kmp_internal_end(void) {
5956   int i;
5957 
5958   /* First, unregister the library */
5959   __kmp_unregister_library();
5960 
5961 #if KMP_OS_WINDOWS
5962   /* In Win static library, we can't tell when a root actually dies, so we
5963      reclaim the data structures for any root threads that have died but not
5964      unregistered themselves, in order to shut down cleanly.
5965      In Win dynamic library we also can't tell when a thread dies.  */
5966   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5967 // dead roots
5968 #endif
5969 
5970   for (i = 0; i < __kmp_threads_capacity; i++)
5971     if (__kmp_root[i])
5972       if (__kmp_root[i]->r.r_active)
5973         break;
5974   KMP_MB(); /* Flush all pending memory write invalidates.  */
5975   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5976 
5977   if (i < __kmp_threads_capacity) {
5978 #if KMP_USE_MONITOR
5979     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5980     KMP_MB(); /* Flush all pending memory write invalidates.  */
5981 
5982     // Need to check that monitor was initialized before reaping it. If we are
5983     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5984     // __kmp_monitor will appear to contain valid data, but it is only valid in
5985     // the parent process, not the child.
5986     // New behavior (201008): instead of keying off of the flag
5987     // __kmp_init_parallel, the monitor thread creation is keyed off
5988     // of the new flag __kmp_init_monitor.
5989     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5990     if (TCR_4(__kmp_init_monitor)) {
5991       __kmp_reap_monitor(&__kmp_monitor);
5992       TCW_4(__kmp_init_monitor, 0);
5993     }
5994     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5995     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5996 #endif // KMP_USE_MONITOR
5997   } else {
5998 /* TODO move this to cleanup code */
5999 #ifdef KMP_DEBUG
6000     /* make sure that everything has properly ended */
6001     for (i = 0; i < __kmp_threads_capacity; i++) {
6002       if (__kmp_root[i]) {
6003         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6004         //                    there can be uber threads alive here
6005         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6006       }
6007     }
6008 #endif
6009 
6010     KMP_MB();
6011 
6012     // Reap the worker threads.
6013     // This is valid for now, but be careful if threads are reaped sooner.
6014     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6015       // Get the next thread from the pool.
6016       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6017       __kmp_thread_pool = thread->th.th_next_pool;
6018       // Reap it.
6019       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6020       thread->th.th_next_pool = NULL;
6021       thread->th.th_in_pool = FALSE;
6022       __kmp_reap_thread(thread, 0);
6023     }
6024     __kmp_thread_pool_insert_pt = NULL;
6025 
6026     // Reap teams.
6027     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6028       // Get the next team from the pool.
6029       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6030       __kmp_team_pool = team->t.t_next_pool;
6031       // Reap it.
6032       team->t.t_next_pool = NULL;
6033       __kmp_reap_team(team);
6034     }
6035 
6036     __kmp_reap_task_teams();
6037 
6038 #if KMP_OS_UNIX
6039     // Threads that are not reaped should not access any resources since they
6040     // are going to be deallocated soon, so the shutdown sequence should wait
6041     // until all threads either exit the final spin-waiting loop or begin
6042     // sleeping after the given blocktime.
6043     for (i = 0; i < __kmp_threads_capacity; i++) {
6044       kmp_info_t *thr = __kmp_threads[i];
6045       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6046         KMP_CPU_PAUSE();
6047     }
6048 #endif
6049 
6050     for (i = 0; i < __kmp_threads_capacity; ++i) {
6051       // TBD: Add some checking...
6052       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6053     }
6054 
6055     /* Make sure all threadprivate destructors get run by joining with all
6056        worker threads before resetting this flag */
6057     TCW_SYNC_4(__kmp_init_common, FALSE);
6058 
6059     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6060     KMP_MB();
6061 
6062 #if KMP_USE_MONITOR
6063     // See note above: One of the possible fixes for CQ138434 / CQ140126
6064     //
6065     // FIXME: push both code fragments down and CSE them?
6066     // push them into __kmp_cleanup() ?
6067     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6068     if (TCR_4(__kmp_init_monitor)) {
6069       __kmp_reap_monitor(&__kmp_monitor);
6070       TCW_4(__kmp_init_monitor, 0);
6071     }
6072     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6073     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6074 #endif
6075   } /* else !__kmp_global.t_active */
6076   TCW_4(__kmp_init_gtid, FALSE);
6077   KMP_MB(); /* Flush all pending memory write invalidates.  */
6078 
6079   __kmp_cleanup();
6080 #if OMPT_SUPPORT
6081   ompt_fini();
6082 #endif
6083 }
6084 
6085 void __kmp_internal_end_library(int gtid_req) {
6086   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6087   /* this shouldn't be a race condition because __kmp_internal_end() is the
6088      only place to clear __kmp_serial_init */
6089   /* we'll check this later too, after we get the lock */
6090   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6091   // redundaant, because the next check will work in any case.
6092   if (__kmp_global.g.g_abort) {
6093     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6094     /* TODO abort? */
6095     return;
6096   }
6097   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6098     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6099     return;
6100   }
6101 
6102   KMP_MB(); /* Flush all pending memory write invalidates.  */
6103 
6104   /* find out who we are and what we should do */
6105   {
6106     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6107     KA_TRACE(
6108         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6109     if (gtid == KMP_GTID_SHUTDOWN) {
6110       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6111                     "already shutdown\n"));
6112       return;
6113     } else if (gtid == KMP_GTID_MONITOR) {
6114       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6115                     "registered, or system shutdown\n"));
6116       return;
6117     } else if (gtid == KMP_GTID_DNE) {
6118       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6119                     "shutdown\n"));
6120       /* we don't know who we are, but we may still shutdown the library */
6121     } else if (KMP_UBER_GTID(gtid)) {
6122       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6123       if (__kmp_root[gtid]->r.r_active) {
6124         __kmp_global.g.g_abort = -1;
6125         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6126         KA_TRACE(10,
6127                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6128                   gtid));
6129         return;
6130       } else {
6131         KA_TRACE(
6132             10,
6133             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6134         __kmp_unregister_root_current_thread(gtid);
6135       }
6136     } else {
6137 /* worker threads may call this function through the atexit handler, if they
6138  * call exit() */
6139 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6140    TODO: do a thorough shutdown instead */
6141 #ifdef DUMP_DEBUG_ON_EXIT
6142       if (__kmp_debug_buf)
6143         __kmp_dump_debug_buffer();
6144 #endif
6145       return;
6146     }
6147   }
6148   /* synchronize the termination process */
6149   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6150 
6151   /* have we already finished */
6152   if (__kmp_global.g.g_abort) {
6153     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6154     /* TODO abort? */
6155     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6156     return;
6157   }
6158   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6159     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6160     return;
6161   }
6162 
6163   /* We need this lock to enforce mutex between this reading of
6164      __kmp_threads_capacity and the writing by __kmp_register_root.
6165      Alternatively, we can use a counter of roots that is atomically updated by
6166      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6167      __kmp_internal_end_*.  */
6168   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6169 
6170   /* now we can safely conduct the actual termination */
6171   __kmp_internal_end();
6172 
6173   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6174   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6175 
6176   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6177 
6178 #ifdef DUMP_DEBUG_ON_EXIT
6179   if (__kmp_debug_buf)
6180     __kmp_dump_debug_buffer();
6181 #endif
6182 
6183 #if KMP_OS_WINDOWS
6184   __kmp_close_console();
6185 #endif
6186 
6187   __kmp_fini_allocator();
6188 
6189 } // __kmp_internal_end_library
6190 
6191 void __kmp_internal_end_thread(int gtid_req) {
6192   int i;
6193 
6194   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6195   /* this shouldn't be a race condition because __kmp_internal_end() is the
6196    * only place to clear __kmp_serial_init */
6197   /* we'll check this later too, after we get the lock */
6198   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6199   // redundant, because the next check will work in any case.
6200   if (__kmp_global.g.g_abort) {
6201     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6202     /* TODO abort? */
6203     return;
6204   }
6205   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6206     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6207     return;
6208   }
6209 
6210   KMP_MB(); /* Flush all pending memory write invalidates.  */
6211 
6212   /* find out who we are and what we should do */
6213   {
6214     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6215     KA_TRACE(10,
6216              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6217     if (gtid == KMP_GTID_SHUTDOWN) {
6218       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6219                     "already shutdown\n"));
6220       return;
6221     } else if (gtid == KMP_GTID_MONITOR) {
6222       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6223                     "registered, or system shutdown\n"));
6224       return;
6225     } else if (gtid == KMP_GTID_DNE) {
6226       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6227                     "shutdown\n"));
6228       return;
6229       /* we don't know who we are */
6230     } else if (KMP_UBER_GTID(gtid)) {
6231       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6232       if (__kmp_root[gtid]->r.r_active) {
6233         __kmp_global.g.g_abort = -1;
6234         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6235         KA_TRACE(10,
6236                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6237                   gtid));
6238         return;
6239       } else {
6240         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6241                       gtid));
6242         __kmp_unregister_root_current_thread(gtid);
6243       }
6244     } else {
6245       /* just a worker thread, let's leave */
6246       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6247 
6248       if (gtid >= 0) {
6249         __kmp_threads[gtid]->th.th_task_team = NULL;
6250       }
6251 
6252       KA_TRACE(10,
6253                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6254                 gtid));
6255       return;
6256     }
6257   }
6258 #if KMP_DYNAMIC_LIB
6259   if (__kmp_pause_status != kmp_hard_paused)
6260   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6261   // because we will better shutdown later in the library destructor.
6262   {
6263     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6264     return;
6265   }
6266 #endif
6267   /* synchronize the termination process */
6268   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6269 
6270   /* have we already finished */
6271   if (__kmp_global.g.g_abort) {
6272     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6273     /* TODO abort? */
6274     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6275     return;
6276   }
6277   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6278     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6279     return;
6280   }
6281 
6282   /* We need this lock to enforce mutex between this reading of
6283      __kmp_threads_capacity and the writing by __kmp_register_root.
6284      Alternatively, we can use a counter of roots that is atomically updated by
6285      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6286      __kmp_internal_end_*.  */
6287 
6288   /* should we finish the run-time?  are all siblings done? */
6289   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6290 
6291   for (i = 0; i < __kmp_threads_capacity; ++i) {
6292     if (KMP_UBER_GTID(i)) {
6293       KA_TRACE(
6294           10,
6295           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6296       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6297       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6298       return;
6299     }
6300   }
6301 
6302   /* now we can safely conduct the actual termination */
6303 
6304   __kmp_internal_end();
6305 
6306   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6307   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6308 
6309   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6310 
6311 #ifdef DUMP_DEBUG_ON_EXIT
6312   if (__kmp_debug_buf)
6313     __kmp_dump_debug_buffer();
6314 #endif
6315 } // __kmp_internal_end_thread
6316 
6317 // -----------------------------------------------------------------------------
6318 // Library registration stuff.
6319 
6320 static long __kmp_registration_flag = 0;
6321 // Random value used to indicate library initialization.
6322 static char *__kmp_registration_str = NULL;
6323 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6324 
6325 static inline char *__kmp_reg_status_name() {
6326   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6327      each thread. If registration and unregistration go in different threads
6328      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6329      env var can not be found, because the name will contain different pid. */
6330   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6331 } // __kmp_reg_status_get
6332 
6333 void __kmp_register_library_startup(void) {
6334 
6335   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6336   int done = 0;
6337   union {
6338     double dtime;
6339     long ltime;
6340   } time;
6341 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6342   __kmp_initialize_system_tick();
6343 #endif
6344   __kmp_read_system_time(&time.dtime);
6345   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6346   __kmp_registration_str =
6347       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6348                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6349 
6350   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6351                 __kmp_registration_str));
6352 
6353   while (!done) {
6354 
6355     char *value = NULL; // Actual value of the environment variable.
6356 
6357     // Set environment variable, but do not overwrite if it is exist.
6358     __kmp_env_set(name, __kmp_registration_str, 0);
6359     // Check the variable is written.
6360     value = __kmp_env_get(name);
6361     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6362 
6363       done = 1; // Ok, environment variable set successfully, exit the loop.
6364 
6365     } else {
6366 
6367       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6368       // Check whether it alive or dead.
6369       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6370       char *tail = value;
6371       char *flag_addr_str = NULL;
6372       char *flag_val_str = NULL;
6373       char const *file_name = NULL;
6374       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6375       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6376       file_name = tail;
6377       if (tail != NULL) {
6378         long *flag_addr = 0;
6379         long flag_val = 0;
6380         KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6381         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6382         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6383           // First, check whether environment-encoded address is mapped into
6384           // addr space.
6385           // If so, dereference it to see if it still has the right value.
6386           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6387             neighbor = 1;
6388           } else {
6389             // If not, then we know the other copy of the library is no longer
6390             // running.
6391             neighbor = 2;
6392           }
6393         }
6394       }
6395       switch (neighbor) {
6396       case 0: // Cannot parse environment variable -- neighbor status unknown.
6397         // Assume it is the incompatible format of future version of the
6398         // library. Assume the other library is alive.
6399         // WARN( ... ); // TODO: Issue a warning.
6400         file_name = "unknown library";
6401         KMP_FALLTHROUGH();
6402       // Attention! Falling to the next case. That's intentional.
6403       case 1: { // Neighbor is alive.
6404         // Check it is allowed.
6405         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6406         if (!__kmp_str_match_true(duplicate_ok)) {
6407           // That's not allowed. Issue fatal error.
6408           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6409                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6410         }
6411         KMP_INTERNAL_FREE(duplicate_ok);
6412         __kmp_duplicate_library_ok = 1;
6413         done = 1; // Exit the loop.
6414       } break;
6415       case 2: { // Neighbor is dead.
6416         // Clear the variable and try to register library again.
6417         __kmp_env_unset(name);
6418       } break;
6419       default: { KMP_DEBUG_ASSERT(0); } break;
6420       }
6421     }
6422     KMP_INTERNAL_FREE((void *)value);
6423   }
6424   KMP_INTERNAL_FREE((void *)name);
6425 
6426 } // func __kmp_register_library_startup
6427 
6428 void __kmp_unregister_library(void) {
6429 
6430   char *name = __kmp_reg_status_name();
6431   char *value = __kmp_env_get(name);
6432 
6433   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6434   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6435   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6436     // Ok, this is our variable. Delete it.
6437     __kmp_env_unset(name);
6438   }
6439 
6440   KMP_INTERNAL_FREE(__kmp_registration_str);
6441   KMP_INTERNAL_FREE(value);
6442   KMP_INTERNAL_FREE(name);
6443 
6444   __kmp_registration_flag = 0;
6445   __kmp_registration_str = NULL;
6446 
6447 } // __kmp_unregister_library
6448 
6449 // End of Library registration stuff.
6450 // -----------------------------------------------------------------------------
6451 
6452 #if KMP_MIC_SUPPORTED
6453 
6454 static void __kmp_check_mic_type() {
6455   kmp_cpuid_t cpuid_state = {0};
6456   kmp_cpuid_t *cs_p = &cpuid_state;
6457   __kmp_x86_cpuid(1, 0, cs_p);
6458   // We don't support mic1 at the moment
6459   if ((cs_p->eax & 0xff0) == 0xB10) {
6460     __kmp_mic_type = mic2;
6461   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6462     __kmp_mic_type = mic3;
6463   } else {
6464     __kmp_mic_type = non_mic;
6465   }
6466 }
6467 
6468 #endif /* KMP_MIC_SUPPORTED */
6469 
6470 static void __kmp_do_serial_initialize(void) {
6471   int i, gtid;
6472   int size;
6473 
6474   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6475 
6476   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6477   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6478   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6479   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6480   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6481 
6482 #if OMPT_SUPPORT
6483   ompt_pre_init();
6484 #endif
6485 
6486   __kmp_validate_locks();
6487 
6488   /* Initialize internal memory allocator */
6489   __kmp_init_allocator();
6490 
6491   /* Register the library startup via an environment variable and check to see
6492      whether another copy of the library is already registered. */
6493 
6494   __kmp_register_library_startup();
6495 
6496   /* TODO reinitialization of library */
6497   if (TCR_4(__kmp_global.g.g_done)) {
6498     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6499   }
6500 
6501   __kmp_global.g.g_abort = 0;
6502   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6503 
6504 /* initialize the locks */
6505 #if KMP_USE_ADAPTIVE_LOCKS
6506 #if KMP_DEBUG_ADAPTIVE_LOCKS
6507   __kmp_init_speculative_stats();
6508 #endif
6509 #endif
6510 #if KMP_STATS_ENABLED
6511   __kmp_stats_init();
6512 #endif
6513   __kmp_init_lock(&__kmp_global_lock);
6514   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6515   __kmp_init_lock(&__kmp_debug_lock);
6516   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6517   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6518   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6519   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6520   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6521   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6522   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6523   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6524   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6525   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6526   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6527   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6528   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6529   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6530   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6531 #if KMP_USE_MONITOR
6532   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6533 #endif
6534   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6535 
6536   /* conduct initialization and initial setup of configuration */
6537 
6538   __kmp_runtime_initialize();
6539 
6540 #if KMP_MIC_SUPPORTED
6541   __kmp_check_mic_type();
6542 #endif
6543 
6544 // Some global variable initialization moved here from kmp_env_initialize()
6545 #ifdef KMP_DEBUG
6546   kmp_diag = 0;
6547 #endif
6548   __kmp_abort_delay = 0;
6549 
6550   // From __kmp_init_dflt_team_nth()
6551   /* assume the entire machine will be used */
6552   __kmp_dflt_team_nth_ub = __kmp_xproc;
6553   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6554     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6555   }
6556   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6557     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6558   }
6559   __kmp_max_nth = __kmp_sys_max_nth;
6560   __kmp_cg_max_nth = __kmp_sys_max_nth;
6561   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6562   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6563     __kmp_teams_max_nth = __kmp_sys_max_nth;
6564   }
6565 
6566   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6567   // part
6568   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6569 #if KMP_USE_MONITOR
6570   __kmp_monitor_wakeups =
6571       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6572   __kmp_bt_intervals =
6573       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6574 #endif
6575   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6576   __kmp_library = library_throughput;
6577   // From KMP_SCHEDULE initialization
6578   __kmp_static = kmp_sch_static_balanced;
6579 // AC: do not use analytical here, because it is non-monotonous
6580 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6581 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6582 // need to repeat assignment
6583 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6584 // bit control and barrier method control parts
6585 #if KMP_FAST_REDUCTION_BARRIER
6586 #define kmp_reduction_barrier_gather_bb ((int)1)
6587 #define kmp_reduction_barrier_release_bb ((int)1)
6588 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6589 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6590 #endif // KMP_FAST_REDUCTION_BARRIER
6591   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6592     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6593     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6594     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6595     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6596 #if KMP_FAST_REDUCTION_BARRIER
6597     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6598       // lin_64 ): hyper,1
6599       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6600       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6601       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6602       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6603     }
6604 #endif // KMP_FAST_REDUCTION_BARRIER
6605   }
6606 #if KMP_FAST_REDUCTION_BARRIER
6607 #undef kmp_reduction_barrier_release_pat
6608 #undef kmp_reduction_barrier_gather_pat
6609 #undef kmp_reduction_barrier_release_bb
6610 #undef kmp_reduction_barrier_gather_bb
6611 #endif // KMP_FAST_REDUCTION_BARRIER
6612 #if KMP_MIC_SUPPORTED
6613   if (__kmp_mic_type == mic2) { // KNC
6614     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6615     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6616     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6617         1; // forkjoin release
6618     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6619     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6620   }
6621 #if KMP_FAST_REDUCTION_BARRIER
6622   if (__kmp_mic_type == mic2) { // KNC
6623     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6624     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6625   }
6626 #endif // KMP_FAST_REDUCTION_BARRIER
6627 #endif // KMP_MIC_SUPPORTED
6628 
6629 // From KMP_CHECKS initialization
6630 #ifdef KMP_DEBUG
6631   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6632 #else
6633   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6634 #endif
6635 
6636   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6637   __kmp_foreign_tp = TRUE;
6638 
6639   __kmp_global.g.g_dynamic = FALSE;
6640   __kmp_global.g.g_dynamic_mode = dynamic_default;
6641 
6642   __kmp_env_initialize(NULL);
6643 
6644 // Print all messages in message catalog for testing purposes.
6645 #ifdef KMP_DEBUG
6646   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6647   if (__kmp_str_match_true(val)) {
6648     kmp_str_buf_t buffer;
6649     __kmp_str_buf_init(&buffer);
6650     __kmp_i18n_dump_catalog(&buffer);
6651     __kmp_printf("%s", buffer.str);
6652     __kmp_str_buf_free(&buffer);
6653   }
6654   __kmp_env_free(&val);
6655 #endif
6656 
6657   __kmp_threads_capacity =
6658       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6659   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6660   __kmp_tp_capacity = __kmp_default_tp_capacity(
6661       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6662 
6663   // If the library is shut down properly, both pools must be NULL. Just in
6664   // case, set them to NULL -- some memory may leak, but subsequent code will
6665   // work even if pools are not freed.
6666   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6667   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6668   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6669   __kmp_thread_pool = NULL;
6670   __kmp_thread_pool_insert_pt = NULL;
6671   __kmp_team_pool = NULL;
6672 
6673   /* Allocate all of the variable sized records */
6674   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6675    * expandable */
6676   /* Since allocation is cache-aligned, just add extra padding at the end */
6677   size =
6678       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6679       CACHE_LINE;
6680   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6681   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6682                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6683 
6684   /* init thread counts */
6685   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6686                    0); // Asserts fail if the library is reinitializing and
6687   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6688   __kmp_all_nth = 0;
6689   __kmp_nth = 0;
6690 
6691   /* setup the uber master thread and hierarchy */
6692   gtid = __kmp_register_root(TRUE);
6693   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6694   KMP_ASSERT(KMP_UBER_GTID(gtid));
6695   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6696 
6697   KMP_MB(); /* Flush all pending memory write invalidates.  */
6698 
6699   __kmp_common_initialize();
6700 
6701 #if KMP_OS_UNIX
6702   /* invoke the child fork handler */
6703   __kmp_register_atfork();
6704 #endif
6705 
6706 #if !KMP_DYNAMIC_LIB
6707   {
6708     /* Invoke the exit handler when the program finishes, only for static
6709        library. For dynamic library, we already have _fini and DllMain. */
6710     int rc = atexit(__kmp_internal_end_atexit);
6711     if (rc != 0) {
6712       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6713                   __kmp_msg_null);
6714     }
6715   }
6716 #endif
6717 
6718 #if KMP_HANDLE_SIGNALS
6719 #if KMP_OS_UNIX
6720   /* NOTE: make sure that this is called before the user installs their own
6721      signal handlers so that the user handlers are called first. this way they
6722      can return false, not call our handler, avoid terminating the library, and
6723      continue execution where they left off. */
6724   __kmp_install_signals(FALSE);
6725 #endif /* KMP_OS_UNIX */
6726 #if KMP_OS_WINDOWS
6727   __kmp_install_signals(TRUE);
6728 #endif /* KMP_OS_WINDOWS */
6729 #endif
6730 
6731   /* we have finished the serial initialization */
6732   __kmp_init_counter++;
6733 
6734   __kmp_init_serial = TRUE;
6735 
6736   if (__kmp_settings) {
6737     __kmp_env_print();
6738   }
6739 
6740   if (__kmp_display_env || __kmp_display_env_verbose) {
6741     __kmp_env_print_2();
6742   }
6743 
6744 #if OMPT_SUPPORT
6745   ompt_post_init();
6746 #endif
6747 
6748   KMP_MB();
6749 
6750   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6751 }
6752 
6753 void __kmp_serial_initialize(void) {
6754   if (__kmp_init_serial) {
6755     return;
6756   }
6757   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6758   if (__kmp_init_serial) {
6759     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6760     return;
6761   }
6762   __kmp_do_serial_initialize();
6763   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6764 }
6765 
6766 static void __kmp_do_middle_initialize(void) {
6767   int i, j;
6768   int prev_dflt_team_nth;
6769 
6770   if (!__kmp_init_serial) {
6771     __kmp_do_serial_initialize();
6772   }
6773 
6774   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6775 
6776   // Save the previous value for the __kmp_dflt_team_nth so that
6777   // we can avoid some reinitialization if it hasn't changed.
6778   prev_dflt_team_nth = __kmp_dflt_team_nth;
6779 
6780 #if KMP_AFFINITY_SUPPORTED
6781   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6782   // number of cores on the machine.
6783   __kmp_affinity_initialize();
6784 
6785   // Run through the __kmp_threads array and set the affinity mask
6786   // for each root thread that is currently registered with the RTL.
6787   for (i = 0; i < __kmp_threads_capacity; i++) {
6788     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6789       __kmp_affinity_set_init_mask(i, TRUE);
6790     }
6791   }
6792 #endif /* KMP_AFFINITY_SUPPORTED */
6793 
6794   KMP_ASSERT(__kmp_xproc > 0);
6795   if (__kmp_avail_proc == 0) {
6796     __kmp_avail_proc = __kmp_xproc;
6797   }
6798 
6799   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6800   // correct them now
6801   j = 0;
6802   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6803     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6804         __kmp_avail_proc;
6805     j++;
6806   }
6807 
6808   if (__kmp_dflt_team_nth == 0) {
6809 #ifdef KMP_DFLT_NTH_CORES
6810     // Default #threads = #cores
6811     __kmp_dflt_team_nth = __kmp_ncores;
6812     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6813                   "__kmp_ncores (%d)\n",
6814                   __kmp_dflt_team_nth));
6815 #else
6816     // Default #threads = #available OS procs
6817     __kmp_dflt_team_nth = __kmp_avail_proc;
6818     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6819                   "__kmp_avail_proc(%d)\n",
6820                   __kmp_dflt_team_nth));
6821 #endif /* KMP_DFLT_NTH_CORES */
6822   }
6823 
6824   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6825     __kmp_dflt_team_nth = KMP_MIN_NTH;
6826   }
6827   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6828     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6829   }
6830 
6831   // There's no harm in continuing if the following check fails,
6832   // but it indicates an error in the previous logic.
6833   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6834 
6835   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6836     // Run through the __kmp_threads array and set the num threads icv for each
6837     // root thread that is currently registered with the RTL (which has not
6838     // already explicitly set its nthreads-var with a call to
6839     // omp_set_num_threads()).
6840     for (i = 0; i < __kmp_threads_capacity; i++) {
6841       kmp_info_t *thread = __kmp_threads[i];
6842       if (thread == NULL)
6843         continue;
6844       if (thread->th.th_current_task->td_icvs.nproc != 0)
6845         continue;
6846 
6847       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6848     }
6849   }
6850   KA_TRACE(
6851       20,
6852       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6853        __kmp_dflt_team_nth));
6854 
6855 #ifdef KMP_ADJUST_BLOCKTIME
6856   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6857   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6858     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6859     if (__kmp_nth > __kmp_avail_proc) {
6860       __kmp_zero_bt = TRUE;
6861     }
6862   }
6863 #endif /* KMP_ADJUST_BLOCKTIME */
6864 
6865   /* we have finished middle initialization */
6866   TCW_SYNC_4(__kmp_init_middle, TRUE);
6867 
6868   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6869 }
6870 
6871 void __kmp_middle_initialize(void) {
6872   if (__kmp_init_middle) {
6873     return;
6874   }
6875   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6876   if (__kmp_init_middle) {
6877     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6878     return;
6879   }
6880   __kmp_do_middle_initialize();
6881   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6882 }
6883 
6884 void __kmp_parallel_initialize(void) {
6885   int gtid = __kmp_entry_gtid(); // this might be a new root
6886 
6887   /* synchronize parallel initialization (for sibling) */
6888   if (TCR_4(__kmp_init_parallel))
6889     return;
6890   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6891   if (TCR_4(__kmp_init_parallel)) {
6892     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6893     return;
6894   }
6895 
6896   /* TODO reinitialization after we have already shut down */
6897   if (TCR_4(__kmp_global.g.g_done)) {
6898     KA_TRACE(
6899         10,
6900         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6901     __kmp_infinite_loop();
6902   }
6903 
6904   /* jc: The lock __kmp_initz_lock is already held, so calling
6905      __kmp_serial_initialize would cause a deadlock.  So we call
6906      __kmp_do_serial_initialize directly. */
6907   if (!__kmp_init_middle) {
6908     __kmp_do_middle_initialize();
6909   }
6910   __kmp_resume_if_hard_paused();
6911 
6912   /* begin initialization */
6913   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6914   KMP_ASSERT(KMP_UBER_GTID(gtid));
6915 
6916 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6917   // Save the FP control regs.
6918   // Worker threads will set theirs to these values at thread startup.
6919   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6920   __kmp_store_mxcsr(&__kmp_init_mxcsr);
6921   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6922 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6923 
6924 #if KMP_OS_UNIX
6925 #if KMP_HANDLE_SIGNALS
6926   /*  must be after __kmp_serial_initialize  */
6927   __kmp_install_signals(TRUE);
6928 #endif
6929 #endif
6930 
6931   __kmp_suspend_initialize();
6932 
6933 #if defined(USE_LOAD_BALANCE)
6934   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6935     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6936   }
6937 #else
6938   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6939     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6940   }
6941 #endif
6942 
6943   if (__kmp_version) {
6944     __kmp_print_version_2();
6945   }
6946 
6947   /* we have finished parallel initialization */
6948   TCW_SYNC_4(__kmp_init_parallel, TRUE);
6949 
6950   KMP_MB();
6951   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6952 
6953   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6954 }
6955 
6956 /* ------------------------------------------------------------------------ */
6957 
6958 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6959                                    kmp_team_t *team) {
6960   kmp_disp_t *dispatch;
6961 
6962   KMP_MB();
6963 
6964   /* none of the threads have encountered any constructs, yet. */
6965   this_thr->th.th_local.this_construct = 0;
6966 #if KMP_CACHE_MANAGE
6967   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6968 #endif /* KMP_CACHE_MANAGE */
6969   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6970   KMP_DEBUG_ASSERT(dispatch);
6971   KMP_DEBUG_ASSERT(team->t.t_dispatch);
6972   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6973   // this_thr->th.th_info.ds.ds_tid ] );
6974 
6975   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6976   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
6977   if (__kmp_env_consistency_check)
6978     __kmp_push_parallel(gtid, team->t.t_ident);
6979 
6980   KMP_MB(); /* Flush all pending memory write invalidates.  */
6981 }
6982 
6983 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6984                                   kmp_team_t *team) {
6985   if (__kmp_env_consistency_check)
6986     __kmp_pop_parallel(gtid, team->t.t_ident);
6987 
6988   __kmp_finish_implicit_task(this_thr);
6989 }
6990 
6991 int __kmp_invoke_task_func(int gtid) {
6992   int rc;
6993   int tid = __kmp_tid_from_gtid(gtid);
6994   kmp_info_t *this_thr = __kmp_threads[gtid];
6995   kmp_team_t *team = this_thr->th.th_team;
6996 
6997   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6998 #if USE_ITT_BUILD
6999   if (__itt_stack_caller_create_ptr) {
7000     __kmp_itt_stack_callee_enter(
7001         (__itt_caller)
7002             team->t.t_stack_id); // inform ittnotify about entering user's code
7003   }
7004 #endif /* USE_ITT_BUILD */
7005 #if INCLUDE_SSC_MARKS
7006   SSC_MARK_INVOKING();
7007 #endif
7008 
7009 #if OMPT_SUPPORT
7010   void *dummy;
7011   void **exit_frame_p;
7012   ompt_data_t *my_task_data;
7013   ompt_data_t *my_parallel_data;
7014   int ompt_team_size;
7015 
7016   if (ompt_enabled.enabled) {
7017     exit_frame_p = &(
7018         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7019   } else {
7020     exit_frame_p = &dummy;
7021   }
7022 
7023   my_task_data =
7024       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7025   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7026   if (ompt_enabled.ompt_callback_implicit_task) {
7027     ompt_team_size = team->t.t_nproc;
7028     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7029         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7030         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7031     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7032   }
7033 #endif
7034 
7035 #if KMP_STATS_ENABLED
7036   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7037   if (previous_state == stats_state_e::TEAMS_REGION) {
7038     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7039   } else {
7040     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7041   }
7042   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7043 #endif
7044 
7045   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7046                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7047 #if OMPT_SUPPORT
7048                               ,
7049                               exit_frame_p
7050 #endif
7051                               );
7052 #if OMPT_SUPPORT
7053   *exit_frame_p = NULL;
7054    this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7055 #endif
7056 
7057 #if KMP_STATS_ENABLED
7058   if (previous_state == stats_state_e::TEAMS_REGION) {
7059     KMP_SET_THREAD_STATE(previous_state);
7060   }
7061   KMP_POP_PARTITIONED_TIMER();
7062 #endif
7063 
7064 #if USE_ITT_BUILD
7065   if (__itt_stack_caller_create_ptr) {
7066     __kmp_itt_stack_callee_leave(
7067         (__itt_caller)
7068             team->t.t_stack_id); // inform ittnotify about leaving user's code
7069   }
7070 #endif /* USE_ITT_BUILD */
7071   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7072 
7073   return rc;
7074 }
7075 
7076 void __kmp_teams_master(int gtid) {
7077   // This routine is called by all master threads in teams construct
7078   kmp_info_t *thr = __kmp_threads[gtid];
7079   kmp_team_t *team = thr->th.th_team;
7080   ident_t *loc = team->t.t_ident;
7081   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7082   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7083   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7084   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7085                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7086 
7087   // This thread is a new CG root.  Set up the proper variables.
7088   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7089   tmp->cg_root = thr; // Make thr the CG root
7090   // Init to thread limit that was stored when league masters were forked
7091   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7092   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7093   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7094                  " cg_nthreads to 1\n",
7095                  thr, tmp));
7096   tmp->up = thr->th.th_cg_roots;
7097   thr->th.th_cg_roots = tmp;
7098 
7099 // Launch league of teams now, but not let workers execute
7100 // (they hang on fork barrier until next parallel)
7101 #if INCLUDE_SSC_MARKS
7102   SSC_MARK_FORKING();
7103 #endif
7104   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7105                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7106                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7107 #if INCLUDE_SSC_MARKS
7108   SSC_MARK_JOINING();
7109 #endif
7110   // If the team size was reduced from the limit, set it to the new size
7111   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7112     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7113   // AC: last parameter "1" eliminates join barrier which won't work because
7114   // worker threads are in a fork barrier waiting for more parallel regions
7115   __kmp_join_call(loc, gtid
7116 #if OMPT_SUPPORT
7117                   ,
7118                   fork_context_intel
7119 #endif
7120                   ,
7121                   1);
7122 }
7123 
7124 int __kmp_invoke_teams_master(int gtid) {
7125   kmp_info_t *this_thr = __kmp_threads[gtid];
7126   kmp_team_t *team = this_thr->th.th_team;
7127 #if KMP_DEBUG
7128   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7129     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7130                      (void *)__kmp_teams_master);
7131 #endif
7132   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7133 #if OMPT_SUPPORT
7134   int tid = __kmp_tid_from_gtid(gtid);
7135   ompt_data_t *task_data =
7136       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7137   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7138   if (ompt_enabled.ompt_callback_implicit_task) {
7139     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7140         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7141         ompt_task_initial);
7142     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7143   }
7144 #endif
7145   __kmp_teams_master(gtid);
7146 #if OMPT_SUPPORT
7147   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7148 #endif
7149   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7150   return 1;
7151 }
7152 
7153 /* this sets the requested number of threads for the next parallel region
7154    encountered by this team. since this should be enclosed in the forkjoin
7155    critical section it should avoid race conditions with asymmetrical nested
7156    parallelism */
7157 
7158 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7159   kmp_info_t *thr = __kmp_threads[gtid];
7160 
7161   if (num_threads > 0)
7162     thr->th.th_set_nproc = num_threads;
7163 }
7164 
7165 /* this sets the requested number of teams for the teams region and/or
7166    the number of threads for the next parallel region encountered  */
7167 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7168                           int num_threads) {
7169   kmp_info_t *thr = __kmp_threads[gtid];
7170   KMP_DEBUG_ASSERT(num_teams >= 0);
7171   KMP_DEBUG_ASSERT(num_threads >= 0);
7172 
7173   if (num_teams == 0)
7174     num_teams = 1; // default number of teams is 1.
7175   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7176     if (!__kmp_reserve_warn) {
7177       __kmp_reserve_warn = 1;
7178       __kmp_msg(kmp_ms_warning,
7179                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7180                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7181     }
7182     num_teams = __kmp_teams_max_nth;
7183   }
7184   // Set number of teams (number of threads in the outer "parallel" of the
7185   // teams)
7186   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7187 
7188   // Remember the number of threads for inner parallel regions
7189   if (!TCR_4(__kmp_init_middle))
7190     __kmp_middle_initialize(); // get internal globals calculated
7191   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7192   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7193   if (num_threads == 0) {
7194     num_threads = __kmp_avail_proc / num_teams;
7195     // adjust num_threads w/o warning as it is not user setting
7196     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7197     // no thread_limit clause specified -  do not change thread-limit-var ICV
7198     if (num_threads > __kmp_dflt_team_nth) {
7199       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7200     }
7201     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7202       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7203     } // prevent team size to exceed thread-limit-var
7204     if (num_teams * num_threads > __kmp_teams_max_nth) {
7205       num_threads = __kmp_teams_max_nth / num_teams;
7206     }
7207   } else {
7208     // This thread will be the master of the league masters
7209     // Store new thread limit; old limit is saved in th_cg_roots list
7210     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7211     // num_threads = min(num_threads, nthreads-var)
7212     if (num_threads > __kmp_dflt_team_nth) {
7213       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7214     }
7215     if (num_teams * num_threads > __kmp_teams_max_nth) {
7216       int new_threads = __kmp_teams_max_nth / num_teams;
7217       if (!__kmp_reserve_warn) { // user asked for too many threads
7218         __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7219         __kmp_msg(kmp_ms_warning,
7220                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7221                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7222       }
7223       num_threads = new_threads;
7224     }
7225   }
7226   thr->th.th_teams_size.nth = num_threads;
7227 }
7228 
7229 // Set the proc_bind var to use in the following parallel region.
7230 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7231   kmp_info_t *thr = __kmp_threads[gtid];
7232   thr->th.th_set_proc_bind = proc_bind;
7233 }
7234 
7235 /* Launch the worker threads into the microtask. */
7236 
7237 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7238   kmp_info_t *this_thr = __kmp_threads[gtid];
7239 
7240 #ifdef KMP_DEBUG
7241   int f;
7242 #endif /* KMP_DEBUG */
7243 
7244   KMP_DEBUG_ASSERT(team);
7245   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7246   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7247   KMP_MB(); /* Flush all pending memory write invalidates.  */
7248 
7249   team->t.t_construct = 0; /* no single directives seen yet */
7250   team->t.t_ordered.dt.t_value =
7251       0; /* thread 0 enters the ordered section first */
7252 
7253   /* Reset the identifiers on the dispatch buffer */
7254   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7255   if (team->t.t_max_nproc > 1) {
7256     int i;
7257     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7258       team->t.t_disp_buffer[i].buffer_index = i;
7259       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7260     }
7261   } else {
7262     team->t.t_disp_buffer[0].buffer_index = 0;
7263     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7264   }
7265 
7266   KMP_MB(); /* Flush all pending memory write invalidates.  */
7267   KMP_ASSERT(this_thr->th.th_team == team);
7268 
7269 #ifdef KMP_DEBUG
7270   for (f = 0; f < team->t.t_nproc; f++) {
7271     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7272                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7273   }
7274 #endif /* KMP_DEBUG */
7275 
7276   /* release the worker threads so they may begin working */
7277   __kmp_fork_barrier(gtid, 0);
7278 }
7279 
7280 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7281   kmp_info_t *this_thr = __kmp_threads[gtid];
7282 
7283   KMP_DEBUG_ASSERT(team);
7284   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7285   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7286   KMP_MB(); /* Flush all pending memory write invalidates.  */
7287 
7288 /* Join barrier after fork */
7289 
7290 #ifdef KMP_DEBUG
7291   if (__kmp_threads[gtid] &&
7292       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7293     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7294                  __kmp_threads[gtid]);
7295     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7296                  "team->t.t_nproc=%d\n",
7297                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7298                  team->t.t_nproc);
7299     __kmp_print_structure();
7300   }
7301   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7302                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7303 #endif /* KMP_DEBUG */
7304 
7305   __kmp_join_barrier(gtid); /* wait for everyone */
7306 #if OMPT_SUPPORT
7307   if (ompt_enabled.enabled &&
7308       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7309     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7310     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7311     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7312 #if OMPT_OPTIONAL
7313     void *codeptr = NULL;
7314     if (KMP_MASTER_TID(ds_tid) &&
7315         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7316          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7317       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7318 
7319     if (ompt_enabled.ompt_callback_sync_region_wait) {
7320       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7321           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7322           codeptr);
7323     }
7324     if (ompt_enabled.ompt_callback_sync_region) {
7325       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7326           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7327           codeptr);
7328     }
7329 #endif
7330     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7331       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7332           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7333     }
7334   }
7335 #endif
7336 
7337   KMP_MB(); /* Flush all pending memory write invalidates.  */
7338   KMP_ASSERT(this_thr->th.th_team == team);
7339 }
7340 
7341 /* ------------------------------------------------------------------------ */
7342 
7343 #ifdef USE_LOAD_BALANCE
7344 
7345 // Return the worker threads actively spinning in the hot team, if we
7346 // are at the outermost level of parallelism.  Otherwise, return 0.
7347 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7348   int i;
7349   int retval;
7350   kmp_team_t *hot_team;
7351 
7352   if (root->r.r_active) {
7353     return 0;
7354   }
7355   hot_team = root->r.r_hot_team;
7356   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7357     return hot_team->t.t_nproc - 1; // Don't count master thread
7358   }
7359 
7360   // Skip the master thread - it is accounted for elsewhere.
7361   retval = 0;
7362   for (i = 1; i < hot_team->t.t_nproc; i++) {
7363     if (hot_team->t.t_threads[i]->th.th_active) {
7364       retval++;
7365     }
7366   }
7367   return retval;
7368 }
7369 
7370 // Perform an automatic adjustment to the number of
7371 // threads used by the next parallel region.
7372 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7373   int retval;
7374   int pool_active;
7375   int hot_team_active;
7376   int team_curr_active;
7377   int system_active;
7378 
7379   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7380                 set_nproc));
7381   KMP_DEBUG_ASSERT(root);
7382   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7383                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7384   KMP_DEBUG_ASSERT(set_nproc > 1);
7385 
7386   if (set_nproc == 1) {
7387     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7388     return 1;
7389   }
7390 
7391   // Threads that are active in the thread pool, active in the hot team for this
7392   // particular root (if we are at the outer par level), and the currently
7393   // executing thread (to become the master) are available to add to the new
7394   // team, but are currently contributing to the system load, and must be
7395   // accounted for.
7396   pool_active = __kmp_thread_pool_active_nth;
7397   hot_team_active = __kmp_active_hot_team_nproc(root);
7398   team_curr_active = pool_active + hot_team_active + 1;
7399 
7400   // Check the system load.
7401   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7402   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7403                 "hot team active = %d\n",
7404                 system_active, pool_active, hot_team_active));
7405 
7406   if (system_active < 0) {
7407     // There was an error reading the necessary info from /proc, so use the
7408     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7409     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7410     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7411     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7412 
7413     // Make this call behave like the thread limit algorithm.
7414     retval = __kmp_avail_proc - __kmp_nth +
7415              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7416     if (retval > set_nproc) {
7417       retval = set_nproc;
7418     }
7419     if (retval < KMP_MIN_NTH) {
7420       retval = KMP_MIN_NTH;
7421     }
7422 
7423     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7424                   retval));
7425     return retval;
7426   }
7427 
7428   // There is a slight delay in the load balance algorithm in detecting new
7429   // running procs. The real system load at this instant should be at least as
7430   // large as the #active omp thread that are available to add to the team.
7431   if (system_active < team_curr_active) {
7432     system_active = team_curr_active;
7433   }
7434   retval = __kmp_avail_proc - system_active + team_curr_active;
7435   if (retval > set_nproc) {
7436     retval = set_nproc;
7437   }
7438   if (retval < KMP_MIN_NTH) {
7439     retval = KMP_MIN_NTH;
7440   }
7441 
7442   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7443   return retval;
7444 } // __kmp_load_balance_nproc()
7445 
7446 #endif /* USE_LOAD_BALANCE */
7447 
7448 /* ------------------------------------------------------------------------ */
7449 
7450 /* NOTE: this is called with the __kmp_init_lock held */
7451 void __kmp_cleanup(void) {
7452   int f;
7453 
7454   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7455 
7456   if (TCR_4(__kmp_init_parallel)) {
7457 #if KMP_HANDLE_SIGNALS
7458     __kmp_remove_signals();
7459 #endif
7460     TCW_4(__kmp_init_parallel, FALSE);
7461   }
7462 
7463   if (TCR_4(__kmp_init_middle)) {
7464 #if KMP_AFFINITY_SUPPORTED
7465     __kmp_affinity_uninitialize();
7466 #endif /* KMP_AFFINITY_SUPPORTED */
7467     __kmp_cleanup_hierarchy();
7468     TCW_4(__kmp_init_middle, FALSE);
7469   }
7470 
7471   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7472 
7473   if (__kmp_init_serial) {
7474     __kmp_runtime_destroy();
7475     __kmp_init_serial = FALSE;
7476   }
7477 
7478   __kmp_cleanup_threadprivate_caches();
7479 
7480   for (f = 0; f < __kmp_threads_capacity; f++) {
7481     if (__kmp_root[f] != NULL) {
7482       __kmp_free(__kmp_root[f]);
7483       __kmp_root[f] = NULL;
7484     }
7485   }
7486   __kmp_free(__kmp_threads);
7487   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7488   // there is no need in freeing __kmp_root.
7489   __kmp_threads = NULL;
7490   __kmp_root = NULL;
7491   __kmp_threads_capacity = 0;
7492 
7493 #if KMP_USE_DYNAMIC_LOCK
7494   __kmp_cleanup_indirect_user_locks();
7495 #else
7496   __kmp_cleanup_user_locks();
7497 #endif
7498 
7499 #if KMP_AFFINITY_SUPPORTED
7500   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7501   __kmp_cpuinfo_file = NULL;
7502 #endif /* KMP_AFFINITY_SUPPORTED */
7503 
7504 #if KMP_USE_ADAPTIVE_LOCKS
7505 #if KMP_DEBUG_ADAPTIVE_LOCKS
7506   __kmp_print_speculative_stats();
7507 #endif
7508 #endif
7509   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7510   __kmp_nested_nth.nth = NULL;
7511   __kmp_nested_nth.size = 0;
7512   __kmp_nested_nth.used = 0;
7513   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7514   __kmp_nested_proc_bind.bind_types = NULL;
7515   __kmp_nested_proc_bind.size = 0;
7516   __kmp_nested_proc_bind.used = 0;
7517   if (__kmp_affinity_format) {
7518     KMP_INTERNAL_FREE(__kmp_affinity_format);
7519     __kmp_affinity_format = NULL;
7520   }
7521 
7522   __kmp_i18n_catclose();
7523 
7524 #if KMP_USE_HIER_SCHED
7525   __kmp_hier_scheds.deallocate();
7526 #endif
7527 
7528 #if KMP_STATS_ENABLED
7529   __kmp_stats_fini();
7530 #endif
7531 
7532   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7533 }
7534 
7535 /* ------------------------------------------------------------------------ */
7536 
7537 int __kmp_ignore_mppbeg(void) {
7538   char *env;
7539 
7540   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7541     if (__kmp_str_match_false(env))
7542       return FALSE;
7543   }
7544   // By default __kmpc_begin() is no-op.
7545   return TRUE;
7546 }
7547 
7548 int __kmp_ignore_mppend(void) {
7549   char *env;
7550 
7551   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7552     if (__kmp_str_match_false(env))
7553       return FALSE;
7554   }
7555   // By default __kmpc_end() is no-op.
7556   return TRUE;
7557 }
7558 
7559 void __kmp_internal_begin(void) {
7560   int gtid;
7561   kmp_root_t *root;
7562 
7563   /* this is a very important step as it will register new sibling threads
7564      and assign these new uber threads a new gtid */
7565   gtid = __kmp_entry_gtid();
7566   root = __kmp_threads[gtid]->th.th_root;
7567   KMP_ASSERT(KMP_UBER_GTID(gtid));
7568 
7569   if (root->r.r_begin)
7570     return;
7571   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7572   if (root->r.r_begin) {
7573     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7574     return;
7575   }
7576 
7577   root->r.r_begin = TRUE;
7578 
7579   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7580 }
7581 
7582 /* ------------------------------------------------------------------------ */
7583 
7584 void __kmp_user_set_library(enum library_type arg) {
7585   int gtid;
7586   kmp_root_t *root;
7587   kmp_info_t *thread;
7588 
7589   /* first, make sure we are initialized so we can get our gtid */
7590 
7591   gtid = __kmp_entry_gtid();
7592   thread = __kmp_threads[gtid];
7593 
7594   root = thread->th.th_root;
7595 
7596   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7597                 library_serial));
7598   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7599                                   thread */
7600     KMP_WARNING(SetLibraryIncorrectCall);
7601     return;
7602   }
7603 
7604   switch (arg) {
7605   case library_serial:
7606     thread->th.th_set_nproc = 0;
7607     set__nproc(thread, 1);
7608     break;
7609   case library_turnaround:
7610     thread->th.th_set_nproc = 0;
7611     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7612                                            : __kmp_dflt_team_nth_ub);
7613     break;
7614   case library_throughput:
7615     thread->th.th_set_nproc = 0;
7616     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7617                                            : __kmp_dflt_team_nth_ub);
7618     break;
7619   default:
7620     KMP_FATAL(UnknownLibraryType, arg);
7621   }
7622 
7623   __kmp_aux_set_library(arg);
7624 }
7625 
7626 void __kmp_aux_set_stacksize(size_t arg) {
7627   if (!__kmp_init_serial)
7628     __kmp_serial_initialize();
7629 
7630 #if KMP_OS_DARWIN
7631   if (arg & (0x1000 - 1)) {
7632     arg &= ~(0x1000 - 1);
7633     if (arg + 0x1000) /* check for overflow if we round up */
7634       arg += 0x1000;
7635   }
7636 #endif
7637   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7638 
7639   /* only change the default stacksize before the first parallel region */
7640   if (!TCR_4(__kmp_init_parallel)) {
7641     size_t value = arg; /* argument is in bytes */
7642 
7643     if (value < __kmp_sys_min_stksize)
7644       value = __kmp_sys_min_stksize;
7645     else if (value > KMP_MAX_STKSIZE)
7646       value = KMP_MAX_STKSIZE;
7647 
7648     __kmp_stksize = value;
7649 
7650     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7651   }
7652 
7653   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7654 }
7655 
7656 /* set the behaviour of the runtime library */
7657 /* TODO this can cause some odd behaviour with sibling parallelism... */
7658 void __kmp_aux_set_library(enum library_type arg) {
7659   __kmp_library = arg;
7660 
7661   switch (__kmp_library) {
7662   case library_serial: {
7663     KMP_INFORM(LibraryIsSerial);
7664   } break;
7665   case library_turnaround:
7666     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7667       __kmp_use_yield = 2; // only yield when oversubscribed
7668     break;
7669   case library_throughput:
7670     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7671       __kmp_dflt_blocktime = 200;
7672     break;
7673   default:
7674     KMP_FATAL(UnknownLibraryType, arg);
7675   }
7676 }
7677 
7678 /* Getting team information common for all team API */
7679 // Returns NULL if not in teams construct
7680 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7681   kmp_info_t *thr = __kmp_entry_thread();
7682   teams_serialized = 0;
7683   if (thr->th.th_teams_microtask) {
7684     kmp_team_t *team = thr->th.th_team;
7685     int tlevel = thr->th.th_teams_level; // the level of the teams construct
7686     int ii = team->t.t_level;
7687     teams_serialized = team->t.t_serialized;
7688     int level = tlevel + 1;
7689     KMP_DEBUG_ASSERT(ii >= tlevel);
7690     while (ii > level) {
7691       for (teams_serialized = team->t.t_serialized;
7692            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7693       }
7694       if (team->t.t_serialized && (!teams_serialized)) {
7695         team = team->t.t_parent;
7696         continue;
7697       }
7698       if (ii > level) {
7699         team = team->t.t_parent;
7700         ii--;
7701       }
7702     }
7703     return team;
7704   }
7705   return NULL;
7706 }
7707 
7708 int __kmp_aux_get_team_num() {
7709   int serialized;
7710   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7711   if (team) {
7712     if (serialized > 1) {
7713       return 0; // teams region is serialized ( 1 team of 1 thread ).
7714     } else {
7715       return team->t.t_master_tid;
7716     }
7717   }
7718   return 0;
7719 }
7720 
7721 int __kmp_aux_get_num_teams() {
7722   int serialized;
7723   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7724   if (team) {
7725     if (serialized > 1) {
7726       return 1;
7727     } else {
7728       return team->t.t_parent->t.t_nproc;
7729     }
7730   }
7731   return 1;
7732 }
7733 
7734 /* ------------------------------------------------------------------------ */
7735 
7736 /*
7737  * Affinity Format Parser
7738  *
7739  * Field is in form of: %[[[0].]size]type
7740  * % and type are required (%% means print a literal '%')
7741  * type is either single char or long name surrounded by {},
7742  * e.g., N or {num_threads}
7743  * 0 => leading zeros
7744  * . => right justified when size is specified
7745  * by default output is left justified
7746  * size is the *minimum* field length
7747  * All other characters are printed as is
7748  *
7749  * Available field types:
7750  * L {thread_level}      - omp_get_level()
7751  * n {thread_num}        - omp_get_thread_num()
7752  * h {host}              - name of host machine
7753  * P {process_id}        - process id (integer)
7754  * T {thread_identifier} - native thread identifier (integer)
7755  * N {num_threads}       - omp_get_num_threads()
7756  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
7757  * a {thread_affinity}   - comma separated list of integers or integer ranges
7758  *                         (values of affinity mask)
7759  *
7760  * Implementation-specific field types can be added
7761  * If a type is unknown, print "undefined"
7762 */
7763 
7764 // Structure holding the short name, long name, and corresponding data type
7765 // for snprintf.  A table of these will represent the entire valid keyword
7766 // field types.
7767 typedef struct kmp_affinity_format_field_t {
7768   char short_name; // from spec e.g., L -> thread level
7769   const char *long_name; // from spec thread_level -> thread level
7770   char field_format; // data type for snprintf (typically 'd' or 's'
7771   // for integer or string)
7772 } kmp_affinity_format_field_t;
7773 
7774 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7775 #if KMP_AFFINITY_SUPPORTED
7776     {'A', "thread_affinity", 's'},
7777 #endif
7778     {'t', "team_num", 'd'},
7779     {'T', "num_teams", 'd'},
7780     {'L', "nesting_level", 'd'},
7781     {'n', "thread_num", 'd'},
7782     {'N', "num_threads", 'd'},
7783     {'a', "ancestor_tnum", 'd'},
7784     {'H', "host", 's'},
7785     {'P', "process_id", 'd'},
7786     {'i', "native_thread_id", 'd'}};
7787 
7788 // Return the number of characters it takes to hold field
7789 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7790                                             const char **ptr,
7791                                             kmp_str_buf_t *field_buffer) {
7792   int rc, format_index, field_value;
7793   const char *width_left, *width_right;
7794   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7795   static const int FORMAT_SIZE = 20;
7796   char format[FORMAT_SIZE] = {0};
7797   char absolute_short_name = 0;
7798 
7799   KMP_DEBUG_ASSERT(gtid >= 0);
7800   KMP_DEBUG_ASSERT(th);
7801   KMP_DEBUG_ASSERT(**ptr == '%');
7802   KMP_DEBUG_ASSERT(field_buffer);
7803 
7804   __kmp_str_buf_clear(field_buffer);
7805 
7806   // Skip the initial %
7807   (*ptr)++;
7808 
7809   // Check for %% first
7810   if (**ptr == '%') {
7811     __kmp_str_buf_cat(field_buffer, "%", 1);
7812     (*ptr)++; // skip over the second %
7813     return 1;
7814   }
7815 
7816   // Parse field modifiers if they are present
7817   pad_zeros = false;
7818   if (**ptr == '0') {
7819     pad_zeros = true;
7820     (*ptr)++; // skip over 0
7821   }
7822   right_justify = false;
7823   if (**ptr == '.') {
7824     right_justify = true;
7825     (*ptr)++; // skip over .
7826   }
7827   // Parse width of field: [width_left, width_right)
7828   width_left = width_right = NULL;
7829   if (**ptr >= '0' && **ptr <= '9') {
7830     width_left = *ptr;
7831     SKIP_DIGITS(*ptr);
7832     width_right = *ptr;
7833   }
7834 
7835   // Create the format for KMP_SNPRINTF based on flags parsed above
7836   format_index = 0;
7837   format[format_index++] = '%';
7838   if (!right_justify)
7839     format[format_index++] = '-';
7840   if (pad_zeros)
7841     format[format_index++] = '0';
7842   if (width_left && width_right) {
7843     int i = 0;
7844     // Only allow 8 digit number widths.
7845     // This also prevents overflowing format variable
7846     while (i < 8 && width_left < width_right) {
7847       format[format_index++] = *width_left;
7848       width_left++;
7849       i++;
7850     }
7851   }
7852 
7853   // Parse a name (long or short)
7854   // Canonicalize the name into absolute_short_name
7855   found_valid_name = false;
7856   parse_long_name = (**ptr == '{');
7857   if (parse_long_name)
7858     (*ptr)++; // skip initial left brace
7859   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7860                              sizeof(__kmp_affinity_format_table[0]);
7861        ++i) {
7862     char short_name = __kmp_affinity_format_table[i].short_name;
7863     const char *long_name = __kmp_affinity_format_table[i].long_name;
7864     char field_format = __kmp_affinity_format_table[i].field_format;
7865     if (parse_long_name) {
7866       int length = KMP_STRLEN(long_name);
7867       if (strncmp(*ptr, long_name, length) == 0) {
7868         found_valid_name = true;
7869         (*ptr) += length; // skip the long name
7870       }
7871     } else if (**ptr == short_name) {
7872       found_valid_name = true;
7873       (*ptr)++; // skip the short name
7874     }
7875     if (found_valid_name) {
7876       format[format_index++] = field_format;
7877       format[format_index++] = '\0';
7878       absolute_short_name = short_name;
7879       break;
7880     }
7881   }
7882   if (parse_long_name) {
7883     if (**ptr != '}') {
7884       absolute_short_name = 0;
7885     } else {
7886       (*ptr)++; // skip over the right brace
7887     }
7888   }
7889 
7890   // Attempt to fill the buffer with the requested
7891   // value using snprintf within __kmp_str_buf_print()
7892   switch (absolute_short_name) {
7893   case 't':
7894     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
7895     break;
7896   case 'T':
7897     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
7898     break;
7899   case 'L':
7900     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
7901     break;
7902   case 'n':
7903     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
7904     break;
7905   case 'H': {
7906     static const int BUFFER_SIZE = 256;
7907     char buf[BUFFER_SIZE];
7908     __kmp_expand_host_name(buf, BUFFER_SIZE);
7909     rc = __kmp_str_buf_print(field_buffer, format, buf);
7910   } break;
7911   case 'P':
7912     rc = __kmp_str_buf_print(field_buffer, format, getpid());
7913     break;
7914   case 'i':
7915     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
7916     break;
7917   case 'N':
7918     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
7919     break;
7920   case 'a':
7921     field_value =
7922         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
7923     rc = __kmp_str_buf_print(field_buffer, format, field_value);
7924     break;
7925 #if KMP_AFFINITY_SUPPORTED
7926   case 'A': {
7927     kmp_str_buf_t buf;
7928     __kmp_str_buf_init(&buf);
7929     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
7930     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
7931     __kmp_str_buf_free(&buf);
7932   } break;
7933 #endif
7934   default:
7935     // According to spec, If an implementation does not have info for field
7936     // type, then "undefined" is printed
7937     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
7938     // Skip the field
7939     if (parse_long_name) {
7940       SKIP_TOKEN(*ptr);
7941       if (**ptr == '}')
7942         (*ptr)++;
7943     } else {
7944       (*ptr)++;
7945     }
7946   }
7947 
7948   KMP_ASSERT(format_index <= FORMAT_SIZE);
7949   return rc;
7950 }
7951 
7952 /*
7953  * Return number of characters needed to hold the affinity string
7954  * (not including null byte character)
7955  * The resultant string is printed to buffer, which the caller can then
7956  * handle afterwards
7957 */
7958 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
7959                                   kmp_str_buf_t *buffer) {
7960   const char *parse_ptr;
7961   size_t retval;
7962   const kmp_info_t *th;
7963   kmp_str_buf_t field;
7964 
7965   KMP_DEBUG_ASSERT(buffer);
7966   KMP_DEBUG_ASSERT(gtid >= 0);
7967 
7968   __kmp_str_buf_init(&field);
7969   __kmp_str_buf_clear(buffer);
7970 
7971   th = __kmp_threads[gtid];
7972   retval = 0;
7973 
7974   // If format is NULL or zero-length string, then we use
7975   // affinity-format-var ICV
7976   parse_ptr = format;
7977   if (parse_ptr == NULL || *parse_ptr == '\0') {
7978     parse_ptr = __kmp_affinity_format;
7979   }
7980   KMP_DEBUG_ASSERT(parse_ptr);
7981 
7982   while (*parse_ptr != '\0') {
7983     // Parse a field
7984     if (*parse_ptr == '%') {
7985       // Put field in the buffer
7986       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
7987       __kmp_str_buf_catbuf(buffer, &field);
7988       retval += rc;
7989     } else {
7990       // Put literal character in buffer
7991       __kmp_str_buf_cat(buffer, parse_ptr, 1);
7992       retval++;
7993       parse_ptr++;
7994     }
7995   }
7996   __kmp_str_buf_free(&field);
7997   return retval;
7998 }
7999 
8000 // Displays the affinity string to stdout
8001 void __kmp_aux_display_affinity(int gtid, const char *format) {
8002   kmp_str_buf_t buf;
8003   __kmp_str_buf_init(&buf);
8004   __kmp_aux_capture_affinity(gtid, format, &buf);
8005   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8006   __kmp_str_buf_free(&buf);
8007 }
8008 
8009 /* ------------------------------------------------------------------------ */
8010 
8011 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8012   int blocktime = arg; /* argument is in milliseconds */
8013 #if KMP_USE_MONITOR
8014   int bt_intervals;
8015 #endif
8016   int bt_set;
8017 
8018   __kmp_save_internal_controls(thread);
8019 
8020   /* Normalize and set blocktime for the teams */
8021   if (blocktime < KMP_MIN_BLOCKTIME)
8022     blocktime = KMP_MIN_BLOCKTIME;
8023   else if (blocktime > KMP_MAX_BLOCKTIME)
8024     blocktime = KMP_MAX_BLOCKTIME;
8025 
8026   set__blocktime_team(thread->th.th_team, tid, blocktime);
8027   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8028 
8029 #if KMP_USE_MONITOR
8030   /* Calculate and set blocktime intervals for the teams */
8031   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8032 
8033   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8034   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8035 #endif
8036 
8037   /* Set whether blocktime has been set to "TRUE" */
8038   bt_set = TRUE;
8039 
8040   set__bt_set_team(thread->th.th_team, tid, bt_set);
8041   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8042 #if KMP_USE_MONITOR
8043   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8044                 "bt_intervals=%d, monitor_updates=%d\n",
8045                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8046                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8047                 __kmp_monitor_wakeups));
8048 #else
8049   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8050                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8051                 thread->th.th_team->t.t_id, tid, blocktime));
8052 #endif
8053 }
8054 
8055 void __kmp_aux_set_defaults(char const *str, int len) {
8056   if (!__kmp_init_serial) {
8057     __kmp_serial_initialize();
8058   }
8059   __kmp_env_initialize(str);
8060 
8061   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8062     __kmp_env_print();
8063   }
8064 } // __kmp_aux_set_defaults
8065 
8066 /* ------------------------------------------------------------------------ */
8067 /* internal fast reduction routines */
8068 
8069 PACKED_REDUCTION_METHOD_T
8070 __kmp_determine_reduction_method(
8071     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8072     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8073     kmp_critical_name *lck) {
8074 
8075   // Default reduction method: critical construct ( lck != NULL, like in current
8076   // PAROPT )
8077   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8078   // can be selected by RTL
8079   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8080   // can be selected by RTL
8081   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8082   // among generated by PAROPT.
8083 
8084   PACKED_REDUCTION_METHOD_T retval;
8085 
8086   int team_size;
8087 
8088   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8089   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8090 
8091 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8092   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8093 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8094 
8095   retval = critical_reduce_block;
8096 
8097   // another choice of getting a team size (with 1 dynamic deference) is slower
8098   team_size = __kmp_get_team_num_threads(global_tid);
8099   if (team_size == 1) {
8100 
8101     retval = empty_reduce_block;
8102 
8103   } else {
8104 
8105     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8106 
8107 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8108     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8109 
8110 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8111     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8112 
8113     int teamsize_cutoff = 4;
8114 
8115 #if KMP_MIC_SUPPORTED
8116     if (__kmp_mic_type != non_mic) {
8117       teamsize_cutoff = 8;
8118     }
8119 #endif
8120     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8121     if (tree_available) {
8122       if (team_size <= teamsize_cutoff) {
8123         if (atomic_available) {
8124           retval = atomic_reduce_block;
8125         }
8126       } else {
8127         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8128       }
8129     } else if (atomic_available) {
8130       retval = atomic_reduce_block;
8131     }
8132 #else
8133 #error "Unknown or unsupported OS"
8134 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8135        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8136 
8137 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8138 
8139 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8140 
8141     // basic tuning
8142 
8143     if (atomic_available) {
8144       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8145         retval = atomic_reduce_block;
8146       }
8147     } // otherwise: use critical section
8148 
8149 #elif KMP_OS_DARWIN
8150 
8151     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8152     if (atomic_available && (num_vars <= 3)) {
8153       retval = atomic_reduce_block;
8154     } else if (tree_available) {
8155       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8156           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8157         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8158       }
8159     } // otherwise: use critical section
8160 
8161 #else
8162 #error "Unknown or unsupported OS"
8163 #endif
8164 
8165 #else
8166 #error "Unknown or unsupported architecture"
8167 #endif
8168   }
8169 
8170   // KMP_FORCE_REDUCTION
8171 
8172   // If the team is serialized (team_size == 1), ignore the forced reduction
8173   // method and stay with the unsynchronized method (empty_reduce_block)
8174   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8175       team_size != 1) {
8176 
8177     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8178 
8179     int atomic_available, tree_available;
8180 
8181     switch ((forced_retval = __kmp_force_reduction_method)) {
8182     case critical_reduce_block:
8183       KMP_ASSERT(lck); // lck should be != 0
8184       break;
8185 
8186     case atomic_reduce_block:
8187       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8188       if (!atomic_available) {
8189         KMP_WARNING(RedMethodNotSupported, "atomic");
8190         forced_retval = critical_reduce_block;
8191       }
8192       break;
8193 
8194     case tree_reduce_block:
8195       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8196       if (!tree_available) {
8197         KMP_WARNING(RedMethodNotSupported, "tree");
8198         forced_retval = critical_reduce_block;
8199       } else {
8200 #if KMP_FAST_REDUCTION_BARRIER
8201         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8202 #endif
8203       }
8204       break;
8205 
8206     default:
8207       KMP_ASSERT(0); // "unsupported method specified"
8208     }
8209 
8210     retval = forced_retval;
8211   }
8212 
8213   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8214 
8215 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8216 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8217 
8218   return (retval);
8219 }
8220 
8221 // this function is for testing set/get/determine reduce method
8222 kmp_int32 __kmp_get_reduce_method(void) {
8223   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8224 }
8225 
8226 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8227 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8228 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8229 
8230 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8231 // OpenMP is used subsequently.
8232 void __kmp_hard_pause() {
8233   __kmp_pause_status = kmp_hard_paused;
8234   __kmp_internal_end_thread(-1);
8235 }
8236 
8237 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8238 void __kmp_resume_if_soft_paused() {
8239   if (__kmp_pause_status == kmp_soft_paused) {
8240     __kmp_pause_status = kmp_not_paused;
8241 
8242     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8243       kmp_info_t *thread = __kmp_threads[gtid];
8244       if (thread) { // Wake it if sleeping
8245         kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8246         if (fl.is_sleeping())
8247           fl.resume(gtid);
8248         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8249           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8250         } else { // thread holds the lock and may sleep soon
8251           do { // until either the thread sleeps, or we can get the lock
8252             if (fl.is_sleeping()) {
8253               fl.resume(gtid);
8254               break;
8255             } else if (__kmp_try_suspend_mx(thread)) {
8256               __kmp_unlock_suspend_mx(thread);
8257               break;
8258             }
8259           } while (1);
8260         }
8261       }
8262     }
8263   }
8264 }
8265 
8266 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8267 // TODO: add warning messages
8268 int __kmp_pause_resource(kmp_pause_status_t level) {
8269   if (level == kmp_not_paused) { // requesting resume
8270     if (__kmp_pause_status == kmp_not_paused) {
8271       // error message about runtime not being paused, so can't resume
8272       return 1;
8273     } else {
8274       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8275                        __kmp_pause_status == kmp_hard_paused);
8276       __kmp_pause_status = kmp_not_paused;
8277       return 0;
8278     }
8279   } else if (level == kmp_soft_paused) { // requesting soft pause
8280     if (__kmp_pause_status != kmp_not_paused) {
8281       // error message about already being paused
8282       return 1;
8283     } else {
8284       __kmp_soft_pause();
8285       return 0;
8286     }
8287   } else if (level == kmp_hard_paused) { // requesting hard pause
8288     if (__kmp_pause_status != kmp_not_paused) {
8289       // error message about already being paused
8290       return 1;
8291     } else {
8292       __kmp_hard_pause();
8293       return 0;
8294     }
8295   } else {
8296     // error message about invalid level
8297     return 1;
8298   }
8299 }
8300