xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_runtime.cpp (revision 4b50c451720d8b427757a6da1dd2bb4c52cd9e35)
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37 
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41 
42 #include "tsan_annotations.h"
43 
44 #if defined(KMP_GOMP_COMPAT)
45 char const __kmp_version_alt_comp[] =
46     KMP_VERSION_PREFIX "alternative compiler support: yes";
47 #endif /* defined(KMP_GOMP_COMPAT) */
48 
49 char const __kmp_version_omp_api[] =
50     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
51 
52 #ifdef KMP_DEBUG
53 char const __kmp_version_lock[] =
54     KMP_VERSION_PREFIX "lock type: run time selectable";
55 #endif /* KMP_DEBUG */
56 
57 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
58 
59 /* ------------------------------------------------------------------------ */
60 
61 #if KMP_USE_MONITOR
62 kmp_info_t __kmp_monitor;
63 #endif
64 
65 /* Forward declarations */
66 
67 void __kmp_cleanup(void);
68 
69 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
70                                   int gtid);
71 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
72                                   kmp_internal_control_t *new_icvs,
73                                   ident_t *loc);
74 #if KMP_AFFINITY_SUPPORTED
75 static void __kmp_partition_places(kmp_team_t *team,
76                                    int update_master_only = 0);
77 #endif
78 static void __kmp_do_serial_initialize(void);
79 void __kmp_fork_barrier(int gtid, int tid);
80 void __kmp_join_barrier(int gtid);
81 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
82                           kmp_internal_control_t *new_icvs, ident_t *loc);
83 
84 #ifdef USE_LOAD_BALANCE
85 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
86 #endif
87 
88 static int __kmp_expand_threads(int nNeed);
89 #if KMP_OS_WINDOWS
90 static int __kmp_unregister_root_other_thread(int gtid);
91 #endif
92 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
93 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
94 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
95 
96 /* Calculate the identifier of the current thread */
97 /* fast (and somewhat portable) way to get unique identifier of executing
98    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
99 int __kmp_get_global_thread_id() {
100   int i;
101   kmp_info_t **other_threads;
102   size_t stack_data;
103   char *stack_addr;
104   size_t stack_size;
105   char *stack_base;
106 
107   KA_TRACE(
108       1000,
109       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
110        __kmp_nth, __kmp_all_nth));
111 
112   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
113      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
114      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
115      __kmp_init_gtid for this to work. */
116 
117   if (!TCR_4(__kmp_init_gtid))
118     return KMP_GTID_DNE;
119 
120 #ifdef KMP_TDATA_GTID
121   if (TCR_4(__kmp_gtid_mode) >= 3) {
122     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
123     return __kmp_gtid;
124   }
125 #endif
126   if (TCR_4(__kmp_gtid_mode) >= 2) {
127     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
128     return __kmp_gtid_get_specific();
129   }
130   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
131 
132   stack_addr = (char *)&stack_data;
133   other_threads = __kmp_threads;
134 
135   /* ATT: The code below is a source of potential bugs due to unsynchronized
136      access to __kmp_threads array. For example:
137      1. Current thread loads other_threads[i] to thr and checks it, it is
138         non-NULL.
139      2. Current thread is suspended by OS.
140      3. Another thread unregisters and finishes (debug versions of free()
141         may fill memory with something like 0xEF).
142      4. Current thread is resumed.
143      5. Current thread reads junk from *thr.
144      TODO: Fix it.  --ln  */
145 
146   for (i = 0; i < __kmp_threads_capacity; i++) {
147 
148     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
149     if (!thr)
150       continue;
151 
152     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
153     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
154 
155     /* stack grows down -- search through all of the active threads */
156 
157     if (stack_addr <= stack_base) {
158       size_t stack_diff = stack_base - stack_addr;
159 
160       if (stack_diff <= stack_size) {
161         /* The only way we can be closer than the allocated */
162         /* stack size is if we are running on this thread. */
163         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
164         return i;
165       }
166     }
167   }
168 
169   /* get specific to try and determine our gtid */
170   KA_TRACE(1000,
171            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
172             "thread, using TLS\n"));
173   i = __kmp_gtid_get_specific();
174 
175   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
176 
177   /* if we havn't been assigned a gtid, then return code */
178   if (i < 0)
179     return i;
180 
181   /* dynamically updated stack window for uber threads to avoid get_specific
182      call */
183   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
184     KMP_FATAL(StackOverflow, i);
185   }
186 
187   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
188   if (stack_addr > stack_base) {
189     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
190     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
191             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
192                 stack_base);
193   } else {
194     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
195             stack_base - stack_addr);
196   }
197 
198   /* Reprint stack bounds for ubermaster since they have been refined */
199   if (__kmp_storage_map) {
200     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
202     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
203                                  other_threads[i]->th.th_info.ds.ds_stacksize,
204                                  "th_%d stack (refinement)", i);
205   }
206   return i;
207 }
208 
209 int __kmp_get_global_thread_id_reg() {
210   int gtid;
211 
212   if (!__kmp_init_serial) {
213     gtid = KMP_GTID_DNE;
214   } else
215 #ifdef KMP_TDATA_GTID
216       if (TCR_4(__kmp_gtid_mode) >= 3) {
217     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
218     gtid = __kmp_gtid;
219   } else
220 #endif
221       if (TCR_4(__kmp_gtid_mode) >= 2) {
222     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
223     gtid = __kmp_gtid_get_specific();
224   } else {
225     KA_TRACE(1000,
226              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
227     gtid = __kmp_get_global_thread_id();
228   }
229 
230   /* we must be a new uber master sibling thread */
231   if (gtid == KMP_GTID_DNE) {
232     KA_TRACE(10,
233              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
234               "Registering a new gtid.\n"));
235     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
236     if (!__kmp_init_serial) {
237       __kmp_do_serial_initialize();
238       gtid = __kmp_gtid_get_specific();
239     } else {
240       gtid = __kmp_register_root(FALSE);
241     }
242     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
243     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
244   }
245 
246   KMP_DEBUG_ASSERT(gtid >= 0);
247 
248   return gtid;
249 }
250 
251 /* caller must hold forkjoin_lock */
252 void __kmp_check_stack_overlap(kmp_info_t *th) {
253   int f;
254   char *stack_beg = NULL;
255   char *stack_end = NULL;
256   int gtid;
257 
258   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
259   if (__kmp_storage_map) {
260     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
261     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
262 
263     gtid = __kmp_gtid_from_thread(th);
264 
265     if (gtid == KMP_GTID_MONITOR) {
266       __kmp_print_storage_map_gtid(
267           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
268           "th_%s stack (%s)", "mon",
269           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
270     } else {
271       __kmp_print_storage_map_gtid(
272           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
273           "th_%d stack (%s)", gtid,
274           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
275     }
276   }
277 
278   /* No point in checking ubermaster threads since they use refinement and
279    * cannot overlap */
280   gtid = __kmp_gtid_from_thread(th);
281   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
282     KA_TRACE(10,
283              ("__kmp_check_stack_overlap: performing extensive checking\n"));
284     if (stack_beg == NULL) {
285       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
286       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
287     }
288 
289     for (f = 0; f < __kmp_threads_capacity; f++) {
290       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
291 
292       if (f_th && f_th != th) {
293         char *other_stack_end =
294             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
295         char *other_stack_beg =
296             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
297         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
298             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
299 
300           /* Print the other stack values before the abort */
301           if (__kmp_storage_map)
302             __kmp_print_storage_map_gtid(
303                 -1, other_stack_beg, other_stack_end,
304                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
305                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
306 
307           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
308                       __kmp_msg_null);
309         }
310       }
311     }
312   }
313   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
314 }
315 
316 /* ------------------------------------------------------------------------ */
317 
318 void __kmp_infinite_loop(void) {
319   static int done = FALSE;
320 
321   while (!done) {
322     KMP_YIELD(TRUE);
323   }
324 }
325 
326 #define MAX_MESSAGE 512
327 
328 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
329                                   char const *format, ...) {
330   char buffer[MAX_MESSAGE];
331   va_list ap;
332 
333   va_start(ap, format);
334   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
335                p2, (unsigned long)size, format);
336   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
337   __kmp_vprintf(kmp_err, buffer, ap);
338 #if KMP_PRINT_DATA_PLACEMENT
339   int node;
340   if (gtid >= 0) {
341     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
342       if (__kmp_storage_map_verbose) {
343         node = __kmp_get_host_node(p1);
344         if (node < 0) /* doesn't work, so don't try this next time */
345           __kmp_storage_map_verbose = FALSE;
346         else {
347           char *last;
348           int lastNode;
349           int localProc = __kmp_get_cpu_from_gtid(gtid);
350 
351           const int page_size = KMP_GET_PAGE_SIZE();
352 
353           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
354           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
355           if (localProc >= 0)
356             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
357                                  localProc >> 1);
358           else
359             __kmp_printf_no_lock("  GTID %d\n", gtid);
360 #if KMP_USE_PRCTL
361           /* The more elaborate format is disabled for now because of the prctl
362            * hanging bug. */
363           do {
364             last = p1;
365             lastNode = node;
366             /* This loop collates adjacent pages with the same host node. */
367             do {
368               (char *)p1 += page_size;
369             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
370             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
371                                  lastNode);
372           } while (p1 <= p2);
373 #else
374           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
375                                (char *)p1 + (page_size - 1),
376                                __kmp_get_host_node(p1));
377           if (p1 < p2) {
378             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
379                                  (char *)p2 + (page_size - 1),
380                                  __kmp_get_host_node(p2));
381           }
382 #endif
383         }
384       }
385     } else
386       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
387   }
388 #endif /* KMP_PRINT_DATA_PLACEMENT */
389   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
390 }
391 
392 void __kmp_warn(char const *format, ...) {
393   char buffer[MAX_MESSAGE];
394   va_list ap;
395 
396   if (__kmp_generate_warnings == kmp_warnings_off) {
397     return;
398   }
399 
400   va_start(ap, format);
401 
402   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
403   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
404   __kmp_vprintf(kmp_err, buffer, ap);
405   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
406 
407   va_end(ap);
408 }
409 
410 void __kmp_abort_process() {
411   // Later threads may stall here, but that's ok because abort() will kill them.
412   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
413 
414   if (__kmp_debug_buf) {
415     __kmp_dump_debug_buffer();
416   }
417 
418   if (KMP_OS_WINDOWS) {
419     // Let other threads know of abnormal termination and prevent deadlock
420     // if abort happened during library initialization or shutdown
421     __kmp_global.g.g_abort = SIGABRT;
422 
423     /* On Windows* OS by default abort() causes pop-up error box, which stalls
424        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
425        boxes. _set_abort_behavior() works well, but this function is not
426        available in VS7 (this is not problem for DLL, but it is a problem for
427        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
428        help, at least in some versions of MS C RTL.
429 
430        It seems following sequence is the only way to simulate abort() and
431        avoid pop-up error box. */
432     raise(SIGABRT);
433     _exit(3); // Just in case, if signal ignored, exit anyway.
434   } else {
435     abort();
436   }
437 
438   __kmp_infinite_loop();
439   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
440 
441 } // __kmp_abort_process
442 
443 void __kmp_abort_thread(void) {
444   // TODO: Eliminate g_abort global variable and this function.
445   // In case of abort just call abort(), it will kill all the threads.
446   __kmp_infinite_loop();
447 } // __kmp_abort_thread
448 
449 /* Print out the storage map for the major kmp_info_t thread data structures
450    that are allocated together. */
451 
452 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
453   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
454                                gtid);
455 
456   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
457                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
458 
459   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
460                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
461 
462   __kmp_print_storage_map_gtid(
463       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
464       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
465 
466   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
467                                &thr->th.th_bar[bs_plain_barrier + 1],
468                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
469                                gtid);
470 
471   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
472                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
473                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
474                                gtid);
475 
476 #if KMP_FAST_REDUCTION_BARRIER
477   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
478                                &thr->th.th_bar[bs_reduction_barrier + 1],
479                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
480                                gtid);
481 #endif // KMP_FAST_REDUCTION_BARRIER
482 }
483 
484 /* Print out the storage map for the major kmp_team_t team data structures
485    that are allocated together. */
486 
487 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
488                                          int team_id, int num_thr) {
489   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
490   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
491                                header, team_id);
492 
493   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
494                                &team->t.t_bar[bs_last_barrier],
495                                sizeof(kmp_balign_team_t) * bs_last_barrier,
496                                "%s_%d.t_bar", header, team_id);
497 
498   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
499                                &team->t.t_bar[bs_plain_barrier + 1],
500                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
501                                header, team_id);
502 
503   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
504                                &team->t.t_bar[bs_forkjoin_barrier + 1],
505                                sizeof(kmp_balign_team_t),
506                                "%s_%d.t_bar[forkjoin]", header, team_id);
507 
508 #if KMP_FAST_REDUCTION_BARRIER
509   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
510                                &team->t.t_bar[bs_reduction_barrier + 1],
511                                sizeof(kmp_balign_team_t),
512                                "%s_%d.t_bar[reduction]", header, team_id);
513 #endif // KMP_FAST_REDUCTION_BARRIER
514 
515   __kmp_print_storage_map_gtid(
516       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
517       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
518 
519   __kmp_print_storage_map_gtid(
520       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
521       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
522 
523   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
524                                &team->t.t_disp_buffer[num_disp_buff],
525                                sizeof(dispatch_shared_info_t) * num_disp_buff,
526                                "%s_%d.t_disp_buffer", header, team_id);
527 }
528 
529 static void __kmp_init_allocator() { __kmp_init_memkind(); }
530 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
531 
532 /* ------------------------------------------------------------------------ */
533 
534 #if KMP_DYNAMIC_LIB
535 #if KMP_OS_WINDOWS
536 
537 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
538   // TODO: Change to __kmp_break_bootstrap_lock().
539   __kmp_init_bootstrap_lock(lck); // make the lock released
540 }
541 
542 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
543   int i;
544   int thread_count;
545 
546   // PROCESS_DETACH is expected to be called by a thread that executes
547   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
548   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
549   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
550   // threads can be still alive here, although being about to be terminated. The
551   // threads in the array with ds_thread==0 are most suspicious. Actually, it
552   // can be not safe to access the __kmp_threads[].
553 
554   // TODO: does it make sense to check __kmp_roots[] ?
555 
556   // Let's check that there are no other alive threads registered with the OMP
557   // lib.
558   while (1) {
559     thread_count = 0;
560     for (i = 0; i < __kmp_threads_capacity; ++i) {
561       if (!__kmp_threads)
562         continue;
563       kmp_info_t *th = __kmp_threads[i];
564       if (th == NULL)
565         continue;
566       int gtid = th->th.th_info.ds.ds_gtid;
567       if (gtid == gtid_req)
568         continue;
569       if (gtid < 0)
570         continue;
571       DWORD exit_val;
572       int alive = __kmp_is_thread_alive(th, &exit_val);
573       if (alive) {
574         ++thread_count;
575       }
576     }
577     if (thread_count == 0)
578       break; // success
579   }
580 
581   // Assume that I'm alone. Now it might be safe to check and reset locks.
582   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
583   __kmp_reset_lock(&__kmp_forkjoin_lock);
584 #ifdef KMP_DEBUG
585   __kmp_reset_lock(&__kmp_stdio_lock);
586 #endif // KMP_DEBUG
587 }
588 
589 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
590   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
591 
592   switch (fdwReason) {
593 
594   case DLL_PROCESS_ATTACH:
595     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
596 
597     return TRUE;
598 
599   case DLL_PROCESS_DETACH:
600     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
601 
602     if (lpReserved != NULL) {
603       // lpReserved is used for telling the difference:
604       //   lpReserved == NULL when FreeLibrary() was called,
605       //   lpReserved != NULL when the process terminates.
606       // When FreeLibrary() is called, worker threads remain alive. So they will
607       // release the forkjoin lock by themselves. When the process terminates,
608       // worker threads disappear triggering the problem of unreleased forkjoin
609       // lock as described below.
610 
611       // A worker thread can take the forkjoin lock. The problem comes up if
612       // that worker thread becomes dead before it releases the forkjoin lock.
613       // The forkjoin lock remains taken, while the thread executing
614       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
615       // to take the forkjoin lock and will always fail, so that the application
616       // will never finish [normally]. This scenario is possible if
617       // __kmpc_end() has not been executed. It looks like it's not a corner
618       // case, but common cases:
619       // - the main function was compiled by an alternative compiler;
620       // - the main function was compiled by icl but without /Qopenmp
621       //   (application with plugins);
622       // - application terminates by calling C exit(), Fortran CALL EXIT() or
623       //   Fortran STOP.
624       // - alive foreign thread prevented __kmpc_end from doing cleanup.
625       //
626       // This is a hack to work around the problem.
627       // TODO: !!! figure out something better.
628       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
629     }
630 
631     __kmp_internal_end_library(__kmp_gtid_get_specific());
632 
633     return TRUE;
634 
635   case DLL_THREAD_ATTACH:
636     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
637 
638     /* if we want to register new siblings all the time here call
639      * __kmp_get_gtid(); */
640     return TRUE;
641 
642   case DLL_THREAD_DETACH:
643     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
644 
645     __kmp_internal_end_thread(__kmp_gtid_get_specific());
646     return TRUE;
647   }
648 
649   return TRUE;
650 }
651 
652 #endif /* KMP_OS_WINDOWS */
653 #endif /* KMP_DYNAMIC_LIB */
654 
655 /* __kmp_parallel_deo -- Wait until it's our turn. */
656 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
657   int gtid = *gtid_ref;
658 #ifdef BUILD_PARALLEL_ORDERED
659   kmp_team_t *team = __kmp_team_from_gtid(gtid);
660 #endif /* BUILD_PARALLEL_ORDERED */
661 
662   if (__kmp_env_consistency_check) {
663     if (__kmp_threads[gtid]->th.th_root->r.r_active)
664 #if KMP_USE_DYNAMIC_LOCK
665       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
666 #else
667       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
668 #endif
669   }
670 #ifdef BUILD_PARALLEL_ORDERED
671   if (!team->t.t_serialized) {
672     KMP_MB();
673     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
674              NULL);
675     KMP_MB();
676   }
677 #endif /* BUILD_PARALLEL_ORDERED */
678 }
679 
680 /* __kmp_parallel_dxo -- Signal the next task. */
681 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682   int gtid = *gtid_ref;
683 #ifdef BUILD_PARALLEL_ORDERED
684   int tid = __kmp_tid_from_gtid(gtid);
685   kmp_team_t *team = __kmp_team_from_gtid(gtid);
686 #endif /* BUILD_PARALLEL_ORDERED */
687 
688   if (__kmp_env_consistency_check) {
689     if (__kmp_threads[gtid]->th.th_root->r.r_active)
690       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
691   }
692 #ifdef BUILD_PARALLEL_ORDERED
693   if (!team->t.t_serialized) {
694     KMP_MB(); /* Flush all pending memory write invalidates.  */
695 
696     /* use the tid of the next thread in this team */
697     /* TODO replace with general release procedure */
698     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
699 
700     KMP_MB(); /* Flush all pending memory write invalidates.  */
701   }
702 #endif /* BUILD_PARALLEL_ORDERED */
703 }
704 
705 /* ------------------------------------------------------------------------ */
706 /* The BARRIER for a SINGLE process section is always explicit   */
707 
708 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
709   int status;
710   kmp_info_t *th;
711   kmp_team_t *team;
712 
713   if (!TCR_4(__kmp_init_parallel))
714     __kmp_parallel_initialize();
715   __kmp_resume_if_soft_paused();
716 
717   th = __kmp_threads[gtid];
718   team = th->th.th_team;
719   status = 0;
720 
721   th->th.th_ident = id_ref;
722 
723   if (team->t.t_serialized) {
724     status = 1;
725   } else {
726     kmp_int32 old_this = th->th.th_local.this_construct;
727 
728     ++th->th.th_local.this_construct;
729     /* try to set team count to thread count--success means thread got the
730        single block */
731     /* TODO: Should this be acquire or release? */
732     if (team->t.t_construct == old_this) {
733       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
734                                               th->th.th_local.this_construct);
735     }
736 #if USE_ITT_BUILD
737     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
738         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
739         team->t.t_active_level ==
740             1) { // Only report metadata by master of active team at level 1
741       __kmp_itt_metadata_single(id_ref);
742     }
743 #endif /* USE_ITT_BUILD */
744   }
745 
746   if (__kmp_env_consistency_check) {
747     if (status && push_ws) {
748       __kmp_push_workshare(gtid, ct_psingle, id_ref);
749     } else {
750       __kmp_check_workshare(gtid, ct_psingle, id_ref);
751     }
752   }
753 #if USE_ITT_BUILD
754   if (status) {
755     __kmp_itt_single_start(gtid);
756   }
757 #endif /* USE_ITT_BUILD */
758   return status;
759 }
760 
761 void __kmp_exit_single(int gtid) {
762 #if USE_ITT_BUILD
763   __kmp_itt_single_end(gtid);
764 #endif /* USE_ITT_BUILD */
765   if (__kmp_env_consistency_check)
766     __kmp_pop_workshare(gtid, ct_psingle, NULL);
767 }
768 
769 /* determine if we can go parallel or must use a serialized parallel region and
770  * how many threads we can use
771  * set_nproc is the number of threads requested for the team
772  * returns 0 if we should serialize or only use one thread,
773  * otherwise the number of threads to use
774  * The forkjoin lock is held by the caller. */
775 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
776                                  int master_tid, int set_nthreads,
777                                  int enter_teams) {
778   int capacity;
779   int new_nthreads;
780   KMP_DEBUG_ASSERT(__kmp_init_serial);
781   KMP_DEBUG_ASSERT(root && parent_team);
782   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
783 
784   // If dyn-var is set, dynamically adjust the number of desired threads,
785   // according to the method specified by dynamic_mode.
786   new_nthreads = set_nthreads;
787   if (!get__dynamic_2(parent_team, master_tid)) {
788     ;
789   }
790 #ifdef USE_LOAD_BALANCE
791   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
792     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
793     if (new_nthreads == 1) {
794       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
795                     "reservation to 1 thread\n",
796                     master_tid));
797       return 1;
798     }
799     if (new_nthreads < set_nthreads) {
800       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
801                     "reservation to %d threads\n",
802                     master_tid, new_nthreads));
803     }
804   }
805 #endif /* USE_LOAD_BALANCE */
806   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
807     new_nthreads = __kmp_avail_proc - __kmp_nth +
808                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
809     if (new_nthreads <= 1) {
810       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
811                     "reservation to 1 thread\n",
812                     master_tid));
813       return 1;
814     }
815     if (new_nthreads < set_nthreads) {
816       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
817                     "reservation to %d threads\n",
818                     master_tid, new_nthreads));
819     } else {
820       new_nthreads = set_nthreads;
821     }
822   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
823     if (set_nthreads > 2) {
824       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
825       new_nthreads = (new_nthreads % set_nthreads) + 1;
826       if (new_nthreads == 1) {
827         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
828                       "reservation to 1 thread\n",
829                       master_tid));
830         return 1;
831       }
832       if (new_nthreads < set_nthreads) {
833         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
834                       "reservation to %d threads\n",
835                       master_tid, new_nthreads));
836       }
837     }
838   } else {
839     KMP_ASSERT(0);
840   }
841 
842   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
843   if (__kmp_nth + new_nthreads -
844           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
845       __kmp_max_nth) {
846     int tl_nthreads = __kmp_max_nth - __kmp_nth +
847                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
848     if (tl_nthreads <= 0) {
849       tl_nthreads = 1;
850     }
851 
852     // If dyn-var is false, emit a 1-time warning.
853     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
854       __kmp_reserve_warn = 1;
855       __kmp_msg(kmp_ms_warning,
856                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
857                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
858     }
859     if (tl_nthreads == 1) {
860       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
861                     "reduced reservation to 1 thread\n",
862                     master_tid));
863       return 1;
864     }
865     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
866                   "reservation to %d threads\n",
867                   master_tid, tl_nthreads));
868     new_nthreads = tl_nthreads;
869   }
870 
871   // Respect OMP_THREAD_LIMIT
872   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
873   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
874   if (cg_nthreads + new_nthreads -
875           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
876       max_cg_threads) {
877     int tl_nthreads = max_cg_threads - cg_nthreads +
878                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
879     if (tl_nthreads <= 0) {
880       tl_nthreads = 1;
881     }
882 
883     // If dyn-var is false, emit a 1-time warning.
884     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
885       __kmp_reserve_warn = 1;
886       __kmp_msg(kmp_ms_warning,
887                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
888                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
889     }
890     if (tl_nthreads == 1) {
891       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
892                     "reduced reservation to 1 thread\n",
893                     master_tid));
894       return 1;
895     }
896     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
897                   "reservation to %d threads\n",
898                   master_tid, tl_nthreads));
899     new_nthreads = tl_nthreads;
900   }
901 
902   // Check if the threads array is large enough, or needs expanding.
903   // See comment in __kmp_register_root() about the adjustment if
904   // __kmp_threads[0] == NULL.
905   capacity = __kmp_threads_capacity;
906   if (TCR_PTR(__kmp_threads[0]) == NULL) {
907     --capacity;
908   }
909   if (__kmp_nth + new_nthreads -
910           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
911       capacity) {
912     // Expand the threads array.
913     int slotsRequired = __kmp_nth + new_nthreads -
914                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
915                         capacity;
916     int slotsAdded = __kmp_expand_threads(slotsRequired);
917     if (slotsAdded < slotsRequired) {
918       // The threads array was not expanded enough.
919       new_nthreads -= (slotsRequired - slotsAdded);
920       KMP_ASSERT(new_nthreads >= 1);
921 
922       // If dyn-var is false, emit a 1-time warning.
923       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
924         __kmp_reserve_warn = 1;
925         if (__kmp_tp_cached) {
926           __kmp_msg(kmp_ms_warning,
927                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
928                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
929                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
930         } else {
931           __kmp_msg(kmp_ms_warning,
932                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
933                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
934         }
935       }
936     }
937   }
938 
939 #ifdef KMP_DEBUG
940   if (new_nthreads == 1) {
941     KC_TRACE(10,
942              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
943               "dead roots and rechecking; requested %d threads\n",
944               __kmp_get_gtid(), set_nthreads));
945   } else {
946     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
947                   " %d threads\n",
948                   __kmp_get_gtid(), new_nthreads, set_nthreads));
949   }
950 #endif // KMP_DEBUG
951   return new_nthreads;
952 }
953 
954 /* Allocate threads from the thread pool and assign them to the new team. We are
955    assured that there are enough threads available, because we checked on that
956    earlier within critical section forkjoin */
957 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
958                                     kmp_info_t *master_th, int master_gtid) {
959   int i;
960   int use_hot_team;
961 
962   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
963   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
964   KMP_MB();
965 
966   /* first, let's setup the master thread */
967   master_th->th.th_info.ds.ds_tid = 0;
968   master_th->th.th_team = team;
969   master_th->th.th_team_nproc = team->t.t_nproc;
970   master_th->th.th_team_master = master_th;
971   master_th->th.th_team_serialized = FALSE;
972   master_th->th.th_dispatch = &team->t.t_dispatch[0];
973 
974 /* make sure we are not the optimized hot team */
975 #if KMP_NESTED_HOT_TEAMS
976   use_hot_team = 0;
977   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
978   if (hot_teams) { // hot teams array is not allocated if
979     // KMP_HOT_TEAMS_MAX_LEVEL=0
980     int level = team->t.t_active_level - 1; // index in array of hot teams
981     if (master_th->th.th_teams_microtask) { // are we inside the teams?
982       if (master_th->th.th_teams_size.nteams > 1) {
983         ++level; // level was not increased in teams construct for
984         // team_of_masters
985       }
986       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
987           master_th->th.th_teams_level == team->t.t_level) {
988         ++level; // level was not increased in teams construct for
989         // team_of_workers before the parallel
990       } // team->t.t_level will be increased inside parallel
991     }
992     if (level < __kmp_hot_teams_max_level) {
993       if (hot_teams[level].hot_team) {
994         // hot team has already been allocated for given level
995         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
996         use_hot_team = 1; // the team is ready to use
997       } else {
998         use_hot_team = 0; // AC: threads are not allocated yet
999         hot_teams[level].hot_team = team; // remember new hot team
1000         hot_teams[level].hot_team_nth = team->t.t_nproc;
1001       }
1002     } else {
1003       use_hot_team = 0;
1004     }
1005   }
1006 #else
1007   use_hot_team = team == root->r.r_hot_team;
1008 #endif
1009   if (!use_hot_team) {
1010 
1011     /* install the master thread */
1012     team->t.t_threads[0] = master_th;
1013     __kmp_initialize_info(master_th, team, 0, master_gtid);
1014 
1015     /* now, install the worker threads */
1016     for (i = 1; i < team->t.t_nproc; i++) {
1017 
1018       /* fork or reallocate a new thread and install it in team */
1019       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1020       team->t.t_threads[i] = thr;
1021       KMP_DEBUG_ASSERT(thr);
1022       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1023       /* align team and thread arrived states */
1024       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1025                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1026                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1027                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1028                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1029                     team->t.t_bar[bs_plain_barrier].b_arrived));
1030       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1031       thr->th.th_teams_level = master_th->th.th_teams_level;
1032       thr->th.th_teams_size = master_th->th.th_teams_size;
1033       { // Initialize threads' barrier data.
1034         int b;
1035         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1036         for (b = 0; b < bs_last_barrier; ++b) {
1037           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1038           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1039 #if USE_DEBUGGER
1040           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1041 #endif
1042         }
1043       }
1044     }
1045 
1046 #if KMP_AFFINITY_SUPPORTED
1047     __kmp_partition_places(team);
1048 #endif
1049   }
1050 
1051   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1052     for (i = 0; i < team->t.t_nproc; i++) {
1053       kmp_info_t *thr = team->t.t_threads[i];
1054       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1055           thr->th.th_prev_level != team->t.t_level) {
1056         team->t.t_display_affinity = 1;
1057         break;
1058       }
1059     }
1060   }
1061 
1062   KMP_MB();
1063 }
1064 
1065 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1066 // Propagate any changes to the floating point control registers out to the team
1067 // We try to avoid unnecessary writes to the relevant cache line in the team
1068 // structure, so we don't make changes unless they are needed.
1069 inline static void propagateFPControl(kmp_team_t *team) {
1070   if (__kmp_inherit_fp_control) {
1071     kmp_int16 x87_fpu_control_word;
1072     kmp_uint32 mxcsr;
1073 
1074     // Get master values of FPU control flags (both X87 and vector)
1075     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1076     __kmp_store_mxcsr(&mxcsr);
1077     mxcsr &= KMP_X86_MXCSR_MASK;
1078 
1079     // There is no point looking at t_fp_control_saved here.
1080     // If it is TRUE, we still have to update the values if they are different
1081     // from those we now have. If it is FALSE we didn't save anything yet, but
1082     // our objective is the same. We have to ensure that the values in the team
1083     // are the same as those we have.
1084     // So, this code achieves what we need whether or not t_fp_control_saved is
1085     // true. By checking whether the value needs updating we avoid unnecessary
1086     // writes that would put the cache-line into a written state, causing all
1087     // threads in the team to have to read it again.
1088     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1089     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1090     // Although we don't use this value, other code in the runtime wants to know
1091     // whether it should restore them. So we must ensure it is correct.
1092     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1093   } else {
1094     // Similarly here. Don't write to this cache-line in the team structure
1095     // unless we have to.
1096     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1097   }
1098 }
1099 
1100 // Do the opposite, setting the hardware registers to the updated values from
1101 // the team.
1102 inline static void updateHWFPControl(kmp_team_t *team) {
1103   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1104     // Only reset the fp control regs if they have been changed in the team.
1105     // the parallel region that we are exiting.
1106     kmp_int16 x87_fpu_control_word;
1107     kmp_uint32 mxcsr;
1108     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1109     __kmp_store_mxcsr(&mxcsr);
1110     mxcsr &= KMP_X86_MXCSR_MASK;
1111 
1112     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1113       __kmp_clear_x87_fpu_status_word();
1114       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1115     }
1116 
1117     if (team->t.t_mxcsr != mxcsr) {
1118       __kmp_load_mxcsr(&team->t.t_mxcsr);
1119     }
1120   }
1121 }
1122 #else
1123 #define propagateFPControl(x) ((void)0)
1124 #define updateHWFPControl(x) ((void)0)
1125 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1126 
1127 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1128                                      int realloc); // forward declaration
1129 
1130 /* Run a parallel region that has been serialized, so runs only in a team of the
1131    single master thread. */
1132 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1133   kmp_info_t *this_thr;
1134   kmp_team_t *serial_team;
1135 
1136   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1137 
1138   /* Skip all this code for autopar serialized loops since it results in
1139      unacceptable overhead */
1140   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1141     return;
1142 
1143   if (!TCR_4(__kmp_init_parallel))
1144     __kmp_parallel_initialize();
1145   __kmp_resume_if_soft_paused();
1146 
1147   this_thr = __kmp_threads[global_tid];
1148   serial_team = this_thr->th.th_serial_team;
1149 
1150   /* utilize the serialized team held by this thread */
1151   KMP_DEBUG_ASSERT(serial_team);
1152   KMP_MB();
1153 
1154   if (__kmp_tasking_mode != tskm_immediate_exec) {
1155     KMP_DEBUG_ASSERT(
1156         this_thr->th.th_task_team ==
1157         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1158     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1159                      NULL);
1160     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1161                   "team %p, new task_team = NULL\n",
1162                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1163     this_thr->th.th_task_team = NULL;
1164   }
1165 
1166   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1167   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1168     proc_bind = proc_bind_false;
1169   } else if (proc_bind == proc_bind_default) {
1170     // No proc_bind clause was specified, so use the current value
1171     // of proc-bind-var for this parallel region.
1172     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1173   }
1174   // Reset for next parallel region
1175   this_thr->th.th_set_proc_bind = proc_bind_default;
1176 
1177 #if OMPT_SUPPORT
1178   ompt_data_t ompt_parallel_data = ompt_data_none;
1179   ompt_data_t *implicit_task_data;
1180   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1181   if (ompt_enabled.enabled &&
1182       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1183 
1184     ompt_task_info_t *parent_task_info;
1185     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1186 
1187     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1188     if (ompt_enabled.ompt_callback_parallel_begin) {
1189       int team_size = 1;
1190 
1191       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1192           &(parent_task_info->task_data), &(parent_task_info->frame),
1193           &ompt_parallel_data, team_size, ompt_parallel_invoker_program,
1194           codeptr);
1195     }
1196   }
1197 #endif // OMPT_SUPPORT
1198 
1199   if (this_thr->th.th_team != serial_team) {
1200     // Nested level will be an index in the nested nthreads array
1201     int level = this_thr->th.th_team->t.t_level;
1202 
1203     if (serial_team->t.t_serialized) {
1204       /* this serial team was already used
1205          TODO increase performance by making this locks more specific */
1206       kmp_team_t *new_team;
1207 
1208       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1209 
1210       new_team =
1211           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1212 #if OMPT_SUPPORT
1213                               ompt_parallel_data,
1214 #endif
1215                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1216                               0 USE_NESTED_HOT_ARG(NULL));
1217       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1218       KMP_ASSERT(new_team);
1219 
1220       /* setup new serialized team and install it */
1221       new_team->t.t_threads[0] = this_thr;
1222       new_team->t.t_parent = this_thr->th.th_team;
1223       serial_team = new_team;
1224       this_thr->th.th_serial_team = serial_team;
1225 
1226       KF_TRACE(
1227           10,
1228           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1229            global_tid, serial_team));
1230 
1231       /* TODO the above breaks the requirement that if we run out of resources,
1232          then we can still guarantee that serialized teams are ok, since we may
1233          need to allocate a new one */
1234     } else {
1235       KF_TRACE(
1236           10,
1237           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1238            global_tid, serial_team));
1239     }
1240 
1241     /* we have to initialize this serial team */
1242     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1243     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1244     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1245     serial_team->t.t_ident = loc;
1246     serial_team->t.t_serialized = 1;
1247     serial_team->t.t_nproc = 1;
1248     serial_team->t.t_parent = this_thr->th.th_team;
1249     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1250     this_thr->th.th_team = serial_team;
1251     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1252 
1253     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1254                   this_thr->th.th_current_task));
1255     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1256     this_thr->th.th_current_task->td_flags.executing = 0;
1257 
1258     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1259 
1260     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1261        implicit task for each serialized task represented by
1262        team->t.t_serialized? */
1263     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1264               &this_thr->th.th_current_task->td_parent->td_icvs);
1265 
1266     // Thread value exists in the nested nthreads array for the next nested
1267     // level
1268     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1269       this_thr->th.th_current_task->td_icvs.nproc =
1270           __kmp_nested_nth.nth[level + 1];
1271     }
1272 
1273     if (__kmp_nested_proc_bind.used &&
1274         (level + 1 < __kmp_nested_proc_bind.used)) {
1275       this_thr->th.th_current_task->td_icvs.proc_bind =
1276           __kmp_nested_proc_bind.bind_types[level + 1];
1277     }
1278 
1279 #if USE_DEBUGGER
1280     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1281 #endif
1282     this_thr->th.th_info.ds.ds_tid = 0;
1283 
1284     /* set thread cache values */
1285     this_thr->th.th_team_nproc = 1;
1286     this_thr->th.th_team_master = this_thr;
1287     this_thr->th.th_team_serialized = 1;
1288 
1289     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1290     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1291     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1292 
1293     propagateFPControl(serial_team);
1294 
1295     /* check if we need to allocate dispatch buffers stack */
1296     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1297     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1298       serial_team->t.t_dispatch->th_disp_buffer =
1299           (dispatch_private_info_t *)__kmp_allocate(
1300               sizeof(dispatch_private_info_t));
1301     }
1302     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1303 
1304     KMP_MB();
1305 
1306   } else {
1307     /* this serialized team is already being used,
1308      * that's fine, just add another nested level */
1309     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1310     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1311     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1312     ++serial_team->t.t_serialized;
1313     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1314 
1315     // Nested level will be an index in the nested nthreads array
1316     int level = this_thr->th.th_team->t.t_level;
1317     // Thread value exists in the nested nthreads array for the next nested
1318     // level
1319     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1320       this_thr->th.th_current_task->td_icvs.nproc =
1321           __kmp_nested_nth.nth[level + 1];
1322     }
1323     serial_team->t.t_level++;
1324     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1325                   "of serial team %p to %d\n",
1326                   global_tid, serial_team, serial_team->t.t_level));
1327 
1328     /* allocate/push dispatch buffers stack */
1329     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1330     {
1331       dispatch_private_info_t *disp_buffer =
1332           (dispatch_private_info_t *)__kmp_allocate(
1333               sizeof(dispatch_private_info_t));
1334       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1335       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1336     }
1337     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1338 
1339     KMP_MB();
1340   }
1341   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1342 
1343   // Perform the display affinity functionality for
1344   // serialized parallel regions
1345   if (__kmp_display_affinity) {
1346     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1347         this_thr->th.th_prev_num_threads != 1) {
1348       // NULL means use the affinity-format-var ICV
1349       __kmp_aux_display_affinity(global_tid, NULL);
1350       this_thr->th.th_prev_level = serial_team->t.t_level;
1351       this_thr->th.th_prev_num_threads = 1;
1352     }
1353   }
1354 
1355   if (__kmp_env_consistency_check)
1356     __kmp_push_parallel(global_tid, NULL);
1357 #if OMPT_SUPPORT
1358   serial_team->t.ompt_team_info.master_return_address = codeptr;
1359   if (ompt_enabled.enabled &&
1360       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1361     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1362 
1363     ompt_lw_taskteam_t lw_taskteam;
1364     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1365                             &ompt_parallel_data, codeptr);
1366 
1367     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1368     // don't use lw_taskteam after linking. content was swaped
1369 
1370     /* OMPT implicit task begin */
1371     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1372     if (ompt_enabled.ompt_callback_implicit_task) {
1373       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1374           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1375           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1376       OMPT_CUR_TASK_INFO(this_thr)
1377           ->thread_num = __kmp_tid_from_gtid(global_tid);
1378     }
1379 
1380     /* OMPT state */
1381     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1382     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1383   }
1384 #endif
1385 }
1386 
1387 /* most of the work for a fork */
1388 /* return true if we really went parallel, false if serialized */
1389 int __kmp_fork_call(ident_t *loc, int gtid,
1390                     enum fork_context_e call_context, // Intel, GNU, ...
1391                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1392 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1393 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1394                     va_list *ap
1395 #else
1396                     va_list ap
1397 #endif
1398                     ) {
1399   void **argv;
1400   int i;
1401   int master_tid;
1402   int master_this_cons;
1403   kmp_team_t *team;
1404   kmp_team_t *parent_team;
1405   kmp_info_t *master_th;
1406   kmp_root_t *root;
1407   int nthreads;
1408   int master_active;
1409   int master_set_numthreads;
1410   int level;
1411   int active_level;
1412   int teams_level;
1413 #if KMP_NESTED_HOT_TEAMS
1414   kmp_hot_team_ptr_t **p_hot_teams;
1415 #endif
1416   { // KMP_TIME_BLOCK
1417     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1418     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1419 
1420     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1421     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1422       /* Some systems prefer the stack for the root thread(s) to start with */
1423       /* some gap from the parent stack to prevent false sharing. */
1424       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1425       /* These 2 lines below are so this does not get optimized out */
1426       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1427         __kmp_stkpadding += (short)((kmp_int64)dummy);
1428     }
1429 
1430     /* initialize if needed */
1431     KMP_DEBUG_ASSERT(
1432         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1433     if (!TCR_4(__kmp_init_parallel))
1434       __kmp_parallel_initialize();
1435     __kmp_resume_if_soft_paused();
1436 
1437     /* setup current data */
1438     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1439     // shutdown
1440     parent_team = master_th->th.th_team;
1441     master_tid = master_th->th.th_info.ds.ds_tid;
1442     master_this_cons = master_th->th.th_local.this_construct;
1443     root = master_th->th.th_root;
1444     master_active = root->r.r_active;
1445     master_set_numthreads = master_th->th.th_set_nproc;
1446 
1447 #if OMPT_SUPPORT
1448     ompt_data_t ompt_parallel_data = ompt_data_none;
1449     ompt_data_t *parent_task_data;
1450     ompt_frame_t *ompt_frame;
1451     ompt_data_t *implicit_task_data;
1452     void *return_address = NULL;
1453 
1454     if (ompt_enabled.enabled) {
1455       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1456                                     NULL, NULL);
1457       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1458     }
1459 #endif
1460 
1461     // Nested level will be an index in the nested nthreads array
1462     level = parent_team->t.t_level;
1463     // used to launch non-serial teams even if nested is not allowed
1464     active_level = parent_team->t.t_active_level;
1465     // needed to check nesting inside the teams
1466     teams_level = master_th->th.th_teams_level;
1467 #if KMP_NESTED_HOT_TEAMS
1468     p_hot_teams = &master_th->th.th_hot_teams;
1469     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1470       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1471           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1472       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1473       // it is either actual or not needed (when active_level > 0)
1474       (*p_hot_teams)[0].hot_team_nth = 1;
1475     }
1476 #endif
1477 
1478 #if OMPT_SUPPORT
1479     if (ompt_enabled.enabled) {
1480       if (ompt_enabled.ompt_callback_parallel_begin) {
1481         int team_size = master_set_numthreads
1482                             ? master_set_numthreads
1483                             : get__nproc_2(parent_team, master_tid);
1484         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1485             parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1486             OMPT_INVOKER(call_context), return_address);
1487       }
1488       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1489     }
1490 #endif
1491 
1492     master_th->th.th_ident = loc;
1493 
1494     if (master_th->th.th_teams_microtask && ap &&
1495         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1496       // AC: This is start of parallel that is nested inside teams construct.
1497       // The team is actual (hot), all workers are ready at the fork barrier.
1498       // No lock needed to initialize the team a bit, then free workers.
1499       parent_team->t.t_ident = loc;
1500       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1501       parent_team->t.t_argc = argc;
1502       argv = (void **)parent_team->t.t_argv;
1503       for (i = argc - 1; i >= 0; --i)
1504 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1505 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1506         *argv++ = va_arg(*ap, void *);
1507 #else
1508         *argv++ = va_arg(ap, void *);
1509 #endif
1510       // Increment our nested depth levels, but not increase the serialization
1511       if (parent_team == master_th->th.th_serial_team) {
1512         // AC: we are in serialized parallel
1513         __kmpc_serialized_parallel(loc, gtid);
1514         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1515         // AC: need this in order enquiry functions work
1516         // correctly, will restore at join time
1517         parent_team->t.t_serialized--;
1518 #if OMPT_SUPPORT
1519         void *dummy;
1520         void **exit_runtime_p;
1521 
1522         ompt_lw_taskteam_t lw_taskteam;
1523 
1524         if (ompt_enabled.enabled) {
1525           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1526                                   &ompt_parallel_data, return_address);
1527           exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1528 
1529           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1530           // don't use lw_taskteam after linking. content was swaped
1531 
1532           /* OMPT implicit task begin */
1533           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1534           if (ompt_enabled.ompt_callback_implicit_task) {
1535             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1536                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1537                 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1538             OMPT_CUR_TASK_INFO(master_th)
1539                 ->thread_num = __kmp_tid_from_gtid(gtid);
1540           }
1541 
1542           /* OMPT state */
1543           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1544         } else {
1545           exit_runtime_p = &dummy;
1546         }
1547 #endif
1548 
1549         {
1550           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1551           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1552           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1553 #if OMPT_SUPPORT
1554                                  ,
1555                                  exit_runtime_p
1556 #endif
1557                                  );
1558         }
1559 
1560 #if OMPT_SUPPORT
1561         *exit_runtime_p = NULL;
1562         if (ompt_enabled.enabled) {
1563           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1564           if (ompt_enabled.ompt_callback_implicit_task) {
1565             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1566                 ompt_scope_end, NULL, implicit_task_data, 1,
1567                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1568           }
1569           __ompt_lw_taskteam_unlink(master_th);
1570 
1571           if (ompt_enabled.ompt_callback_parallel_end) {
1572             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1573                 OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1574                 OMPT_INVOKER(call_context), return_address);
1575           }
1576           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1577         }
1578 #endif
1579         return TRUE;
1580       }
1581 
1582       parent_team->t.t_pkfn = microtask;
1583       parent_team->t.t_invoke = invoker;
1584       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1585       parent_team->t.t_active_level++;
1586       parent_team->t.t_level++;
1587       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1588 
1589       /* Change number of threads in the team if requested */
1590       if (master_set_numthreads) { // The parallel has num_threads clause
1591         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1592           // AC: only can reduce number of threads dynamically, can't increase
1593           kmp_info_t **other_threads = parent_team->t.t_threads;
1594           parent_team->t.t_nproc = master_set_numthreads;
1595           for (i = 0; i < master_set_numthreads; ++i) {
1596             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1597           }
1598           // Keep extra threads hot in the team for possible next parallels
1599         }
1600         master_th->th.th_set_nproc = 0;
1601       }
1602 
1603 #if USE_DEBUGGER
1604       if (__kmp_debugging) { // Let debugger override number of threads.
1605         int nth = __kmp_omp_num_threads(loc);
1606         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1607           master_set_numthreads = nth;
1608         }
1609       }
1610 #endif
1611 
1612       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1613                     "master_th=%p, gtid=%d\n",
1614                     root, parent_team, master_th, gtid));
1615       __kmp_internal_fork(loc, gtid, parent_team);
1616       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1617                     "master_th=%p, gtid=%d\n",
1618                     root, parent_team, master_th, gtid));
1619 
1620       /* Invoke microtask for MASTER thread */
1621       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1622                     parent_team->t.t_id, parent_team->t.t_pkfn));
1623 
1624       if (!parent_team->t.t_invoke(gtid)) {
1625         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1626       }
1627       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1628                     parent_team->t.t_id, parent_team->t.t_pkfn));
1629       KMP_MB(); /* Flush all pending memory write invalidates.  */
1630 
1631       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1632 
1633       return TRUE;
1634     } // Parallel closely nested in teams construct
1635 
1636 #if KMP_DEBUG
1637     if (__kmp_tasking_mode != tskm_immediate_exec) {
1638       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1639                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1640     }
1641 #endif
1642 
1643     if (parent_team->t.t_active_level >=
1644         master_th->th.th_current_task->td_icvs.max_active_levels) {
1645       nthreads = 1;
1646     } else {
1647       int enter_teams = ((ap == NULL && active_level == 0) ||
1648                          (ap && teams_level > 0 && teams_level == level));
1649       nthreads =
1650           master_set_numthreads
1651               ? master_set_numthreads
1652               : get__nproc_2(
1653                     parent_team,
1654                     master_tid); // TODO: get nproc directly from current task
1655 
1656       // Check if we need to take forkjoin lock? (no need for serialized
1657       // parallel out of teams construct). This code moved here from
1658       // __kmp_reserve_threads() to speedup nested serialized parallels.
1659       if (nthreads > 1) {
1660         if ((get__max_active_levels(master_th) == 1 &&
1661              (root->r.r_in_parallel && !enter_teams)) ||
1662             (__kmp_library == library_serial)) {
1663           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1664                         " threads\n",
1665                         gtid, nthreads));
1666           nthreads = 1;
1667         }
1668       }
1669       if (nthreads > 1) {
1670         /* determine how many new threads we can use */
1671         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1672         /* AC: If we execute teams from parallel region (on host), then teams
1673            should be created but each can only have 1 thread if nesting is
1674            disabled. If teams called from serial region, then teams and their
1675            threads should be created regardless of the nesting setting. */
1676         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1677                                          nthreads, enter_teams);
1678         if (nthreads == 1) {
1679           // Free lock for single thread execution here; for multi-thread
1680           // execution it will be freed later after team of threads created
1681           // and initialized
1682           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1683         }
1684       }
1685     }
1686     KMP_DEBUG_ASSERT(nthreads > 0);
1687 
1688     // If we temporarily changed the set number of threads then restore it now
1689     master_th->th.th_set_nproc = 0;
1690 
1691     /* create a serialized parallel region? */
1692     if (nthreads == 1) {
1693 /* josh todo: hypothetical question: what do we do for OS X*? */
1694 #if KMP_OS_LINUX &&                                                            \
1695     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1696       void *args[argc];
1697 #else
1698       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1699 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1700           KMP_ARCH_AARCH64) */
1701 
1702       KA_TRACE(20,
1703                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1704 
1705       __kmpc_serialized_parallel(loc, gtid);
1706 
1707       if (call_context == fork_context_intel) {
1708         /* TODO this sucks, use the compiler itself to pass args! :) */
1709         master_th->th.th_serial_team->t.t_ident = loc;
1710         if (!ap) {
1711           // revert change made in __kmpc_serialized_parallel()
1712           master_th->th.th_serial_team->t.t_level--;
1713 // Get args from parent team for teams construct
1714 
1715 #if OMPT_SUPPORT
1716           void *dummy;
1717           void **exit_runtime_p;
1718           ompt_task_info_t *task_info;
1719 
1720           ompt_lw_taskteam_t lw_taskteam;
1721 
1722           if (ompt_enabled.enabled) {
1723             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1724                                     &ompt_parallel_data, return_address);
1725 
1726             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1727             // don't use lw_taskteam after linking. content was swaped
1728 
1729             task_info = OMPT_CUR_TASK_INFO(master_th);
1730             exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1731             if (ompt_enabled.ompt_callback_implicit_task) {
1732               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1733                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1734                   &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1735               OMPT_CUR_TASK_INFO(master_th)
1736                   ->thread_num = __kmp_tid_from_gtid(gtid);
1737             }
1738 
1739             /* OMPT state */
1740             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1741           } else {
1742             exit_runtime_p = &dummy;
1743           }
1744 #endif
1745 
1746           {
1747             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1748             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1749             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1750                                    parent_team->t.t_argv
1751 #if OMPT_SUPPORT
1752                                    ,
1753                                    exit_runtime_p
1754 #endif
1755                                    );
1756           }
1757 
1758 #if OMPT_SUPPORT
1759           if (ompt_enabled.enabled) {
1760             exit_runtime_p = NULL;
1761             if (ompt_enabled.ompt_callback_implicit_task) {
1762               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1763                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1764                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1765             }
1766 
1767             __ompt_lw_taskteam_unlink(master_th);
1768             if (ompt_enabled.ompt_callback_parallel_end) {
1769               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1770                   OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1771                   OMPT_INVOKER(call_context), return_address);
1772             }
1773             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1774           }
1775 #endif
1776         } else if (microtask == (microtask_t)__kmp_teams_master) {
1777           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1778                            master_th->th.th_serial_team);
1779           team = master_th->th.th_team;
1780           // team->t.t_pkfn = microtask;
1781           team->t.t_invoke = invoker;
1782           __kmp_alloc_argv_entries(argc, team, TRUE);
1783           team->t.t_argc = argc;
1784           argv = (void **)team->t.t_argv;
1785           if (ap) {
1786             for (i = argc - 1; i >= 0; --i)
1787 // TODO: revert workaround for Intel(R) 64 tracker #96
1788 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1789               *argv++ = va_arg(*ap, void *);
1790 #else
1791               *argv++ = va_arg(ap, void *);
1792 #endif
1793           } else {
1794             for (i = 0; i < argc; ++i)
1795               // Get args from parent team for teams construct
1796               argv[i] = parent_team->t.t_argv[i];
1797           }
1798           // AC: revert change made in __kmpc_serialized_parallel()
1799           //     because initial code in teams should have level=0
1800           team->t.t_level--;
1801           // AC: call special invoker for outer "parallel" of teams construct
1802           invoker(gtid);
1803         } else {
1804           argv = args;
1805           for (i = argc - 1; i >= 0; --i)
1806 // TODO: revert workaround for Intel(R) 64 tracker #96
1807 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1808             *argv++ = va_arg(*ap, void *);
1809 #else
1810             *argv++ = va_arg(ap, void *);
1811 #endif
1812           KMP_MB();
1813 
1814 #if OMPT_SUPPORT
1815           void *dummy;
1816           void **exit_runtime_p;
1817           ompt_task_info_t *task_info;
1818 
1819           ompt_lw_taskteam_t lw_taskteam;
1820 
1821           if (ompt_enabled.enabled) {
1822             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1823                                     &ompt_parallel_data, return_address);
1824             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1825             // don't use lw_taskteam after linking. content was swaped
1826             task_info = OMPT_CUR_TASK_INFO(master_th);
1827             exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1828 
1829             /* OMPT implicit task begin */
1830             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1831             if (ompt_enabled.ompt_callback_implicit_task) {
1832               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1833                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1834                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1835               OMPT_CUR_TASK_INFO(master_th)
1836                   ->thread_num = __kmp_tid_from_gtid(gtid);
1837             }
1838 
1839             /* OMPT state */
1840             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1841           } else {
1842             exit_runtime_p = &dummy;
1843           }
1844 #endif
1845 
1846           {
1847             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1848             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1849             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1850 #if OMPT_SUPPORT
1851                                    ,
1852                                    exit_runtime_p
1853 #endif
1854                                    );
1855           }
1856 
1857 #if OMPT_SUPPORT
1858           if (ompt_enabled.enabled) {
1859             *exit_runtime_p = NULL;
1860             if (ompt_enabled.ompt_callback_implicit_task) {
1861               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1862                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1863                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1864             }
1865 
1866             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1867             __ompt_lw_taskteam_unlink(master_th);
1868             if (ompt_enabled.ompt_callback_parallel_end) {
1869               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1870                   &ompt_parallel_data, parent_task_data,
1871                   OMPT_INVOKER(call_context), return_address);
1872             }
1873             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1874           }
1875 #endif
1876         }
1877       } else if (call_context == fork_context_gnu) {
1878 #if OMPT_SUPPORT
1879         ompt_lw_taskteam_t lwt;
1880         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1881                                 return_address);
1882 
1883         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1884         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1885 // don't use lw_taskteam after linking. content was swaped
1886 #endif
1887 
1888         // we were called from GNU native code
1889         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1890         return FALSE;
1891       } else {
1892         KMP_ASSERT2(call_context < fork_context_last,
1893                     "__kmp_fork_call: unknown fork_context parameter");
1894       }
1895 
1896       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1897       KMP_MB();
1898       return FALSE;
1899     } // if (nthreads == 1)
1900 
1901     // GEH: only modify the executing flag in the case when not serialized
1902     //      serialized case is handled in kmpc_serialized_parallel
1903     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1904                   "curtask=%p, curtask_max_aclevel=%d\n",
1905                   parent_team->t.t_active_level, master_th,
1906                   master_th->th.th_current_task,
1907                   master_th->th.th_current_task->td_icvs.max_active_levels));
1908     // TODO: GEH - cannot do this assertion because root thread not set up as
1909     // executing
1910     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1911     master_th->th.th_current_task->td_flags.executing = 0;
1912 
1913     if (!master_th->th.th_teams_microtask || level > teams_level) {
1914       /* Increment our nested depth level */
1915       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1916     }
1917 
1918     // See if we need to make a copy of the ICVs.
1919     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1920     if ((level + 1 < __kmp_nested_nth.used) &&
1921         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1922       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1923     } else {
1924       nthreads_icv = 0; // don't update
1925     }
1926 
1927     // Figure out the proc_bind_policy for the new team.
1928     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1929     kmp_proc_bind_t proc_bind_icv =
1930         proc_bind_default; // proc_bind_default means don't update
1931     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1932       proc_bind = proc_bind_false;
1933     } else {
1934       if (proc_bind == proc_bind_default) {
1935         // No proc_bind clause specified; use current proc-bind-var for this
1936         // parallel region
1937         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1938       }
1939       /* else: The proc_bind policy was specified explicitly on parallel clause.
1940          This overrides proc-bind-var for this parallel region, but does not
1941          change proc-bind-var. */
1942       // Figure the value of proc-bind-var for the child threads.
1943       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1944           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1945            master_th->th.th_current_task->td_icvs.proc_bind)) {
1946         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1947       }
1948     }
1949 
1950     // Reset for next parallel region
1951     master_th->th.th_set_proc_bind = proc_bind_default;
1952 
1953     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1954       kmp_internal_control_t new_icvs;
1955       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1956       new_icvs.next = NULL;
1957       if (nthreads_icv > 0) {
1958         new_icvs.nproc = nthreads_icv;
1959       }
1960       if (proc_bind_icv != proc_bind_default) {
1961         new_icvs.proc_bind = proc_bind_icv;
1962       }
1963 
1964       /* allocate a new parallel team */
1965       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1966       team = __kmp_allocate_team(root, nthreads, nthreads,
1967 #if OMPT_SUPPORT
1968                                  ompt_parallel_data,
1969 #endif
1970                                  proc_bind, &new_icvs,
1971                                  argc USE_NESTED_HOT_ARG(master_th));
1972     } else {
1973       /* allocate a new parallel team */
1974       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1975       team = __kmp_allocate_team(root, nthreads, nthreads,
1976 #if OMPT_SUPPORT
1977                                  ompt_parallel_data,
1978 #endif
1979                                  proc_bind,
1980                                  &master_th->th.th_current_task->td_icvs,
1981                                  argc USE_NESTED_HOT_ARG(master_th));
1982     }
1983     KF_TRACE(
1984         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
1985 
1986     /* setup the new team */
1987     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
1988     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
1989     KMP_CHECK_UPDATE(team->t.t_ident, loc);
1990     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
1991     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
1992 #if OMPT_SUPPORT
1993     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
1994                           return_address);
1995 #endif
1996     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
1997     // TODO: parent_team->t.t_level == INT_MAX ???
1998     if (!master_th->th.th_teams_microtask || level > teams_level) {
1999       int new_level = parent_team->t.t_level + 1;
2000       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2001       new_level = parent_team->t.t_active_level + 1;
2002       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2003     } else {
2004       // AC: Do not increase parallel level at start of the teams construct
2005       int new_level = parent_team->t.t_level;
2006       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2007       new_level = parent_team->t.t_active_level;
2008       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2009     }
2010     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2011     // set master's schedule as new run-time schedule
2012     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2013 
2014     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2015     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2016 
2017     // Update the floating point rounding in the team if required.
2018     propagateFPControl(team);
2019 
2020     if (__kmp_tasking_mode != tskm_immediate_exec) {
2021       // Set master's task team to team's task team. Unless this is hot team, it
2022       // should be NULL.
2023       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2024                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2025       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2026                     "%p, new task_team %p / team %p\n",
2027                     __kmp_gtid_from_thread(master_th),
2028                     master_th->th.th_task_team, parent_team,
2029                     team->t.t_task_team[master_th->th.th_task_state], team));
2030 
2031       if (active_level || master_th->th.th_task_team) {
2032         // Take a memo of master's task_state
2033         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2034         if (master_th->th.th_task_state_top >=
2035             master_th->th.th_task_state_stack_sz) { // increase size
2036           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2037           kmp_uint8 *old_stack, *new_stack;
2038           kmp_uint32 i;
2039           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2040           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2041             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2042           }
2043           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2044                ++i) { // zero-init rest of stack
2045             new_stack[i] = 0;
2046           }
2047           old_stack = master_th->th.th_task_state_memo_stack;
2048           master_th->th.th_task_state_memo_stack = new_stack;
2049           master_th->th.th_task_state_stack_sz = new_size;
2050           __kmp_free(old_stack);
2051         }
2052         // Store master's task_state on stack
2053         master_th->th
2054             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2055             master_th->th.th_task_state;
2056         master_th->th.th_task_state_top++;
2057 #if KMP_NESTED_HOT_TEAMS
2058         if (master_th->th.th_hot_teams &&
2059             active_level < __kmp_hot_teams_max_level &&
2060             team == master_th->th.th_hot_teams[active_level].hot_team) {
2061           // Restore master's nested state if nested hot team
2062           master_th->th.th_task_state =
2063               master_th->th
2064                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2065         } else {
2066 #endif
2067           master_th->th.th_task_state = 0;
2068 #if KMP_NESTED_HOT_TEAMS
2069         }
2070 #endif
2071       }
2072 #if !KMP_NESTED_HOT_TEAMS
2073       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2074                        (team == root->r.r_hot_team));
2075 #endif
2076     }
2077 
2078     KA_TRACE(
2079         20,
2080         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2081          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2082          team->t.t_nproc));
2083     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2084                      (team->t.t_master_tid == 0 &&
2085                       (team->t.t_parent == root->r.r_root_team ||
2086                        team->t.t_parent->t.t_serialized)));
2087     KMP_MB();
2088 
2089     /* now, setup the arguments */
2090     argv = (void **)team->t.t_argv;
2091     if (ap) {
2092       for (i = argc - 1; i >= 0; --i) {
2093 // TODO: revert workaround for Intel(R) 64 tracker #96
2094 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2095         void *new_argv = va_arg(*ap, void *);
2096 #else
2097         void *new_argv = va_arg(ap, void *);
2098 #endif
2099         KMP_CHECK_UPDATE(*argv, new_argv);
2100         argv++;
2101       }
2102     } else {
2103       for (i = 0; i < argc; ++i) {
2104         // Get args from parent team for teams construct
2105         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2106       }
2107     }
2108 
2109     /* now actually fork the threads */
2110     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2111     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2112       root->r.r_active = TRUE;
2113 
2114     __kmp_fork_team_threads(root, team, master_th, gtid);
2115     __kmp_setup_icv_copy(team, nthreads,
2116                          &master_th->th.th_current_task->td_icvs, loc);
2117 
2118 #if OMPT_SUPPORT
2119     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2120 #endif
2121 
2122     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2123 
2124 #if USE_ITT_BUILD
2125     if (team->t.t_active_level == 1 // only report frames at level 1
2126         && !master_th->th.th_teams_microtask) { // not in teams construct
2127 #if USE_ITT_NOTIFY
2128       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2129           (__kmp_forkjoin_frames_mode == 3 ||
2130            __kmp_forkjoin_frames_mode == 1)) {
2131         kmp_uint64 tmp_time = 0;
2132         if (__itt_get_timestamp_ptr)
2133           tmp_time = __itt_get_timestamp();
2134         // Internal fork - report frame begin
2135         master_th->th.th_frame_time = tmp_time;
2136         if (__kmp_forkjoin_frames_mode == 3)
2137           team->t.t_region_time = tmp_time;
2138       } else
2139 // only one notification scheme (either "submit" or "forking/joined", not both)
2140 #endif /* USE_ITT_NOTIFY */
2141           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2142               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2143         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2144         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2145       }
2146     }
2147 #endif /* USE_ITT_BUILD */
2148 
2149     /* now go on and do the work */
2150     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2151     KMP_MB();
2152     KF_TRACE(10,
2153              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2154               root, team, master_th, gtid));
2155 
2156 #if USE_ITT_BUILD
2157     if (__itt_stack_caller_create_ptr) {
2158       team->t.t_stack_id =
2159           __kmp_itt_stack_caller_create(); // create new stack stitching id
2160       // before entering fork barrier
2161     }
2162 #endif /* USE_ITT_BUILD */
2163 
2164     // AC: skip __kmp_internal_fork at teams construct, let only master
2165     // threads execute
2166     if (ap) {
2167       __kmp_internal_fork(loc, gtid, team);
2168       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2169                     "master_th=%p, gtid=%d\n",
2170                     root, team, master_th, gtid));
2171     }
2172 
2173     if (call_context == fork_context_gnu) {
2174       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2175       return TRUE;
2176     }
2177 
2178     /* Invoke microtask for MASTER thread */
2179     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2180                   team->t.t_id, team->t.t_pkfn));
2181   } // END of timer KMP_fork_call block
2182 
2183 #if KMP_STATS_ENABLED
2184   // If beginning a teams construct, then change thread state
2185   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2186   if (!ap) {
2187     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2188   }
2189 #endif
2190 
2191   if (!team->t.t_invoke(gtid)) {
2192     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2193   }
2194 
2195 #if KMP_STATS_ENABLED
2196   // If was beginning of a teams construct, then reset thread state
2197   if (!ap) {
2198     KMP_SET_THREAD_STATE(previous_state);
2199   }
2200 #endif
2201 
2202   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2203                 team->t.t_id, team->t.t_pkfn));
2204   KMP_MB(); /* Flush all pending memory write invalidates.  */
2205 
2206   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2207 
2208 #if OMPT_SUPPORT
2209   if (ompt_enabled.enabled) {
2210     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2211   }
2212 #endif
2213 
2214   return TRUE;
2215 }
2216 
2217 #if OMPT_SUPPORT
2218 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2219                                             kmp_team_t *team) {
2220   // restore state outside the region
2221   thread->th.ompt_thread_info.state =
2222       ((team->t.t_serialized) ? ompt_state_work_serial
2223                               : ompt_state_work_parallel);
2224 }
2225 
2226 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2227                                    kmp_team_t *team, ompt_data_t *parallel_data,
2228                                    fork_context_e fork_context, void *codeptr) {
2229   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2230   if (ompt_enabled.ompt_callback_parallel_end) {
2231     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2232         parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2233         codeptr);
2234   }
2235 
2236   task_info->frame.enter_frame = ompt_data_none;
2237   __kmp_join_restore_state(thread, team);
2238 }
2239 #endif
2240 
2241 void __kmp_join_call(ident_t *loc, int gtid
2242 #if OMPT_SUPPORT
2243                      ,
2244                      enum fork_context_e fork_context
2245 #endif
2246                      ,
2247                      int exit_teams) {
2248   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2249   kmp_team_t *team;
2250   kmp_team_t *parent_team;
2251   kmp_info_t *master_th;
2252   kmp_root_t *root;
2253   int master_active;
2254 
2255   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2256 
2257   /* setup current data */
2258   master_th = __kmp_threads[gtid];
2259   root = master_th->th.th_root;
2260   team = master_th->th.th_team;
2261   parent_team = team->t.t_parent;
2262 
2263   master_th->th.th_ident = loc;
2264 
2265 #if OMPT_SUPPORT
2266   if (ompt_enabled.enabled) {
2267     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2268   }
2269 #endif
2270 
2271 #if KMP_DEBUG
2272   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2273     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2274                   "th_task_team = %p\n",
2275                   __kmp_gtid_from_thread(master_th), team,
2276                   team->t.t_task_team[master_th->th.th_task_state],
2277                   master_th->th.th_task_team));
2278     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2279                      team->t.t_task_team[master_th->th.th_task_state]);
2280   }
2281 #endif
2282 
2283   if (team->t.t_serialized) {
2284     if (master_th->th.th_teams_microtask) {
2285       // We are in teams construct
2286       int level = team->t.t_level;
2287       int tlevel = master_th->th.th_teams_level;
2288       if (level == tlevel) {
2289         // AC: we haven't incremented it earlier at start of teams construct,
2290         //     so do it here - at the end of teams construct
2291         team->t.t_level++;
2292       } else if (level == tlevel + 1) {
2293         // AC: we are exiting parallel inside teams, need to increment
2294         // serialization in order to restore it in the next call to
2295         // __kmpc_end_serialized_parallel
2296         team->t.t_serialized++;
2297       }
2298     }
2299     __kmpc_end_serialized_parallel(loc, gtid);
2300 
2301 #if OMPT_SUPPORT
2302     if (ompt_enabled.enabled) {
2303       __kmp_join_restore_state(master_th, parent_team);
2304     }
2305 #endif
2306 
2307     return;
2308   }
2309 
2310   master_active = team->t.t_master_active;
2311 
2312   if (!exit_teams) {
2313     // AC: No barrier for internal teams at exit from teams construct.
2314     //     But there is barrier for external team (league).
2315     __kmp_internal_join(loc, gtid, team);
2316   } else {
2317     master_th->th.th_task_state =
2318         0; // AC: no tasking in teams (out of any parallel)
2319   }
2320 
2321   KMP_MB();
2322 
2323 #if OMPT_SUPPORT
2324   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2325   void *codeptr = team->t.ompt_team_info.master_return_address;
2326 #endif
2327 
2328 #if USE_ITT_BUILD
2329   if (__itt_stack_caller_create_ptr) {
2330     __kmp_itt_stack_caller_destroy(
2331         (__itt_caller)team->t
2332             .t_stack_id); // destroy the stack stitching id after join barrier
2333   }
2334 
2335   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2336   if (team->t.t_active_level == 1 &&
2337       !master_th->th.th_teams_microtask) { /* not in teams construct */
2338     master_th->th.th_ident = loc;
2339     // only one notification scheme (either "submit" or "forking/joined", not
2340     // both)
2341     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2342         __kmp_forkjoin_frames_mode == 3)
2343       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2344                              master_th->th.th_frame_time, 0, loc,
2345                              master_th->th.th_team_nproc, 1);
2346     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2347              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2348       __kmp_itt_region_joined(gtid);
2349   } // active_level == 1
2350 #endif /* USE_ITT_BUILD */
2351 
2352   if (master_th->th.th_teams_microtask && !exit_teams &&
2353       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2354       team->t.t_level == master_th->th.th_teams_level + 1) {
2355     // AC: We need to leave the team structure intact at the end of parallel
2356     // inside the teams construct, so that at the next parallel same (hot) team
2357     // works, only adjust nesting levels
2358 
2359     /* Decrement our nested depth level */
2360     team->t.t_level--;
2361     team->t.t_active_level--;
2362     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2363 
2364     // Restore number of threads in the team if needed. This code relies on
2365     // the proper adjustment of th_teams_size.nth after the fork in
2366     // __kmp_teams_master on each teams master in the case that
2367     // __kmp_reserve_threads reduced it.
2368     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2369       int old_num = master_th->th.th_team_nproc;
2370       int new_num = master_th->th.th_teams_size.nth;
2371       kmp_info_t **other_threads = team->t.t_threads;
2372       team->t.t_nproc = new_num;
2373       for (int i = 0; i < old_num; ++i) {
2374         other_threads[i]->th.th_team_nproc = new_num;
2375       }
2376       // Adjust states of non-used threads of the team
2377       for (int i = old_num; i < new_num; ++i) {
2378         // Re-initialize thread's barrier data.
2379         KMP_DEBUG_ASSERT(other_threads[i]);
2380         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2381         for (int b = 0; b < bs_last_barrier; ++b) {
2382           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2383           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2384 #if USE_DEBUGGER
2385           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2386 #endif
2387         }
2388         if (__kmp_tasking_mode != tskm_immediate_exec) {
2389           // Synchronize thread's task state
2390           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2391         }
2392       }
2393     }
2394 
2395 #if OMPT_SUPPORT
2396     if (ompt_enabled.enabled) {
2397       __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2398                       codeptr);
2399     }
2400 #endif
2401 
2402     return;
2403   }
2404 
2405   /* do cleanup and restore the parent team */
2406   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2407   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2408 
2409   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2410 
2411   /* jc: The following lock has instructions with REL and ACQ semantics,
2412      separating the parallel user code called in this parallel region
2413      from the serial user code called after this function returns. */
2414   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2415 
2416   if (!master_th->th.th_teams_microtask ||
2417       team->t.t_level > master_th->th.th_teams_level) {
2418     /* Decrement our nested depth level */
2419     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2420   }
2421   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2422 
2423 #if OMPT_SUPPORT
2424   if (ompt_enabled.enabled) {
2425     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2426     if (ompt_enabled.ompt_callback_implicit_task) {
2427       int ompt_team_size = team->t.t_nproc;
2428       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2429           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2430           OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2431     }
2432 
2433     task_info->frame.exit_frame = ompt_data_none;
2434     task_info->task_data = ompt_data_none;
2435   }
2436 #endif
2437 
2438   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2439                 master_th, team));
2440   __kmp_pop_current_task_from_thread(master_th);
2441 
2442 #if KMP_AFFINITY_SUPPORTED
2443   // Restore master thread's partition.
2444   master_th->th.th_first_place = team->t.t_first_place;
2445   master_th->th.th_last_place = team->t.t_last_place;
2446 #endif // KMP_AFFINITY_SUPPORTED
2447   master_th->th.th_def_allocator = team->t.t_def_allocator;
2448 
2449   updateHWFPControl(team);
2450 
2451   if (root->r.r_active != master_active)
2452     root->r.r_active = master_active;
2453 
2454   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2455                             master_th)); // this will free worker threads
2456 
2457   /* this race was fun to find. make sure the following is in the critical
2458      region otherwise assertions may fail occasionally since the old team may be
2459      reallocated and the hierarchy appears inconsistent. it is actually safe to
2460      run and won't cause any bugs, but will cause those assertion failures. it's
2461      only one deref&assign so might as well put this in the critical region */
2462   master_th->th.th_team = parent_team;
2463   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2464   master_th->th.th_team_master = parent_team->t.t_threads[0];
2465   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2466 
2467   /* restore serialized team, if need be */
2468   if (parent_team->t.t_serialized &&
2469       parent_team != master_th->th.th_serial_team &&
2470       parent_team != root->r.r_root_team) {
2471     __kmp_free_team(root,
2472                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2473     master_th->th.th_serial_team = parent_team;
2474   }
2475 
2476   if (__kmp_tasking_mode != tskm_immediate_exec) {
2477     if (master_th->th.th_task_state_top >
2478         0) { // Restore task state from memo stack
2479       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2480       // Remember master's state if we re-use this nested hot team
2481       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2482           master_th->th.th_task_state;
2483       --master_th->th.th_task_state_top; // pop
2484       // Now restore state at this level
2485       master_th->th.th_task_state =
2486           master_th->th
2487               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2488     }
2489     // Copy the task team from the parent team to the master thread
2490     master_th->th.th_task_team =
2491         parent_team->t.t_task_team[master_th->th.th_task_state];
2492     KA_TRACE(20,
2493              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2494               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2495               parent_team));
2496   }
2497 
2498   // TODO: GEH - cannot do this assertion because root thread not set up as
2499   // executing
2500   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2501   master_th->th.th_current_task->td_flags.executing = 1;
2502 
2503   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2504 
2505 #if OMPT_SUPPORT
2506   if (ompt_enabled.enabled) {
2507     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2508                     codeptr);
2509   }
2510 #endif
2511 
2512   KMP_MB();
2513   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2514 }
2515 
2516 /* Check whether we should push an internal control record onto the
2517    serial team stack.  If so, do it.  */
2518 void __kmp_save_internal_controls(kmp_info_t *thread) {
2519 
2520   if (thread->th.th_team != thread->th.th_serial_team) {
2521     return;
2522   }
2523   if (thread->th.th_team->t.t_serialized > 1) {
2524     int push = 0;
2525 
2526     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2527       push = 1;
2528     } else {
2529       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2530           thread->th.th_team->t.t_serialized) {
2531         push = 1;
2532       }
2533     }
2534     if (push) { /* push a record on the serial team's stack */
2535       kmp_internal_control_t *control =
2536           (kmp_internal_control_t *)__kmp_allocate(
2537               sizeof(kmp_internal_control_t));
2538 
2539       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2540 
2541       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2542 
2543       control->next = thread->th.th_team->t.t_control_stack_top;
2544       thread->th.th_team->t.t_control_stack_top = control;
2545     }
2546   }
2547 }
2548 
2549 /* Changes set_nproc */
2550 void __kmp_set_num_threads(int new_nth, int gtid) {
2551   kmp_info_t *thread;
2552   kmp_root_t *root;
2553 
2554   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2555   KMP_DEBUG_ASSERT(__kmp_init_serial);
2556 
2557   if (new_nth < 1)
2558     new_nth = 1;
2559   else if (new_nth > __kmp_max_nth)
2560     new_nth = __kmp_max_nth;
2561 
2562   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2563   thread = __kmp_threads[gtid];
2564   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2565     return; // nothing to do
2566 
2567   __kmp_save_internal_controls(thread);
2568 
2569   set__nproc(thread, new_nth);
2570 
2571   // If this omp_set_num_threads() call will cause the hot team size to be
2572   // reduced (in the absence of a num_threads clause), then reduce it now,
2573   // rather than waiting for the next parallel region.
2574   root = thread->th.th_root;
2575   if (__kmp_init_parallel && (!root->r.r_active) &&
2576       (root->r.r_hot_team->t.t_nproc > new_nth)
2577 #if KMP_NESTED_HOT_TEAMS
2578       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2579 #endif
2580       ) {
2581     kmp_team_t *hot_team = root->r.r_hot_team;
2582     int f;
2583 
2584     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2585 
2586     // Release the extra threads we don't need any more.
2587     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2588       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2589       if (__kmp_tasking_mode != tskm_immediate_exec) {
2590         // When decreasing team size, threads no longer in the team should unref
2591         // task team.
2592         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2593       }
2594       __kmp_free_thread(hot_team->t.t_threads[f]);
2595       hot_team->t.t_threads[f] = NULL;
2596     }
2597     hot_team->t.t_nproc = new_nth;
2598 #if KMP_NESTED_HOT_TEAMS
2599     if (thread->th.th_hot_teams) {
2600       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2601       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2602     }
2603 #endif
2604 
2605     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2606 
2607     // Update the t_nproc field in the threads that are still active.
2608     for (f = 0; f < new_nth; f++) {
2609       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2610       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2611     }
2612     // Special flag in case omp_set_num_threads() call
2613     hot_team->t.t_size_changed = -1;
2614   }
2615 }
2616 
2617 /* Changes max_active_levels */
2618 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2619   kmp_info_t *thread;
2620 
2621   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2622                 "%d = (%d)\n",
2623                 gtid, max_active_levels));
2624   KMP_DEBUG_ASSERT(__kmp_init_serial);
2625 
2626   // validate max_active_levels
2627   if (max_active_levels < 0) {
2628     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2629     // We ignore this call if the user has specified a negative value.
2630     // The current setting won't be changed. The last valid setting will be
2631     // used. A warning will be issued (if warnings are allowed as controlled by
2632     // the KMP_WARNINGS env var).
2633     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2634                   "max_active_levels for thread %d = (%d)\n",
2635                   gtid, max_active_levels));
2636     return;
2637   }
2638   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2639     // it's OK, the max_active_levels is within the valid range: [ 0;
2640     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2641     // We allow a zero value. (implementation defined behavior)
2642   } else {
2643     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2644                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2645     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2646     // Current upper limit is MAX_INT. (implementation defined behavior)
2647     // If the input exceeds the upper limit, we correct the input to be the
2648     // upper limit. (implementation defined behavior)
2649     // Actually, the flow should never get here until we use MAX_INT limit.
2650   }
2651   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2652                 "max_active_levels for thread %d = (%d)\n",
2653                 gtid, max_active_levels));
2654 
2655   thread = __kmp_threads[gtid];
2656 
2657   __kmp_save_internal_controls(thread);
2658 
2659   set__max_active_levels(thread, max_active_levels);
2660 }
2661 
2662 /* Gets max_active_levels */
2663 int __kmp_get_max_active_levels(int gtid) {
2664   kmp_info_t *thread;
2665 
2666   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2667   KMP_DEBUG_ASSERT(__kmp_init_serial);
2668 
2669   thread = __kmp_threads[gtid];
2670   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2671   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2672                 "curtask_maxaclevel=%d\n",
2673                 gtid, thread->th.th_current_task,
2674                 thread->th.th_current_task->td_icvs.max_active_levels));
2675   return thread->th.th_current_task->td_icvs.max_active_levels;
2676 }
2677 
2678 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2679 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2680 
2681 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2682 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2683   kmp_info_t *thread;
2684   kmp_sched_t orig_kind;
2685   //    kmp_team_t *team;
2686 
2687   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2688                 gtid, (int)kind, chunk));
2689   KMP_DEBUG_ASSERT(__kmp_init_serial);
2690 
2691   // Check if the kind parameter is valid, correct if needed.
2692   // Valid parameters should fit in one of two intervals - standard or extended:
2693   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2694   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2695   orig_kind = kind;
2696   kind = __kmp_sched_without_mods(kind);
2697 
2698   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2699       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2700     // TODO: Hint needs attention in case we change the default schedule.
2701     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2702               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2703               __kmp_msg_null);
2704     kind = kmp_sched_default;
2705     chunk = 0; // ignore chunk value in case of bad kind
2706   }
2707 
2708   thread = __kmp_threads[gtid];
2709 
2710   __kmp_save_internal_controls(thread);
2711 
2712   if (kind < kmp_sched_upper_std) {
2713     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2714       // differ static chunked vs. unchunked:  chunk should be invalid to
2715       // indicate unchunked schedule (which is the default)
2716       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2717     } else {
2718       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2719           __kmp_sch_map[kind - kmp_sched_lower - 1];
2720     }
2721   } else {
2722     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2723     //    kmp_sched_lower - 2 ];
2724     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2725         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2726                       kmp_sched_lower - 2];
2727   }
2728   __kmp_sched_apply_mods_intkind(
2729       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2730   if (kind == kmp_sched_auto || chunk < 1) {
2731     // ignore parameter chunk for schedule auto
2732     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2733   } else {
2734     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2735   }
2736 }
2737 
2738 /* Gets def_sched_var ICV values */
2739 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2740   kmp_info_t *thread;
2741   enum sched_type th_type;
2742 
2743   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2744   KMP_DEBUG_ASSERT(__kmp_init_serial);
2745 
2746   thread = __kmp_threads[gtid];
2747 
2748   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2749   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2750   case kmp_sch_static:
2751   case kmp_sch_static_greedy:
2752   case kmp_sch_static_balanced:
2753     *kind = kmp_sched_static;
2754     __kmp_sched_apply_mods_stdkind(kind, th_type);
2755     *chunk = 0; // chunk was not set, try to show this fact via zero value
2756     return;
2757   case kmp_sch_static_chunked:
2758     *kind = kmp_sched_static;
2759     break;
2760   case kmp_sch_dynamic_chunked:
2761     *kind = kmp_sched_dynamic;
2762     break;
2763   case kmp_sch_guided_chunked:
2764   case kmp_sch_guided_iterative_chunked:
2765   case kmp_sch_guided_analytical_chunked:
2766     *kind = kmp_sched_guided;
2767     break;
2768   case kmp_sch_auto:
2769     *kind = kmp_sched_auto;
2770     break;
2771   case kmp_sch_trapezoidal:
2772     *kind = kmp_sched_trapezoidal;
2773     break;
2774 #if KMP_STATIC_STEAL_ENABLED
2775   case kmp_sch_static_steal:
2776     *kind = kmp_sched_static_steal;
2777     break;
2778 #endif
2779   default:
2780     KMP_FATAL(UnknownSchedulingType, th_type);
2781   }
2782 
2783   __kmp_sched_apply_mods_stdkind(kind, th_type);
2784   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2785 }
2786 
2787 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2788 
2789   int ii, dd;
2790   kmp_team_t *team;
2791   kmp_info_t *thr;
2792 
2793   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2794   KMP_DEBUG_ASSERT(__kmp_init_serial);
2795 
2796   // validate level
2797   if (level == 0)
2798     return 0;
2799   if (level < 0)
2800     return -1;
2801   thr = __kmp_threads[gtid];
2802   team = thr->th.th_team;
2803   ii = team->t.t_level;
2804   if (level > ii)
2805     return -1;
2806 
2807   if (thr->th.th_teams_microtask) {
2808     // AC: we are in teams region where multiple nested teams have same level
2809     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2810     if (level <=
2811         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2812       KMP_DEBUG_ASSERT(ii >= tlevel);
2813       // AC: As we need to pass by the teams league, we need to artificially
2814       // increase ii
2815       if (ii == tlevel) {
2816         ii += 2; // three teams have same level
2817       } else {
2818         ii++; // two teams have same level
2819       }
2820     }
2821   }
2822 
2823   if (ii == level)
2824     return __kmp_tid_from_gtid(gtid);
2825 
2826   dd = team->t.t_serialized;
2827   level++;
2828   while (ii > level) {
2829     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2830     }
2831     if ((team->t.t_serialized) && (!dd)) {
2832       team = team->t.t_parent;
2833       continue;
2834     }
2835     if (ii > level) {
2836       team = team->t.t_parent;
2837       dd = team->t.t_serialized;
2838       ii--;
2839     }
2840   }
2841 
2842   return (dd > 1) ? (0) : (team->t.t_master_tid);
2843 }
2844 
2845 int __kmp_get_team_size(int gtid, int level) {
2846 
2847   int ii, dd;
2848   kmp_team_t *team;
2849   kmp_info_t *thr;
2850 
2851   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2852   KMP_DEBUG_ASSERT(__kmp_init_serial);
2853 
2854   // validate level
2855   if (level == 0)
2856     return 1;
2857   if (level < 0)
2858     return -1;
2859   thr = __kmp_threads[gtid];
2860   team = thr->th.th_team;
2861   ii = team->t.t_level;
2862   if (level > ii)
2863     return -1;
2864 
2865   if (thr->th.th_teams_microtask) {
2866     // AC: we are in teams region where multiple nested teams have same level
2867     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2868     if (level <=
2869         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2870       KMP_DEBUG_ASSERT(ii >= tlevel);
2871       // AC: As we need to pass by the teams league, we need to artificially
2872       // increase ii
2873       if (ii == tlevel) {
2874         ii += 2; // three teams have same level
2875       } else {
2876         ii++; // two teams have same level
2877       }
2878     }
2879   }
2880 
2881   while (ii > level) {
2882     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2883     }
2884     if (team->t.t_serialized && (!dd)) {
2885       team = team->t.t_parent;
2886       continue;
2887     }
2888     if (ii > level) {
2889       team = team->t.t_parent;
2890       ii--;
2891     }
2892   }
2893 
2894   return team->t.t_nproc;
2895 }
2896 
2897 kmp_r_sched_t __kmp_get_schedule_global() {
2898   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2899   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2900   // independently. So one can get the updated schedule here.
2901 
2902   kmp_r_sched_t r_sched;
2903 
2904   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2905   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2906   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2907   // different roots (even in OMP 2.5)
2908   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2909   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2910   if (s == kmp_sch_static) {
2911     // replace STATIC with more detailed schedule (balanced or greedy)
2912     r_sched.r_sched_type = __kmp_static;
2913   } else if (s == kmp_sch_guided_chunked) {
2914     // replace GUIDED with more detailed schedule (iterative or analytical)
2915     r_sched.r_sched_type = __kmp_guided;
2916   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2917     r_sched.r_sched_type = __kmp_sched;
2918   }
2919   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2920 
2921   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2922     // __kmp_chunk may be wrong here (if it was not ever set)
2923     r_sched.chunk = KMP_DEFAULT_CHUNK;
2924   } else {
2925     r_sched.chunk = __kmp_chunk;
2926   }
2927 
2928   return r_sched;
2929 }
2930 
2931 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2932    at least argc number of *t_argv entries for the requested team. */
2933 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2934 
2935   KMP_DEBUG_ASSERT(team);
2936   if (!realloc || argc > team->t.t_max_argc) {
2937 
2938     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2939                    "current entries=%d\n",
2940                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2941     /* if previously allocated heap space for args, free them */
2942     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
2943       __kmp_free((void *)team->t.t_argv);
2944 
2945     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
2946       /* use unused space in the cache line for arguments */
2947       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2948       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
2949                      "argv entries\n",
2950                      team->t.t_id, team->t.t_max_argc));
2951       team->t.t_argv = &team->t.t_inline_argv[0];
2952       if (__kmp_storage_map) {
2953         __kmp_print_storage_map_gtid(
2954             -1, &team->t.t_inline_argv[0],
2955             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2956             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
2957             team->t.t_id);
2958       }
2959     } else {
2960       /* allocate space for arguments in the heap */
2961       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
2962                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
2963                                : 2 * argc;
2964       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
2965                      "argv entries\n",
2966                      team->t.t_id, team->t.t_max_argc));
2967       team->t.t_argv =
2968           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
2969       if (__kmp_storage_map) {
2970         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
2971                                      &team->t.t_argv[team->t.t_max_argc],
2972                                      sizeof(void *) * team->t.t_max_argc,
2973                                      "team_%d.t_argv", team->t.t_id);
2974       }
2975     }
2976   }
2977 }
2978 
2979 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
2980   int i;
2981   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
2982   team->t.t_threads =
2983       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
2984   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
2985       sizeof(dispatch_shared_info_t) * num_disp_buff);
2986   team->t.t_dispatch =
2987       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
2988   team->t.t_implicit_task_taskdata =
2989       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
2990   team->t.t_max_nproc = max_nth;
2991 
2992   /* setup dispatch buffers */
2993   for (i = 0; i < num_disp_buff; ++i) {
2994     team->t.t_disp_buffer[i].buffer_index = i;
2995     team->t.t_disp_buffer[i].doacross_buf_idx = i;
2996   }
2997 }
2998 
2999 static void __kmp_free_team_arrays(kmp_team_t *team) {
3000   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3001   int i;
3002   for (i = 0; i < team->t.t_max_nproc; ++i) {
3003     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3004       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3005       team->t.t_dispatch[i].th_disp_buffer = NULL;
3006     }
3007   }
3008 #if KMP_USE_HIER_SCHED
3009   __kmp_dispatch_free_hierarchies(team);
3010 #endif
3011   __kmp_free(team->t.t_threads);
3012   __kmp_free(team->t.t_disp_buffer);
3013   __kmp_free(team->t.t_dispatch);
3014   __kmp_free(team->t.t_implicit_task_taskdata);
3015   team->t.t_threads = NULL;
3016   team->t.t_disp_buffer = NULL;
3017   team->t.t_dispatch = NULL;
3018   team->t.t_implicit_task_taskdata = 0;
3019 }
3020 
3021 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3022   kmp_info_t **oldThreads = team->t.t_threads;
3023 
3024   __kmp_free(team->t.t_disp_buffer);
3025   __kmp_free(team->t.t_dispatch);
3026   __kmp_free(team->t.t_implicit_task_taskdata);
3027   __kmp_allocate_team_arrays(team, max_nth);
3028 
3029   KMP_MEMCPY(team->t.t_threads, oldThreads,
3030              team->t.t_nproc * sizeof(kmp_info_t *));
3031 
3032   __kmp_free(oldThreads);
3033 }
3034 
3035 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3036 
3037   kmp_r_sched_t r_sched =
3038       __kmp_get_schedule_global(); // get current state of scheduling globals
3039 
3040   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3041 
3042   kmp_internal_control_t g_icvs = {
3043     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3044     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3045     // adjustment of threads (per thread)
3046     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3047     // whether blocktime is explicitly set
3048     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3049 #if KMP_USE_MONITOR
3050     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3051 // intervals
3052 #endif
3053     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3054     // next parallel region (per thread)
3055     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3056     __kmp_cg_max_nth, // int thread_limit;
3057     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3058     // for max_active_levels
3059     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3060     // {sched,chunk} pair
3061     __kmp_nested_proc_bind.bind_types[0],
3062     __kmp_default_device,
3063     NULL // struct kmp_internal_control *next;
3064   };
3065 
3066   return g_icvs;
3067 }
3068 
3069 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3070 
3071   kmp_internal_control_t gx_icvs;
3072   gx_icvs.serial_nesting_level =
3073       0; // probably =team->t.t_serial like in save_inter_controls
3074   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3075   gx_icvs.next = NULL;
3076 
3077   return gx_icvs;
3078 }
3079 
3080 static void __kmp_initialize_root(kmp_root_t *root) {
3081   int f;
3082   kmp_team_t *root_team;
3083   kmp_team_t *hot_team;
3084   int hot_team_max_nth;
3085   kmp_r_sched_t r_sched =
3086       __kmp_get_schedule_global(); // get current state of scheduling globals
3087   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3088   KMP_DEBUG_ASSERT(root);
3089   KMP_ASSERT(!root->r.r_begin);
3090 
3091   /* setup the root state structure */
3092   __kmp_init_lock(&root->r.r_begin_lock);
3093   root->r.r_begin = FALSE;
3094   root->r.r_active = FALSE;
3095   root->r.r_in_parallel = 0;
3096   root->r.r_blocktime = __kmp_dflt_blocktime;
3097 
3098   /* setup the root team for this task */
3099   /* allocate the root team structure */
3100   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3101 
3102   root_team =
3103       __kmp_allocate_team(root,
3104                           1, // new_nproc
3105                           1, // max_nproc
3106 #if OMPT_SUPPORT
3107                           ompt_data_none, // root parallel id
3108 #endif
3109                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3110                           0 // argc
3111                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3112                           );
3113 #if USE_DEBUGGER
3114   // Non-NULL value should be assigned to make the debugger display the root
3115   // team.
3116   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3117 #endif
3118 
3119   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3120 
3121   root->r.r_root_team = root_team;
3122   root_team->t.t_control_stack_top = NULL;
3123 
3124   /* initialize root team */
3125   root_team->t.t_threads[0] = NULL;
3126   root_team->t.t_nproc = 1;
3127   root_team->t.t_serialized = 1;
3128   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3129   root_team->t.t_sched.sched = r_sched.sched;
3130   KA_TRACE(
3131       20,
3132       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3133        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3134 
3135   /* setup the  hot team for this task */
3136   /* allocate the hot team structure */
3137   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3138 
3139   hot_team =
3140       __kmp_allocate_team(root,
3141                           1, // new_nproc
3142                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3143 #if OMPT_SUPPORT
3144                           ompt_data_none, // root parallel id
3145 #endif
3146                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3147                           0 // argc
3148                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3149                           );
3150   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3151 
3152   root->r.r_hot_team = hot_team;
3153   root_team->t.t_control_stack_top = NULL;
3154 
3155   /* first-time initialization */
3156   hot_team->t.t_parent = root_team;
3157 
3158   /* initialize hot team */
3159   hot_team_max_nth = hot_team->t.t_max_nproc;
3160   for (f = 0; f < hot_team_max_nth; ++f) {
3161     hot_team->t.t_threads[f] = NULL;
3162   }
3163   hot_team->t.t_nproc = 1;
3164   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3165   hot_team->t.t_sched.sched = r_sched.sched;
3166   hot_team->t.t_size_changed = 0;
3167 }
3168 
3169 #ifdef KMP_DEBUG
3170 
3171 typedef struct kmp_team_list_item {
3172   kmp_team_p const *entry;
3173   struct kmp_team_list_item *next;
3174 } kmp_team_list_item_t;
3175 typedef kmp_team_list_item_t *kmp_team_list_t;
3176 
3177 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3178     kmp_team_list_t list, // List of teams.
3179     kmp_team_p const *team // Team to add.
3180     ) {
3181 
3182   // List must terminate with item where both entry and next are NULL.
3183   // Team is added to the list only once.
3184   // List is sorted in ascending order by team id.
3185   // Team id is *not* a key.
3186 
3187   kmp_team_list_t l;
3188 
3189   KMP_DEBUG_ASSERT(list != NULL);
3190   if (team == NULL) {
3191     return;
3192   }
3193 
3194   __kmp_print_structure_team_accum(list, team->t.t_parent);
3195   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3196 
3197   // Search list for the team.
3198   l = list;
3199   while (l->next != NULL && l->entry != team) {
3200     l = l->next;
3201   }
3202   if (l->next != NULL) {
3203     return; // Team has been added before, exit.
3204   }
3205 
3206   // Team is not found. Search list again for insertion point.
3207   l = list;
3208   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3209     l = l->next;
3210   }
3211 
3212   // Insert team.
3213   {
3214     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3215         sizeof(kmp_team_list_item_t));
3216     *item = *l;
3217     l->entry = team;
3218     l->next = item;
3219   }
3220 }
3221 
3222 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3223 
3224                                        ) {
3225   __kmp_printf("%s", title);
3226   if (team != NULL) {
3227     __kmp_printf("%2x %p\n", team->t.t_id, team);
3228   } else {
3229     __kmp_printf(" - (nil)\n");
3230   }
3231 }
3232 
3233 static void __kmp_print_structure_thread(char const *title,
3234                                          kmp_info_p const *thread) {
3235   __kmp_printf("%s", title);
3236   if (thread != NULL) {
3237     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3238   } else {
3239     __kmp_printf(" - (nil)\n");
3240   }
3241 }
3242 
3243 void __kmp_print_structure(void) {
3244 
3245   kmp_team_list_t list;
3246 
3247   // Initialize list of teams.
3248   list =
3249       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3250   list->entry = NULL;
3251   list->next = NULL;
3252 
3253   __kmp_printf("\n------------------------------\nGlobal Thread "
3254                "Table\n------------------------------\n");
3255   {
3256     int gtid;
3257     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3258       __kmp_printf("%2d", gtid);
3259       if (__kmp_threads != NULL) {
3260         __kmp_printf(" %p", __kmp_threads[gtid]);
3261       }
3262       if (__kmp_root != NULL) {
3263         __kmp_printf(" %p", __kmp_root[gtid]);
3264       }
3265       __kmp_printf("\n");
3266     }
3267   }
3268 
3269   // Print out __kmp_threads array.
3270   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3271                "----------\n");
3272   if (__kmp_threads != NULL) {
3273     int gtid;
3274     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3275       kmp_info_t const *thread = __kmp_threads[gtid];
3276       if (thread != NULL) {
3277         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3278         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3279         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3280         __kmp_print_structure_team("    Serial Team:  ",
3281                                    thread->th.th_serial_team);
3282         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3283         __kmp_print_structure_thread("    Master:       ",
3284                                      thread->th.th_team_master);
3285         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3286         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3287         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3288         __kmp_print_structure_thread("    Next in pool: ",
3289                                      thread->th.th_next_pool);
3290         __kmp_printf("\n");
3291         __kmp_print_structure_team_accum(list, thread->th.th_team);
3292         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3293       }
3294     }
3295   } else {
3296     __kmp_printf("Threads array is not allocated.\n");
3297   }
3298 
3299   // Print out __kmp_root array.
3300   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3301                "--------\n");
3302   if (__kmp_root != NULL) {
3303     int gtid;
3304     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3305       kmp_root_t const *root = __kmp_root[gtid];
3306       if (root != NULL) {
3307         __kmp_printf("GTID %2d %p:\n", gtid, root);
3308         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3309         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3310         __kmp_print_structure_thread("    Uber Thread:  ",
3311                                      root->r.r_uber_thread);
3312         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3313         __kmp_printf("    In Parallel:  %2d\n",
3314                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3315         __kmp_printf("\n");
3316         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3317         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3318       }
3319     }
3320   } else {
3321     __kmp_printf("Ubers array is not allocated.\n");
3322   }
3323 
3324   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3325                "--------\n");
3326   while (list->next != NULL) {
3327     kmp_team_p const *team = list->entry;
3328     int i;
3329     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3330     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3331     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3332     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3333     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3334     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3335     for (i = 0; i < team->t.t_nproc; ++i) {
3336       __kmp_printf("    Thread %2d:      ", i);
3337       __kmp_print_structure_thread("", team->t.t_threads[i]);
3338     }
3339     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3340     __kmp_printf("\n");
3341     list = list->next;
3342   }
3343 
3344   // Print out __kmp_thread_pool and __kmp_team_pool.
3345   __kmp_printf("\n------------------------------\nPools\n----------------------"
3346                "--------\n");
3347   __kmp_print_structure_thread("Thread pool:          ",
3348                                CCAST(kmp_info_t *, __kmp_thread_pool));
3349   __kmp_print_structure_team("Team pool:            ",
3350                              CCAST(kmp_team_t *, __kmp_team_pool));
3351   __kmp_printf("\n");
3352 
3353   // Free team list.
3354   while (list != NULL) {
3355     kmp_team_list_item_t *item = list;
3356     list = list->next;
3357     KMP_INTERNAL_FREE(item);
3358   }
3359 }
3360 
3361 #endif
3362 
3363 //---------------------------------------------------------------------------
3364 //  Stuff for per-thread fast random number generator
3365 //  Table of primes
3366 static const unsigned __kmp_primes[] = {
3367     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3368     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3369     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3370     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3371     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3372     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3373     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3374     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3375     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3376     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3377     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3378 
3379 //---------------------------------------------------------------------------
3380 //  __kmp_get_random: Get a random number using a linear congruential method.
3381 unsigned short __kmp_get_random(kmp_info_t *thread) {
3382   unsigned x = thread->th.th_x;
3383   unsigned short r = x >> 16;
3384 
3385   thread->th.th_x = x * thread->th.th_a + 1;
3386 
3387   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3388                 thread->th.th_info.ds.ds_tid, r));
3389 
3390   return r;
3391 }
3392 //--------------------------------------------------------
3393 // __kmp_init_random: Initialize a random number generator
3394 void __kmp_init_random(kmp_info_t *thread) {
3395   unsigned seed = thread->th.th_info.ds.ds_tid;
3396 
3397   thread->th.th_a =
3398       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3399   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3400   KA_TRACE(30,
3401            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3402 }
3403 
3404 #if KMP_OS_WINDOWS
3405 /* reclaim array entries for root threads that are already dead, returns number
3406  * reclaimed */
3407 static int __kmp_reclaim_dead_roots(void) {
3408   int i, r = 0;
3409 
3410   for (i = 0; i < __kmp_threads_capacity; ++i) {
3411     if (KMP_UBER_GTID(i) &&
3412         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3413         !__kmp_root[i]
3414              ->r.r_active) { // AC: reclaim only roots died in non-active state
3415       r += __kmp_unregister_root_other_thread(i);
3416     }
3417   }
3418   return r;
3419 }
3420 #endif
3421 
3422 /* This function attempts to create free entries in __kmp_threads and
3423    __kmp_root, and returns the number of free entries generated.
3424 
3425    For Windows* OS static library, the first mechanism used is to reclaim array
3426    entries for root threads that are already dead.
3427 
3428    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3429    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3430    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3431    threadprivate cache array has been created. Synchronization with
3432    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3433 
3434    After any dead root reclamation, if the clipping value allows array expansion
3435    to result in the generation of a total of nNeed free slots, the function does
3436    that expansion. If not, nothing is done beyond the possible initial root
3437    thread reclamation.
3438 
3439    If any argument is negative, the behavior is undefined. */
3440 static int __kmp_expand_threads(int nNeed) {
3441   int added = 0;
3442   int minimumRequiredCapacity;
3443   int newCapacity;
3444   kmp_info_t **newThreads;
3445   kmp_root_t **newRoot;
3446 
3447 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3448 // resizing __kmp_threads does not need additional protection if foreign
3449 // threads are present
3450 
3451 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3452   /* only for Windows static library */
3453   /* reclaim array entries for root threads that are already dead */
3454   added = __kmp_reclaim_dead_roots();
3455 
3456   if (nNeed) {
3457     nNeed -= added;
3458     if (nNeed < 0)
3459       nNeed = 0;
3460   }
3461 #endif
3462   if (nNeed <= 0)
3463     return added;
3464 
3465   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3466   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3467   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3468   // > __kmp_max_nth in one of two ways:
3469   //
3470   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3471   //    may not be resused by another thread, so we may need to increase
3472   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3473   //
3474   // 2) New foreign root(s) are encountered.  We always register new foreign
3475   //    roots. This may cause a smaller # of threads to be allocated at
3476   //    subsequent parallel regions, but the worker threads hang around (and
3477   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3478   //
3479   // Anyway, that is the reason for moving the check to see if
3480   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3481   // instead of having it performed here. -BB
3482 
3483   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3484 
3485   /* compute expansion headroom to check if we can expand */
3486   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3487     /* possible expansion too small -- give up */
3488     return added;
3489   }
3490   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3491 
3492   newCapacity = __kmp_threads_capacity;
3493   do {
3494     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3495                                                           : __kmp_sys_max_nth;
3496   } while (newCapacity < minimumRequiredCapacity);
3497   newThreads = (kmp_info_t **)__kmp_allocate(
3498       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3499   newRoot =
3500       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3501   KMP_MEMCPY(newThreads, __kmp_threads,
3502              __kmp_threads_capacity * sizeof(kmp_info_t *));
3503   KMP_MEMCPY(newRoot, __kmp_root,
3504              __kmp_threads_capacity * sizeof(kmp_root_t *));
3505 
3506   kmp_info_t **temp_threads = __kmp_threads;
3507   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3508   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3509   __kmp_free(temp_threads);
3510   added += newCapacity - __kmp_threads_capacity;
3511   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3512 
3513   if (newCapacity > __kmp_tp_capacity) {
3514     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3515     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3516       __kmp_threadprivate_resize_cache(newCapacity);
3517     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3518       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3519     }
3520     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3521   }
3522 
3523   return added;
3524 }
3525 
3526 /* Register the current thread as a root thread and obtain our gtid. We must
3527    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3528    thread that calls from __kmp_do_serial_initialize() */
3529 int __kmp_register_root(int initial_thread) {
3530   kmp_info_t *root_thread;
3531   kmp_root_t *root;
3532   int gtid;
3533   int capacity;
3534   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3535   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3536   KMP_MB();
3537 
3538   /* 2007-03-02:
3539      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3540      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3541      work as expected -- it may return false (that means there is at least one
3542      empty slot in __kmp_threads array), but it is possible the only free slot
3543      is #0, which is reserved for initial thread and so cannot be used for this
3544      one. Following code workarounds this bug.
3545 
3546      However, right solution seems to be not reserving slot #0 for initial
3547      thread because:
3548      (1) there is no magic in slot #0,
3549      (2) we cannot detect initial thread reliably (the first thread which does
3550         serial initialization may be not a real initial thread).
3551   */
3552   capacity = __kmp_threads_capacity;
3553   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3554     --capacity;
3555   }
3556 
3557   /* see if there are too many threads */
3558   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3559     if (__kmp_tp_cached) {
3560       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3561                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3562                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3563     } else {
3564       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3565                   __kmp_msg_null);
3566     }
3567   }
3568 
3569   /* find an available thread slot */
3570   /* Don't reassign the zero slot since we need that to only be used by initial
3571      thread */
3572   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3573        gtid++)
3574     ;
3575   KA_TRACE(1,
3576            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3577   KMP_ASSERT(gtid < __kmp_threads_capacity);
3578 
3579   /* update global accounting */
3580   __kmp_all_nth++;
3581   TCW_4(__kmp_nth, __kmp_nth + 1);
3582 
3583   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3584   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3585   if (__kmp_adjust_gtid_mode) {
3586     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3587       if (TCR_4(__kmp_gtid_mode) != 2) {
3588         TCW_4(__kmp_gtid_mode, 2);
3589       }
3590     } else {
3591       if (TCR_4(__kmp_gtid_mode) != 1) {
3592         TCW_4(__kmp_gtid_mode, 1);
3593       }
3594     }
3595   }
3596 
3597 #ifdef KMP_ADJUST_BLOCKTIME
3598   /* Adjust blocktime to zero if necessary            */
3599   /* Middle initialization might not have occurred yet */
3600   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3601     if (__kmp_nth > __kmp_avail_proc) {
3602       __kmp_zero_bt = TRUE;
3603     }
3604   }
3605 #endif /* KMP_ADJUST_BLOCKTIME */
3606 
3607   /* setup this new hierarchy */
3608   if (!(root = __kmp_root[gtid])) {
3609     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3610     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3611   }
3612 
3613 #if KMP_STATS_ENABLED
3614   // Initialize stats as soon as possible (right after gtid assignment).
3615   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3616   __kmp_stats_thread_ptr->startLife();
3617   KMP_SET_THREAD_STATE(SERIAL_REGION);
3618   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3619 #endif
3620   __kmp_initialize_root(root);
3621 
3622   /* setup new root thread structure */
3623   if (root->r.r_uber_thread) {
3624     root_thread = root->r.r_uber_thread;
3625   } else {
3626     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3627     if (__kmp_storage_map) {
3628       __kmp_print_thread_storage_map(root_thread, gtid);
3629     }
3630     root_thread->th.th_info.ds.ds_gtid = gtid;
3631 #if OMPT_SUPPORT
3632     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3633 #endif
3634     root_thread->th.th_root = root;
3635     if (__kmp_env_consistency_check) {
3636       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3637     }
3638 #if USE_FAST_MEMORY
3639     __kmp_initialize_fast_memory(root_thread);
3640 #endif /* USE_FAST_MEMORY */
3641 
3642 #if KMP_USE_BGET
3643     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3644     __kmp_initialize_bget(root_thread);
3645 #endif
3646     __kmp_init_random(root_thread); // Initialize random number generator
3647   }
3648 
3649   /* setup the serial team held in reserve by the root thread */
3650   if (!root_thread->th.th_serial_team) {
3651     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3652     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3653     root_thread->th.th_serial_team = __kmp_allocate_team(
3654         root, 1, 1,
3655 #if OMPT_SUPPORT
3656         ompt_data_none, // root parallel id
3657 #endif
3658         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3659   }
3660   KMP_ASSERT(root_thread->th.th_serial_team);
3661   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3662                 root_thread->th.th_serial_team));
3663 
3664   /* drop root_thread into place */
3665   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3666 
3667   root->r.r_root_team->t.t_threads[0] = root_thread;
3668   root->r.r_hot_team->t.t_threads[0] = root_thread;
3669   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3670   // AC: the team created in reserve, not for execution (it is unused for now).
3671   root_thread->th.th_serial_team->t.t_serialized = 0;
3672   root->r.r_uber_thread = root_thread;
3673 
3674   /* initialize the thread, get it ready to go */
3675   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3676   TCW_4(__kmp_init_gtid, TRUE);
3677 
3678   /* prepare the master thread for get_gtid() */
3679   __kmp_gtid_set_specific(gtid);
3680 
3681 #if USE_ITT_BUILD
3682   __kmp_itt_thread_name(gtid);
3683 #endif /* USE_ITT_BUILD */
3684 
3685 #ifdef KMP_TDATA_GTID
3686   __kmp_gtid = gtid;
3687 #endif
3688   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3689   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3690 
3691   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3692                 "plain=%u\n",
3693                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3694                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3695                 KMP_INIT_BARRIER_STATE));
3696   { // Initialize barrier data.
3697     int b;
3698     for (b = 0; b < bs_last_barrier; ++b) {
3699       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3700 #if USE_DEBUGGER
3701       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3702 #endif
3703     }
3704   }
3705   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3706                    KMP_INIT_BARRIER_STATE);
3707 
3708 #if KMP_AFFINITY_SUPPORTED
3709   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3710   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3711   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3712   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3713   if (TCR_4(__kmp_init_middle)) {
3714     __kmp_affinity_set_init_mask(gtid, TRUE);
3715   }
3716 #endif /* KMP_AFFINITY_SUPPORTED */
3717   root_thread->th.th_def_allocator = __kmp_def_allocator;
3718   root_thread->th.th_prev_level = 0;
3719   root_thread->th.th_prev_num_threads = 1;
3720 
3721   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3722   tmp->cg_root = root_thread;
3723   tmp->cg_thread_limit = __kmp_cg_max_nth;
3724   tmp->cg_nthreads = 1;
3725   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3726                  " cg_nthreads init to 1\n",
3727                  root_thread, tmp));
3728   tmp->up = NULL;
3729   root_thread->th.th_cg_roots = tmp;
3730 
3731   __kmp_root_counter++;
3732 
3733 #if OMPT_SUPPORT
3734   if (!initial_thread && ompt_enabled.enabled) {
3735 
3736     kmp_info_t *root_thread = ompt_get_thread();
3737 
3738     ompt_set_thread_state(root_thread, ompt_state_overhead);
3739 
3740     if (ompt_enabled.ompt_callback_thread_begin) {
3741       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3742           ompt_thread_initial, __ompt_get_thread_data_internal());
3743     }
3744     ompt_data_t *task_data;
3745     ompt_data_t *parallel_data;
3746     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3747     if (ompt_enabled.ompt_callback_implicit_task) {
3748       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3749           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3750     }
3751 
3752     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3753   }
3754 #endif
3755 
3756   KMP_MB();
3757   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3758 
3759   return gtid;
3760 }
3761 
3762 #if KMP_NESTED_HOT_TEAMS
3763 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3764                                 const int max_level) {
3765   int i, n, nth;
3766   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3767   if (!hot_teams || !hot_teams[level].hot_team) {
3768     return 0;
3769   }
3770   KMP_DEBUG_ASSERT(level < max_level);
3771   kmp_team_t *team = hot_teams[level].hot_team;
3772   nth = hot_teams[level].hot_team_nth;
3773   n = nth - 1; // master is not freed
3774   if (level < max_level - 1) {
3775     for (i = 0; i < nth; ++i) {
3776       kmp_info_t *th = team->t.t_threads[i];
3777       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3778       if (i > 0 && th->th.th_hot_teams) {
3779         __kmp_free(th->th.th_hot_teams);
3780         th->th.th_hot_teams = NULL;
3781       }
3782     }
3783   }
3784   __kmp_free_team(root, team, NULL);
3785   return n;
3786 }
3787 #endif
3788 
3789 // Resets a root thread and clear its root and hot teams.
3790 // Returns the number of __kmp_threads entries directly and indirectly freed.
3791 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3792   kmp_team_t *root_team = root->r.r_root_team;
3793   kmp_team_t *hot_team = root->r.r_hot_team;
3794   int n = hot_team->t.t_nproc;
3795   int i;
3796 
3797   KMP_DEBUG_ASSERT(!root->r.r_active);
3798 
3799   root->r.r_root_team = NULL;
3800   root->r.r_hot_team = NULL;
3801   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3802   // before call to __kmp_free_team().
3803   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3804 #if KMP_NESTED_HOT_TEAMS
3805   if (__kmp_hot_teams_max_level >
3806       0) { // need to free nested hot teams and their threads if any
3807     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3808       kmp_info_t *th = hot_team->t.t_threads[i];
3809       if (__kmp_hot_teams_max_level > 1) {
3810         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3811       }
3812       if (th->th.th_hot_teams) {
3813         __kmp_free(th->th.th_hot_teams);
3814         th->th.th_hot_teams = NULL;
3815       }
3816     }
3817   }
3818 #endif
3819   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3820 
3821   // Before we can reap the thread, we need to make certain that all other
3822   // threads in the teams that had this root as ancestor have stopped trying to
3823   // steal tasks.
3824   if (__kmp_tasking_mode != tskm_immediate_exec) {
3825     __kmp_wait_to_unref_task_teams();
3826   }
3827 
3828 #if KMP_OS_WINDOWS
3829   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3830   KA_TRACE(
3831       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3832            "\n",
3833            (LPVOID) & (root->r.r_uber_thread->th),
3834            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3835   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3836 #endif /* KMP_OS_WINDOWS */
3837 
3838 #if OMPT_SUPPORT
3839   ompt_data_t *task_data;
3840   ompt_data_t *parallel_data;
3841   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3842   if (ompt_enabled.ompt_callback_implicit_task) {
3843     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3844         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3845   }
3846   if (ompt_enabled.ompt_callback_thread_end) {
3847     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3848         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3849   }
3850 #endif
3851 
3852   TCW_4(__kmp_nth,
3853         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3854   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3855   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3856                  " to %d\n",
3857                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3858                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3859   if (i == 1) {
3860     // need to free contention group structure
3861     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3862                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3863     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3864     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3865     root->r.r_uber_thread->th.th_cg_roots = NULL;
3866   }
3867   __kmp_reap_thread(root->r.r_uber_thread, 1);
3868 
3869   // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3870   // of freeing.
3871   root->r.r_uber_thread = NULL;
3872   /* mark root as no longer in use */
3873   root->r.r_begin = FALSE;
3874 
3875   return n;
3876 }
3877 
3878 void __kmp_unregister_root_current_thread(int gtid) {
3879   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3880   /* this lock should be ok, since unregister_root_current_thread is never
3881      called during an abort, only during a normal close. furthermore, if you
3882      have the forkjoin lock, you should never try to get the initz lock */
3883   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3884   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3885     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3886                   "exiting T#%d\n",
3887                   gtid));
3888     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3889     return;
3890   }
3891   kmp_root_t *root = __kmp_root[gtid];
3892 
3893   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3894   KMP_ASSERT(KMP_UBER_GTID(gtid));
3895   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3896   KMP_ASSERT(root->r.r_active == FALSE);
3897 
3898   KMP_MB();
3899 
3900   kmp_info_t *thread = __kmp_threads[gtid];
3901   kmp_team_t *team = thread->th.th_team;
3902   kmp_task_team_t *task_team = thread->th.th_task_team;
3903 
3904   // we need to wait for the proxy tasks before finishing the thread
3905   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3906 #if OMPT_SUPPORT
3907     // the runtime is shutting down so we won't report any events
3908     thread->th.ompt_thread_info.state = ompt_state_undefined;
3909 #endif
3910     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3911   }
3912 
3913   __kmp_reset_root(gtid, root);
3914 
3915   /* free up this thread slot */
3916   __kmp_gtid_set_specific(KMP_GTID_DNE);
3917 #ifdef KMP_TDATA_GTID
3918   __kmp_gtid = KMP_GTID_DNE;
3919 #endif
3920 
3921   KMP_MB();
3922   KC_TRACE(10,
3923            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3924 
3925   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3926 }
3927 
3928 #if KMP_OS_WINDOWS
3929 /* __kmp_forkjoin_lock must be already held
3930    Unregisters a root thread that is not the current thread.  Returns the number
3931    of __kmp_threads entries freed as a result. */
3932 static int __kmp_unregister_root_other_thread(int gtid) {
3933   kmp_root_t *root = __kmp_root[gtid];
3934   int r;
3935 
3936   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3937   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3938   KMP_ASSERT(KMP_UBER_GTID(gtid));
3939   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3940   KMP_ASSERT(root->r.r_active == FALSE);
3941 
3942   r = __kmp_reset_root(gtid, root);
3943   KC_TRACE(10,
3944            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
3945   return r;
3946 }
3947 #endif
3948 
3949 #if KMP_DEBUG
3950 void __kmp_task_info() {
3951 
3952   kmp_int32 gtid = __kmp_entry_gtid();
3953   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
3954   kmp_info_t *this_thr = __kmp_threads[gtid];
3955   kmp_team_t *steam = this_thr->th.th_serial_team;
3956   kmp_team_t *team = this_thr->th.th_team;
3957 
3958   __kmp_printf(
3959       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
3960       "ptask=%p\n",
3961       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
3962       team->t.t_implicit_task_taskdata[tid].td_parent);
3963 }
3964 #endif // KMP_DEBUG
3965 
3966 /* TODO optimize with one big memclr, take out what isn't needed, split
3967    responsibility to workers as much as possible, and delay initialization of
3968    features as much as possible  */
3969 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
3970                                   int tid, int gtid) {
3971   /* this_thr->th.th_info.ds.ds_gtid is setup in
3972      kmp_allocate_thread/create_worker.
3973      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
3974   kmp_info_t *master = team->t.t_threads[0];
3975   KMP_DEBUG_ASSERT(this_thr != NULL);
3976   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
3977   KMP_DEBUG_ASSERT(team);
3978   KMP_DEBUG_ASSERT(team->t.t_threads);
3979   KMP_DEBUG_ASSERT(team->t.t_dispatch);
3980   KMP_DEBUG_ASSERT(master);
3981   KMP_DEBUG_ASSERT(master->th.th_root);
3982 
3983   KMP_MB();
3984 
3985   TCW_SYNC_PTR(this_thr->th.th_team, team);
3986 
3987   this_thr->th.th_info.ds.ds_tid = tid;
3988   this_thr->th.th_set_nproc = 0;
3989   if (__kmp_tasking_mode != tskm_immediate_exec)
3990     // When tasking is possible, threads are not safe to reap until they are
3991     // done tasking; this will be set when tasking code is exited in wait
3992     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3993   else // no tasking --> always safe to reap
3994     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
3995   this_thr->th.th_set_proc_bind = proc_bind_default;
3996 #if KMP_AFFINITY_SUPPORTED
3997   this_thr->th.th_new_place = this_thr->th.th_current_place;
3998 #endif
3999   this_thr->th.th_root = master->th.th_root;
4000 
4001   /* setup the thread's cache of the team structure */
4002   this_thr->th.th_team_nproc = team->t.t_nproc;
4003   this_thr->th.th_team_master = master;
4004   this_thr->th.th_team_serialized = team->t.t_serialized;
4005   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4006 
4007   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4008 
4009   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4010                 tid, gtid, this_thr, this_thr->th.th_current_task));
4011 
4012   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4013                            team, tid, TRUE);
4014 
4015   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4016                 tid, gtid, this_thr, this_thr->th.th_current_task));
4017   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4018   // __kmp_initialize_team()?
4019 
4020   /* TODO no worksharing in speculative threads */
4021   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4022 
4023   this_thr->th.th_local.this_construct = 0;
4024 
4025   if (!this_thr->th.th_pri_common) {
4026     this_thr->th.th_pri_common =
4027         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4028     if (__kmp_storage_map) {
4029       __kmp_print_storage_map_gtid(
4030           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4031           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4032     }
4033     this_thr->th.th_pri_head = NULL;
4034   }
4035 
4036   if (this_thr != master && // Master's CG root is initialized elsewhere
4037       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4038     // Make new thread's CG root same as master's
4039     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4040     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4041     if (tmp) {
4042       // worker changes CG, need to check if old CG should be freed
4043       int i = tmp->cg_nthreads--;
4044       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4045                      " on node %p of thread %p to %d\n",
4046                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4047       if (i == 1) {
4048         __kmp_free(tmp); // last thread left CG --> free it
4049       }
4050     }
4051     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4052     // Increment new thread's CG root's counter to add the new thread
4053     this_thr->th.th_cg_roots->cg_nthreads++;
4054     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4055                    " node %p of thread %p to %d\n",
4056                    this_thr, this_thr->th.th_cg_roots,
4057                    this_thr->th.th_cg_roots->cg_root,
4058                    this_thr->th.th_cg_roots->cg_nthreads));
4059     this_thr->th.th_current_task->td_icvs.thread_limit =
4060         this_thr->th.th_cg_roots->cg_thread_limit;
4061   }
4062 
4063   /* Initialize dynamic dispatch */
4064   {
4065     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4066     // Use team max_nproc since this will never change for the team.
4067     size_t disp_size =
4068         sizeof(dispatch_private_info_t) *
4069         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4070     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4071                   team->t.t_max_nproc));
4072     KMP_ASSERT(dispatch);
4073     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4074     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4075 
4076     dispatch->th_disp_index = 0;
4077     dispatch->th_doacross_buf_idx = 0;
4078     if (!dispatch->th_disp_buffer) {
4079       dispatch->th_disp_buffer =
4080           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4081 
4082       if (__kmp_storage_map) {
4083         __kmp_print_storage_map_gtid(
4084             gtid, &dispatch->th_disp_buffer[0],
4085             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4086                                           ? 1
4087                                           : __kmp_dispatch_num_buffers],
4088             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4089                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4090             gtid, team->t.t_id, gtid);
4091       }
4092     } else {
4093       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4094     }
4095 
4096     dispatch->th_dispatch_pr_current = 0;
4097     dispatch->th_dispatch_sh_current = 0;
4098 
4099     dispatch->th_deo_fcn = 0; /* ORDERED     */
4100     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4101   }
4102 
4103   this_thr->th.th_next_pool = NULL;
4104 
4105   if (!this_thr->th.th_task_state_memo_stack) {
4106     size_t i;
4107     this_thr->th.th_task_state_memo_stack =
4108         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4109     this_thr->th.th_task_state_top = 0;
4110     this_thr->th.th_task_state_stack_sz = 4;
4111     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4112          ++i) // zero init the stack
4113       this_thr->th.th_task_state_memo_stack[i] = 0;
4114   }
4115 
4116   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4117   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4118 
4119   KMP_MB();
4120 }
4121 
4122 /* allocate a new thread for the requesting team. this is only called from
4123    within a forkjoin critical section. we will first try to get an available
4124    thread from the thread pool. if none is available, we will fork a new one
4125    assuming we are able to create a new one. this should be assured, as the
4126    caller should check on this first. */
4127 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4128                                   int new_tid) {
4129   kmp_team_t *serial_team;
4130   kmp_info_t *new_thr;
4131   int new_gtid;
4132 
4133   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4134   KMP_DEBUG_ASSERT(root && team);
4135 #if !KMP_NESTED_HOT_TEAMS
4136   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4137 #endif
4138   KMP_MB();
4139 
4140   /* first, try to get one from the thread pool */
4141   if (__kmp_thread_pool) {
4142     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4143     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4144     if (new_thr == __kmp_thread_pool_insert_pt) {
4145       __kmp_thread_pool_insert_pt = NULL;
4146     }
4147     TCW_4(new_thr->th.th_in_pool, FALSE);
4148     __kmp_suspend_initialize_thread(new_thr);
4149     __kmp_lock_suspend_mx(new_thr);
4150     if (new_thr->th.th_active_in_pool == TRUE) {
4151       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4152       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4153       new_thr->th.th_active_in_pool = FALSE;
4154     }
4155     __kmp_unlock_suspend_mx(new_thr);
4156 
4157     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4158                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4159     KMP_ASSERT(!new_thr->th.th_team);
4160     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4161 
4162     /* setup the thread structure */
4163     __kmp_initialize_info(new_thr, team, new_tid,
4164                           new_thr->th.th_info.ds.ds_gtid);
4165     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4166 
4167     TCW_4(__kmp_nth, __kmp_nth + 1);
4168 
4169     new_thr->th.th_task_state = 0;
4170     new_thr->th.th_task_state_top = 0;
4171     new_thr->th.th_task_state_stack_sz = 4;
4172 
4173 #ifdef KMP_ADJUST_BLOCKTIME
4174     /* Adjust blocktime back to zero if necessary */
4175     /* Middle initialization might not have occurred yet */
4176     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4177       if (__kmp_nth > __kmp_avail_proc) {
4178         __kmp_zero_bt = TRUE;
4179       }
4180     }
4181 #endif /* KMP_ADJUST_BLOCKTIME */
4182 
4183 #if KMP_DEBUG
4184     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4185     // KMP_BARRIER_PARENT_FLAG.
4186     int b;
4187     kmp_balign_t *balign = new_thr->th.th_bar;
4188     for (b = 0; b < bs_last_barrier; ++b)
4189       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4190 #endif
4191 
4192     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4193                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4194 
4195     KMP_MB();
4196     return new_thr;
4197   }
4198 
4199   /* no, well fork a new one */
4200   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4201   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4202 
4203 #if KMP_USE_MONITOR
4204   // If this is the first worker thread the RTL is creating, then also
4205   // launch the monitor thread.  We try to do this as early as possible.
4206   if (!TCR_4(__kmp_init_monitor)) {
4207     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4208     if (!TCR_4(__kmp_init_monitor)) {
4209       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4210       TCW_4(__kmp_init_monitor, 1);
4211       __kmp_create_monitor(&__kmp_monitor);
4212       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4213 #if KMP_OS_WINDOWS
4214       // AC: wait until monitor has started. This is a fix for CQ232808.
4215       // The reason is that if the library is loaded/unloaded in a loop with
4216       // small (parallel) work in between, then there is high probability that
4217       // monitor thread started after the library shutdown. At shutdown it is
4218       // too late to cope with the problem, because when the master is in
4219       // DllMain (process detach) the monitor has no chances to start (it is
4220       // blocked), and master has no means to inform the monitor that the
4221       // library has gone, because all the memory which the monitor can access
4222       // is going to be released/reset.
4223       while (TCR_4(__kmp_init_monitor) < 2) {
4224         KMP_YIELD(TRUE);
4225       }
4226       KF_TRACE(10, ("after monitor thread has started\n"));
4227 #endif
4228     }
4229     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4230   }
4231 #endif
4232 
4233   KMP_MB();
4234   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4235     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4236   }
4237 
4238   /* allocate space for it. */
4239   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4240 
4241   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4242 
4243   if (__kmp_storage_map) {
4244     __kmp_print_thread_storage_map(new_thr, new_gtid);
4245   }
4246 
4247   // add the reserve serialized team, initialized from the team's master thread
4248   {
4249     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4250     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4251     new_thr->th.th_serial_team = serial_team =
4252         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4253 #if OMPT_SUPPORT
4254                                           ompt_data_none, // root parallel id
4255 #endif
4256                                           proc_bind_default, &r_icvs,
4257                                           0 USE_NESTED_HOT_ARG(NULL));
4258   }
4259   KMP_ASSERT(serial_team);
4260   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4261   // execution (it is unused for now).
4262   serial_team->t.t_threads[0] = new_thr;
4263   KF_TRACE(10,
4264            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4265             new_thr));
4266 
4267   /* setup the thread structures */
4268   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4269 
4270 #if USE_FAST_MEMORY
4271   __kmp_initialize_fast_memory(new_thr);
4272 #endif /* USE_FAST_MEMORY */
4273 
4274 #if KMP_USE_BGET
4275   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4276   __kmp_initialize_bget(new_thr);
4277 #endif
4278 
4279   __kmp_init_random(new_thr); // Initialize random number generator
4280 
4281   /* Initialize these only once when thread is grabbed for a team allocation */
4282   KA_TRACE(20,
4283            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4284             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4285 
4286   int b;
4287   kmp_balign_t *balign = new_thr->th.th_bar;
4288   for (b = 0; b < bs_last_barrier; ++b) {
4289     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4290     balign[b].bb.team = NULL;
4291     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4292     balign[b].bb.use_oncore_barrier = 0;
4293   }
4294 
4295   new_thr->th.th_spin_here = FALSE;
4296   new_thr->th.th_next_waiting = 0;
4297 #if KMP_OS_UNIX
4298   new_thr->th.th_blocking = false;
4299 #endif
4300 
4301 #if KMP_AFFINITY_SUPPORTED
4302   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4303   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4304   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4305   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4306 #endif
4307   new_thr->th.th_def_allocator = __kmp_def_allocator;
4308   new_thr->th.th_prev_level = 0;
4309   new_thr->th.th_prev_num_threads = 1;
4310 
4311   TCW_4(new_thr->th.th_in_pool, FALSE);
4312   new_thr->th.th_active_in_pool = FALSE;
4313   TCW_4(new_thr->th.th_active, TRUE);
4314 
4315   /* adjust the global counters */
4316   __kmp_all_nth++;
4317   __kmp_nth++;
4318 
4319   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4320   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4321   if (__kmp_adjust_gtid_mode) {
4322     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4323       if (TCR_4(__kmp_gtid_mode) != 2) {
4324         TCW_4(__kmp_gtid_mode, 2);
4325       }
4326     } else {
4327       if (TCR_4(__kmp_gtid_mode) != 1) {
4328         TCW_4(__kmp_gtid_mode, 1);
4329       }
4330     }
4331   }
4332 
4333 #ifdef KMP_ADJUST_BLOCKTIME
4334   /* Adjust blocktime back to zero if necessary       */
4335   /* Middle initialization might not have occurred yet */
4336   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4337     if (__kmp_nth > __kmp_avail_proc) {
4338       __kmp_zero_bt = TRUE;
4339     }
4340   }
4341 #endif /* KMP_ADJUST_BLOCKTIME */
4342 
4343   /* actually fork it and create the new worker thread */
4344   KF_TRACE(
4345       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4346   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4347   KF_TRACE(10,
4348            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4349 
4350   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4351                 new_gtid));
4352   KMP_MB();
4353   return new_thr;
4354 }
4355 
4356 /* Reinitialize team for reuse.
4357    The hot team code calls this case at every fork barrier, so EPCC barrier
4358    test are extremely sensitive to changes in it, esp. writes to the team
4359    struct, which cause a cache invalidation in all threads.
4360    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4361 static void __kmp_reinitialize_team(kmp_team_t *team,
4362                                     kmp_internal_control_t *new_icvs,
4363                                     ident_t *loc) {
4364   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4365                 team->t.t_threads[0], team));
4366   KMP_DEBUG_ASSERT(team && new_icvs);
4367   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4368   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4369 
4370   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4371   // Copy ICVs to the master thread's implicit taskdata
4372   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4373   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4374 
4375   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4376                 team->t.t_threads[0], team));
4377 }
4378 
4379 /* Initialize the team data structure.
4380    This assumes the t_threads and t_max_nproc are already set.
4381    Also, we don't touch the arguments */
4382 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4383                                   kmp_internal_control_t *new_icvs,
4384                                   ident_t *loc) {
4385   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4386 
4387   /* verify */
4388   KMP_DEBUG_ASSERT(team);
4389   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4390   KMP_DEBUG_ASSERT(team->t.t_threads);
4391   KMP_MB();
4392 
4393   team->t.t_master_tid = 0; /* not needed */
4394   /* team->t.t_master_bar;        not needed */
4395   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4396   team->t.t_nproc = new_nproc;
4397 
4398   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4399   team->t.t_next_pool = NULL;
4400   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4401    * up hot team */
4402 
4403   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4404   team->t.t_invoke = NULL; /* not needed */
4405 
4406   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4407   team->t.t_sched.sched = new_icvs->sched.sched;
4408 
4409 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4410   team->t.t_fp_control_saved = FALSE; /* not needed */
4411   team->t.t_x87_fpu_control_word = 0; /* not needed */
4412   team->t.t_mxcsr = 0; /* not needed */
4413 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4414 
4415   team->t.t_construct = 0;
4416 
4417   team->t.t_ordered.dt.t_value = 0;
4418   team->t.t_master_active = FALSE;
4419 
4420 #ifdef KMP_DEBUG
4421   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4422 #endif
4423 #if KMP_OS_WINDOWS
4424   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4425 #endif
4426 
4427   team->t.t_control_stack_top = NULL;
4428 
4429   __kmp_reinitialize_team(team, new_icvs, loc);
4430 
4431   KMP_MB();
4432   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4433 }
4434 
4435 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4436 /* Sets full mask for thread and returns old mask, no changes to structures. */
4437 static void
4438 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4439   if (KMP_AFFINITY_CAPABLE()) {
4440     int status;
4441     if (old_mask != NULL) {
4442       status = __kmp_get_system_affinity(old_mask, TRUE);
4443       int error = errno;
4444       if (status != 0) {
4445         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4446                     __kmp_msg_null);
4447       }
4448     }
4449     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4450   }
4451 }
4452 #endif
4453 
4454 #if KMP_AFFINITY_SUPPORTED
4455 
4456 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4457 // It calculats the worker + master thread's partition based upon the parent
4458 // thread's partition, and binds each worker to a thread in their partition.
4459 // The master thread's partition should already include its current binding.
4460 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4461   // Copy the master thread's place partion to the team struct
4462   kmp_info_t *master_th = team->t.t_threads[0];
4463   KMP_DEBUG_ASSERT(master_th != NULL);
4464   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4465   int first_place = master_th->th.th_first_place;
4466   int last_place = master_th->th.th_last_place;
4467   int masters_place = master_th->th.th_current_place;
4468   team->t.t_first_place = first_place;
4469   team->t.t_last_place = last_place;
4470 
4471   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4472                 "bound to place %d partition = [%d,%d]\n",
4473                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4474                 team->t.t_id, masters_place, first_place, last_place));
4475 
4476   switch (proc_bind) {
4477 
4478   case proc_bind_default:
4479     // serial teams might have the proc_bind policy set to proc_bind_default. It
4480     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4481     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4482     break;
4483 
4484   case proc_bind_master: {
4485     int f;
4486     int n_th = team->t.t_nproc;
4487     for (f = 1; f < n_th; f++) {
4488       kmp_info_t *th = team->t.t_threads[f];
4489       KMP_DEBUG_ASSERT(th != NULL);
4490       th->th.th_first_place = first_place;
4491       th->th.th_last_place = last_place;
4492       th->th.th_new_place = masters_place;
4493       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4494           team->t.t_display_affinity != 1) {
4495         team->t.t_display_affinity = 1;
4496       }
4497 
4498       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4499                      "partition = [%d,%d]\n",
4500                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4501                      f, masters_place, first_place, last_place));
4502     }
4503   } break;
4504 
4505   case proc_bind_close: {
4506     int f;
4507     int n_th = team->t.t_nproc;
4508     int n_places;
4509     if (first_place <= last_place) {
4510       n_places = last_place - first_place + 1;
4511     } else {
4512       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4513     }
4514     if (n_th <= n_places) {
4515       int place = masters_place;
4516       for (f = 1; f < n_th; f++) {
4517         kmp_info_t *th = team->t.t_threads[f];
4518         KMP_DEBUG_ASSERT(th != NULL);
4519 
4520         if (place == last_place) {
4521           place = first_place;
4522         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4523           place = 0;
4524         } else {
4525           place++;
4526         }
4527         th->th.th_first_place = first_place;
4528         th->th.th_last_place = last_place;
4529         th->th.th_new_place = place;
4530         if (__kmp_display_affinity && place != th->th.th_current_place &&
4531             team->t.t_display_affinity != 1) {
4532           team->t.t_display_affinity = 1;
4533         }
4534 
4535         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4536                        "partition = [%d,%d]\n",
4537                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4538                        team->t.t_id, f, place, first_place, last_place));
4539       }
4540     } else {
4541       int S, rem, gap, s_count;
4542       S = n_th / n_places;
4543       s_count = 0;
4544       rem = n_th - (S * n_places);
4545       gap = rem > 0 ? n_places / rem : n_places;
4546       int place = masters_place;
4547       int gap_ct = gap;
4548       for (f = 0; f < n_th; f++) {
4549         kmp_info_t *th = team->t.t_threads[f];
4550         KMP_DEBUG_ASSERT(th != NULL);
4551 
4552         th->th.th_first_place = first_place;
4553         th->th.th_last_place = last_place;
4554         th->th.th_new_place = place;
4555         if (__kmp_display_affinity && place != th->th.th_current_place &&
4556             team->t.t_display_affinity != 1) {
4557           team->t.t_display_affinity = 1;
4558         }
4559         s_count++;
4560 
4561         if ((s_count == S) && rem && (gap_ct == gap)) {
4562           // do nothing, add an extra thread to place on next iteration
4563         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4564           // we added an extra thread to this place; move to next place
4565           if (place == last_place) {
4566             place = first_place;
4567           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4568             place = 0;
4569           } else {
4570             place++;
4571           }
4572           s_count = 0;
4573           gap_ct = 1;
4574           rem--;
4575         } else if (s_count == S) { // place full; don't add extra
4576           if (place == last_place) {
4577             place = first_place;
4578           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4579             place = 0;
4580           } else {
4581             place++;
4582           }
4583           gap_ct++;
4584           s_count = 0;
4585         }
4586 
4587         KA_TRACE(100,
4588                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4589                   "partition = [%d,%d]\n",
4590                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4591                   th->th.th_new_place, first_place, last_place));
4592       }
4593       KMP_DEBUG_ASSERT(place == masters_place);
4594     }
4595   } break;
4596 
4597   case proc_bind_spread: {
4598     int f;
4599     int n_th = team->t.t_nproc;
4600     int n_places;
4601     int thidx;
4602     if (first_place <= last_place) {
4603       n_places = last_place - first_place + 1;
4604     } else {
4605       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4606     }
4607     if (n_th <= n_places) {
4608       int place = -1;
4609 
4610       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4611         int S = n_places / n_th;
4612         int s_count, rem, gap, gap_ct;
4613 
4614         place = masters_place;
4615         rem = n_places - n_th * S;
4616         gap = rem ? n_th / rem : 1;
4617         gap_ct = gap;
4618         thidx = n_th;
4619         if (update_master_only == 1)
4620           thidx = 1;
4621         for (f = 0; f < thidx; f++) {
4622           kmp_info_t *th = team->t.t_threads[f];
4623           KMP_DEBUG_ASSERT(th != NULL);
4624 
4625           th->th.th_first_place = place;
4626           th->th.th_new_place = place;
4627           if (__kmp_display_affinity && place != th->th.th_current_place &&
4628               team->t.t_display_affinity != 1) {
4629             team->t.t_display_affinity = 1;
4630           }
4631           s_count = 1;
4632           while (s_count < S) {
4633             if (place == last_place) {
4634               place = first_place;
4635             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4636               place = 0;
4637             } else {
4638               place++;
4639             }
4640             s_count++;
4641           }
4642           if (rem && (gap_ct == gap)) {
4643             if (place == last_place) {
4644               place = first_place;
4645             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4646               place = 0;
4647             } else {
4648               place++;
4649             }
4650             rem--;
4651             gap_ct = 0;
4652           }
4653           th->th.th_last_place = place;
4654           gap_ct++;
4655 
4656           if (place == last_place) {
4657             place = first_place;
4658           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4659             place = 0;
4660           } else {
4661             place++;
4662           }
4663 
4664           KA_TRACE(100,
4665                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4666                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4667                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4668                     f, th->th.th_new_place, th->th.th_first_place,
4669                     th->th.th_last_place, __kmp_affinity_num_masks));
4670         }
4671       } else {
4672         /* Having uniform space of available computation places I can create
4673            T partitions of round(P/T) size and put threads into the first
4674            place of each partition. */
4675         double current = static_cast<double>(masters_place);
4676         double spacing =
4677             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4678         int first, last;
4679         kmp_info_t *th;
4680 
4681         thidx = n_th + 1;
4682         if (update_master_only == 1)
4683           thidx = 1;
4684         for (f = 0; f < thidx; f++) {
4685           first = static_cast<int>(current);
4686           last = static_cast<int>(current + spacing) - 1;
4687           KMP_DEBUG_ASSERT(last >= first);
4688           if (first >= n_places) {
4689             if (masters_place) {
4690               first -= n_places;
4691               last -= n_places;
4692               if (first == (masters_place + 1)) {
4693                 KMP_DEBUG_ASSERT(f == n_th);
4694                 first--;
4695               }
4696               if (last == masters_place) {
4697                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4698                 last--;
4699               }
4700             } else {
4701               KMP_DEBUG_ASSERT(f == n_th);
4702               first = 0;
4703               last = 0;
4704             }
4705           }
4706           if (last >= n_places) {
4707             last = (n_places - 1);
4708           }
4709           place = first;
4710           current += spacing;
4711           if (f < n_th) {
4712             KMP_DEBUG_ASSERT(0 <= first);
4713             KMP_DEBUG_ASSERT(n_places > first);
4714             KMP_DEBUG_ASSERT(0 <= last);
4715             KMP_DEBUG_ASSERT(n_places > last);
4716             KMP_DEBUG_ASSERT(last_place >= first_place);
4717             th = team->t.t_threads[f];
4718             KMP_DEBUG_ASSERT(th);
4719             th->th.th_first_place = first;
4720             th->th.th_new_place = place;
4721             th->th.th_last_place = last;
4722             if (__kmp_display_affinity && place != th->th.th_current_place &&
4723                 team->t.t_display_affinity != 1) {
4724               team->t.t_display_affinity = 1;
4725             }
4726             KA_TRACE(100,
4727                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4728                       "partition = [%d,%d], spacing = %.4f\n",
4729                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4730                       team->t.t_id, f, th->th.th_new_place,
4731                       th->th.th_first_place, th->th.th_last_place, spacing));
4732           }
4733         }
4734       }
4735       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4736     } else {
4737       int S, rem, gap, s_count;
4738       S = n_th / n_places;
4739       s_count = 0;
4740       rem = n_th - (S * n_places);
4741       gap = rem > 0 ? n_places / rem : n_places;
4742       int place = masters_place;
4743       int gap_ct = gap;
4744       thidx = n_th;
4745       if (update_master_only == 1)
4746         thidx = 1;
4747       for (f = 0; f < thidx; f++) {
4748         kmp_info_t *th = team->t.t_threads[f];
4749         KMP_DEBUG_ASSERT(th != NULL);
4750 
4751         th->th.th_first_place = place;
4752         th->th.th_last_place = place;
4753         th->th.th_new_place = place;
4754         if (__kmp_display_affinity && place != th->th.th_current_place &&
4755             team->t.t_display_affinity != 1) {
4756           team->t.t_display_affinity = 1;
4757         }
4758         s_count++;
4759 
4760         if ((s_count == S) && rem && (gap_ct == gap)) {
4761           // do nothing, add an extra thread to place on next iteration
4762         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4763           // we added an extra thread to this place; move on to next place
4764           if (place == last_place) {
4765             place = first_place;
4766           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4767             place = 0;
4768           } else {
4769             place++;
4770           }
4771           s_count = 0;
4772           gap_ct = 1;
4773           rem--;
4774         } else if (s_count == S) { // place is full; don't add extra thread
4775           if (place == last_place) {
4776             place = first_place;
4777           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4778             place = 0;
4779           } else {
4780             place++;
4781           }
4782           gap_ct++;
4783           s_count = 0;
4784         }
4785 
4786         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4787                        "partition = [%d,%d]\n",
4788                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4789                        team->t.t_id, f, th->th.th_new_place,
4790                        th->th.th_first_place, th->th.th_last_place));
4791       }
4792       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4793     }
4794   } break;
4795 
4796   default:
4797     break;
4798   }
4799 
4800   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4801 }
4802 
4803 #endif // KMP_AFFINITY_SUPPORTED
4804 
4805 /* allocate a new team data structure to use.  take one off of the free pool if
4806    available */
4807 kmp_team_t *
4808 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4809 #if OMPT_SUPPORT
4810                     ompt_data_t ompt_parallel_data,
4811 #endif
4812                     kmp_proc_bind_t new_proc_bind,
4813                     kmp_internal_control_t *new_icvs,
4814                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4815   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4816   int f;
4817   kmp_team_t *team;
4818   int use_hot_team = !root->r.r_active;
4819   int level = 0;
4820 
4821   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4822   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4823   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4824   KMP_MB();
4825 
4826 #if KMP_NESTED_HOT_TEAMS
4827   kmp_hot_team_ptr_t *hot_teams;
4828   if (master) {
4829     team = master->th.th_team;
4830     level = team->t.t_active_level;
4831     if (master->th.th_teams_microtask) { // in teams construct?
4832       if (master->th.th_teams_size.nteams > 1 &&
4833           ( // #teams > 1
4834               team->t.t_pkfn ==
4835                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4836               master->th.th_teams_level <
4837                   team->t.t_level)) { // or nested parallel inside the teams
4838         ++level; // not increment if #teams==1, or for outer fork of the teams;
4839         // increment otherwise
4840       }
4841     }
4842     hot_teams = master->th.th_hot_teams;
4843     if (level < __kmp_hot_teams_max_level && hot_teams &&
4844         hot_teams[level]
4845             .hot_team) { // hot team has already been allocated for given level
4846       use_hot_team = 1;
4847     } else {
4848       use_hot_team = 0;
4849     }
4850   }
4851 #endif
4852   // Optimization to use a "hot" team
4853   if (use_hot_team && new_nproc > 1) {
4854     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4855 #if KMP_NESTED_HOT_TEAMS
4856     team = hot_teams[level].hot_team;
4857 #else
4858     team = root->r.r_hot_team;
4859 #endif
4860 #if KMP_DEBUG
4861     if (__kmp_tasking_mode != tskm_immediate_exec) {
4862       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4863                     "task_team[1] = %p before reinit\n",
4864                     team->t.t_task_team[0], team->t.t_task_team[1]));
4865     }
4866 #endif
4867 
4868     // Has the number of threads changed?
4869     /* Let's assume the most common case is that the number of threads is
4870        unchanged, and put that case first. */
4871     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4872       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4873       // This case can mean that omp_set_num_threads() was called and the hot
4874       // team size was already reduced, so we check the special flag
4875       if (team->t.t_size_changed == -1) {
4876         team->t.t_size_changed = 1;
4877       } else {
4878         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4879       }
4880 
4881       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4882       kmp_r_sched_t new_sched = new_icvs->sched;
4883       // set master's schedule as new run-time schedule
4884       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4885 
4886       __kmp_reinitialize_team(team, new_icvs,
4887                               root->r.r_uber_thread->th.th_ident);
4888 
4889       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4890                     team->t.t_threads[0], team));
4891       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4892 
4893 #if KMP_AFFINITY_SUPPORTED
4894       if ((team->t.t_size_changed == 0) &&
4895           (team->t.t_proc_bind == new_proc_bind)) {
4896         if (new_proc_bind == proc_bind_spread) {
4897           __kmp_partition_places(
4898               team, 1); // add flag to update only master for spread
4899         }
4900         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4901                        "proc_bind = %d, partition = [%d,%d]\n",
4902                        team->t.t_id, new_proc_bind, team->t.t_first_place,
4903                        team->t.t_last_place));
4904       } else {
4905         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4906         __kmp_partition_places(team);
4907       }
4908 #else
4909       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4910 #endif /* KMP_AFFINITY_SUPPORTED */
4911     } else if (team->t.t_nproc > new_nproc) {
4912       KA_TRACE(20,
4913                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4914                 new_nproc));
4915 
4916       team->t.t_size_changed = 1;
4917 #if KMP_NESTED_HOT_TEAMS
4918       if (__kmp_hot_teams_mode == 0) {
4919         // AC: saved number of threads should correspond to team's value in this
4920         // mode, can be bigger in mode 1, when hot team has threads in reserve
4921         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4922         hot_teams[level].hot_team_nth = new_nproc;
4923 #endif // KMP_NESTED_HOT_TEAMS
4924         /* release the extra threads we don't need any more */
4925         for (f = new_nproc; f < team->t.t_nproc; f++) {
4926           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4927           if (__kmp_tasking_mode != tskm_immediate_exec) {
4928             // When decreasing team size, threads no longer in the team should
4929             // unref task team.
4930             team->t.t_threads[f]->th.th_task_team = NULL;
4931           }
4932           __kmp_free_thread(team->t.t_threads[f]);
4933           team->t.t_threads[f] = NULL;
4934         }
4935 #if KMP_NESTED_HOT_TEAMS
4936       } // (__kmp_hot_teams_mode == 0)
4937       else {
4938         // When keeping extra threads in team, switch threads to wait on own
4939         // b_go flag
4940         for (f = new_nproc; f < team->t.t_nproc; ++f) {
4941           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4942           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4943           for (int b = 0; b < bs_last_barrier; ++b) {
4944             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4945               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4946             }
4947             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4948           }
4949         }
4950       }
4951 #endif // KMP_NESTED_HOT_TEAMS
4952       team->t.t_nproc = new_nproc;
4953       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4954       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
4955       __kmp_reinitialize_team(team, new_icvs,
4956                               root->r.r_uber_thread->th.th_ident);
4957 
4958       // Update remaining threads
4959       for (f = 0; f < new_nproc; ++f) {
4960         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4961       }
4962 
4963       // restore the current task state of the master thread: should be the
4964       // implicit task
4965       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
4966                     team->t.t_threads[0], team));
4967 
4968       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4969 
4970 #ifdef KMP_DEBUG
4971       for (f = 0; f < team->t.t_nproc; f++) {
4972         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
4973                          team->t.t_threads[f]->th.th_team_nproc ==
4974                              team->t.t_nproc);
4975       }
4976 #endif
4977 
4978       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4979 #if KMP_AFFINITY_SUPPORTED
4980       __kmp_partition_places(team);
4981 #endif
4982     } else { // team->t.t_nproc < new_nproc
4983 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4984       kmp_affin_mask_t *old_mask;
4985       if (KMP_AFFINITY_CAPABLE()) {
4986         KMP_CPU_ALLOC(old_mask);
4987       }
4988 #endif
4989 
4990       KA_TRACE(20,
4991                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
4992                 new_nproc));
4993 
4994       team->t.t_size_changed = 1;
4995 
4996 #if KMP_NESTED_HOT_TEAMS
4997       int avail_threads = hot_teams[level].hot_team_nth;
4998       if (new_nproc < avail_threads)
4999         avail_threads = new_nproc;
5000       kmp_info_t **other_threads = team->t.t_threads;
5001       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5002         // Adjust barrier data of reserved threads (if any) of the team
5003         // Other data will be set in __kmp_initialize_info() below.
5004         int b;
5005         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5006         for (b = 0; b < bs_last_barrier; ++b) {
5007           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5008           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5009 #if USE_DEBUGGER
5010           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5011 #endif
5012         }
5013       }
5014       if (hot_teams[level].hot_team_nth >= new_nproc) {
5015         // we have all needed threads in reserve, no need to allocate any
5016         // this only possible in mode 1, cannot have reserved threads in mode 0
5017         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5018         team->t.t_nproc = new_nproc; // just get reserved threads involved
5019       } else {
5020         // we may have some threads in reserve, but not enough
5021         team->t.t_nproc =
5022             hot_teams[level]
5023                 .hot_team_nth; // get reserved threads involved if any
5024         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5025 #endif // KMP_NESTED_HOT_TEAMS
5026         if (team->t.t_max_nproc < new_nproc) {
5027           /* reallocate larger arrays */
5028           __kmp_reallocate_team_arrays(team, new_nproc);
5029           __kmp_reinitialize_team(team, new_icvs, NULL);
5030         }
5031 
5032 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5033         /* Temporarily set full mask for master thread before creation of
5034            workers. The reason is that workers inherit the affinity from master,
5035            so if a lot of workers are created on the single core quickly, they
5036            don't get a chance to set their own affinity for a long time. */
5037         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5038 #endif
5039 
5040         /* allocate new threads for the hot team */
5041         for (f = team->t.t_nproc; f < new_nproc; f++) {
5042           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5043           KMP_DEBUG_ASSERT(new_worker);
5044           team->t.t_threads[f] = new_worker;
5045 
5046           KA_TRACE(20,
5047                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5048                     "join=%llu, plain=%llu\n",
5049                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5050                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5051                     team->t.t_bar[bs_plain_barrier].b_arrived));
5052 
5053           { // Initialize barrier data for new threads.
5054             int b;
5055             kmp_balign_t *balign = new_worker->th.th_bar;
5056             for (b = 0; b < bs_last_barrier; ++b) {
5057               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5058               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5059                                KMP_BARRIER_PARENT_FLAG);
5060 #if USE_DEBUGGER
5061               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5062 #endif
5063             }
5064           }
5065         }
5066 
5067 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5068         if (KMP_AFFINITY_CAPABLE()) {
5069           /* Restore initial master thread's affinity mask */
5070           __kmp_set_system_affinity(old_mask, TRUE);
5071           KMP_CPU_FREE(old_mask);
5072         }
5073 #endif
5074 #if KMP_NESTED_HOT_TEAMS
5075       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5076 #endif // KMP_NESTED_HOT_TEAMS
5077       /* make sure everyone is syncronized */
5078       int old_nproc = team->t.t_nproc; // save old value and use to update only
5079       // new threads below
5080       __kmp_initialize_team(team, new_nproc, new_icvs,
5081                             root->r.r_uber_thread->th.th_ident);
5082 
5083       /* reinitialize the threads */
5084       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5085       for (f = 0; f < team->t.t_nproc; ++f)
5086         __kmp_initialize_info(team->t.t_threads[f], team, f,
5087                               __kmp_gtid_from_tid(f, team));
5088 
5089       if (level) { // set th_task_state for new threads in nested hot team
5090         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5091         // only need to set the th_task_state for the new threads. th_task_state
5092         // for master thread will not be accurate until after this in
5093         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5094         // correct value.
5095         for (f = old_nproc; f < team->t.t_nproc; ++f)
5096           team->t.t_threads[f]->th.th_task_state =
5097               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5098       } else { // set th_task_state for new threads in non-nested hot team
5099         int old_state =
5100             team->t.t_threads[0]->th.th_task_state; // copy master's state
5101         for (f = old_nproc; f < team->t.t_nproc; ++f)
5102           team->t.t_threads[f]->th.th_task_state = old_state;
5103       }
5104 
5105 #ifdef KMP_DEBUG
5106       for (f = 0; f < team->t.t_nproc; ++f) {
5107         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5108                          team->t.t_threads[f]->th.th_team_nproc ==
5109                              team->t.t_nproc);
5110       }
5111 #endif
5112 
5113       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5114 #if KMP_AFFINITY_SUPPORTED
5115       __kmp_partition_places(team);
5116 #endif
5117     } // Check changes in number of threads
5118 
5119     kmp_info_t *master = team->t.t_threads[0];
5120     if (master->th.th_teams_microtask) {
5121       for (f = 1; f < new_nproc; ++f) {
5122         // propagate teams construct specific info to workers
5123         kmp_info_t *thr = team->t.t_threads[f];
5124         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5125         thr->th.th_teams_level = master->th.th_teams_level;
5126         thr->th.th_teams_size = master->th.th_teams_size;
5127       }
5128     }
5129 #if KMP_NESTED_HOT_TEAMS
5130     if (level) {
5131       // Sync barrier state for nested hot teams, not needed for outermost hot
5132       // team.
5133       for (f = 1; f < new_nproc; ++f) {
5134         kmp_info_t *thr = team->t.t_threads[f];
5135         int b;
5136         kmp_balign_t *balign = thr->th.th_bar;
5137         for (b = 0; b < bs_last_barrier; ++b) {
5138           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5139           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5140 #if USE_DEBUGGER
5141           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5142 #endif
5143         }
5144       }
5145     }
5146 #endif // KMP_NESTED_HOT_TEAMS
5147 
5148     /* reallocate space for arguments if necessary */
5149     __kmp_alloc_argv_entries(argc, team, TRUE);
5150     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5151     // The hot team re-uses the previous task team,
5152     // if untouched during the previous release->gather phase.
5153 
5154     KF_TRACE(10, (" hot_team = %p\n", team));
5155 
5156 #if KMP_DEBUG
5157     if (__kmp_tasking_mode != tskm_immediate_exec) {
5158       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5159                     "task_team[1] = %p after reinit\n",
5160                     team->t.t_task_team[0], team->t.t_task_team[1]));
5161     }
5162 #endif
5163 
5164 #if OMPT_SUPPORT
5165     __ompt_team_assign_id(team, ompt_parallel_data);
5166 #endif
5167 
5168     KMP_MB();
5169 
5170     return team;
5171   }
5172 
5173   /* next, let's try to take one from the team pool */
5174   KMP_MB();
5175   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5176     /* TODO: consider resizing undersized teams instead of reaping them, now
5177        that we have a resizing mechanism */
5178     if (team->t.t_max_nproc >= max_nproc) {
5179       /* take this team from the team pool */
5180       __kmp_team_pool = team->t.t_next_pool;
5181 
5182       /* setup the team for fresh use */
5183       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5184 
5185       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5186                     "task_team[1] %p to NULL\n",
5187                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5188       team->t.t_task_team[0] = NULL;
5189       team->t.t_task_team[1] = NULL;
5190 
5191       /* reallocate space for arguments if necessary */
5192       __kmp_alloc_argv_entries(argc, team, TRUE);
5193       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5194 
5195       KA_TRACE(
5196           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5197                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5198       { // Initialize barrier data.
5199         int b;
5200         for (b = 0; b < bs_last_barrier; ++b) {
5201           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5202 #if USE_DEBUGGER
5203           team->t.t_bar[b].b_master_arrived = 0;
5204           team->t.t_bar[b].b_team_arrived = 0;
5205 #endif
5206         }
5207       }
5208 
5209       team->t.t_proc_bind = new_proc_bind;
5210 
5211       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5212                     team->t.t_id));
5213 
5214 #if OMPT_SUPPORT
5215       __ompt_team_assign_id(team, ompt_parallel_data);
5216 #endif
5217 
5218       KMP_MB();
5219 
5220       return team;
5221     }
5222 
5223     /* reap team if it is too small, then loop back and check the next one */
5224     // not sure if this is wise, but, will be redone during the hot-teams
5225     // rewrite.
5226     /* TODO: Use technique to find the right size hot-team, don't reap them */
5227     team = __kmp_reap_team(team);
5228     __kmp_team_pool = team;
5229   }
5230 
5231   /* nothing available in the pool, no matter, make a new team! */
5232   KMP_MB();
5233   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5234 
5235   /* and set it up */
5236   team->t.t_max_nproc = max_nproc;
5237   /* NOTE well, for some reason allocating one big buffer and dividing it up
5238      seems to really hurt performance a lot on the P4, so, let's not use this */
5239   __kmp_allocate_team_arrays(team, max_nproc);
5240 
5241   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5242   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5243 
5244   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5245                 "%p to NULL\n",
5246                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5247   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5248   // memory, no need to duplicate
5249   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5250   // memory, no need to duplicate
5251 
5252   if (__kmp_storage_map) {
5253     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5254   }
5255 
5256   /* allocate space for arguments */
5257   __kmp_alloc_argv_entries(argc, team, FALSE);
5258   team->t.t_argc = argc;
5259 
5260   KA_TRACE(20,
5261            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5262             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5263   { // Initialize barrier data.
5264     int b;
5265     for (b = 0; b < bs_last_barrier; ++b) {
5266       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5267 #if USE_DEBUGGER
5268       team->t.t_bar[b].b_master_arrived = 0;
5269       team->t.t_bar[b].b_team_arrived = 0;
5270 #endif
5271     }
5272   }
5273 
5274   team->t.t_proc_bind = new_proc_bind;
5275 
5276 #if OMPT_SUPPORT
5277   __ompt_team_assign_id(team, ompt_parallel_data);
5278   team->t.ompt_serialized_team_info = NULL;
5279 #endif
5280 
5281   KMP_MB();
5282 
5283   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5284                 team->t.t_id));
5285 
5286   return team;
5287 }
5288 
5289 /* TODO implement hot-teams at all levels */
5290 /* TODO implement lazy thread release on demand (disband request) */
5291 
5292 /* free the team.  return it to the team pool.  release all the threads
5293  * associated with it */
5294 void __kmp_free_team(kmp_root_t *root,
5295                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5296   int f;
5297   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5298                 team->t.t_id));
5299 
5300   /* verify state */
5301   KMP_DEBUG_ASSERT(root);
5302   KMP_DEBUG_ASSERT(team);
5303   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5304   KMP_DEBUG_ASSERT(team->t.t_threads);
5305 
5306   int use_hot_team = team == root->r.r_hot_team;
5307 #if KMP_NESTED_HOT_TEAMS
5308   int level;
5309   kmp_hot_team_ptr_t *hot_teams;
5310   if (master) {
5311     level = team->t.t_active_level - 1;
5312     if (master->th.th_teams_microtask) { // in teams construct?
5313       if (master->th.th_teams_size.nteams > 1) {
5314         ++level; // level was not increased in teams construct for
5315         // team_of_masters
5316       }
5317       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5318           master->th.th_teams_level == team->t.t_level) {
5319         ++level; // level was not increased in teams construct for
5320         // team_of_workers before the parallel
5321       } // team->t.t_level will be increased inside parallel
5322     }
5323     hot_teams = master->th.th_hot_teams;
5324     if (level < __kmp_hot_teams_max_level) {
5325       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5326       use_hot_team = 1;
5327     }
5328   }
5329 #endif // KMP_NESTED_HOT_TEAMS
5330 
5331   /* team is done working */
5332   TCW_SYNC_PTR(team->t.t_pkfn,
5333                NULL); // Important for Debugging Support Library.
5334 #if KMP_OS_WINDOWS
5335   team->t.t_copyin_counter = 0; // init counter for possible reuse
5336 #endif
5337   // Do not reset pointer to parent team to NULL for hot teams.
5338 
5339   /* if we are non-hot team, release our threads */
5340   if (!use_hot_team) {
5341     if (__kmp_tasking_mode != tskm_immediate_exec) {
5342       // Wait for threads to reach reapable state
5343       for (f = 1; f < team->t.t_nproc; ++f) {
5344         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5345         kmp_info_t *th = team->t.t_threads[f];
5346         volatile kmp_uint32 *state = &th->th.th_reap_state;
5347         while (*state != KMP_SAFE_TO_REAP) {
5348 #if KMP_OS_WINDOWS
5349           // On Windows a thread can be killed at any time, check this
5350           DWORD ecode;
5351           if (!__kmp_is_thread_alive(th, &ecode)) {
5352             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5353             break;
5354           }
5355 #endif
5356           // first check if thread is sleeping
5357           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5358           if (fl.is_sleeping())
5359             fl.resume(__kmp_gtid_from_thread(th));
5360           KMP_CPU_PAUSE();
5361         }
5362       }
5363 
5364       // Delete task teams
5365       int tt_idx;
5366       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5367         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5368         if (task_team != NULL) {
5369           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5370             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5371             team->t.t_threads[f]->th.th_task_team = NULL;
5372           }
5373           KA_TRACE(
5374               20,
5375               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5376                __kmp_get_gtid(), task_team, team->t.t_id));
5377 #if KMP_NESTED_HOT_TEAMS
5378           __kmp_free_task_team(master, task_team);
5379 #endif
5380           team->t.t_task_team[tt_idx] = NULL;
5381         }
5382       }
5383     }
5384 
5385     // Reset pointer to parent team only for non-hot teams.
5386     team->t.t_parent = NULL;
5387     team->t.t_level = 0;
5388     team->t.t_active_level = 0;
5389 
5390     /* free the worker threads */
5391     for (f = 1; f < team->t.t_nproc; ++f) {
5392       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5393       __kmp_free_thread(team->t.t_threads[f]);
5394       team->t.t_threads[f] = NULL;
5395     }
5396 
5397     /* put the team back in the team pool */
5398     /* TODO limit size of team pool, call reap_team if pool too large */
5399     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5400     __kmp_team_pool = (volatile kmp_team_t *)team;
5401   } else { // Check if team was created for the masters in a teams construct
5402     // See if first worker is a CG root
5403     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5404                      team->t.t_threads[1]->th.th_cg_roots);
5405     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5406       // Clean up the CG root nodes on workers so that this team can be re-used
5407       for (f = 1; f < team->t.t_nproc; ++f) {
5408         kmp_info_t *thr = team->t.t_threads[f];
5409         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5410                          thr->th.th_cg_roots->cg_root == thr);
5411         // Pop current CG root off list
5412         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5413         thr->th.th_cg_roots = tmp->up;
5414         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5415                        " up to node %p. cg_nthreads was %d\n",
5416                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5417         int i = tmp->cg_nthreads--;
5418         if (i == 1) {
5419           __kmp_free(tmp); // free CG if we are the last thread in it
5420         }
5421         // Restore current task's thread_limit from CG root
5422         if (thr->th.th_cg_roots)
5423           thr->th.th_current_task->td_icvs.thread_limit =
5424               thr->th.th_cg_roots->cg_thread_limit;
5425       }
5426     }
5427   }
5428 
5429   KMP_MB();
5430 }
5431 
5432 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5433 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5434   kmp_team_t *next_pool = team->t.t_next_pool;
5435 
5436   KMP_DEBUG_ASSERT(team);
5437   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5438   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5439   KMP_DEBUG_ASSERT(team->t.t_threads);
5440   KMP_DEBUG_ASSERT(team->t.t_argv);
5441 
5442   /* TODO clean the threads that are a part of this? */
5443 
5444   /* free stuff */
5445   __kmp_free_team_arrays(team);
5446   if (team->t.t_argv != &team->t.t_inline_argv[0])
5447     __kmp_free((void *)team->t.t_argv);
5448   __kmp_free(team);
5449 
5450   KMP_MB();
5451   return next_pool;
5452 }
5453 
5454 // Free the thread.  Don't reap it, just place it on the pool of available
5455 // threads.
5456 //
5457 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5458 // binding for the affinity mechanism to be useful.
5459 //
5460 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5461 // However, we want to avoid a potential performance problem by always
5462 // scanning through the list to find the correct point at which to insert
5463 // the thread (potential N**2 behavior).  To do this we keep track of the
5464 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5465 // With single-level parallelism, threads will always be added to the tail
5466 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5467 // parallelism, all bets are off and we may need to scan through the entire
5468 // free list.
5469 //
5470 // This change also has a potentially large performance benefit, for some
5471 // applications.  Previously, as threads were freed from the hot team, they
5472 // would be placed back on the free list in inverse order.  If the hot team
5473 // grew back to it's original size, then the freed thread would be placed
5474 // back on the hot team in reverse order.  This could cause bad cache
5475 // locality problems on programs where the size of the hot team regularly
5476 // grew and shrunk.
5477 //
5478 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5479 void __kmp_free_thread(kmp_info_t *this_th) {
5480   int gtid;
5481   kmp_info_t **scan;
5482 
5483   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5484                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5485 
5486   KMP_DEBUG_ASSERT(this_th);
5487 
5488   // When moving thread to pool, switch thread to wait on own b_go flag, and
5489   // uninitialized (NULL team).
5490   int b;
5491   kmp_balign_t *balign = this_th->th.th_bar;
5492   for (b = 0; b < bs_last_barrier; ++b) {
5493     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5494       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5495     balign[b].bb.team = NULL;
5496     balign[b].bb.leaf_kids = 0;
5497   }
5498   this_th->th.th_task_state = 0;
5499   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5500 
5501   /* put thread back on the free pool */
5502   TCW_PTR(this_th->th.th_team, NULL);
5503   TCW_PTR(this_th->th.th_root, NULL);
5504   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5505 
5506   while (this_th->th.th_cg_roots) {
5507     this_th->th.th_cg_roots->cg_nthreads--;
5508     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5509                    " %p of thread  %p to %d\n",
5510                    this_th, this_th->th.th_cg_roots,
5511                    this_th->th.th_cg_roots->cg_root,
5512                    this_th->th.th_cg_roots->cg_nthreads));
5513     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5514     if (tmp->cg_root == this_th) { // Thread is a cg_root
5515       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5516       KA_TRACE(
5517           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5518       this_th->th.th_cg_roots = tmp->up;
5519       __kmp_free(tmp);
5520     } else { // Worker thread
5521       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5522         __kmp_free(tmp);
5523       }
5524       this_th->th.th_cg_roots = NULL;
5525       break;
5526     }
5527   }
5528 
5529   /* If the implicit task assigned to this thread can be used by other threads
5530    * -> multiple threads can share the data and try to free the task at
5531    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5532    * with higher probability when hot team is disabled but can occurs even when
5533    * the hot team is enabled */
5534   __kmp_free_implicit_task(this_th);
5535   this_th->th.th_current_task = NULL;
5536 
5537   // If the __kmp_thread_pool_insert_pt is already past the new insert
5538   // point, then we need to re-scan the entire list.
5539   gtid = this_th->th.th_info.ds.ds_gtid;
5540   if (__kmp_thread_pool_insert_pt != NULL) {
5541     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5542     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5543       __kmp_thread_pool_insert_pt = NULL;
5544     }
5545   }
5546 
5547   // Scan down the list to find the place to insert the thread.
5548   // scan is the address of a link in the list, possibly the address of
5549   // __kmp_thread_pool itself.
5550   //
5551   // In the absence of nested parallism, the for loop will have 0 iterations.
5552   if (__kmp_thread_pool_insert_pt != NULL) {
5553     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5554   } else {
5555     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5556   }
5557   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5558        scan = &((*scan)->th.th_next_pool))
5559     ;
5560 
5561   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5562   // to its address.
5563   TCW_PTR(this_th->th.th_next_pool, *scan);
5564   __kmp_thread_pool_insert_pt = *scan = this_th;
5565   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5566                    (this_th->th.th_info.ds.ds_gtid <
5567                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5568   TCW_4(this_th->th.th_in_pool, TRUE);
5569   __kmp_suspend_initialize_thread(this_th);
5570   __kmp_lock_suspend_mx(this_th);
5571   if (this_th->th.th_active == TRUE) {
5572     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5573     this_th->th.th_active_in_pool = TRUE;
5574   }
5575 #if KMP_DEBUG
5576   else {
5577     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5578   }
5579 #endif
5580   __kmp_unlock_suspend_mx(this_th);
5581 
5582   TCW_4(__kmp_nth, __kmp_nth - 1);
5583 
5584 #ifdef KMP_ADJUST_BLOCKTIME
5585   /* Adjust blocktime back to user setting or default if necessary */
5586   /* Middle initialization might never have occurred                */
5587   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5588     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5589     if (__kmp_nth <= __kmp_avail_proc) {
5590       __kmp_zero_bt = FALSE;
5591     }
5592   }
5593 #endif /* KMP_ADJUST_BLOCKTIME */
5594 
5595   KMP_MB();
5596 }
5597 
5598 /* ------------------------------------------------------------------------ */
5599 
5600 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5601   int gtid = this_thr->th.th_info.ds.ds_gtid;
5602   /*    void                 *stack_data;*/
5603   kmp_team_t *(*volatile pteam);
5604 
5605   KMP_MB();
5606   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5607 
5608   if (__kmp_env_consistency_check) {
5609     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5610   }
5611 
5612 #if OMPT_SUPPORT
5613   ompt_data_t *thread_data;
5614   if (ompt_enabled.enabled) {
5615     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5616     *thread_data = ompt_data_none;
5617 
5618     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5619     this_thr->th.ompt_thread_info.wait_id = 0;
5620     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5621     if (ompt_enabled.ompt_callback_thread_begin) {
5622       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5623           ompt_thread_worker, thread_data);
5624     }
5625   }
5626 #endif
5627 
5628 #if OMPT_SUPPORT
5629   if (ompt_enabled.enabled) {
5630     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5631   }
5632 #endif
5633   /* This is the place where threads wait for work */
5634   while (!TCR_4(__kmp_global.g.g_done)) {
5635     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5636     KMP_MB();
5637 
5638     /* wait for work to do */
5639     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5640 
5641     /* No tid yet since not part of a team */
5642     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5643 
5644 #if OMPT_SUPPORT
5645     if (ompt_enabled.enabled) {
5646       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5647     }
5648 #endif
5649 
5650     pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5651 
5652     /* have we been allocated? */
5653     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5654       /* we were just woken up, so run our new task */
5655       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5656         int rc;
5657         KA_TRACE(20,
5658                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5659                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5660                   (*pteam)->t.t_pkfn));
5661 
5662         updateHWFPControl(*pteam);
5663 
5664 #if OMPT_SUPPORT
5665         if (ompt_enabled.enabled) {
5666           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5667         }
5668 #endif
5669 
5670         rc = (*pteam)->t.t_invoke(gtid);
5671         KMP_ASSERT(rc);
5672 
5673         KMP_MB();
5674         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5675                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5676                       (*pteam)->t.t_pkfn));
5677       }
5678 #if OMPT_SUPPORT
5679       if (ompt_enabled.enabled) {
5680         /* no frame set while outside task */
5681         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5682 
5683         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5684       }
5685 #endif
5686       /* join barrier after parallel region */
5687       __kmp_join_barrier(gtid);
5688     }
5689   }
5690   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5691 
5692 #if OMPT_SUPPORT
5693   if (ompt_enabled.ompt_callback_thread_end) {
5694     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5695   }
5696 #endif
5697 
5698   this_thr->th.th_task_team = NULL;
5699   /* run the destructors for the threadprivate data for this thread */
5700   __kmp_common_destroy_gtid(gtid);
5701 
5702   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5703   KMP_MB();
5704   return this_thr;
5705 }
5706 
5707 /* ------------------------------------------------------------------------ */
5708 
5709 void __kmp_internal_end_dest(void *specific_gtid) {
5710 #if KMP_COMPILER_ICC
5711 #pragma warning(push)
5712 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5713 // significant bits
5714 #endif
5715   // Make sure no significant bits are lost
5716   int gtid = (kmp_intptr_t)specific_gtid - 1;
5717 #if KMP_COMPILER_ICC
5718 #pragma warning(pop)
5719 #endif
5720 
5721   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5722   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5723    * this is because 0 is reserved for the nothing-stored case */
5724 
5725   /* josh: One reason for setting the gtid specific data even when it is being
5726      destroyed by pthread is to allow gtid lookup through thread specific data
5727      (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5728      that gets executed in the call to __kmp_internal_end_thread, actually
5729      gets the gtid through the thread specific data.  Setting it here seems
5730      rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5731      to run smoothly.
5732      todo: get rid of this after we remove the dependence on
5733      __kmp_gtid_get_specific  */
5734   if (gtid >= 0 && KMP_UBER_GTID(gtid))
5735     __kmp_gtid_set_specific(gtid);
5736 #ifdef KMP_TDATA_GTID
5737   __kmp_gtid = gtid;
5738 #endif
5739   __kmp_internal_end_thread(gtid);
5740 }
5741 
5742 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5743 
5744 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5745 // destructors work perfectly, but in real libomp.so I have no evidence it is
5746 // ever called. However, -fini linker option in makefile.mk works fine.
5747 
5748 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5749   __kmp_internal_end_atexit();
5750 }
5751 
5752 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5753 
5754 #endif
5755 
5756 /* [Windows] josh: when the atexit handler is called, there may still be more
5757    than one thread alive */
5758 void __kmp_internal_end_atexit(void) {
5759   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5760   /* [Windows]
5761      josh: ideally, we want to completely shutdown the library in this atexit
5762      handler, but stat code that depends on thread specific data for gtid fails
5763      because that data becomes unavailable at some point during the shutdown, so
5764      we call __kmp_internal_end_thread instead. We should eventually remove the
5765      dependency on __kmp_get_specific_gtid in the stat code and use
5766      __kmp_internal_end_library to cleanly shutdown the library.
5767 
5768      // TODO: Can some of this comment about GVS be removed?
5769      I suspect that the offending stat code is executed when the calling thread
5770      tries to clean up a dead root thread's data structures, resulting in GVS
5771      code trying to close the GVS structures for that thread, but since the stat
5772      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5773      the calling thread is cleaning up itself instead of another thread, it get
5774      confused. This happens because allowing a thread to unregister and cleanup
5775      another thread is a recent modification for addressing an issue.
5776      Based on the current design (20050722), a thread may end up
5777      trying to unregister another thread only if thread death does not trigger
5778      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5779      thread specific data destructor function to detect thread death. For
5780      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5781      is nothing.  Thus, the workaround is applicable only for Windows static
5782      stat library. */
5783   __kmp_internal_end_library(-1);
5784 #if KMP_OS_WINDOWS
5785   __kmp_close_console();
5786 #endif
5787 }
5788 
5789 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5790   // It is assumed __kmp_forkjoin_lock is acquired.
5791 
5792   int gtid;
5793 
5794   KMP_DEBUG_ASSERT(thread != NULL);
5795 
5796   gtid = thread->th.th_info.ds.ds_gtid;
5797 
5798   if (!is_root) {
5799     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5800       /* Assume the threads are at the fork barrier here */
5801       KA_TRACE(
5802           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5803                gtid));
5804       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5805        * (GEH) */
5806       ANNOTATE_HAPPENS_BEFORE(thread);
5807       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5808       __kmp_release_64(&flag);
5809     }
5810 
5811     // Terminate OS thread.
5812     __kmp_reap_worker(thread);
5813 
5814     // The thread was killed asynchronously.  If it was actively
5815     // spinning in the thread pool, decrement the global count.
5816     //
5817     // There is a small timing hole here - if the worker thread was just waking
5818     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5819     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5820     // the global counter might not get updated.
5821     //
5822     // Currently, this can only happen as the library is unloaded,
5823     // so there are no harmful side effects.
5824     if (thread->th.th_active_in_pool) {
5825       thread->th.th_active_in_pool = FALSE;
5826       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5827       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5828     }
5829   }
5830 
5831   __kmp_free_implicit_task(thread);
5832 
5833 // Free the fast memory for tasking
5834 #if USE_FAST_MEMORY
5835   __kmp_free_fast_memory(thread);
5836 #endif /* USE_FAST_MEMORY */
5837 
5838   __kmp_suspend_uninitialize_thread(thread);
5839 
5840   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5841   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5842 
5843   --__kmp_all_nth;
5844 // __kmp_nth was decremented when thread is added to the pool.
5845 
5846 #ifdef KMP_ADJUST_BLOCKTIME
5847   /* Adjust blocktime back to user setting or default if necessary */
5848   /* Middle initialization might never have occurred                */
5849   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5850     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5851     if (__kmp_nth <= __kmp_avail_proc) {
5852       __kmp_zero_bt = FALSE;
5853     }
5854   }
5855 #endif /* KMP_ADJUST_BLOCKTIME */
5856 
5857   /* free the memory being used */
5858   if (__kmp_env_consistency_check) {
5859     if (thread->th.th_cons) {
5860       __kmp_free_cons_stack(thread->th.th_cons);
5861       thread->th.th_cons = NULL;
5862     }
5863   }
5864 
5865   if (thread->th.th_pri_common != NULL) {
5866     __kmp_free(thread->th.th_pri_common);
5867     thread->th.th_pri_common = NULL;
5868   }
5869 
5870   if (thread->th.th_task_state_memo_stack != NULL) {
5871     __kmp_free(thread->th.th_task_state_memo_stack);
5872     thread->th.th_task_state_memo_stack = NULL;
5873   }
5874 
5875 #if KMP_USE_BGET
5876   if (thread->th.th_local.bget_data != NULL) {
5877     __kmp_finalize_bget(thread);
5878   }
5879 #endif
5880 
5881 #if KMP_AFFINITY_SUPPORTED
5882   if (thread->th.th_affin_mask != NULL) {
5883     KMP_CPU_FREE(thread->th.th_affin_mask);
5884     thread->th.th_affin_mask = NULL;
5885   }
5886 #endif /* KMP_AFFINITY_SUPPORTED */
5887 
5888 #if KMP_USE_HIER_SCHED
5889   if (thread->th.th_hier_bar_data != NULL) {
5890     __kmp_free(thread->th.th_hier_bar_data);
5891     thread->th.th_hier_bar_data = NULL;
5892   }
5893 #endif
5894 
5895   __kmp_reap_team(thread->th.th_serial_team);
5896   thread->th.th_serial_team = NULL;
5897   __kmp_free(thread);
5898 
5899   KMP_MB();
5900 
5901 } // __kmp_reap_thread
5902 
5903 static void __kmp_internal_end(void) {
5904   int i;
5905 
5906   /* First, unregister the library */
5907   __kmp_unregister_library();
5908 
5909 #if KMP_OS_WINDOWS
5910   /* In Win static library, we can't tell when a root actually dies, so we
5911      reclaim the data structures for any root threads that have died but not
5912      unregistered themselves, in order to shut down cleanly.
5913      In Win dynamic library we also can't tell when a thread dies.  */
5914   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5915 // dead roots
5916 #endif
5917 
5918   for (i = 0; i < __kmp_threads_capacity; i++)
5919     if (__kmp_root[i])
5920       if (__kmp_root[i]->r.r_active)
5921         break;
5922   KMP_MB(); /* Flush all pending memory write invalidates.  */
5923   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5924 
5925   if (i < __kmp_threads_capacity) {
5926 #if KMP_USE_MONITOR
5927     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5928     KMP_MB(); /* Flush all pending memory write invalidates.  */
5929 
5930     // Need to check that monitor was initialized before reaping it. If we are
5931     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5932     // __kmp_monitor will appear to contain valid data, but it is only valid in
5933     // the parent process, not the child.
5934     // New behavior (201008): instead of keying off of the flag
5935     // __kmp_init_parallel, the monitor thread creation is keyed off
5936     // of the new flag __kmp_init_monitor.
5937     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5938     if (TCR_4(__kmp_init_monitor)) {
5939       __kmp_reap_monitor(&__kmp_monitor);
5940       TCW_4(__kmp_init_monitor, 0);
5941     }
5942     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5943     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5944 #endif // KMP_USE_MONITOR
5945   } else {
5946 /* TODO move this to cleanup code */
5947 #ifdef KMP_DEBUG
5948     /* make sure that everything has properly ended */
5949     for (i = 0; i < __kmp_threads_capacity; i++) {
5950       if (__kmp_root[i]) {
5951         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
5952         //                    there can be uber threads alive here
5953         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5954       }
5955     }
5956 #endif
5957 
5958     KMP_MB();
5959 
5960     // Reap the worker threads.
5961     // This is valid for now, but be careful if threads are reaped sooner.
5962     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5963       // Get the next thread from the pool.
5964       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5965       __kmp_thread_pool = thread->th.th_next_pool;
5966       // Reap it.
5967       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5968       thread->th.th_next_pool = NULL;
5969       thread->th.th_in_pool = FALSE;
5970       __kmp_reap_thread(thread, 0);
5971     }
5972     __kmp_thread_pool_insert_pt = NULL;
5973 
5974     // Reap teams.
5975     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5976       // Get the next team from the pool.
5977       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
5978       __kmp_team_pool = team->t.t_next_pool;
5979       // Reap it.
5980       team->t.t_next_pool = NULL;
5981       __kmp_reap_team(team);
5982     }
5983 
5984     __kmp_reap_task_teams();
5985 
5986 #if KMP_OS_UNIX
5987     // Threads that are not reaped should not access any resources since they
5988     // are going to be deallocated soon, so the shutdown sequence should wait
5989     // until all threads either exit the final spin-waiting loop or begin
5990     // sleeping after the given blocktime.
5991     for (i = 0; i < __kmp_threads_capacity; i++) {
5992       kmp_info_t *thr = __kmp_threads[i];
5993       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
5994         KMP_CPU_PAUSE();
5995     }
5996 #endif
5997 
5998     for (i = 0; i < __kmp_threads_capacity; ++i) {
5999       // TBD: Add some checking...
6000       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6001     }
6002 
6003     /* Make sure all threadprivate destructors get run by joining with all
6004        worker threads before resetting this flag */
6005     TCW_SYNC_4(__kmp_init_common, FALSE);
6006 
6007     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6008     KMP_MB();
6009 
6010 #if KMP_USE_MONITOR
6011     // See note above: One of the possible fixes for CQ138434 / CQ140126
6012     //
6013     // FIXME: push both code fragments down and CSE them?
6014     // push them into __kmp_cleanup() ?
6015     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6016     if (TCR_4(__kmp_init_monitor)) {
6017       __kmp_reap_monitor(&__kmp_monitor);
6018       TCW_4(__kmp_init_monitor, 0);
6019     }
6020     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6021     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6022 #endif
6023   } /* else !__kmp_global.t_active */
6024   TCW_4(__kmp_init_gtid, FALSE);
6025   KMP_MB(); /* Flush all pending memory write invalidates.  */
6026 
6027   __kmp_cleanup();
6028 #if OMPT_SUPPORT
6029   ompt_fini();
6030 #endif
6031 }
6032 
6033 void __kmp_internal_end_library(int gtid_req) {
6034   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6035   /* this shouldn't be a race condition because __kmp_internal_end() is the
6036      only place to clear __kmp_serial_init */
6037   /* we'll check this later too, after we get the lock */
6038   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6039   // redundaant, because the next check will work in any case.
6040   if (__kmp_global.g.g_abort) {
6041     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6042     /* TODO abort? */
6043     return;
6044   }
6045   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6046     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6047     return;
6048   }
6049 
6050   KMP_MB(); /* Flush all pending memory write invalidates.  */
6051 
6052   /* find out who we are and what we should do */
6053   {
6054     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6055     KA_TRACE(
6056         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6057     if (gtid == KMP_GTID_SHUTDOWN) {
6058       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6059                     "already shutdown\n"));
6060       return;
6061     } else if (gtid == KMP_GTID_MONITOR) {
6062       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6063                     "registered, or system shutdown\n"));
6064       return;
6065     } else if (gtid == KMP_GTID_DNE) {
6066       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6067                     "shutdown\n"));
6068       /* we don't know who we are, but we may still shutdown the library */
6069     } else if (KMP_UBER_GTID(gtid)) {
6070       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6071       if (__kmp_root[gtid]->r.r_active) {
6072         __kmp_global.g.g_abort = -1;
6073         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6074         KA_TRACE(10,
6075                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6076                   gtid));
6077         return;
6078       } else {
6079         KA_TRACE(
6080             10,
6081             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6082         __kmp_unregister_root_current_thread(gtid);
6083       }
6084     } else {
6085 /* worker threads may call this function through the atexit handler, if they
6086  * call exit() */
6087 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6088    TODO: do a thorough shutdown instead */
6089 #ifdef DUMP_DEBUG_ON_EXIT
6090       if (__kmp_debug_buf)
6091         __kmp_dump_debug_buffer();
6092 #endif
6093       return;
6094     }
6095   }
6096   /* synchronize the termination process */
6097   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6098 
6099   /* have we already finished */
6100   if (__kmp_global.g.g_abort) {
6101     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6102     /* TODO abort? */
6103     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6104     return;
6105   }
6106   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6107     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6108     return;
6109   }
6110 
6111   /* We need this lock to enforce mutex between this reading of
6112      __kmp_threads_capacity and the writing by __kmp_register_root.
6113      Alternatively, we can use a counter of roots that is atomically updated by
6114      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6115      __kmp_internal_end_*.  */
6116   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6117 
6118   /* now we can safely conduct the actual termination */
6119   __kmp_internal_end();
6120 
6121   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6122   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6123 
6124   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6125 
6126 #ifdef DUMP_DEBUG_ON_EXIT
6127   if (__kmp_debug_buf)
6128     __kmp_dump_debug_buffer();
6129 #endif
6130 
6131 #if KMP_OS_WINDOWS
6132   __kmp_close_console();
6133 #endif
6134 
6135   __kmp_fini_allocator();
6136 
6137 } // __kmp_internal_end_library
6138 
6139 void __kmp_internal_end_thread(int gtid_req) {
6140   int i;
6141 
6142   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6143   /* this shouldn't be a race condition because __kmp_internal_end() is the
6144    * only place to clear __kmp_serial_init */
6145   /* we'll check this later too, after we get the lock */
6146   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6147   // redundant, because the next check will work in any case.
6148   if (__kmp_global.g.g_abort) {
6149     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6150     /* TODO abort? */
6151     return;
6152   }
6153   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6154     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6155     return;
6156   }
6157 
6158   KMP_MB(); /* Flush all pending memory write invalidates.  */
6159 
6160   /* find out who we are and what we should do */
6161   {
6162     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6163     KA_TRACE(10,
6164              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6165     if (gtid == KMP_GTID_SHUTDOWN) {
6166       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6167                     "already shutdown\n"));
6168       return;
6169     } else if (gtid == KMP_GTID_MONITOR) {
6170       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6171                     "registered, or system shutdown\n"));
6172       return;
6173     } else if (gtid == KMP_GTID_DNE) {
6174       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6175                     "shutdown\n"));
6176       return;
6177       /* we don't know who we are */
6178     } else if (KMP_UBER_GTID(gtid)) {
6179       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6180       if (__kmp_root[gtid]->r.r_active) {
6181         __kmp_global.g.g_abort = -1;
6182         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6183         KA_TRACE(10,
6184                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6185                   gtid));
6186         return;
6187       } else {
6188         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6189                       gtid));
6190         __kmp_unregister_root_current_thread(gtid);
6191       }
6192     } else {
6193       /* just a worker thread, let's leave */
6194       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6195 
6196       if (gtid >= 0) {
6197         __kmp_threads[gtid]->th.th_task_team = NULL;
6198       }
6199 
6200       KA_TRACE(10,
6201                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6202                 gtid));
6203       return;
6204     }
6205   }
6206 #if KMP_DYNAMIC_LIB
6207   if (__kmp_pause_status != kmp_hard_paused)
6208   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6209   // because we will better shutdown later in the library destructor.
6210   {
6211     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6212     return;
6213   }
6214 #endif
6215   /* synchronize the termination process */
6216   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6217 
6218   /* have we already finished */
6219   if (__kmp_global.g.g_abort) {
6220     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6221     /* TODO abort? */
6222     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6223     return;
6224   }
6225   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6226     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6227     return;
6228   }
6229 
6230   /* We need this lock to enforce mutex between this reading of
6231      __kmp_threads_capacity and the writing by __kmp_register_root.
6232      Alternatively, we can use a counter of roots that is atomically updated by
6233      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6234      __kmp_internal_end_*.  */
6235 
6236   /* should we finish the run-time?  are all siblings done? */
6237   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6238 
6239   for (i = 0; i < __kmp_threads_capacity; ++i) {
6240     if (KMP_UBER_GTID(i)) {
6241       KA_TRACE(
6242           10,
6243           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6244       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6245       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6246       return;
6247     }
6248   }
6249 
6250   /* now we can safely conduct the actual termination */
6251 
6252   __kmp_internal_end();
6253 
6254   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6255   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6256 
6257   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6258 
6259 #ifdef DUMP_DEBUG_ON_EXIT
6260   if (__kmp_debug_buf)
6261     __kmp_dump_debug_buffer();
6262 #endif
6263 } // __kmp_internal_end_thread
6264 
6265 // -----------------------------------------------------------------------------
6266 // Library registration stuff.
6267 
6268 static long __kmp_registration_flag = 0;
6269 // Random value used to indicate library initialization.
6270 static char *__kmp_registration_str = NULL;
6271 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6272 
6273 static inline char *__kmp_reg_status_name() {
6274   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6275      each thread. If registration and unregistration go in different threads
6276      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6277      env var can not be found, because the name will contain different pid. */
6278   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6279 } // __kmp_reg_status_get
6280 
6281 void __kmp_register_library_startup(void) {
6282 
6283   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6284   int done = 0;
6285   union {
6286     double dtime;
6287     long ltime;
6288   } time;
6289 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6290   __kmp_initialize_system_tick();
6291 #endif
6292   __kmp_read_system_time(&time.dtime);
6293   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6294   __kmp_registration_str =
6295       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6296                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6297 
6298   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6299                 __kmp_registration_str));
6300 
6301   while (!done) {
6302 
6303     char *value = NULL; // Actual value of the environment variable.
6304 
6305     // Set environment variable, but do not overwrite if it is exist.
6306     __kmp_env_set(name, __kmp_registration_str, 0);
6307     // Check the variable is written.
6308     value = __kmp_env_get(name);
6309     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6310 
6311       done = 1; // Ok, environment variable set successfully, exit the loop.
6312 
6313     } else {
6314 
6315       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6316       // Check whether it alive or dead.
6317       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6318       char *tail = value;
6319       char *flag_addr_str = NULL;
6320       char *flag_val_str = NULL;
6321       char const *file_name = NULL;
6322       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6323       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6324       file_name = tail;
6325       if (tail != NULL) {
6326         long *flag_addr = 0;
6327         long flag_val = 0;
6328         KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6329         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6330         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6331           // First, check whether environment-encoded address is mapped into
6332           // addr space.
6333           // If so, dereference it to see if it still has the right value.
6334           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6335             neighbor = 1;
6336           } else {
6337             // If not, then we know the other copy of the library is no longer
6338             // running.
6339             neighbor = 2;
6340           }
6341         }
6342       }
6343       switch (neighbor) {
6344       case 0: // Cannot parse environment variable -- neighbor status unknown.
6345         // Assume it is the incompatible format of future version of the
6346         // library. Assume the other library is alive.
6347         // WARN( ... ); // TODO: Issue a warning.
6348         file_name = "unknown library";
6349         KMP_FALLTHROUGH();
6350       // Attention! Falling to the next case. That's intentional.
6351       case 1: { // Neighbor is alive.
6352         // Check it is allowed.
6353         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6354         if (!__kmp_str_match_true(duplicate_ok)) {
6355           // That's not allowed. Issue fatal error.
6356           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6357                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6358         }
6359         KMP_INTERNAL_FREE(duplicate_ok);
6360         __kmp_duplicate_library_ok = 1;
6361         done = 1; // Exit the loop.
6362       } break;
6363       case 2: { // Neighbor is dead.
6364         // Clear the variable and try to register library again.
6365         __kmp_env_unset(name);
6366       } break;
6367       default: { KMP_DEBUG_ASSERT(0); } break;
6368       }
6369     }
6370     KMP_INTERNAL_FREE((void *)value);
6371   }
6372   KMP_INTERNAL_FREE((void *)name);
6373 
6374 } // func __kmp_register_library_startup
6375 
6376 void __kmp_unregister_library(void) {
6377 
6378   char *name = __kmp_reg_status_name();
6379   char *value = __kmp_env_get(name);
6380 
6381   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6382   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6383   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6384     // Ok, this is our variable. Delete it.
6385     __kmp_env_unset(name);
6386   }
6387 
6388   KMP_INTERNAL_FREE(__kmp_registration_str);
6389   KMP_INTERNAL_FREE(value);
6390   KMP_INTERNAL_FREE(name);
6391 
6392   __kmp_registration_flag = 0;
6393   __kmp_registration_str = NULL;
6394 
6395 } // __kmp_unregister_library
6396 
6397 // End of Library registration stuff.
6398 // -----------------------------------------------------------------------------
6399 
6400 #if KMP_MIC_SUPPORTED
6401 
6402 static void __kmp_check_mic_type() {
6403   kmp_cpuid_t cpuid_state = {0};
6404   kmp_cpuid_t *cs_p = &cpuid_state;
6405   __kmp_x86_cpuid(1, 0, cs_p);
6406   // We don't support mic1 at the moment
6407   if ((cs_p->eax & 0xff0) == 0xB10) {
6408     __kmp_mic_type = mic2;
6409   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6410     __kmp_mic_type = mic3;
6411   } else {
6412     __kmp_mic_type = non_mic;
6413   }
6414 }
6415 
6416 #endif /* KMP_MIC_SUPPORTED */
6417 
6418 static void __kmp_do_serial_initialize(void) {
6419   int i, gtid;
6420   int size;
6421 
6422   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6423 
6424   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6425   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6426   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6427   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6428   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6429 
6430 #if OMPT_SUPPORT
6431   ompt_pre_init();
6432 #endif
6433 
6434   __kmp_validate_locks();
6435 
6436   /* Initialize internal memory allocator */
6437   __kmp_init_allocator();
6438 
6439   /* Register the library startup via an environment variable and check to see
6440      whether another copy of the library is already registered. */
6441 
6442   __kmp_register_library_startup();
6443 
6444   /* TODO reinitialization of library */
6445   if (TCR_4(__kmp_global.g.g_done)) {
6446     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6447   }
6448 
6449   __kmp_global.g.g_abort = 0;
6450   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6451 
6452 /* initialize the locks */
6453 #if KMP_USE_ADAPTIVE_LOCKS
6454 #if KMP_DEBUG_ADAPTIVE_LOCKS
6455   __kmp_init_speculative_stats();
6456 #endif
6457 #endif
6458 #if KMP_STATS_ENABLED
6459   __kmp_stats_init();
6460 #endif
6461   __kmp_init_lock(&__kmp_global_lock);
6462   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6463   __kmp_init_lock(&__kmp_debug_lock);
6464   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6465   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6466   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6467   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6468   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6469   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6470   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6471   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6472   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6473   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6474   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6475   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6476   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6477   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6478   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6479 #if KMP_USE_MONITOR
6480   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6481 #endif
6482   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6483 
6484   /* conduct initialization and initial setup of configuration */
6485 
6486   __kmp_runtime_initialize();
6487 
6488 #if KMP_MIC_SUPPORTED
6489   __kmp_check_mic_type();
6490 #endif
6491 
6492 // Some global variable initialization moved here from kmp_env_initialize()
6493 #ifdef KMP_DEBUG
6494   kmp_diag = 0;
6495 #endif
6496   __kmp_abort_delay = 0;
6497 
6498   // From __kmp_init_dflt_team_nth()
6499   /* assume the entire machine will be used */
6500   __kmp_dflt_team_nth_ub = __kmp_xproc;
6501   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6502     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6503   }
6504   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6505     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6506   }
6507   __kmp_max_nth = __kmp_sys_max_nth;
6508   __kmp_cg_max_nth = __kmp_sys_max_nth;
6509   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6510   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6511     __kmp_teams_max_nth = __kmp_sys_max_nth;
6512   }
6513 
6514   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6515   // part
6516   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6517 #if KMP_USE_MONITOR
6518   __kmp_monitor_wakeups =
6519       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6520   __kmp_bt_intervals =
6521       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6522 #endif
6523   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6524   __kmp_library = library_throughput;
6525   // From KMP_SCHEDULE initialization
6526   __kmp_static = kmp_sch_static_balanced;
6527 // AC: do not use analytical here, because it is non-monotonous
6528 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6529 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6530 // need to repeat assignment
6531 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6532 // bit control and barrier method control parts
6533 #if KMP_FAST_REDUCTION_BARRIER
6534 #define kmp_reduction_barrier_gather_bb ((int)1)
6535 #define kmp_reduction_barrier_release_bb ((int)1)
6536 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6537 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6538 #endif // KMP_FAST_REDUCTION_BARRIER
6539   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6540     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6541     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6542     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6543     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6544 #if KMP_FAST_REDUCTION_BARRIER
6545     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6546       // lin_64 ): hyper,1
6547       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6548       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6549       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6550       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6551     }
6552 #endif // KMP_FAST_REDUCTION_BARRIER
6553   }
6554 #if KMP_FAST_REDUCTION_BARRIER
6555 #undef kmp_reduction_barrier_release_pat
6556 #undef kmp_reduction_barrier_gather_pat
6557 #undef kmp_reduction_barrier_release_bb
6558 #undef kmp_reduction_barrier_gather_bb
6559 #endif // KMP_FAST_REDUCTION_BARRIER
6560 #if KMP_MIC_SUPPORTED
6561   if (__kmp_mic_type == mic2) { // KNC
6562     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6563     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6564     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6565         1; // forkjoin release
6566     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6567     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6568   }
6569 #if KMP_FAST_REDUCTION_BARRIER
6570   if (__kmp_mic_type == mic2) { // KNC
6571     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6572     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6573   }
6574 #endif // KMP_FAST_REDUCTION_BARRIER
6575 #endif // KMP_MIC_SUPPORTED
6576 
6577 // From KMP_CHECKS initialization
6578 #ifdef KMP_DEBUG
6579   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6580 #else
6581   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6582 #endif
6583 
6584   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6585   __kmp_foreign_tp = TRUE;
6586 
6587   __kmp_global.g.g_dynamic = FALSE;
6588   __kmp_global.g.g_dynamic_mode = dynamic_default;
6589 
6590   __kmp_env_initialize(NULL);
6591 
6592 // Print all messages in message catalog for testing purposes.
6593 #ifdef KMP_DEBUG
6594   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6595   if (__kmp_str_match_true(val)) {
6596     kmp_str_buf_t buffer;
6597     __kmp_str_buf_init(&buffer);
6598     __kmp_i18n_dump_catalog(&buffer);
6599     __kmp_printf("%s", buffer.str);
6600     __kmp_str_buf_free(&buffer);
6601   }
6602   __kmp_env_free(&val);
6603 #endif
6604 
6605   __kmp_threads_capacity =
6606       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6607   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6608   __kmp_tp_capacity = __kmp_default_tp_capacity(
6609       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6610 
6611   // If the library is shut down properly, both pools must be NULL. Just in
6612   // case, set them to NULL -- some memory may leak, but subsequent code will
6613   // work even if pools are not freed.
6614   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6615   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6616   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6617   __kmp_thread_pool = NULL;
6618   __kmp_thread_pool_insert_pt = NULL;
6619   __kmp_team_pool = NULL;
6620 
6621   /* Allocate all of the variable sized records */
6622   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6623    * expandable */
6624   /* Since allocation is cache-aligned, just add extra padding at the end */
6625   size =
6626       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6627       CACHE_LINE;
6628   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6629   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6630                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6631 
6632   /* init thread counts */
6633   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6634                    0); // Asserts fail if the library is reinitializing and
6635   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6636   __kmp_all_nth = 0;
6637   __kmp_nth = 0;
6638 
6639   /* setup the uber master thread and hierarchy */
6640   gtid = __kmp_register_root(TRUE);
6641   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6642   KMP_ASSERT(KMP_UBER_GTID(gtid));
6643   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6644 
6645   KMP_MB(); /* Flush all pending memory write invalidates.  */
6646 
6647   __kmp_common_initialize();
6648 
6649 #if KMP_OS_UNIX
6650   /* invoke the child fork handler */
6651   __kmp_register_atfork();
6652 #endif
6653 
6654 #if !KMP_DYNAMIC_LIB
6655   {
6656     /* Invoke the exit handler when the program finishes, only for static
6657        library. For dynamic library, we already have _fini and DllMain. */
6658     int rc = atexit(__kmp_internal_end_atexit);
6659     if (rc != 0) {
6660       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6661                   __kmp_msg_null);
6662     }
6663   }
6664 #endif
6665 
6666 #if KMP_HANDLE_SIGNALS
6667 #if KMP_OS_UNIX
6668   /* NOTE: make sure that this is called before the user installs their own
6669      signal handlers so that the user handlers are called first. this way they
6670      can return false, not call our handler, avoid terminating the library, and
6671      continue execution where they left off. */
6672   __kmp_install_signals(FALSE);
6673 #endif /* KMP_OS_UNIX */
6674 #if KMP_OS_WINDOWS
6675   __kmp_install_signals(TRUE);
6676 #endif /* KMP_OS_WINDOWS */
6677 #endif
6678 
6679   /* we have finished the serial initialization */
6680   __kmp_init_counter++;
6681 
6682   __kmp_init_serial = TRUE;
6683 
6684   if (__kmp_settings) {
6685     __kmp_env_print();
6686   }
6687 
6688   if (__kmp_display_env || __kmp_display_env_verbose) {
6689     __kmp_env_print_2();
6690   }
6691 
6692 #if OMPT_SUPPORT
6693   ompt_post_init();
6694 #endif
6695 
6696   KMP_MB();
6697 
6698   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6699 }
6700 
6701 void __kmp_serial_initialize(void) {
6702   if (__kmp_init_serial) {
6703     return;
6704   }
6705   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6706   if (__kmp_init_serial) {
6707     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6708     return;
6709   }
6710   __kmp_do_serial_initialize();
6711   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6712 }
6713 
6714 static void __kmp_do_middle_initialize(void) {
6715   int i, j;
6716   int prev_dflt_team_nth;
6717 
6718   if (!__kmp_init_serial) {
6719     __kmp_do_serial_initialize();
6720   }
6721 
6722   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6723 
6724   // Save the previous value for the __kmp_dflt_team_nth so that
6725   // we can avoid some reinitialization if it hasn't changed.
6726   prev_dflt_team_nth = __kmp_dflt_team_nth;
6727 
6728 #if KMP_AFFINITY_SUPPORTED
6729   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6730   // number of cores on the machine.
6731   __kmp_affinity_initialize();
6732 
6733   // Run through the __kmp_threads array and set the affinity mask
6734   // for each root thread that is currently registered with the RTL.
6735   for (i = 0; i < __kmp_threads_capacity; i++) {
6736     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6737       __kmp_affinity_set_init_mask(i, TRUE);
6738     }
6739   }
6740 #endif /* KMP_AFFINITY_SUPPORTED */
6741 
6742   KMP_ASSERT(__kmp_xproc > 0);
6743   if (__kmp_avail_proc == 0) {
6744     __kmp_avail_proc = __kmp_xproc;
6745   }
6746 
6747   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6748   // correct them now
6749   j = 0;
6750   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6751     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6752         __kmp_avail_proc;
6753     j++;
6754   }
6755 
6756   if (__kmp_dflt_team_nth == 0) {
6757 #ifdef KMP_DFLT_NTH_CORES
6758     // Default #threads = #cores
6759     __kmp_dflt_team_nth = __kmp_ncores;
6760     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6761                   "__kmp_ncores (%d)\n",
6762                   __kmp_dflt_team_nth));
6763 #else
6764     // Default #threads = #available OS procs
6765     __kmp_dflt_team_nth = __kmp_avail_proc;
6766     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6767                   "__kmp_avail_proc(%d)\n",
6768                   __kmp_dflt_team_nth));
6769 #endif /* KMP_DFLT_NTH_CORES */
6770   }
6771 
6772   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6773     __kmp_dflt_team_nth = KMP_MIN_NTH;
6774   }
6775   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6776     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6777   }
6778 
6779   // There's no harm in continuing if the following check fails,
6780   // but it indicates an error in the previous logic.
6781   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6782 
6783   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6784     // Run through the __kmp_threads array and set the num threads icv for each
6785     // root thread that is currently registered with the RTL (which has not
6786     // already explicitly set its nthreads-var with a call to
6787     // omp_set_num_threads()).
6788     for (i = 0; i < __kmp_threads_capacity; i++) {
6789       kmp_info_t *thread = __kmp_threads[i];
6790       if (thread == NULL)
6791         continue;
6792       if (thread->th.th_current_task->td_icvs.nproc != 0)
6793         continue;
6794 
6795       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6796     }
6797   }
6798   KA_TRACE(
6799       20,
6800       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6801        __kmp_dflt_team_nth));
6802 
6803 #ifdef KMP_ADJUST_BLOCKTIME
6804   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6805   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6806     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6807     if (__kmp_nth > __kmp_avail_proc) {
6808       __kmp_zero_bt = TRUE;
6809     }
6810   }
6811 #endif /* KMP_ADJUST_BLOCKTIME */
6812 
6813   /* we have finished middle initialization */
6814   TCW_SYNC_4(__kmp_init_middle, TRUE);
6815 
6816   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6817 }
6818 
6819 void __kmp_middle_initialize(void) {
6820   if (__kmp_init_middle) {
6821     return;
6822   }
6823   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6824   if (__kmp_init_middle) {
6825     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6826     return;
6827   }
6828   __kmp_do_middle_initialize();
6829   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6830 }
6831 
6832 void __kmp_parallel_initialize(void) {
6833   int gtid = __kmp_entry_gtid(); // this might be a new root
6834 
6835   /* synchronize parallel initialization (for sibling) */
6836   if (TCR_4(__kmp_init_parallel))
6837     return;
6838   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6839   if (TCR_4(__kmp_init_parallel)) {
6840     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6841     return;
6842   }
6843 
6844   /* TODO reinitialization after we have already shut down */
6845   if (TCR_4(__kmp_global.g.g_done)) {
6846     KA_TRACE(
6847         10,
6848         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6849     __kmp_infinite_loop();
6850   }
6851 
6852   /* jc: The lock __kmp_initz_lock is already held, so calling
6853      __kmp_serial_initialize would cause a deadlock.  So we call
6854      __kmp_do_serial_initialize directly. */
6855   if (!__kmp_init_middle) {
6856     __kmp_do_middle_initialize();
6857   }
6858   __kmp_resume_if_hard_paused();
6859 
6860   /* begin initialization */
6861   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6862   KMP_ASSERT(KMP_UBER_GTID(gtid));
6863 
6864 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6865   // Save the FP control regs.
6866   // Worker threads will set theirs to these values at thread startup.
6867   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6868   __kmp_store_mxcsr(&__kmp_init_mxcsr);
6869   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6870 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6871 
6872 #if KMP_OS_UNIX
6873 #if KMP_HANDLE_SIGNALS
6874   /*  must be after __kmp_serial_initialize  */
6875   __kmp_install_signals(TRUE);
6876 #endif
6877 #endif
6878 
6879   __kmp_suspend_initialize();
6880 
6881 #if defined(USE_LOAD_BALANCE)
6882   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6883     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6884   }
6885 #else
6886   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6887     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6888   }
6889 #endif
6890 
6891   if (__kmp_version) {
6892     __kmp_print_version_2();
6893   }
6894 
6895   /* we have finished parallel initialization */
6896   TCW_SYNC_4(__kmp_init_parallel, TRUE);
6897 
6898   KMP_MB();
6899   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6900 
6901   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6902 }
6903 
6904 /* ------------------------------------------------------------------------ */
6905 
6906 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6907                                    kmp_team_t *team) {
6908   kmp_disp_t *dispatch;
6909 
6910   KMP_MB();
6911 
6912   /* none of the threads have encountered any constructs, yet. */
6913   this_thr->th.th_local.this_construct = 0;
6914 #if KMP_CACHE_MANAGE
6915   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6916 #endif /* KMP_CACHE_MANAGE */
6917   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6918   KMP_DEBUG_ASSERT(dispatch);
6919   KMP_DEBUG_ASSERT(team->t.t_dispatch);
6920   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6921   // this_thr->th.th_info.ds.ds_tid ] );
6922 
6923   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6924   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
6925   if (__kmp_env_consistency_check)
6926     __kmp_push_parallel(gtid, team->t.t_ident);
6927 
6928   KMP_MB(); /* Flush all pending memory write invalidates.  */
6929 }
6930 
6931 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6932                                   kmp_team_t *team) {
6933   if (__kmp_env_consistency_check)
6934     __kmp_pop_parallel(gtid, team->t.t_ident);
6935 
6936   __kmp_finish_implicit_task(this_thr);
6937 }
6938 
6939 int __kmp_invoke_task_func(int gtid) {
6940   int rc;
6941   int tid = __kmp_tid_from_gtid(gtid);
6942   kmp_info_t *this_thr = __kmp_threads[gtid];
6943   kmp_team_t *team = this_thr->th.th_team;
6944 
6945   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6946 #if USE_ITT_BUILD
6947   if (__itt_stack_caller_create_ptr) {
6948     __kmp_itt_stack_callee_enter(
6949         (__itt_caller)
6950             team->t.t_stack_id); // inform ittnotify about entering user's code
6951   }
6952 #endif /* USE_ITT_BUILD */
6953 #if INCLUDE_SSC_MARKS
6954   SSC_MARK_INVOKING();
6955 #endif
6956 
6957 #if OMPT_SUPPORT
6958   void *dummy;
6959   void **exit_runtime_p;
6960   ompt_data_t *my_task_data;
6961   ompt_data_t *my_parallel_data;
6962   int ompt_team_size;
6963 
6964   if (ompt_enabled.enabled) {
6965     exit_runtime_p = &(
6966         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
6967   } else {
6968     exit_runtime_p = &dummy;
6969   }
6970 
6971   my_task_data =
6972       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
6973   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
6974   if (ompt_enabled.ompt_callback_implicit_task) {
6975     ompt_team_size = team->t.t_nproc;
6976     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
6977         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
6978         __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
6979     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
6980   }
6981 #endif
6982 
6983 #if KMP_STATS_ENABLED
6984   stats_state_e previous_state = KMP_GET_THREAD_STATE();
6985   if (previous_state == stats_state_e::TEAMS_REGION) {
6986     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
6987   } else {
6988     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
6989   }
6990   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
6991 #endif
6992 
6993   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6994                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
6995 #if OMPT_SUPPORT
6996                               ,
6997                               exit_runtime_p
6998 #endif
6999                               );
7000 #if OMPT_SUPPORT
7001   *exit_runtime_p = NULL;
7002 #endif
7003 
7004 #if KMP_STATS_ENABLED
7005   if (previous_state == stats_state_e::TEAMS_REGION) {
7006     KMP_SET_THREAD_STATE(previous_state);
7007   }
7008   KMP_POP_PARTITIONED_TIMER();
7009 #endif
7010 
7011 #if USE_ITT_BUILD
7012   if (__itt_stack_caller_create_ptr) {
7013     __kmp_itt_stack_callee_leave(
7014         (__itt_caller)
7015             team->t.t_stack_id); // inform ittnotify about leaving user's code
7016   }
7017 #endif /* USE_ITT_BUILD */
7018   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7019 
7020   return rc;
7021 }
7022 
7023 void __kmp_teams_master(int gtid) {
7024   // This routine is called by all master threads in teams construct
7025   kmp_info_t *thr = __kmp_threads[gtid];
7026   kmp_team_t *team = thr->th.th_team;
7027   ident_t *loc = team->t.t_ident;
7028   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7029   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7030   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7031   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7032                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7033 
7034   // This thread is a new CG root.  Set up the proper variables.
7035   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7036   tmp->cg_root = thr; // Make thr the CG root
7037   // Init to thread limit that was stored when league masters were forked
7038   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7039   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7040   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7041                  " cg_nthreads to 1\n",
7042                  thr, tmp));
7043   tmp->up = thr->th.th_cg_roots;
7044   thr->th.th_cg_roots = tmp;
7045 
7046 // Launch league of teams now, but not let workers execute
7047 // (they hang on fork barrier until next parallel)
7048 #if INCLUDE_SSC_MARKS
7049   SSC_MARK_FORKING();
7050 #endif
7051   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7052                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7053                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7054 #if INCLUDE_SSC_MARKS
7055   SSC_MARK_JOINING();
7056 #endif
7057   // If the team size was reduced from the limit, set it to the new size
7058   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7059     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7060   // AC: last parameter "1" eliminates join barrier which won't work because
7061   // worker threads are in a fork barrier waiting for more parallel regions
7062   __kmp_join_call(loc, gtid
7063 #if OMPT_SUPPORT
7064                   ,
7065                   fork_context_intel
7066 #endif
7067                   ,
7068                   1);
7069 }
7070 
7071 int __kmp_invoke_teams_master(int gtid) {
7072   kmp_info_t *this_thr = __kmp_threads[gtid];
7073   kmp_team_t *team = this_thr->th.th_team;
7074 #if KMP_DEBUG
7075   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7076     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7077                      (void *)__kmp_teams_master);
7078 #endif
7079   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7080   __kmp_teams_master(gtid);
7081   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7082   return 1;
7083 }
7084 
7085 /* this sets the requested number of threads for the next parallel region
7086    encountered by this team. since this should be enclosed in the forkjoin
7087    critical section it should avoid race conditions with assymmetrical nested
7088    parallelism */
7089 
7090 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7091   kmp_info_t *thr = __kmp_threads[gtid];
7092 
7093   if (num_threads > 0)
7094     thr->th.th_set_nproc = num_threads;
7095 }
7096 
7097 /* this sets the requested number of teams for the teams region and/or
7098    the number of threads for the next parallel region encountered  */
7099 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7100                           int num_threads) {
7101   kmp_info_t *thr = __kmp_threads[gtid];
7102   KMP_DEBUG_ASSERT(num_teams >= 0);
7103   KMP_DEBUG_ASSERT(num_threads >= 0);
7104 
7105   if (num_teams == 0)
7106     num_teams = 1; // default number of teams is 1.
7107   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7108     if (!__kmp_reserve_warn) {
7109       __kmp_reserve_warn = 1;
7110       __kmp_msg(kmp_ms_warning,
7111                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7112                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7113     }
7114     num_teams = __kmp_teams_max_nth;
7115   }
7116   // Set number of teams (number of threads in the outer "parallel" of the
7117   // teams)
7118   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7119 
7120   // Remember the number of threads for inner parallel regions
7121   if (num_threads == 0) {
7122     if (!TCR_4(__kmp_init_middle))
7123       __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7124     num_threads = __kmp_avail_proc / num_teams;
7125     if (num_teams * num_threads > __kmp_teams_max_nth) {
7126       // adjust num_threads w/o warning as it is not user setting
7127       num_threads = __kmp_teams_max_nth / num_teams;
7128     }
7129   } else {
7130     // This thread will be the master of the league masters
7131     // Store new thread limit; old limit is saved in th_cg_roots list
7132     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7133 
7134     if (num_teams * num_threads > __kmp_teams_max_nth) {
7135       int new_threads = __kmp_teams_max_nth / num_teams;
7136       if (!__kmp_reserve_warn) { // user asked for too many threads
7137         __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7138         __kmp_msg(kmp_ms_warning,
7139                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7140                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7141       }
7142       num_threads = new_threads;
7143     }
7144   }
7145   thr->th.th_teams_size.nth = num_threads;
7146 }
7147 
7148 // Set the proc_bind var to use in the following parallel region.
7149 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7150   kmp_info_t *thr = __kmp_threads[gtid];
7151   thr->th.th_set_proc_bind = proc_bind;
7152 }
7153 
7154 /* Launch the worker threads into the microtask. */
7155 
7156 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7157   kmp_info_t *this_thr = __kmp_threads[gtid];
7158 
7159 #ifdef KMP_DEBUG
7160   int f;
7161 #endif /* KMP_DEBUG */
7162 
7163   KMP_DEBUG_ASSERT(team);
7164   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7165   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7166   KMP_MB(); /* Flush all pending memory write invalidates.  */
7167 
7168   team->t.t_construct = 0; /* no single directives seen yet */
7169   team->t.t_ordered.dt.t_value =
7170       0; /* thread 0 enters the ordered section first */
7171 
7172   /* Reset the identifiers on the dispatch buffer */
7173   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7174   if (team->t.t_max_nproc > 1) {
7175     int i;
7176     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7177       team->t.t_disp_buffer[i].buffer_index = i;
7178       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7179     }
7180   } else {
7181     team->t.t_disp_buffer[0].buffer_index = 0;
7182     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7183   }
7184 
7185   KMP_MB(); /* Flush all pending memory write invalidates.  */
7186   KMP_ASSERT(this_thr->th.th_team == team);
7187 
7188 #ifdef KMP_DEBUG
7189   for (f = 0; f < team->t.t_nproc; f++) {
7190     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7191                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7192   }
7193 #endif /* KMP_DEBUG */
7194 
7195   /* release the worker threads so they may begin working */
7196   __kmp_fork_barrier(gtid, 0);
7197 }
7198 
7199 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7200   kmp_info_t *this_thr = __kmp_threads[gtid];
7201 
7202   KMP_DEBUG_ASSERT(team);
7203   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7204   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7205   KMP_MB(); /* Flush all pending memory write invalidates.  */
7206 
7207 /* Join barrier after fork */
7208 
7209 #ifdef KMP_DEBUG
7210   if (__kmp_threads[gtid] &&
7211       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7212     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7213                  __kmp_threads[gtid]);
7214     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7215                  "team->t.t_nproc=%d\n",
7216                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7217                  team->t.t_nproc);
7218     __kmp_print_structure();
7219   }
7220   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7221                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7222 #endif /* KMP_DEBUG */
7223 
7224   __kmp_join_barrier(gtid); /* wait for everyone */
7225 #if OMPT_SUPPORT
7226   if (ompt_enabled.enabled &&
7227       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7228     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7229     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7230     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7231 #if OMPT_OPTIONAL
7232     void *codeptr = NULL;
7233     if (KMP_MASTER_TID(ds_tid) &&
7234         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7235          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7236       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7237 
7238     if (ompt_enabled.ompt_callback_sync_region_wait) {
7239       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7240           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7241           codeptr);
7242     }
7243     if (ompt_enabled.ompt_callback_sync_region) {
7244       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7245           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7246           codeptr);
7247     }
7248 #endif
7249     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7250       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7251           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7252     }
7253   }
7254 #endif
7255 
7256   KMP_MB(); /* Flush all pending memory write invalidates.  */
7257   KMP_ASSERT(this_thr->th.th_team == team);
7258 }
7259 
7260 /* ------------------------------------------------------------------------ */
7261 
7262 #ifdef USE_LOAD_BALANCE
7263 
7264 // Return the worker threads actively spinning in the hot team, if we
7265 // are at the outermost level of parallelism.  Otherwise, return 0.
7266 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7267   int i;
7268   int retval;
7269   kmp_team_t *hot_team;
7270 
7271   if (root->r.r_active) {
7272     return 0;
7273   }
7274   hot_team = root->r.r_hot_team;
7275   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7276     return hot_team->t.t_nproc - 1; // Don't count master thread
7277   }
7278 
7279   // Skip the master thread - it is accounted for elsewhere.
7280   retval = 0;
7281   for (i = 1; i < hot_team->t.t_nproc; i++) {
7282     if (hot_team->t.t_threads[i]->th.th_active) {
7283       retval++;
7284     }
7285   }
7286   return retval;
7287 }
7288 
7289 // Perform an automatic adjustment to the number of
7290 // threads used by the next parallel region.
7291 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7292   int retval;
7293   int pool_active;
7294   int hot_team_active;
7295   int team_curr_active;
7296   int system_active;
7297 
7298   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7299                 set_nproc));
7300   KMP_DEBUG_ASSERT(root);
7301   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7302                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7303   KMP_DEBUG_ASSERT(set_nproc > 1);
7304 
7305   if (set_nproc == 1) {
7306     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7307     return 1;
7308   }
7309 
7310   // Threads that are active in the thread pool, active in the hot team for this
7311   // particular root (if we are at the outer par level), and the currently
7312   // executing thread (to become the master) are available to add to the new
7313   // team, but are currently contributing to the system load, and must be
7314   // accounted for.
7315   pool_active = __kmp_thread_pool_active_nth;
7316   hot_team_active = __kmp_active_hot_team_nproc(root);
7317   team_curr_active = pool_active + hot_team_active + 1;
7318 
7319   // Check the system load.
7320   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7321   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7322                 "hot team active = %d\n",
7323                 system_active, pool_active, hot_team_active));
7324 
7325   if (system_active < 0) {
7326     // There was an error reading the necessary info from /proc, so use the
7327     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7328     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7329     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7330     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7331 
7332     // Make this call behave like the thread limit algorithm.
7333     retval = __kmp_avail_proc - __kmp_nth +
7334              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7335     if (retval > set_nproc) {
7336       retval = set_nproc;
7337     }
7338     if (retval < KMP_MIN_NTH) {
7339       retval = KMP_MIN_NTH;
7340     }
7341 
7342     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7343                   retval));
7344     return retval;
7345   }
7346 
7347   // There is a slight delay in the load balance algorithm in detecting new
7348   // running procs. The real system load at this instant should be at least as
7349   // large as the #active omp thread that are available to add to the team.
7350   if (system_active < team_curr_active) {
7351     system_active = team_curr_active;
7352   }
7353   retval = __kmp_avail_proc - system_active + team_curr_active;
7354   if (retval > set_nproc) {
7355     retval = set_nproc;
7356   }
7357   if (retval < KMP_MIN_NTH) {
7358     retval = KMP_MIN_NTH;
7359   }
7360 
7361   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7362   return retval;
7363 } // __kmp_load_balance_nproc()
7364 
7365 #endif /* USE_LOAD_BALANCE */
7366 
7367 /* ------------------------------------------------------------------------ */
7368 
7369 /* NOTE: this is called with the __kmp_init_lock held */
7370 void __kmp_cleanup(void) {
7371   int f;
7372 
7373   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7374 
7375   if (TCR_4(__kmp_init_parallel)) {
7376 #if KMP_HANDLE_SIGNALS
7377     __kmp_remove_signals();
7378 #endif
7379     TCW_4(__kmp_init_parallel, FALSE);
7380   }
7381 
7382   if (TCR_4(__kmp_init_middle)) {
7383 #if KMP_AFFINITY_SUPPORTED
7384     __kmp_affinity_uninitialize();
7385 #endif /* KMP_AFFINITY_SUPPORTED */
7386     __kmp_cleanup_hierarchy();
7387     TCW_4(__kmp_init_middle, FALSE);
7388   }
7389 
7390   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7391 
7392   if (__kmp_init_serial) {
7393     __kmp_runtime_destroy();
7394     __kmp_init_serial = FALSE;
7395   }
7396 
7397   __kmp_cleanup_threadprivate_caches();
7398 
7399   for (f = 0; f < __kmp_threads_capacity; f++) {
7400     if (__kmp_root[f] != NULL) {
7401       __kmp_free(__kmp_root[f]);
7402       __kmp_root[f] = NULL;
7403     }
7404   }
7405   __kmp_free(__kmp_threads);
7406   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7407   // there is no need in freeing __kmp_root.
7408   __kmp_threads = NULL;
7409   __kmp_root = NULL;
7410   __kmp_threads_capacity = 0;
7411 
7412 #if KMP_USE_DYNAMIC_LOCK
7413   __kmp_cleanup_indirect_user_locks();
7414 #else
7415   __kmp_cleanup_user_locks();
7416 #endif
7417 
7418 #if KMP_AFFINITY_SUPPORTED
7419   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7420   __kmp_cpuinfo_file = NULL;
7421 #endif /* KMP_AFFINITY_SUPPORTED */
7422 
7423 #if KMP_USE_ADAPTIVE_LOCKS
7424 #if KMP_DEBUG_ADAPTIVE_LOCKS
7425   __kmp_print_speculative_stats();
7426 #endif
7427 #endif
7428   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7429   __kmp_nested_nth.nth = NULL;
7430   __kmp_nested_nth.size = 0;
7431   __kmp_nested_nth.used = 0;
7432   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7433   __kmp_nested_proc_bind.bind_types = NULL;
7434   __kmp_nested_proc_bind.size = 0;
7435   __kmp_nested_proc_bind.used = 0;
7436   if (__kmp_affinity_format) {
7437     KMP_INTERNAL_FREE(__kmp_affinity_format);
7438     __kmp_affinity_format = NULL;
7439   }
7440 
7441   __kmp_i18n_catclose();
7442 
7443 #if KMP_USE_HIER_SCHED
7444   __kmp_hier_scheds.deallocate();
7445 #endif
7446 
7447 #if KMP_STATS_ENABLED
7448   __kmp_stats_fini();
7449 #endif
7450 
7451   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7452 }
7453 
7454 /* ------------------------------------------------------------------------ */
7455 
7456 int __kmp_ignore_mppbeg(void) {
7457   char *env;
7458 
7459   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7460     if (__kmp_str_match_false(env))
7461       return FALSE;
7462   }
7463   // By default __kmpc_begin() is no-op.
7464   return TRUE;
7465 }
7466 
7467 int __kmp_ignore_mppend(void) {
7468   char *env;
7469 
7470   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7471     if (__kmp_str_match_false(env))
7472       return FALSE;
7473   }
7474   // By default __kmpc_end() is no-op.
7475   return TRUE;
7476 }
7477 
7478 void __kmp_internal_begin(void) {
7479   int gtid;
7480   kmp_root_t *root;
7481 
7482   /* this is a very important step as it will register new sibling threads
7483      and assign these new uber threads a new gtid */
7484   gtid = __kmp_entry_gtid();
7485   root = __kmp_threads[gtid]->th.th_root;
7486   KMP_ASSERT(KMP_UBER_GTID(gtid));
7487 
7488   if (root->r.r_begin)
7489     return;
7490   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7491   if (root->r.r_begin) {
7492     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7493     return;
7494   }
7495 
7496   root->r.r_begin = TRUE;
7497 
7498   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7499 }
7500 
7501 /* ------------------------------------------------------------------------ */
7502 
7503 void __kmp_user_set_library(enum library_type arg) {
7504   int gtid;
7505   kmp_root_t *root;
7506   kmp_info_t *thread;
7507 
7508   /* first, make sure we are initialized so we can get our gtid */
7509 
7510   gtid = __kmp_entry_gtid();
7511   thread = __kmp_threads[gtid];
7512 
7513   root = thread->th.th_root;
7514 
7515   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7516                 library_serial));
7517   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7518                                   thread */
7519     KMP_WARNING(SetLibraryIncorrectCall);
7520     return;
7521   }
7522 
7523   switch (arg) {
7524   case library_serial:
7525     thread->th.th_set_nproc = 0;
7526     set__nproc(thread, 1);
7527     break;
7528   case library_turnaround:
7529     thread->th.th_set_nproc = 0;
7530     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7531                                            : __kmp_dflt_team_nth_ub);
7532     break;
7533   case library_throughput:
7534     thread->th.th_set_nproc = 0;
7535     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7536                                            : __kmp_dflt_team_nth_ub);
7537     break;
7538   default:
7539     KMP_FATAL(UnknownLibraryType, arg);
7540   }
7541 
7542   __kmp_aux_set_library(arg);
7543 }
7544 
7545 void __kmp_aux_set_stacksize(size_t arg) {
7546   if (!__kmp_init_serial)
7547     __kmp_serial_initialize();
7548 
7549 #if KMP_OS_DARWIN
7550   if (arg & (0x1000 - 1)) {
7551     arg &= ~(0x1000 - 1);
7552     if (arg + 0x1000) /* check for overflow if we round up */
7553       arg += 0x1000;
7554   }
7555 #endif
7556   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7557 
7558   /* only change the default stacksize before the first parallel region */
7559   if (!TCR_4(__kmp_init_parallel)) {
7560     size_t value = arg; /* argument is in bytes */
7561 
7562     if (value < __kmp_sys_min_stksize)
7563       value = __kmp_sys_min_stksize;
7564     else if (value > KMP_MAX_STKSIZE)
7565       value = KMP_MAX_STKSIZE;
7566 
7567     __kmp_stksize = value;
7568 
7569     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7570   }
7571 
7572   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7573 }
7574 
7575 /* set the behaviour of the runtime library */
7576 /* TODO this can cause some odd behaviour with sibling parallelism... */
7577 void __kmp_aux_set_library(enum library_type arg) {
7578   __kmp_library = arg;
7579 
7580   switch (__kmp_library) {
7581   case library_serial: {
7582     KMP_INFORM(LibraryIsSerial);
7583   } break;
7584   case library_turnaround:
7585     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7586       __kmp_use_yield = 2; // only yield when oversubscribed
7587     break;
7588   case library_throughput:
7589     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7590       __kmp_dflt_blocktime = 200;
7591     break;
7592   default:
7593     KMP_FATAL(UnknownLibraryType, arg);
7594   }
7595 }
7596 
7597 /* Getting team information common for all team API */
7598 // Returns NULL if not in teams construct
7599 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7600   kmp_info_t *thr = __kmp_entry_thread();
7601   teams_serialized = 0;
7602   if (thr->th.th_teams_microtask) {
7603     kmp_team_t *team = thr->th.th_team;
7604     int tlevel = thr->th.th_teams_level; // the level of the teams construct
7605     int ii = team->t.t_level;
7606     teams_serialized = team->t.t_serialized;
7607     int level = tlevel + 1;
7608     KMP_DEBUG_ASSERT(ii >= tlevel);
7609     while (ii > level) {
7610       for (teams_serialized = team->t.t_serialized;
7611            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7612       }
7613       if (team->t.t_serialized && (!teams_serialized)) {
7614         team = team->t.t_parent;
7615         continue;
7616       }
7617       if (ii > level) {
7618         team = team->t.t_parent;
7619         ii--;
7620       }
7621     }
7622     return team;
7623   }
7624   return NULL;
7625 }
7626 
7627 int __kmp_aux_get_team_num() {
7628   int serialized;
7629   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7630   if (team) {
7631     if (serialized > 1) {
7632       return 0; // teams region is serialized ( 1 team of 1 thread ).
7633     } else {
7634       return team->t.t_master_tid;
7635     }
7636   }
7637   return 0;
7638 }
7639 
7640 int __kmp_aux_get_num_teams() {
7641   int serialized;
7642   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7643   if (team) {
7644     if (serialized > 1) {
7645       return 1;
7646     } else {
7647       return team->t.t_parent->t.t_nproc;
7648     }
7649   }
7650   return 1;
7651 }
7652 
7653 /* ------------------------------------------------------------------------ */
7654 
7655 /*
7656  * Affinity Format Parser
7657  *
7658  * Field is in form of: %[[[0].]size]type
7659  * % and type are required (%% means print a literal '%')
7660  * type is either single char or long name surrounded by {},
7661  * e.g., N or {num_threads}
7662  * 0 => leading zeros
7663  * . => right justified when size is specified
7664  * by default output is left justified
7665  * size is the *minimum* field length
7666  * All other characters are printed as is
7667  *
7668  * Available field types:
7669  * L {thread_level}      - omp_get_level()
7670  * n {thread_num}        - omp_get_thread_num()
7671  * h {host}              - name of host machine
7672  * P {process_id}        - process id (integer)
7673  * T {thread_identifier} - native thread identifier (integer)
7674  * N {num_threads}       - omp_get_num_threads()
7675  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
7676  * a {thread_affinity}   - comma separated list of integers or integer ranges
7677  *                         (values of affinity mask)
7678  *
7679  * Implementation-specific field types can be added
7680  * If a type is unknown, print "undefined"
7681 */
7682 
7683 // Structure holding the short name, long name, and corresponding data type
7684 // for snprintf.  A table of these will represent the entire valid keyword
7685 // field types.
7686 typedef struct kmp_affinity_format_field_t {
7687   char short_name; // from spec e.g., L -> thread level
7688   const char *long_name; // from spec thread_level -> thread level
7689   char field_format; // data type for snprintf (typically 'd' or 's'
7690   // for integer or string)
7691 } kmp_affinity_format_field_t;
7692 
7693 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7694 #if KMP_AFFINITY_SUPPORTED
7695     {'A', "thread_affinity", 's'},
7696 #endif
7697     {'t', "team_num", 'd'},
7698     {'T', "num_teams", 'd'},
7699     {'L', "nesting_level", 'd'},
7700     {'n', "thread_num", 'd'},
7701     {'N', "num_threads", 'd'},
7702     {'a', "ancestor_tnum", 'd'},
7703     {'H', "host", 's'},
7704     {'P', "process_id", 'd'},
7705     {'i', "native_thread_id", 'd'}};
7706 
7707 // Return the number of characters it takes to hold field
7708 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7709                                             const char **ptr,
7710                                             kmp_str_buf_t *field_buffer) {
7711   int rc, format_index, field_value;
7712   const char *width_left, *width_right;
7713   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7714   static const int FORMAT_SIZE = 20;
7715   char format[FORMAT_SIZE] = {0};
7716   char absolute_short_name = 0;
7717 
7718   KMP_DEBUG_ASSERT(gtid >= 0);
7719   KMP_DEBUG_ASSERT(th);
7720   KMP_DEBUG_ASSERT(**ptr == '%');
7721   KMP_DEBUG_ASSERT(field_buffer);
7722 
7723   __kmp_str_buf_clear(field_buffer);
7724 
7725   // Skip the initial %
7726   (*ptr)++;
7727 
7728   // Check for %% first
7729   if (**ptr == '%') {
7730     __kmp_str_buf_cat(field_buffer, "%", 1);
7731     (*ptr)++; // skip over the second %
7732     return 1;
7733   }
7734 
7735   // Parse field modifiers if they are present
7736   pad_zeros = false;
7737   if (**ptr == '0') {
7738     pad_zeros = true;
7739     (*ptr)++; // skip over 0
7740   }
7741   right_justify = false;
7742   if (**ptr == '.') {
7743     right_justify = true;
7744     (*ptr)++; // skip over .
7745   }
7746   // Parse width of field: [width_left, width_right)
7747   width_left = width_right = NULL;
7748   if (**ptr >= '0' && **ptr <= '9') {
7749     width_left = *ptr;
7750     SKIP_DIGITS(*ptr);
7751     width_right = *ptr;
7752   }
7753 
7754   // Create the format for KMP_SNPRINTF based on flags parsed above
7755   format_index = 0;
7756   format[format_index++] = '%';
7757   if (!right_justify)
7758     format[format_index++] = '-';
7759   if (pad_zeros)
7760     format[format_index++] = '0';
7761   if (width_left && width_right) {
7762     int i = 0;
7763     // Only allow 8 digit number widths.
7764     // This also prevents overflowing format variable
7765     while (i < 8 && width_left < width_right) {
7766       format[format_index++] = *width_left;
7767       width_left++;
7768       i++;
7769     }
7770   }
7771 
7772   // Parse a name (long or short)
7773   // Canonicalize the name into absolute_short_name
7774   found_valid_name = false;
7775   parse_long_name = (**ptr == '{');
7776   if (parse_long_name)
7777     (*ptr)++; // skip initial left brace
7778   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7779                              sizeof(__kmp_affinity_format_table[0]);
7780        ++i) {
7781     char short_name = __kmp_affinity_format_table[i].short_name;
7782     const char *long_name = __kmp_affinity_format_table[i].long_name;
7783     char field_format = __kmp_affinity_format_table[i].field_format;
7784     if (parse_long_name) {
7785       int length = KMP_STRLEN(long_name);
7786       if (strncmp(*ptr, long_name, length) == 0) {
7787         found_valid_name = true;
7788         (*ptr) += length; // skip the long name
7789       }
7790     } else if (**ptr == short_name) {
7791       found_valid_name = true;
7792       (*ptr)++; // skip the short name
7793     }
7794     if (found_valid_name) {
7795       format[format_index++] = field_format;
7796       format[format_index++] = '\0';
7797       absolute_short_name = short_name;
7798       break;
7799     }
7800   }
7801   if (parse_long_name) {
7802     if (**ptr != '}') {
7803       absolute_short_name = 0;
7804     } else {
7805       (*ptr)++; // skip over the right brace
7806     }
7807   }
7808 
7809   // Attempt to fill the buffer with the requested
7810   // value using snprintf within __kmp_str_buf_print()
7811   switch (absolute_short_name) {
7812   case 't':
7813     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
7814     break;
7815   case 'T':
7816     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
7817     break;
7818   case 'L':
7819     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
7820     break;
7821   case 'n':
7822     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
7823     break;
7824   case 'H': {
7825     static const int BUFFER_SIZE = 256;
7826     char buf[BUFFER_SIZE];
7827     __kmp_expand_host_name(buf, BUFFER_SIZE);
7828     rc = __kmp_str_buf_print(field_buffer, format, buf);
7829   } break;
7830   case 'P':
7831     rc = __kmp_str_buf_print(field_buffer, format, getpid());
7832     break;
7833   case 'i':
7834     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
7835     break;
7836   case 'N':
7837     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
7838     break;
7839   case 'a':
7840     field_value =
7841         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
7842     rc = __kmp_str_buf_print(field_buffer, format, field_value);
7843     break;
7844 #if KMP_AFFINITY_SUPPORTED
7845   case 'A': {
7846     kmp_str_buf_t buf;
7847     __kmp_str_buf_init(&buf);
7848     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
7849     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
7850     __kmp_str_buf_free(&buf);
7851   } break;
7852 #endif
7853   default:
7854     // According to spec, If an implementation does not have info for field
7855     // type, then "undefined" is printed
7856     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
7857     // Skip the field
7858     if (parse_long_name) {
7859       SKIP_TOKEN(*ptr);
7860       if (**ptr == '}')
7861         (*ptr)++;
7862     } else {
7863       (*ptr)++;
7864     }
7865   }
7866 
7867   KMP_ASSERT(format_index <= FORMAT_SIZE);
7868   return rc;
7869 }
7870 
7871 /*
7872  * Return number of characters needed to hold the affinity string
7873  * (not including null byte character)
7874  * The resultant string is printed to buffer, which the caller can then
7875  * handle afterwards
7876 */
7877 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
7878                                   kmp_str_buf_t *buffer) {
7879   const char *parse_ptr;
7880   size_t retval;
7881   const kmp_info_t *th;
7882   kmp_str_buf_t field;
7883 
7884   KMP_DEBUG_ASSERT(buffer);
7885   KMP_DEBUG_ASSERT(gtid >= 0);
7886 
7887   __kmp_str_buf_init(&field);
7888   __kmp_str_buf_clear(buffer);
7889 
7890   th = __kmp_threads[gtid];
7891   retval = 0;
7892 
7893   // If format is NULL or zero-length string, then we use
7894   // affinity-format-var ICV
7895   parse_ptr = format;
7896   if (parse_ptr == NULL || *parse_ptr == '\0') {
7897     parse_ptr = __kmp_affinity_format;
7898   }
7899   KMP_DEBUG_ASSERT(parse_ptr);
7900 
7901   while (*parse_ptr != '\0') {
7902     // Parse a field
7903     if (*parse_ptr == '%') {
7904       // Put field in the buffer
7905       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
7906       __kmp_str_buf_catbuf(buffer, &field);
7907       retval += rc;
7908     } else {
7909       // Put literal character in buffer
7910       __kmp_str_buf_cat(buffer, parse_ptr, 1);
7911       retval++;
7912       parse_ptr++;
7913     }
7914   }
7915   __kmp_str_buf_free(&field);
7916   return retval;
7917 }
7918 
7919 // Displays the affinity string to stdout
7920 void __kmp_aux_display_affinity(int gtid, const char *format) {
7921   kmp_str_buf_t buf;
7922   __kmp_str_buf_init(&buf);
7923   __kmp_aux_capture_affinity(gtid, format, &buf);
7924   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
7925   __kmp_str_buf_free(&buf);
7926 }
7927 
7928 /* ------------------------------------------------------------------------ */
7929 
7930 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7931   int blocktime = arg; /* argument is in milliseconds */
7932 #if KMP_USE_MONITOR
7933   int bt_intervals;
7934 #endif
7935   int bt_set;
7936 
7937   __kmp_save_internal_controls(thread);
7938 
7939   /* Normalize and set blocktime for the teams */
7940   if (blocktime < KMP_MIN_BLOCKTIME)
7941     blocktime = KMP_MIN_BLOCKTIME;
7942   else if (blocktime > KMP_MAX_BLOCKTIME)
7943     blocktime = KMP_MAX_BLOCKTIME;
7944 
7945   set__blocktime_team(thread->th.th_team, tid, blocktime);
7946   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7947 
7948 #if KMP_USE_MONITOR
7949   /* Calculate and set blocktime intervals for the teams */
7950   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7951 
7952   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7953   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7954 #endif
7955 
7956   /* Set whether blocktime has been set to "TRUE" */
7957   bt_set = TRUE;
7958 
7959   set__bt_set_team(thread->th.th_team, tid, bt_set);
7960   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7961 #if KMP_USE_MONITOR
7962   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7963                 "bt_intervals=%d, monitor_updates=%d\n",
7964                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7965                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7966                 __kmp_monitor_wakeups));
7967 #else
7968   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7969                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7970                 thread->th.th_team->t.t_id, tid, blocktime));
7971 #endif
7972 }
7973 
7974 void __kmp_aux_set_defaults(char const *str, int len) {
7975   if (!__kmp_init_serial) {
7976     __kmp_serial_initialize();
7977   }
7978   __kmp_env_initialize(str);
7979 
7980   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
7981     __kmp_env_print();
7982   }
7983 } // __kmp_aux_set_defaults
7984 
7985 /* ------------------------------------------------------------------------ */
7986 /* internal fast reduction routines */
7987 
7988 PACKED_REDUCTION_METHOD_T
7989 __kmp_determine_reduction_method(
7990     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7991     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7992     kmp_critical_name *lck) {
7993 
7994   // Default reduction method: critical construct ( lck != NULL, like in current
7995   // PAROPT )
7996   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7997   // can be selected by RTL
7998   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7999   // can be selected by RTL
8000   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8001   // among generated by PAROPT.
8002 
8003   PACKED_REDUCTION_METHOD_T retval;
8004 
8005   int team_size;
8006 
8007   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8008   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8009 
8010 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8011   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8012 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8013 
8014   retval = critical_reduce_block;
8015 
8016   // another choice of getting a team size (with 1 dynamic deference) is slower
8017   team_size = __kmp_get_team_num_threads(global_tid);
8018   if (team_size == 1) {
8019 
8020     retval = empty_reduce_block;
8021 
8022   } else {
8023 
8024     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8025 
8026 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
8027 
8028 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8029     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8030 
8031     int teamsize_cutoff = 4;
8032 
8033 #if KMP_MIC_SUPPORTED
8034     if (__kmp_mic_type != non_mic) {
8035       teamsize_cutoff = 8;
8036     }
8037 #endif
8038     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8039     if (tree_available) {
8040       if (team_size <= teamsize_cutoff) {
8041         if (atomic_available) {
8042           retval = atomic_reduce_block;
8043         }
8044       } else {
8045         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8046       }
8047     } else if (atomic_available) {
8048       retval = atomic_reduce_block;
8049     }
8050 #else
8051 #error "Unknown or unsupported OS"
8052 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8053        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8054 
8055 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8056 
8057 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8058 
8059     // basic tuning
8060 
8061     if (atomic_available) {
8062       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8063         retval = atomic_reduce_block;
8064       }
8065     } // otherwise: use critical section
8066 
8067 #elif KMP_OS_DARWIN
8068 
8069     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8070     if (atomic_available && (num_vars <= 3)) {
8071       retval = atomic_reduce_block;
8072     } else if (tree_available) {
8073       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8074           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8075         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8076       }
8077     } // otherwise: use critical section
8078 
8079 #else
8080 #error "Unknown or unsupported OS"
8081 #endif
8082 
8083 #else
8084 #error "Unknown or unsupported architecture"
8085 #endif
8086   }
8087 
8088   // KMP_FORCE_REDUCTION
8089 
8090   // If the team is serialized (team_size == 1), ignore the forced reduction
8091   // method and stay with the unsynchronized method (empty_reduce_block)
8092   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8093       team_size != 1) {
8094 
8095     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8096 
8097     int atomic_available, tree_available;
8098 
8099     switch ((forced_retval = __kmp_force_reduction_method)) {
8100     case critical_reduce_block:
8101       KMP_ASSERT(lck); // lck should be != 0
8102       break;
8103 
8104     case atomic_reduce_block:
8105       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8106       if (!atomic_available) {
8107         KMP_WARNING(RedMethodNotSupported, "atomic");
8108         forced_retval = critical_reduce_block;
8109       }
8110       break;
8111 
8112     case tree_reduce_block:
8113       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8114       if (!tree_available) {
8115         KMP_WARNING(RedMethodNotSupported, "tree");
8116         forced_retval = critical_reduce_block;
8117       } else {
8118 #if KMP_FAST_REDUCTION_BARRIER
8119         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8120 #endif
8121       }
8122       break;
8123 
8124     default:
8125       KMP_ASSERT(0); // "unsupported method specified"
8126     }
8127 
8128     retval = forced_retval;
8129   }
8130 
8131   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8132 
8133 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8134 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8135 
8136   return (retval);
8137 }
8138 
8139 // this function is for testing set/get/determine reduce method
8140 kmp_int32 __kmp_get_reduce_method(void) {
8141   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8142 }
8143 
8144 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8145 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8146 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8147 
8148 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8149 // OpenMP is used subsequently.
8150 void __kmp_hard_pause() {
8151   __kmp_pause_status = kmp_hard_paused;
8152   __kmp_internal_end_thread(-1);
8153 }
8154 
8155 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8156 void __kmp_resume_if_soft_paused() {
8157   if (__kmp_pause_status == kmp_soft_paused) {
8158     __kmp_pause_status = kmp_not_paused;
8159 
8160     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8161       kmp_info_t *thread = __kmp_threads[gtid];
8162       if (thread) { // Wake it if sleeping
8163         kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8164         if (fl.is_sleeping())
8165           fl.resume(gtid);
8166         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8167           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8168         } else { // thread holds the lock and may sleep soon
8169           do { // until either the thread sleeps, or we can get the lock
8170             if (fl.is_sleeping()) {
8171               fl.resume(gtid);
8172               break;
8173             } else if (__kmp_try_suspend_mx(thread)) {
8174               __kmp_unlock_suspend_mx(thread);
8175               break;
8176             }
8177           } while (1);
8178         }
8179       }
8180     }
8181   }
8182 }
8183 
8184 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8185 // TODO: add warning messages
8186 int __kmp_pause_resource(kmp_pause_status_t level) {
8187   if (level == kmp_not_paused) { // requesting resume
8188     if (__kmp_pause_status == kmp_not_paused) {
8189       // error message about runtime not being paused, so can't resume
8190       return 1;
8191     } else {
8192       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8193                        __kmp_pause_status == kmp_hard_paused);
8194       __kmp_pause_status = kmp_not_paused;
8195       return 0;
8196     }
8197   } else if (level == kmp_soft_paused) { // requesting soft pause
8198     if (__kmp_pause_status != kmp_not_paused) {
8199       // error message about already being paused
8200       return 1;
8201     } else {
8202       __kmp_soft_pause();
8203       return 0;
8204     }
8205   } else if (level == kmp_hard_paused) { // requesting hard pause
8206     if (__kmp_pause_status != kmp_not_paused) {
8207       // error message about already being paused
8208       return 1;
8209     } else {
8210       __kmp_hard_pause();
8211       return 0;
8212     }
8213   } else {
8214     // error message about invalid level
8215     return 1;
8216   }
8217 }
8218