xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_runtime.cpp (revision bc5304a006238115291e7568583632889dffbab9)
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 #if OMP_PROFILING_SUPPORT
36 #include "llvm/Support/TimeProfiler.h"
37 static char *ProfileTraceFile = nullptr;
38 #endif
39 
40 /* these are temporary issues to be dealt with */
41 #define KMP_USE_PRCTL 0
42 
43 #if KMP_OS_WINDOWS
44 #include <process.h>
45 #endif
46 
47 #include "tsan_annotations.h"
48 
49 #if KMP_OS_WINDOWS
50 // windows does not need include files as it doesn't use shared memory
51 #else
52 #include <sys/mman.h>
53 #include <sys/stat.h>
54 #include <fcntl.h>
55 #define SHM_SIZE 1024
56 #endif
57 
58 #if defined(KMP_GOMP_COMPAT)
59 char const __kmp_version_alt_comp[] =
60     KMP_VERSION_PREFIX "alternative compiler support: yes";
61 #endif /* defined(KMP_GOMP_COMPAT) */
62 
63 char const __kmp_version_omp_api[] =
64     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
65 
66 #ifdef KMP_DEBUG
67 char const __kmp_version_lock[] =
68     KMP_VERSION_PREFIX "lock type: run time selectable";
69 #endif /* KMP_DEBUG */
70 
71 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
72 
73 /* ------------------------------------------------------------------------ */
74 
75 #if KMP_USE_MONITOR
76 kmp_info_t __kmp_monitor;
77 #endif
78 
79 /* Forward declarations */
80 
81 void __kmp_cleanup(void);
82 
83 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
84                                   int gtid);
85 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
86                                   kmp_internal_control_t *new_icvs,
87                                   ident_t *loc);
88 #if KMP_AFFINITY_SUPPORTED
89 static void __kmp_partition_places(kmp_team_t *team,
90                                    int update_master_only = 0);
91 #endif
92 static void __kmp_do_serial_initialize(void);
93 void __kmp_fork_barrier(int gtid, int tid);
94 void __kmp_join_barrier(int gtid);
95 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
96                           kmp_internal_control_t *new_icvs, ident_t *loc);
97 
98 #ifdef USE_LOAD_BALANCE
99 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
100 #endif
101 
102 static int __kmp_expand_threads(int nNeed);
103 #if KMP_OS_WINDOWS
104 static int __kmp_unregister_root_other_thread(int gtid);
105 #endif
106 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
107 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
108 
109 /* Calculate the identifier of the current thread */
110 /* fast (and somewhat portable) way to get unique identifier of executing
111    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
112 int __kmp_get_global_thread_id() {
113   int i;
114   kmp_info_t **other_threads;
115   size_t stack_data;
116   char *stack_addr;
117   size_t stack_size;
118   char *stack_base;
119 
120   KA_TRACE(
121       1000,
122       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
123        __kmp_nth, __kmp_all_nth));
124 
125   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
126      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
127      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
128      __kmp_init_gtid for this to work. */
129 
130   if (!TCR_4(__kmp_init_gtid))
131     return KMP_GTID_DNE;
132 
133 #ifdef KMP_TDATA_GTID
134   if (TCR_4(__kmp_gtid_mode) >= 3) {
135     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
136     return __kmp_gtid;
137   }
138 #endif
139   if (TCR_4(__kmp_gtid_mode) >= 2) {
140     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
141     return __kmp_gtid_get_specific();
142   }
143   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
144 
145   stack_addr = (char *)&stack_data;
146   other_threads = __kmp_threads;
147 
148   /* ATT: The code below is a source of potential bugs due to unsynchronized
149      access to __kmp_threads array. For example:
150      1. Current thread loads other_threads[i] to thr and checks it, it is
151         non-NULL.
152      2. Current thread is suspended by OS.
153      3. Another thread unregisters and finishes (debug versions of free()
154         may fill memory with something like 0xEF).
155      4. Current thread is resumed.
156      5. Current thread reads junk from *thr.
157      TODO: Fix it.  --ln  */
158 
159   for (i = 0; i < __kmp_threads_capacity; i++) {
160 
161     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
162     if (!thr)
163       continue;
164 
165     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
166     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
167 
168     /* stack grows down -- search through all of the active threads */
169 
170     if (stack_addr <= stack_base) {
171       size_t stack_diff = stack_base - stack_addr;
172 
173       if (stack_diff <= stack_size) {
174         /* The only way we can be closer than the allocated */
175         /* stack size is if we are running on this thread. */
176         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
177         return i;
178       }
179     }
180   }
181 
182   /* get specific to try and determine our gtid */
183   KA_TRACE(1000,
184            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
185             "thread, using TLS\n"));
186   i = __kmp_gtid_get_specific();
187 
188   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
189 
190   /* if we havn't been assigned a gtid, then return code */
191   if (i < 0)
192     return i;
193 
194   /* dynamically updated stack window for uber threads to avoid get_specific
195      call */
196   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
197     KMP_FATAL(StackOverflow, i);
198   }
199 
200   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201   if (stack_addr > stack_base) {
202     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
203     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
205                 stack_base);
206   } else {
207     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
208             stack_base - stack_addr);
209   }
210 
211   /* Reprint stack bounds for ubermaster since they have been refined */
212   if (__kmp_storage_map) {
213     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
214     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
215     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
216                                  other_threads[i]->th.th_info.ds.ds_stacksize,
217                                  "th_%d stack (refinement)", i);
218   }
219   return i;
220 }
221 
222 int __kmp_get_global_thread_id_reg() {
223   int gtid;
224 
225   if (!__kmp_init_serial) {
226     gtid = KMP_GTID_DNE;
227   } else
228 #ifdef KMP_TDATA_GTID
229       if (TCR_4(__kmp_gtid_mode) >= 3) {
230     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
231     gtid = __kmp_gtid;
232   } else
233 #endif
234       if (TCR_4(__kmp_gtid_mode) >= 2) {
235     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
236     gtid = __kmp_gtid_get_specific();
237   } else {
238     KA_TRACE(1000,
239              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
240     gtid = __kmp_get_global_thread_id();
241   }
242 
243   /* we must be a new uber master sibling thread */
244   if (gtid == KMP_GTID_DNE) {
245     KA_TRACE(10,
246              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
247               "Registering a new gtid.\n"));
248     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
249     if (!__kmp_init_serial) {
250       __kmp_do_serial_initialize();
251       gtid = __kmp_gtid_get_specific();
252     } else {
253       gtid = __kmp_register_root(FALSE);
254     }
255     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
256     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
257   }
258 
259   KMP_DEBUG_ASSERT(gtid >= 0);
260 
261   return gtid;
262 }
263 
264 /* caller must hold forkjoin_lock */
265 void __kmp_check_stack_overlap(kmp_info_t *th) {
266   int f;
267   char *stack_beg = NULL;
268   char *stack_end = NULL;
269   int gtid;
270 
271   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
272   if (__kmp_storage_map) {
273     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
274     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
275 
276     gtid = __kmp_gtid_from_thread(th);
277 
278     if (gtid == KMP_GTID_MONITOR) {
279       __kmp_print_storage_map_gtid(
280           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281           "th_%s stack (%s)", "mon",
282           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
283     } else {
284       __kmp_print_storage_map_gtid(
285           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286           "th_%d stack (%s)", gtid,
287           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288     }
289   }
290 
291   /* No point in checking ubermaster threads since they use refinement and
292    * cannot overlap */
293   gtid = __kmp_gtid_from_thread(th);
294   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
295     KA_TRACE(10,
296              ("__kmp_check_stack_overlap: performing extensive checking\n"));
297     if (stack_beg == NULL) {
298       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
299       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
300     }
301 
302     for (f = 0; f < __kmp_threads_capacity; f++) {
303       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
304 
305       if (f_th && f_th != th) {
306         char *other_stack_end =
307             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
308         char *other_stack_beg =
309             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
310         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
311             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
312 
313           /* Print the other stack values before the abort */
314           if (__kmp_storage_map)
315             __kmp_print_storage_map_gtid(
316                 -1, other_stack_beg, other_stack_end,
317                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
318                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
319 
320           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
321                       __kmp_msg_null);
322         }
323       }
324     }
325   }
326   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
327 }
328 
329 /* ------------------------------------------------------------------------ */
330 
331 void __kmp_infinite_loop(void) {
332   static int done = FALSE;
333 
334   while (!done) {
335     KMP_YIELD(TRUE);
336   }
337 }
338 
339 #define MAX_MESSAGE 512
340 
341 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
342                                   char const *format, ...) {
343   char buffer[MAX_MESSAGE];
344   va_list ap;
345 
346   va_start(ap, format);
347   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
348                p2, (unsigned long)size, format);
349   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
350   __kmp_vprintf(kmp_err, buffer, ap);
351 #if KMP_PRINT_DATA_PLACEMENT
352   int node;
353   if (gtid >= 0) {
354     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
355       if (__kmp_storage_map_verbose) {
356         node = __kmp_get_host_node(p1);
357         if (node < 0) /* doesn't work, so don't try this next time */
358           __kmp_storage_map_verbose = FALSE;
359         else {
360           char *last;
361           int lastNode;
362           int localProc = __kmp_get_cpu_from_gtid(gtid);
363 
364           const int page_size = KMP_GET_PAGE_SIZE();
365 
366           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
367           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
368           if (localProc >= 0)
369             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
370                                  localProc >> 1);
371           else
372             __kmp_printf_no_lock("  GTID %d\n", gtid);
373 #if KMP_USE_PRCTL
374           /* The more elaborate format is disabled for now because of the prctl
375            * hanging bug. */
376           do {
377             last = p1;
378             lastNode = node;
379             /* This loop collates adjacent pages with the same host node. */
380             do {
381               (char *)p1 += page_size;
382             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
383             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
384                                  lastNode);
385           } while (p1 <= p2);
386 #else
387           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
388                                (char *)p1 + (page_size - 1),
389                                __kmp_get_host_node(p1));
390           if (p1 < p2) {
391             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
392                                  (char *)p2 + (page_size - 1),
393                                  __kmp_get_host_node(p2));
394           }
395 #endif
396         }
397       }
398     } else
399       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
400   }
401 #endif /* KMP_PRINT_DATA_PLACEMENT */
402   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
403 }
404 
405 void __kmp_warn(char const *format, ...) {
406   char buffer[MAX_MESSAGE];
407   va_list ap;
408 
409   if (__kmp_generate_warnings == kmp_warnings_off) {
410     return;
411   }
412 
413   va_start(ap, format);
414 
415   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
416   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
417   __kmp_vprintf(kmp_err, buffer, ap);
418   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
419 
420   va_end(ap);
421 }
422 
423 void __kmp_abort_process() {
424   // Later threads may stall here, but that's ok because abort() will kill them.
425   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
426 
427   if (__kmp_debug_buf) {
428     __kmp_dump_debug_buffer();
429   }
430 
431   if (KMP_OS_WINDOWS) {
432     // Let other threads know of abnormal termination and prevent deadlock
433     // if abort happened during library initialization or shutdown
434     __kmp_global.g.g_abort = SIGABRT;
435 
436     /* On Windows* OS by default abort() causes pop-up error box, which stalls
437        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
438        boxes. _set_abort_behavior() works well, but this function is not
439        available in VS7 (this is not problem for DLL, but it is a problem for
440        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
441        help, at least in some versions of MS C RTL.
442 
443        It seems following sequence is the only way to simulate abort() and
444        avoid pop-up error box. */
445     raise(SIGABRT);
446     _exit(3); // Just in case, if signal ignored, exit anyway.
447   } else {
448     __kmp_unregister_library();
449     abort();
450   }
451 
452   __kmp_infinite_loop();
453   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
454 
455 } // __kmp_abort_process
456 
457 void __kmp_abort_thread(void) {
458   // TODO: Eliminate g_abort global variable and this function.
459   // In case of abort just call abort(), it will kill all the threads.
460   __kmp_infinite_loop();
461 } // __kmp_abort_thread
462 
463 /* Print out the storage map for the major kmp_info_t thread data structures
464    that are allocated together. */
465 
466 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
467   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
468                                gtid);
469 
470   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
471                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
472 
473   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
474                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
475 
476   __kmp_print_storage_map_gtid(
477       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
478       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
479 
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
481                                &thr->th.th_bar[bs_plain_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
483                                gtid);
484 
485   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
486                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
487                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
488                                gtid);
489 
490 #if KMP_FAST_REDUCTION_BARRIER
491   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
492                                &thr->th.th_bar[bs_reduction_barrier + 1],
493                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
494                                gtid);
495 #endif // KMP_FAST_REDUCTION_BARRIER
496 }
497 
498 /* Print out the storage map for the major kmp_team_t team data structures
499    that are allocated together. */
500 
501 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
502                                          int team_id, int num_thr) {
503   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
504   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
505                                header, team_id);
506 
507   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
508                                &team->t.t_bar[bs_last_barrier],
509                                sizeof(kmp_balign_team_t) * bs_last_barrier,
510                                "%s_%d.t_bar", header, team_id);
511 
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
513                                &team->t.t_bar[bs_plain_barrier + 1],
514                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
515                                header, team_id);
516 
517   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
518                                &team->t.t_bar[bs_forkjoin_barrier + 1],
519                                sizeof(kmp_balign_team_t),
520                                "%s_%d.t_bar[forkjoin]", header, team_id);
521 
522 #if KMP_FAST_REDUCTION_BARRIER
523   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
524                                &team->t.t_bar[bs_reduction_barrier + 1],
525                                sizeof(kmp_balign_team_t),
526                                "%s_%d.t_bar[reduction]", header, team_id);
527 #endif // KMP_FAST_REDUCTION_BARRIER
528 
529   __kmp_print_storage_map_gtid(
530       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
531       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
532 
533   __kmp_print_storage_map_gtid(
534       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
535       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
536 
537   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
538                                &team->t.t_disp_buffer[num_disp_buff],
539                                sizeof(dispatch_shared_info_t) * num_disp_buff,
540                                "%s_%d.t_disp_buffer", header, team_id);
541 }
542 
543 static void __kmp_init_allocator() { __kmp_init_memkind(); }
544 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
545 
546 /* ------------------------------------------------------------------------ */
547 
548 #if KMP_DYNAMIC_LIB
549 #if KMP_OS_WINDOWS
550 
551 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
552   // TODO: Change to __kmp_break_bootstrap_lock().
553   __kmp_init_bootstrap_lock(lck); // make the lock released
554 }
555 
556 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
557   int i;
558   int thread_count;
559 
560   // PROCESS_DETACH is expected to be called by a thread that executes
561   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
562   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
563   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
564   // threads can be still alive here, although being about to be terminated. The
565   // threads in the array with ds_thread==0 are most suspicious. Actually, it
566   // can be not safe to access the __kmp_threads[].
567 
568   // TODO: does it make sense to check __kmp_roots[] ?
569 
570   // Let's check that there are no other alive threads registered with the OMP
571   // lib.
572   while (1) {
573     thread_count = 0;
574     for (i = 0; i < __kmp_threads_capacity; ++i) {
575       if (!__kmp_threads)
576         continue;
577       kmp_info_t *th = __kmp_threads[i];
578       if (th == NULL)
579         continue;
580       int gtid = th->th.th_info.ds.ds_gtid;
581       if (gtid == gtid_req)
582         continue;
583       if (gtid < 0)
584         continue;
585       DWORD exit_val;
586       int alive = __kmp_is_thread_alive(th, &exit_val);
587       if (alive) {
588         ++thread_count;
589       }
590     }
591     if (thread_count == 0)
592       break; // success
593   }
594 
595   // Assume that I'm alone. Now it might be safe to check and reset locks.
596   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
597   __kmp_reset_lock(&__kmp_forkjoin_lock);
598 #ifdef KMP_DEBUG
599   __kmp_reset_lock(&__kmp_stdio_lock);
600 #endif // KMP_DEBUG
601 }
602 
603 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
604   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
605 
606   switch (fdwReason) {
607 
608   case DLL_PROCESS_ATTACH:
609     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
610 
611     return TRUE;
612 
613   case DLL_PROCESS_DETACH:
614     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
615 
616     if (lpReserved != NULL) {
617       // lpReserved is used for telling the difference:
618       //   lpReserved == NULL when FreeLibrary() was called,
619       //   lpReserved != NULL when the process terminates.
620       // When FreeLibrary() is called, worker threads remain alive. So they will
621       // release the forkjoin lock by themselves. When the process terminates,
622       // worker threads disappear triggering the problem of unreleased forkjoin
623       // lock as described below.
624 
625       // A worker thread can take the forkjoin lock. The problem comes up if
626       // that worker thread becomes dead before it releases the forkjoin lock.
627       // The forkjoin lock remains taken, while the thread executing
628       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
629       // to take the forkjoin lock and will always fail, so that the application
630       // will never finish [normally]. This scenario is possible if
631       // __kmpc_end() has not been executed. It looks like it's not a corner
632       // case, but common cases:
633       // - the main function was compiled by an alternative compiler;
634       // - the main function was compiled by icl but without /Qopenmp
635       //   (application with plugins);
636       // - application terminates by calling C exit(), Fortran CALL EXIT() or
637       //   Fortran STOP.
638       // - alive foreign thread prevented __kmpc_end from doing cleanup.
639       //
640       // This is a hack to work around the problem.
641       // TODO: !!! figure out something better.
642       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
643     }
644 
645     __kmp_internal_end_library(__kmp_gtid_get_specific());
646 
647     return TRUE;
648 
649   case DLL_THREAD_ATTACH:
650     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
651 
652     /* if we want to register new siblings all the time here call
653      * __kmp_get_gtid(); */
654     return TRUE;
655 
656   case DLL_THREAD_DETACH:
657     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
658 
659     __kmp_internal_end_thread(__kmp_gtid_get_specific());
660     return TRUE;
661   }
662 
663   return TRUE;
664 }
665 
666 #endif /* KMP_OS_WINDOWS */
667 #endif /* KMP_DYNAMIC_LIB */
668 
669 /* __kmp_parallel_deo -- Wait until it's our turn. */
670 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
671   int gtid = *gtid_ref;
672 #ifdef BUILD_PARALLEL_ORDERED
673   kmp_team_t *team = __kmp_team_from_gtid(gtid);
674 #endif /* BUILD_PARALLEL_ORDERED */
675 
676   if (__kmp_env_consistency_check) {
677     if (__kmp_threads[gtid]->th.th_root->r.r_active)
678 #if KMP_USE_DYNAMIC_LOCK
679       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
680 #else
681       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
682 #endif
683   }
684 #ifdef BUILD_PARALLEL_ORDERED
685   if (!team->t.t_serialized) {
686     KMP_MB();
687     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
688              NULL);
689     KMP_MB();
690   }
691 #endif /* BUILD_PARALLEL_ORDERED */
692 }
693 
694 /* __kmp_parallel_dxo -- Signal the next task. */
695 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
696   int gtid = *gtid_ref;
697 #ifdef BUILD_PARALLEL_ORDERED
698   int tid = __kmp_tid_from_gtid(gtid);
699   kmp_team_t *team = __kmp_team_from_gtid(gtid);
700 #endif /* BUILD_PARALLEL_ORDERED */
701 
702   if (__kmp_env_consistency_check) {
703     if (__kmp_threads[gtid]->th.th_root->r.r_active)
704       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
705   }
706 #ifdef BUILD_PARALLEL_ORDERED
707   if (!team->t.t_serialized) {
708     KMP_MB(); /* Flush all pending memory write invalidates.  */
709 
710     /* use the tid of the next thread in this team */
711     /* TODO replace with general release procedure */
712     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
713 
714     KMP_MB(); /* Flush all pending memory write invalidates.  */
715   }
716 #endif /* BUILD_PARALLEL_ORDERED */
717 }
718 
719 /* ------------------------------------------------------------------------ */
720 /* The BARRIER for a SINGLE process section is always explicit   */
721 
722 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
723   int status;
724   kmp_info_t *th;
725   kmp_team_t *team;
726 
727   if (!TCR_4(__kmp_init_parallel))
728     __kmp_parallel_initialize();
729   __kmp_resume_if_soft_paused();
730 
731   th = __kmp_threads[gtid];
732   team = th->th.th_team;
733   status = 0;
734 
735   th->th.th_ident = id_ref;
736 
737   if (team->t.t_serialized) {
738     status = 1;
739   } else {
740     kmp_int32 old_this = th->th.th_local.this_construct;
741 
742     ++th->th.th_local.this_construct;
743     /* try to set team count to thread count--success means thread got the
744        single block */
745     /* TODO: Should this be acquire or release? */
746     if (team->t.t_construct == old_this) {
747       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
748                                               th->th.th_local.this_construct);
749     }
750 #if USE_ITT_BUILD
751     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
752         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
753         team->t.t_active_level ==
754             1) { // Only report metadata by master of active team at level 1
755       __kmp_itt_metadata_single(id_ref);
756     }
757 #endif /* USE_ITT_BUILD */
758   }
759 
760   if (__kmp_env_consistency_check) {
761     if (status && push_ws) {
762       __kmp_push_workshare(gtid, ct_psingle, id_ref);
763     } else {
764       __kmp_check_workshare(gtid, ct_psingle, id_ref);
765     }
766   }
767 #if USE_ITT_BUILD
768   if (status) {
769     __kmp_itt_single_start(gtid);
770   }
771 #endif /* USE_ITT_BUILD */
772   return status;
773 }
774 
775 void __kmp_exit_single(int gtid) {
776 #if USE_ITT_BUILD
777   __kmp_itt_single_end(gtid);
778 #endif /* USE_ITT_BUILD */
779   if (__kmp_env_consistency_check)
780     __kmp_pop_workshare(gtid, ct_psingle, NULL);
781 }
782 
783 /* determine if we can go parallel or must use a serialized parallel region and
784  * how many threads we can use
785  * set_nproc is the number of threads requested for the team
786  * returns 0 if we should serialize or only use one thread,
787  * otherwise the number of threads to use
788  * The forkjoin lock is held by the caller. */
789 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
790                                  int master_tid, int set_nthreads,
791                                  int enter_teams) {
792   int capacity;
793   int new_nthreads;
794   KMP_DEBUG_ASSERT(__kmp_init_serial);
795   KMP_DEBUG_ASSERT(root && parent_team);
796   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
797 
798   // If dyn-var is set, dynamically adjust the number of desired threads,
799   // according to the method specified by dynamic_mode.
800   new_nthreads = set_nthreads;
801   if (!get__dynamic_2(parent_team, master_tid)) {
802     ;
803   }
804 #ifdef USE_LOAD_BALANCE
805   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
806     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
807     if (new_nthreads == 1) {
808       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
809                     "reservation to 1 thread\n",
810                     master_tid));
811       return 1;
812     }
813     if (new_nthreads < set_nthreads) {
814       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
815                     "reservation to %d threads\n",
816                     master_tid, new_nthreads));
817     }
818   }
819 #endif /* USE_LOAD_BALANCE */
820   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
821     new_nthreads = __kmp_avail_proc - __kmp_nth +
822                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
823     if (new_nthreads <= 1) {
824       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
825                     "reservation to 1 thread\n",
826                     master_tid));
827       return 1;
828     }
829     if (new_nthreads < set_nthreads) {
830       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
831                     "reservation to %d threads\n",
832                     master_tid, new_nthreads));
833     } else {
834       new_nthreads = set_nthreads;
835     }
836   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
837     if (set_nthreads > 2) {
838       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
839       new_nthreads = (new_nthreads % set_nthreads) + 1;
840       if (new_nthreads == 1) {
841         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
842                       "reservation to 1 thread\n",
843                       master_tid));
844         return 1;
845       }
846       if (new_nthreads < set_nthreads) {
847         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
848                       "reservation to %d threads\n",
849                       master_tid, new_nthreads));
850       }
851     }
852   } else {
853     KMP_ASSERT(0);
854   }
855 
856   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
857   if (__kmp_nth + new_nthreads -
858           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
859       __kmp_max_nth) {
860     int tl_nthreads = __kmp_max_nth - __kmp_nth +
861                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
862     if (tl_nthreads <= 0) {
863       tl_nthreads = 1;
864     }
865 
866     // If dyn-var is false, emit a 1-time warning.
867     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
868       __kmp_reserve_warn = 1;
869       __kmp_msg(kmp_ms_warning,
870                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
871                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
872     }
873     if (tl_nthreads == 1) {
874       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
875                     "reduced reservation to 1 thread\n",
876                     master_tid));
877       return 1;
878     }
879     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
880                   "reservation to %d threads\n",
881                   master_tid, tl_nthreads));
882     new_nthreads = tl_nthreads;
883   }
884 
885   // Respect OMP_THREAD_LIMIT
886   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
887   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
888   if (cg_nthreads + new_nthreads -
889           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
890       max_cg_threads) {
891     int tl_nthreads = max_cg_threads - cg_nthreads +
892                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
893     if (tl_nthreads <= 0) {
894       tl_nthreads = 1;
895     }
896 
897     // If dyn-var is false, emit a 1-time warning.
898     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
899       __kmp_reserve_warn = 1;
900       __kmp_msg(kmp_ms_warning,
901                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
902                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
903     }
904     if (tl_nthreads == 1) {
905       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
906                     "reduced reservation to 1 thread\n",
907                     master_tid));
908       return 1;
909     }
910     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
911                   "reservation to %d threads\n",
912                   master_tid, tl_nthreads));
913     new_nthreads = tl_nthreads;
914   }
915 
916   // Check if the threads array is large enough, or needs expanding.
917   // See comment in __kmp_register_root() about the adjustment if
918   // __kmp_threads[0] == NULL.
919   capacity = __kmp_threads_capacity;
920   if (TCR_PTR(__kmp_threads[0]) == NULL) {
921     --capacity;
922   }
923   // If it is not for initializing the hidden helper team, we need to take
924   // __kmp_hidden_helper_threads_num out of the capacity because it is included
925   // in __kmp_threads_capacity.
926   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
927     capacity -= __kmp_hidden_helper_threads_num;
928   }
929   if (__kmp_nth + new_nthreads -
930           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
931       capacity) {
932     // Expand the threads array.
933     int slotsRequired = __kmp_nth + new_nthreads -
934                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
935                         capacity;
936     int slotsAdded = __kmp_expand_threads(slotsRequired);
937     if (slotsAdded < slotsRequired) {
938       // The threads array was not expanded enough.
939       new_nthreads -= (slotsRequired - slotsAdded);
940       KMP_ASSERT(new_nthreads >= 1);
941 
942       // If dyn-var is false, emit a 1-time warning.
943       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
944         __kmp_reserve_warn = 1;
945         if (__kmp_tp_cached) {
946           __kmp_msg(kmp_ms_warning,
947                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
948                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
949                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
950         } else {
951           __kmp_msg(kmp_ms_warning,
952                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
953                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
954         }
955       }
956     }
957   }
958 
959 #ifdef KMP_DEBUG
960   if (new_nthreads == 1) {
961     KC_TRACE(10,
962              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
963               "dead roots and rechecking; requested %d threads\n",
964               __kmp_get_gtid(), set_nthreads));
965   } else {
966     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
967                   " %d threads\n",
968                   __kmp_get_gtid(), new_nthreads, set_nthreads));
969   }
970 #endif // KMP_DEBUG
971   return new_nthreads;
972 }
973 
974 /* Allocate threads from the thread pool and assign them to the new team. We are
975    assured that there are enough threads available, because we checked on that
976    earlier within critical section forkjoin */
977 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
978                                     kmp_info_t *master_th, int master_gtid) {
979   int i;
980   int use_hot_team;
981 
982   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
983   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
984   KMP_MB();
985 
986   /* first, let's setup the master thread */
987   master_th->th.th_info.ds.ds_tid = 0;
988   master_th->th.th_team = team;
989   master_th->th.th_team_nproc = team->t.t_nproc;
990   master_th->th.th_team_master = master_th;
991   master_th->th.th_team_serialized = FALSE;
992   master_th->th.th_dispatch = &team->t.t_dispatch[0];
993 
994 /* make sure we are not the optimized hot team */
995 #if KMP_NESTED_HOT_TEAMS
996   use_hot_team = 0;
997   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
998   if (hot_teams) { // hot teams array is not allocated if
999     // KMP_HOT_TEAMS_MAX_LEVEL=0
1000     int level = team->t.t_active_level - 1; // index in array of hot teams
1001     if (master_th->th.th_teams_microtask) { // are we inside the teams?
1002       if (master_th->th.th_teams_size.nteams > 1) {
1003         ++level; // level was not increased in teams construct for
1004         // team_of_masters
1005       }
1006       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1007           master_th->th.th_teams_level == team->t.t_level) {
1008         ++level; // level was not increased in teams construct for
1009         // team_of_workers before the parallel
1010       } // team->t.t_level will be increased inside parallel
1011     }
1012     if (level < __kmp_hot_teams_max_level) {
1013       if (hot_teams[level].hot_team) {
1014         // hot team has already been allocated for given level
1015         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1016         use_hot_team = 1; // the team is ready to use
1017       } else {
1018         use_hot_team = 0; // AC: threads are not allocated yet
1019         hot_teams[level].hot_team = team; // remember new hot team
1020         hot_teams[level].hot_team_nth = team->t.t_nproc;
1021       }
1022     } else {
1023       use_hot_team = 0;
1024     }
1025   }
1026 #else
1027   use_hot_team = team == root->r.r_hot_team;
1028 #endif
1029   if (!use_hot_team) {
1030 
1031     /* install the master thread */
1032     team->t.t_threads[0] = master_th;
1033     __kmp_initialize_info(master_th, team, 0, master_gtid);
1034 
1035     /* now, install the worker threads */
1036     for (i = 1; i < team->t.t_nproc; i++) {
1037 
1038       /* fork or reallocate a new thread and install it in team */
1039       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1040       team->t.t_threads[i] = thr;
1041       KMP_DEBUG_ASSERT(thr);
1042       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1043       /* align team and thread arrived states */
1044       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1045                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1046                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1047                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1048                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1049                     team->t.t_bar[bs_plain_barrier].b_arrived));
1050       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1051       thr->th.th_teams_level = master_th->th.th_teams_level;
1052       thr->th.th_teams_size = master_th->th.th_teams_size;
1053       { // Initialize threads' barrier data.
1054         int b;
1055         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1056         for (b = 0; b < bs_last_barrier; ++b) {
1057           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1058           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1059 #if USE_DEBUGGER
1060           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1061 #endif
1062         }
1063       }
1064     }
1065 
1066 #if KMP_AFFINITY_SUPPORTED
1067     __kmp_partition_places(team);
1068 #endif
1069   }
1070 
1071   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1072     for (i = 0; i < team->t.t_nproc; i++) {
1073       kmp_info_t *thr = team->t.t_threads[i];
1074       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1075           thr->th.th_prev_level != team->t.t_level) {
1076         team->t.t_display_affinity = 1;
1077         break;
1078       }
1079     }
1080   }
1081 
1082   KMP_MB();
1083 }
1084 
1085 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1086 // Propagate any changes to the floating point control registers out to the team
1087 // We try to avoid unnecessary writes to the relevant cache line in the team
1088 // structure, so we don't make changes unless they are needed.
1089 inline static void propagateFPControl(kmp_team_t *team) {
1090   if (__kmp_inherit_fp_control) {
1091     kmp_int16 x87_fpu_control_word;
1092     kmp_uint32 mxcsr;
1093 
1094     // Get master values of FPU control flags (both X87 and vector)
1095     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1096     __kmp_store_mxcsr(&mxcsr);
1097     mxcsr &= KMP_X86_MXCSR_MASK;
1098 
1099     // There is no point looking at t_fp_control_saved here.
1100     // If it is TRUE, we still have to update the values if they are different
1101     // from those we now have. If it is FALSE we didn't save anything yet, but
1102     // our objective is the same. We have to ensure that the values in the team
1103     // are the same as those we have.
1104     // So, this code achieves what we need whether or not t_fp_control_saved is
1105     // true. By checking whether the value needs updating we avoid unnecessary
1106     // writes that would put the cache-line into a written state, causing all
1107     // threads in the team to have to read it again.
1108     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1109     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1110     // Although we don't use this value, other code in the runtime wants to know
1111     // whether it should restore them. So we must ensure it is correct.
1112     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1113   } else {
1114     // Similarly here. Don't write to this cache-line in the team structure
1115     // unless we have to.
1116     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1117   }
1118 }
1119 
1120 // Do the opposite, setting the hardware registers to the updated values from
1121 // the team.
1122 inline static void updateHWFPControl(kmp_team_t *team) {
1123   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1124     // Only reset the fp control regs if they have been changed in the team.
1125     // the parallel region that we are exiting.
1126     kmp_int16 x87_fpu_control_word;
1127     kmp_uint32 mxcsr;
1128     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1129     __kmp_store_mxcsr(&mxcsr);
1130     mxcsr &= KMP_X86_MXCSR_MASK;
1131 
1132     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1133       __kmp_clear_x87_fpu_status_word();
1134       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1135     }
1136 
1137     if (team->t.t_mxcsr != mxcsr) {
1138       __kmp_load_mxcsr(&team->t.t_mxcsr);
1139     }
1140   }
1141 }
1142 #else
1143 #define propagateFPControl(x) ((void)0)
1144 #define updateHWFPControl(x) ((void)0)
1145 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1146 
1147 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1148                                      int realloc); // forward declaration
1149 
1150 /* Run a parallel region that has been serialized, so runs only in a team of the
1151    single master thread. */
1152 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1153   kmp_info_t *this_thr;
1154   kmp_team_t *serial_team;
1155 
1156   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1157 
1158   /* Skip all this code for autopar serialized loops since it results in
1159      unacceptable overhead */
1160   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1161     return;
1162 
1163   if (!TCR_4(__kmp_init_parallel))
1164     __kmp_parallel_initialize();
1165   __kmp_resume_if_soft_paused();
1166 
1167   this_thr = __kmp_threads[global_tid];
1168   serial_team = this_thr->th.th_serial_team;
1169 
1170   /* utilize the serialized team held by this thread */
1171   KMP_DEBUG_ASSERT(serial_team);
1172   KMP_MB();
1173 
1174   if (__kmp_tasking_mode != tskm_immediate_exec) {
1175     KMP_DEBUG_ASSERT(
1176         this_thr->th.th_task_team ==
1177         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1178     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1179                      NULL);
1180     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1181                   "team %p, new task_team = NULL\n",
1182                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1183     this_thr->th.th_task_team = NULL;
1184   }
1185 
1186   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1187   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1188     proc_bind = proc_bind_false;
1189   } else if (proc_bind == proc_bind_default) {
1190     // No proc_bind clause was specified, so use the current value
1191     // of proc-bind-var for this parallel region.
1192     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1193   }
1194   // Reset for next parallel region
1195   this_thr->th.th_set_proc_bind = proc_bind_default;
1196 
1197 #if OMPT_SUPPORT
1198   ompt_data_t ompt_parallel_data = ompt_data_none;
1199   ompt_data_t *implicit_task_data;
1200   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1201   if (ompt_enabled.enabled &&
1202       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1203 
1204     ompt_task_info_t *parent_task_info;
1205     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1206 
1207     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1208     if (ompt_enabled.ompt_callback_parallel_begin) {
1209       int team_size = 1;
1210 
1211       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1212           &(parent_task_info->task_data), &(parent_task_info->frame),
1213           &ompt_parallel_data, team_size,
1214           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1215     }
1216   }
1217 #endif // OMPT_SUPPORT
1218 
1219   if (this_thr->th.th_team != serial_team) {
1220     // Nested level will be an index in the nested nthreads array
1221     int level = this_thr->th.th_team->t.t_level;
1222 
1223     if (serial_team->t.t_serialized) {
1224       /* this serial team was already used
1225          TODO increase performance by making this locks more specific */
1226       kmp_team_t *new_team;
1227 
1228       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1229 
1230       new_team =
1231           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1232 #if OMPT_SUPPORT
1233                               ompt_parallel_data,
1234 #endif
1235                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1236                               0 USE_NESTED_HOT_ARG(NULL));
1237       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1238       KMP_ASSERT(new_team);
1239 
1240       /* setup new serialized team and install it */
1241       new_team->t.t_threads[0] = this_thr;
1242       new_team->t.t_parent = this_thr->th.th_team;
1243       serial_team = new_team;
1244       this_thr->th.th_serial_team = serial_team;
1245 
1246       KF_TRACE(
1247           10,
1248           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1249            global_tid, serial_team));
1250 
1251       /* TODO the above breaks the requirement that if we run out of resources,
1252          then we can still guarantee that serialized teams are ok, since we may
1253          need to allocate a new one */
1254     } else {
1255       KF_TRACE(
1256           10,
1257           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1258            global_tid, serial_team));
1259     }
1260 
1261     /* we have to initialize this serial team */
1262     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1263     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1264     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1265     serial_team->t.t_ident = loc;
1266     serial_team->t.t_serialized = 1;
1267     serial_team->t.t_nproc = 1;
1268     serial_team->t.t_parent = this_thr->th.th_team;
1269     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1270     this_thr->th.th_team = serial_team;
1271     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1272 
1273     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1274                   this_thr->th.th_current_task));
1275     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1276     this_thr->th.th_current_task->td_flags.executing = 0;
1277 
1278     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1279 
1280     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1281        implicit task for each serialized task represented by
1282        team->t.t_serialized? */
1283     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1284               &this_thr->th.th_current_task->td_parent->td_icvs);
1285 
1286     // Thread value exists in the nested nthreads array for the next nested
1287     // level
1288     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1289       this_thr->th.th_current_task->td_icvs.nproc =
1290           __kmp_nested_nth.nth[level + 1];
1291     }
1292 
1293     if (__kmp_nested_proc_bind.used &&
1294         (level + 1 < __kmp_nested_proc_bind.used)) {
1295       this_thr->th.th_current_task->td_icvs.proc_bind =
1296           __kmp_nested_proc_bind.bind_types[level + 1];
1297     }
1298 
1299 #if USE_DEBUGGER
1300     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1301 #endif
1302     this_thr->th.th_info.ds.ds_tid = 0;
1303 
1304     /* set thread cache values */
1305     this_thr->th.th_team_nproc = 1;
1306     this_thr->th.th_team_master = this_thr;
1307     this_thr->th.th_team_serialized = 1;
1308 
1309     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1310     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1311     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1312 
1313     propagateFPControl(serial_team);
1314 
1315     /* check if we need to allocate dispatch buffers stack */
1316     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1317     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1318       serial_team->t.t_dispatch->th_disp_buffer =
1319           (dispatch_private_info_t *)__kmp_allocate(
1320               sizeof(dispatch_private_info_t));
1321     }
1322     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1323 
1324     KMP_MB();
1325 
1326   } else {
1327     /* this serialized team is already being used,
1328      * that's fine, just add another nested level */
1329     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1330     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1331     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1332     ++serial_team->t.t_serialized;
1333     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1334 
1335     // Nested level will be an index in the nested nthreads array
1336     int level = this_thr->th.th_team->t.t_level;
1337     // Thread value exists in the nested nthreads array for the next nested
1338     // level
1339     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1340       this_thr->th.th_current_task->td_icvs.nproc =
1341           __kmp_nested_nth.nth[level + 1];
1342     }
1343     serial_team->t.t_level++;
1344     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1345                   "of serial team %p to %d\n",
1346                   global_tid, serial_team, serial_team->t.t_level));
1347 
1348     /* allocate/push dispatch buffers stack */
1349     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1350     {
1351       dispatch_private_info_t *disp_buffer =
1352           (dispatch_private_info_t *)__kmp_allocate(
1353               sizeof(dispatch_private_info_t));
1354       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1355       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1356     }
1357     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1358 
1359     KMP_MB();
1360   }
1361   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1362 
1363   // Perform the display affinity functionality for
1364   // serialized parallel regions
1365   if (__kmp_display_affinity) {
1366     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1367         this_thr->th.th_prev_num_threads != 1) {
1368       // NULL means use the affinity-format-var ICV
1369       __kmp_aux_display_affinity(global_tid, NULL);
1370       this_thr->th.th_prev_level = serial_team->t.t_level;
1371       this_thr->th.th_prev_num_threads = 1;
1372     }
1373   }
1374 
1375   if (__kmp_env_consistency_check)
1376     __kmp_push_parallel(global_tid, NULL);
1377 #if OMPT_SUPPORT
1378   serial_team->t.ompt_team_info.master_return_address = codeptr;
1379   if (ompt_enabled.enabled &&
1380       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1381     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1382 
1383     ompt_lw_taskteam_t lw_taskteam;
1384     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1385                             &ompt_parallel_data, codeptr);
1386 
1387     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1388     // don't use lw_taskteam after linking. content was swaped
1389 
1390     /* OMPT implicit task begin */
1391     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1392     if (ompt_enabled.ompt_callback_implicit_task) {
1393       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1394           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1395           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1396       OMPT_CUR_TASK_INFO(this_thr)
1397           ->thread_num = __kmp_tid_from_gtid(global_tid);
1398     }
1399 
1400     /* OMPT state */
1401     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1402     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1403   }
1404 #endif
1405 }
1406 
1407 /* most of the work for a fork */
1408 /* return true if we really went parallel, false if serialized */
1409 int __kmp_fork_call(ident_t *loc, int gtid,
1410                     enum fork_context_e call_context, // Intel, GNU, ...
1411                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1412                     kmp_va_list ap) {
1413   void **argv;
1414   int i;
1415   int master_tid;
1416   int master_this_cons;
1417   kmp_team_t *team;
1418   kmp_team_t *parent_team;
1419   kmp_info_t *master_th;
1420   kmp_root_t *root;
1421   int nthreads;
1422   int master_active;
1423   int master_set_numthreads;
1424   int level;
1425   int active_level;
1426   int teams_level;
1427 #if KMP_NESTED_HOT_TEAMS
1428   kmp_hot_team_ptr_t **p_hot_teams;
1429 #endif
1430   { // KMP_TIME_BLOCK
1431     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1432     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1433 
1434     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1435     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1436       /* Some systems prefer the stack for the root thread(s) to start with */
1437       /* some gap from the parent stack to prevent false sharing. */
1438       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1439       /* These 2 lines below are so this does not get optimized out */
1440       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1441         __kmp_stkpadding += (short)((kmp_int64)dummy);
1442     }
1443 
1444     /* initialize if needed */
1445     KMP_DEBUG_ASSERT(
1446         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1447     if (!TCR_4(__kmp_init_parallel))
1448       __kmp_parallel_initialize();
1449     __kmp_resume_if_soft_paused();
1450 
1451     /* setup current data */
1452     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1453     // shutdown
1454     parent_team = master_th->th.th_team;
1455     master_tid = master_th->th.th_info.ds.ds_tid;
1456     master_this_cons = master_th->th.th_local.this_construct;
1457     root = master_th->th.th_root;
1458     master_active = root->r.r_active;
1459     master_set_numthreads = master_th->th.th_set_nproc;
1460 
1461 #if OMPT_SUPPORT
1462     ompt_data_t ompt_parallel_data = ompt_data_none;
1463     ompt_data_t *parent_task_data;
1464     ompt_frame_t *ompt_frame;
1465     ompt_data_t *implicit_task_data;
1466     void *return_address = NULL;
1467 
1468     if (ompt_enabled.enabled) {
1469       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1470                                     NULL, NULL);
1471       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1472     }
1473 #endif
1474 
1475     // Nested level will be an index in the nested nthreads array
1476     level = parent_team->t.t_level;
1477     // used to launch non-serial teams even if nested is not allowed
1478     active_level = parent_team->t.t_active_level;
1479     // needed to check nesting inside the teams
1480     teams_level = master_th->th.th_teams_level;
1481 #if KMP_NESTED_HOT_TEAMS
1482     p_hot_teams = &master_th->th.th_hot_teams;
1483     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1484       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1485           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1486       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1487       // it is either actual or not needed (when active_level > 0)
1488       (*p_hot_teams)[0].hot_team_nth = 1;
1489     }
1490 #endif
1491 
1492 #if OMPT_SUPPORT
1493     if (ompt_enabled.enabled) {
1494       if (ompt_enabled.ompt_callback_parallel_begin) {
1495         int team_size = master_set_numthreads
1496                             ? master_set_numthreads
1497                             : get__nproc_2(parent_team, master_tid);
1498         int flags = OMPT_INVOKER(call_context) |
1499                     ((microtask == (microtask_t)__kmp_teams_master)
1500                          ? ompt_parallel_league
1501                          : ompt_parallel_team);
1502         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1503             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1504             return_address);
1505       }
1506       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1507     }
1508 #endif
1509 
1510     master_th->th.th_ident = loc;
1511 
1512     if (master_th->th.th_teams_microtask && ap &&
1513         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1514       // AC: This is start of parallel that is nested inside teams construct.
1515       // The team is actual (hot), all workers are ready at the fork barrier.
1516       // No lock needed to initialize the team a bit, then free workers.
1517       parent_team->t.t_ident = loc;
1518       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1519       parent_team->t.t_argc = argc;
1520       argv = (void **)parent_team->t.t_argv;
1521       for (i = argc - 1; i >= 0; --i)
1522         *argv++ = va_arg(kmp_va_deref(ap), void *);
1523       // Increment our nested depth levels, but not increase the serialization
1524       if (parent_team == master_th->th.th_serial_team) {
1525         // AC: we are in serialized parallel
1526         __kmpc_serialized_parallel(loc, gtid);
1527         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1528 
1529         if (call_context == fork_context_gnu) {
1530           // AC: need to decrement t_serialized for enquiry functions to work
1531           // correctly, will restore at join time
1532           parent_team->t.t_serialized--;
1533           return TRUE;
1534         }
1535 
1536 #if OMPT_SUPPORT
1537         void *dummy;
1538         void **exit_frame_p;
1539 
1540         ompt_lw_taskteam_t lw_taskteam;
1541 
1542         if (ompt_enabled.enabled) {
1543           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1544                                   &ompt_parallel_data, return_address);
1545           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1546 
1547           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1548           // don't use lw_taskteam after linking. content was swaped
1549 
1550           /* OMPT implicit task begin */
1551           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1552           if (ompt_enabled.ompt_callback_implicit_task) {
1553             OMPT_CUR_TASK_INFO(master_th)
1554                 ->thread_num = __kmp_tid_from_gtid(gtid);
1555             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1556                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1557                 implicit_task_data, 1,
1558                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1559           }
1560 
1561           /* OMPT state */
1562           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1563         } else {
1564           exit_frame_p = &dummy;
1565         }
1566 #endif
1567         // AC: need to decrement t_serialized for enquiry functions to work
1568         // correctly, will restore at join time
1569         parent_team->t.t_serialized--;
1570 
1571         {
1572           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1573           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1574           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1575 #if OMPT_SUPPORT
1576                                  ,
1577                                  exit_frame_p
1578 #endif
1579                                  );
1580         }
1581 
1582 #if OMPT_SUPPORT
1583         if (ompt_enabled.enabled) {
1584           *exit_frame_p = NULL;
1585           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1586           if (ompt_enabled.ompt_callback_implicit_task) {
1587             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1588                 ompt_scope_end, NULL, implicit_task_data, 1,
1589                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1590           }
1591           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1592           __ompt_lw_taskteam_unlink(master_th);
1593           if (ompt_enabled.ompt_callback_parallel_end) {
1594             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1595                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1596                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1597                 return_address);
1598           }
1599           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1600         }
1601 #endif
1602         return TRUE;
1603       }
1604 
1605       parent_team->t.t_pkfn = microtask;
1606       parent_team->t.t_invoke = invoker;
1607       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1608       parent_team->t.t_active_level++;
1609       parent_team->t.t_level++;
1610       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1611 
1612 #if OMPT_SUPPORT
1613       if (ompt_enabled.enabled) {
1614         ompt_lw_taskteam_t lw_taskteam;
1615         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1616                                 &ompt_parallel_data, return_address);
1617         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1618       }
1619 #endif
1620 
1621       /* Change number of threads in the team if requested */
1622       if (master_set_numthreads) { // The parallel has num_threads clause
1623         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1624           // AC: only can reduce number of threads dynamically, can't increase
1625           kmp_info_t **other_threads = parent_team->t.t_threads;
1626           parent_team->t.t_nproc = master_set_numthreads;
1627           for (i = 0; i < master_set_numthreads; ++i) {
1628             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1629           }
1630           // Keep extra threads hot in the team for possible next parallels
1631         }
1632         master_th->th.th_set_nproc = 0;
1633       }
1634 
1635 #if USE_DEBUGGER
1636       if (__kmp_debugging) { // Let debugger override number of threads.
1637         int nth = __kmp_omp_num_threads(loc);
1638         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1639           master_set_numthreads = nth;
1640         }
1641       }
1642 #endif
1643 
1644 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1645       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1646            KMP_ITT_DEBUG) &&
1647           __kmp_forkjoin_frames_mode == 3 &&
1648           parent_team->t.t_active_level == 1 // only report frames at level 1
1649           && master_th->th.th_teams_size.nteams == 1) {
1650         kmp_uint64 tmp_time = __itt_get_timestamp();
1651         master_th->th.th_frame_time = tmp_time;
1652         parent_team->t.t_region_time = tmp_time;
1653       }
1654       if (__itt_stack_caller_create_ptr) {
1655         // create new stack stitching id before entering fork barrier
1656         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1657       }
1658 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1659 
1660       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1661                     "master_th=%p, gtid=%d\n",
1662                     root, parent_team, master_th, gtid));
1663       __kmp_internal_fork(loc, gtid, parent_team);
1664       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1665                     "master_th=%p, gtid=%d\n",
1666                     root, parent_team, master_th, gtid));
1667 
1668       if (call_context == fork_context_gnu)
1669         return TRUE;
1670 
1671       /* Invoke microtask for MASTER thread */
1672       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1673                     parent_team->t.t_id, parent_team->t.t_pkfn));
1674 
1675       if (!parent_team->t.t_invoke(gtid)) {
1676         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1677       }
1678       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1679                     parent_team->t.t_id, parent_team->t.t_pkfn));
1680       KMP_MB(); /* Flush all pending memory write invalidates.  */
1681 
1682       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1683 
1684       return TRUE;
1685     } // Parallel closely nested in teams construct
1686 
1687 #if KMP_DEBUG
1688     if (__kmp_tasking_mode != tskm_immediate_exec) {
1689       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1690                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1691     }
1692 #endif
1693 
1694     if (parent_team->t.t_active_level >=
1695         master_th->th.th_current_task->td_icvs.max_active_levels) {
1696       nthreads = 1;
1697     } else {
1698       int enter_teams = ((ap == NULL && active_level == 0) ||
1699                          (ap && teams_level > 0 && teams_level == level));
1700       nthreads =
1701           master_set_numthreads
1702               ? master_set_numthreads
1703               : get__nproc_2(
1704                     parent_team,
1705                     master_tid); // TODO: get nproc directly from current task
1706 
1707       // Check if we need to take forkjoin lock? (no need for serialized
1708       // parallel out of teams construct). This code moved here from
1709       // __kmp_reserve_threads() to speedup nested serialized parallels.
1710       if (nthreads > 1) {
1711         if ((get__max_active_levels(master_th) == 1 &&
1712              (root->r.r_in_parallel && !enter_teams)) ||
1713             (__kmp_library == library_serial)) {
1714           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1715                         " threads\n",
1716                         gtid, nthreads));
1717           nthreads = 1;
1718         }
1719       }
1720       if (nthreads > 1) {
1721         /* determine how many new threads we can use */
1722         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1723         /* AC: If we execute teams from parallel region (on host), then teams
1724            should be created but each can only have 1 thread if nesting is
1725            disabled. If teams called from serial region, then teams and their
1726            threads should be created regardless of the nesting setting. */
1727         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1728                                          nthreads, enter_teams);
1729         if (nthreads == 1) {
1730           // Free lock for single thread execution here; for multi-thread
1731           // execution it will be freed later after team of threads created
1732           // and initialized
1733           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1734         }
1735       }
1736     }
1737     KMP_DEBUG_ASSERT(nthreads > 0);
1738 
1739     // If we temporarily changed the set number of threads then restore it now
1740     master_th->th.th_set_nproc = 0;
1741 
1742     /* create a serialized parallel region? */
1743     if (nthreads == 1) {
1744 /* josh todo: hypothetical question: what do we do for OS X*? */
1745 #if KMP_OS_LINUX &&                                                            \
1746     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1747       void *args[argc];
1748 #else
1749       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1750 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1751           KMP_ARCH_AARCH64) */
1752 
1753       KA_TRACE(20,
1754                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1755 
1756       __kmpc_serialized_parallel(loc, gtid);
1757 
1758       if (call_context == fork_context_intel) {
1759         /* TODO this sucks, use the compiler itself to pass args! :) */
1760         master_th->th.th_serial_team->t.t_ident = loc;
1761         if (!ap) {
1762           // revert change made in __kmpc_serialized_parallel()
1763           master_th->th.th_serial_team->t.t_level--;
1764 // Get args from parent team for teams construct
1765 
1766 #if OMPT_SUPPORT
1767           void *dummy;
1768           void **exit_frame_p;
1769           ompt_task_info_t *task_info;
1770 
1771           ompt_lw_taskteam_t lw_taskteam;
1772 
1773           if (ompt_enabled.enabled) {
1774             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1775                                     &ompt_parallel_data, return_address);
1776 
1777             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1778             // don't use lw_taskteam after linking. content was swaped
1779 
1780             task_info = OMPT_CUR_TASK_INFO(master_th);
1781             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1782             if (ompt_enabled.ompt_callback_implicit_task) {
1783               OMPT_CUR_TASK_INFO(master_th)
1784                   ->thread_num = __kmp_tid_from_gtid(gtid);
1785               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1786                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1787                   &(task_info->task_data), 1,
1788                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1789                   ompt_task_implicit);
1790             }
1791 
1792             /* OMPT state */
1793             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1794           } else {
1795             exit_frame_p = &dummy;
1796           }
1797 #endif
1798 
1799           {
1800             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1801             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1802             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1803                                    parent_team->t.t_argv
1804 #if OMPT_SUPPORT
1805                                    ,
1806                                    exit_frame_p
1807 #endif
1808                                    );
1809           }
1810 
1811 #if OMPT_SUPPORT
1812           if (ompt_enabled.enabled) {
1813             *exit_frame_p = NULL;
1814             if (ompt_enabled.ompt_callback_implicit_task) {
1815               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1816                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1817                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1818                   ompt_task_implicit);
1819             }
1820             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1821             __ompt_lw_taskteam_unlink(master_th);
1822             if (ompt_enabled.ompt_callback_parallel_end) {
1823               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1824                   &ompt_parallel_data, parent_task_data,
1825                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1826                   return_address);
1827             }
1828             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1829           }
1830 #endif
1831         } else if (microtask == (microtask_t)__kmp_teams_master) {
1832           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1833                            master_th->th.th_serial_team);
1834           team = master_th->th.th_team;
1835           // team->t.t_pkfn = microtask;
1836           team->t.t_invoke = invoker;
1837           __kmp_alloc_argv_entries(argc, team, TRUE);
1838           team->t.t_argc = argc;
1839           argv = (void **)team->t.t_argv;
1840           if (ap) {
1841             for (i = argc - 1; i >= 0; --i)
1842               *argv++ = va_arg(kmp_va_deref(ap), void *);
1843           } else {
1844             for (i = 0; i < argc; ++i)
1845               // Get args from parent team for teams construct
1846               argv[i] = parent_team->t.t_argv[i];
1847           }
1848           // AC: revert change made in __kmpc_serialized_parallel()
1849           //     because initial code in teams should have level=0
1850           team->t.t_level--;
1851           // AC: call special invoker for outer "parallel" of teams construct
1852           invoker(gtid);
1853 #if OMPT_SUPPORT
1854           if (ompt_enabled.enabled) {
1855             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1856             if (ompt_enabled.ompt_callback_implicit_task) {
1857               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1858                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1859                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1860             }
1861             if (ompt_enabled.ompt_callback_parallel_end) {
1862               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1863                   &ompt_parallel_data, parent_task_data,
1864                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1865                   return_address);
1866             }
1867             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1868           }
1869 #endif
1870         } else {
1871           argv = args;
1872           for (i = argc - 1; i >= 0; --i)
1873             *argv++ = va_arg(kmp_va_deref(ap), void *);
1874           KMP_MB();
1875 
1876 #if OMPT_SUPPORT
1877           void *dummy;
1878           void **exit_frame_p;
1879           ompt_task_info_t *task_info;
1880 
1881           ompt_lw_taskteam_t lw_taskteam;
1882 
1883           if (ompt_enabled.enabled) {
1884             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1885                                     &ompt_parallel_data, return_address);
1886             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1887             // don't use lw_taskteam after linking. content was swaped
1888             task_info = OMPT_CUR_TASK_INFO(master_th);
1889             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1890 
1891             /* OMPT implicit task begin */
1892             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1893             if (ompt_enabled.ompt_callback_implicit_task) {
1894               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1895                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1896                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1897                   ompt_task_implicit);
1898               OMPT_CUR_TASK_INFO(master_th)
1899                   ->thread_num = __kmp_tid_from_gtid(gtid);
1900             }
1901 
1902             /* OMPT state */
1903             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1904           } else {
1905             exit_frame_p = &dummy;
1906           }
1907 #endif
1908 
1909           {
1910             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1911             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1912             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1913 #if OMPT_SUPPORT
1914                                    ,
1915                                    exit_frame_p
1916 #endif
1917                                    );
1918           }
1919 
1920 #if OMPT_SUPPORT
1921           if (ompt_enabled.enabled) {
1922             *exit_frame_p = NULL;
1923             if (ompt_enabled.ompt_callback_implicit_task) {
1924               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1925                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1926                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1927                   ompt_task_implicit);
1928             }
1929 
1930             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1931             __ompt_lw_taskteam_unlink(master_th);
1932             if (ompt_enabled.ompt_callback_parallel_end) {
1933               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1934                   &ompt_parallel_data, parent_task_data,
1935                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1936                   return_address);
1937             }
1938             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1939           }
1940 #endif
1941         }
1942       } else if (call_context == fork_context_gnu) {
1943 #if OMPT_SUPPORT
1944         ompt_lw_taskteam_t lwt;
1945         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1946                                 return_address);
1947 
1948         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1949         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1950 // don't use lw_taskteam after linking. content was swaped
1951 #endif
1952 
1953         // we were called from GNU native code
1954         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1955         return FALSE;
1956       } else {
1957         KMP_ASSERT2(call_context < fork_context_last,
1958                     "__kmp_fork_call: unknown fork_context parameter");
1959       }
1960 
1961       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1962       KMP_MB();
1963       return FALSE;
1964     } // if (nthreads == 1)
1965 
1966     // GEH: only modify the executing flag in the case when not serialized
1967     //      serialized case is handled in kmpc_serialized_parallel
1968     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1969                   "curtask=%p, curtask_max_aclevel=%d\n",
1970                   parent_team->t.t_active_level, master_th,
1971                   master_th->th.th_current_task,
1972                   master_th->th.th_current_task->td_icvs.max_active_levels));
1973     // TODO: GEH - cannot do this assertion because root thread not set up as
1974     // executing
1975     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1976     master_th->th.th_current_task->td_flags.executing = 0;
1977 
1978     if (!master_th->th.th_teams_microtask || level > teams_level) {
1979       /* Increment our nested depth level */
1980       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1981     }
1982 
1983     // See if we need to make a copy of the ICVs.
1984     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1985     if ((level + 1 < __kmp_nested_nth.used) &&
1986         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1987       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1988     } else {
1989       nthreads_icv = 0; // don't update
1990     }
1991 
1992     // Figure out the proc_bind_policy for the new team.
1993     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1994     kmp_proc_bind_t proc_bind_icv =
1995         proc_bind_default; // proc_bind_default means don't update
1996     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1997       proc_bind = proc_bind_false;
1998     } else {
1999       if (proc_bind == proc_bind_default) {
2000         // No proc_bind clause specified; use current proc-bind-var for this
2001         // parallel region
2002         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2003       }
2004       /* else: The proc_bind policy was specified explicitly on parallel clause.
2005          This overrides proc-bind-var for this parallel region, but does not
2006          change proc-bind-var. */
2007       // Figure the value of proc-bind-var for the child threads.
2008       if ((level + 1 < __kmp_nested_proc_bind.used) &&
2009           (__kmp_nested_proc_bind.bind_types[level + 1] !=
2010            master_th->th.th_current_task->td_icvs.proc_bind)) {
2011         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2012       }
2013     }
2014 
2015     // Reset for next parallel region
2016     master_th->th.th_set_proc_bind = proc_bind_default;
2017 
2018     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2019       kmp_internal_control_t new_icvs;
2020       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2021       new_icvs.next = NULL;
2022       if (nthreads_icv > 0) {
2023         new_icvs.nproc = nthreads_icv;
2024       }
2025       if (proc_bind_icv != proc_bind_default) {
2026         new_icvs.proc_bind = proc_bind_icv;
2027       }
2028 
2029       /* allocate a new parallel team */
2030       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2031       team = __kmp_allocate_team(root, nthreads, nthreads,
2032 #if OMPT_SUPPORT
2033                                  ompt_parallel_data,
2034 #endif
2035                                  proc_bind, &new_icvs,
2036                                  argc USE_NESTED_HOT_ARG(master_th));
2037     } else {
2038       /* allocate a new parallel team */
2039       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2040       team = __kmp_allocate_team(root, nthreads, nthreads,
2041 #if OMPT_SUPPORT
2042                                  ompt_parallel_data,
2043 #endif
2044                                  proc_bind,
2045                                  &master_th->th.th_current_task->td_icvs,
2046                                  argc USE_NESTED_HOT_ARG(master_th));
2047     }
2048     KF_TRACE(
2049         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2050 
2051     /* setup the new team */
2052     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2053     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2054     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2055     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2056     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2057 #if OMPT_SUPPORT
2058     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2059                           return_address);
2060 #endif
2061     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2062     // TODO: parent_team->t.t_level == INT_MAX ???
2063     if (!master_th->th.th_teams_microtask || level > teams_level) {
2064       int new_level = parent_team->t.t_level + 1;
2065       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2066       new_level = parent_team->t.t_active_level + 1;
2067       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2068     } else {
2069       // AC: Do not increase parallel level at start of the teams construct
2070       int new_level = parent_team->t.t_level;
2071       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2072       new_level = parent_team->t.t_active_level;
2073       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2074     }
2075     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2076     // set master's schedule as new run-time schedule
2077     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2078 
2079     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2080     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2081 
2082     // Update the floating point rounding in the team if required.
2083     propagateFPControl(team);
2084 
2085     if (__kmp_tasking_mode != tskm_immediate_exec) {
2086       // Set master's task team to team's task team. Unless this is hot team, it
2087       // should be NULL.
2088       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2089                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2090       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2091                     "%p, new task_team %p / team %p\n",
2092                     __kmp_gtid_from_thread(master_th),
2093                     master_th->th.th_task_team, parent_team,
2094                     team->t.t_task_team[master_th->th.th_task_state], team));
2095 
2096       if (active_level || master_th->th.th_task_team) {
2097         // Take a memo of master's task_state
2098         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2099         if (master_th->th.th_task_state_top >=
2100             master_th->th.th_task_state_stack_sz) { // increase size
2101           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2102           kmp_uint8 *old_stack, *new_stack;
2103           kmp_uint32 i;
2104           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2105           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2106             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2107           }
2108           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2109                ++i) { // zero-init rest of stack
2110             new_stack[i] = 0;
2111           }
2112           old_stack = master_th->th.th_task_state_memo_stack;
2113           master_th->th.th_task_state_memo_stack = new_stack;
2114           master_th->th.th_task_state_stack_sz = new_size;
2115           __kmp_free(old_stack);
2116         }
2117         // Store master's task_state on stack
2118         master_th->th
2119             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2120             master_th->th.th_task_state;
2121         master_th->th.th_task_state_top++;
2122 #if KMP_NESTED_HOT_TEAMS
2123         if (master_th->th.th_hot_teams &&
2124             active_level < __kmp_hot_teams_max_level &&
2125             team == master_th->th.th_hot_teams[active_level].hot_team) {
2126           // Restore master's nested state if nested hot team
2127           master_th->th.th_task_state =
2128               master_th->th
2129                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2130         } else {
2131 #endif
2132           master_th->th.th_task_state = 0;
2133 #if KMP_NESTED_HOT_TEAMS
2134         }
2135 #endif
2136       }
2137 #if !KMP_NESTED_HOT_TEAMS
2138       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2139                        (team == root->r.r_hot_team));
2140 #endif
2141     }
2142 
2143     KA_TRACE(
2144         20,
2145         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2146          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2147          team->t.t_nproc));
2148     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2149                      (team->t.t_master_tid == 0 &&
2150                       (team->t.t_parent == root->r.r_root_team ||
2151                        team->t.t_parent->t.t_serialized)));
2152     KMP_MB();
2153 
2154     /* now, setup the arguments */
2155     argv = (void **)team->t.t_argv;
2156     if (ap) {
2157       for (i = argc - 1; i >= 0; --i) {
2158         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2159         KMP_CHECK_UPDATE(*argv, new_argv);
2160         argv++;
2161       }
2162     } else {
2163       for (i = 0; i < argc; ++i) {
2164         // Get args from parent team for teams construct
2165         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2166       }
2167     }
2168 
2169     /* now actually fork the threads */
2170     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2171     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2172       root->r.r_active = TRUE;
2173 
2174     __kmp_fork_team_threads(root, team, master_th, gtid);
2175     __kmp_setup_icv_copy(team, nthreads,
2176                          &master_th->th.th_current_task->td_icvs, loc);
2177 
2178 #if OMPT_SUPPORT
2179     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2180 #endif
2181 
2182     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2183 
2184 #if USE_ITT_BUILD
2185     if (team->t.t_active_level == 1 // only report frames at level 1
2186         && !master_th->th.th_teams_microtask) { // not in teams construct
2187 #if USE_ITT_NOTIFY
2188       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2189           (__kmp_forkjoin_frames_mode == 3 ||
2190            __kmp_forkjoin_frames_mode == 1)) {
2191         kmp_uint64 tmp_time = 0;
2192         if (__itt_get_timestamp_ptr)
2193           tmp_time = __itt_get_timestamp();
2194         // Internal fork - report frame begin
2195         master_th->th.th_frame_time = tmp_time;
2196         if (__kmp_forkjoin_frames_mode == 3)
2197           team->t.t_region_time = tmp_time;
2198       } else
2199 // only one notification scheme (either "submit" or "forking/joined", not both)
2200 #endif /* USE_ITT_NOTIFY */
2201           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2202               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2203         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2204         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2205       }
2206     }
2207 #endif /* USE_ITT_BUILD */
2208 
2209     /* now go on and do the work */
2210     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2211     KMP_MB();
2212     KF_TRACE(10,
2213              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2214               root, team, master_th, gtid));
2215 
2216 #if USE_ITT_BUILD
2217     if (__itt_stack_caller_create_ptr) {
2218       team->t.t_stack_id =
2219           __kmp_itt_stack_caller_create(); // create new stack stitching id
2220       // before entering fork barrier
2221     }
2222 #endif /* USE_ITT_BUILD */
2223 
2224     // AC: skip __kmp_internal_fork at teams construct, let only master
2225     // threads execute
2226     if (ap) {
2227       __kmp_internal_fork(loc, gtid, team);
2228       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2229                     "master_th=%p, gtid=%d\n",
2230                     root, team, master_th, gtid));
2231     }
2232 
2233     if (call_context == fork_context_gnu) {
2234       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2235       return TRUE;
2236     }
2237 
2238     /* Invoke microtask for MASTER thread */
2239     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2240                   team->t.t_id, team->t.t_pkfn));
2241   } // END of timer KMP_fork_call block
2242 
2243 #if KMP_STATS_ENABLED
2244   // If beginning a teams construct, then change thread state
2245   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2246   if (!ap) {
2247     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2248   }
2249 #endif
2250 
2251   if (!team->t.t_invoke(gtid)) {
2252     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2253   }
2254 
2255 #if KMP_STATS_ENABLED
2256   // If was beginning of a teams construct, then reset thread state
2257   if (!ap) {
2258     KMP_SET_THREAD_STATE(previous_state);
2259   }
2260 #endif
2261 
2262   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2263                 team->t.t_id, team->t.t_pkfn));
2264   KMP_MB(); /* Flush all pending memory write invalidates.  */
2265 
2266   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2267 
2268 #if OMPT_SUPPORT
2269   if (ompt_enabled.enabled) {
2270     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2271   }
2272 #endif
2273 
2274   return TRUE;
2275 }
2276 
2277 #if OMPT_SUPPORT
2278 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2279                                             kmp_team_t *team) {
2280   // restore state outside the region
2281   thread->th.ompt_thread_info.state =
2282       ((team->t.t_serialized) ? ompt_state_work_serial
2283                               : ompt_state_work_parallel);
2284 }
2285 
2286 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2287                                    kmp_team_t *team, ompt_data_t *parallel_data,
2288                                    int flags, void *codeptr) {
2289   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2290   if (ompt_enabled.ompt_callback_parallel_end) {
2291     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2292         parallel_data, &(task_info->task_data), flags, codeptr);
2293   }
2294 
2295   task_info->frame.enter_frame = ompt_data_none;
2296   __kmp_join_restore_state(thread, team);
2297 }
2298 #endif
2299 
2300 void __kmp_join_call(ident_t *loc, int gtid
2301 #if OMPT_SUPPORT
2302                      ,
2303                      enum fork_context_e fork_context
2304 #endif
2305                      ,
2306                      int exit_teams) {
2307   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2308   kmp_team_t *team;
2309   kmp_team_t *parent_team;
2310   kmp_info_t *master_th;
2311   kmp_root_t *root;
2312   int master_active;
2313 
2314   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2315 
2316   /* setup current data */
2317   master_th = __kmp_threads[gtid];
2318   root = master_th->th.th_root;
2319   team = master_th->th.th_team;
2320   parent_team = team->t.t_parent;
2321 
2322   master_th->th.th_ident = loc;
2323 
2324 #if OMPT_SUPPORT
2325   void *team_microtask = (void *)team->t.t_pkfn;
2326   // For GOMP interface with serialized parallel, need the
2327   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2328   // and end-parallel events.
2329   if (ompt_enabled.enabled &&
2330       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2331     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2332   }
2333 #endif
2334 
2335 #if KMP_DEBUG
2336   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2337     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2338                   "th_task_team = %p\n",
2339                   __kmp_gtid_from_thread(master_th), team,
2340                   team->t.t_task_team[master_th->th.th_task_state],
2341                   master_th->th.th_task_team));
2342     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2343                      team->t.t_task_team[master_th->th.th_task_state]);
2344   }
2345 #endif
2346 
2347   if (team->t.t_serialized) {
2348     if (master_th->th.th_teams_microtask) {
2349       // We are in teams construct
2350       int level = team->t.t_level;
2351       int tlevel = master_th->th.th_teams_level;
2352       if (level == tlevel) {
2353         // AC: we haven't incremented it earlier at start of teams construct,
2354         //     so do it here - at the end of teams construct
2355         team->t.t_level++;
2356       } else if (level == tlevel + 1) {
2357         // AC: we are exiting parallel inside teams, need to increment
2358         // serialization in order to restore it in the next call to
2359         // __kmpc_end_serialized_parallel
2360         team->t.t_serialized++;
2361       }
2362     }
2363     __kmpc_end_serialized_parallel(loc, gtid);
2364 
2365 #if OMPT_SUPPORT
2366     if (ompt_enabled.enabled) {
2367       __kmp_join_restore_state(master_th, parent_team);
2368     }
2369 #endif
2370 
2371     return;
2372   }
2373 
2374   master_active = team->t.t_master_active;
2375 
2376   if (!exit_teams) {
2377     // AC: No barrier for internal teams at exit from teams construct.
2378     //     But there is barrier for external team (league).
2379     __kmp_internal_join(loc, gtid, team);
2380   } else {
2381     master_th->th.th_task_state =
2382         0; // AC: no tasking in teams (out of any parallel)
2383   }
2384 
2385   KMP_MB();
2386 
2387 #if OMPT_SUPPORT
2388   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2389   void *codeptr = team->t.ompt_team_info.master_return_address;
2390 #endif
2391 
2392 #if USE_ITT_BUILD
2393   if (__itt_stack_caller_create_ptr) {
2394     // destroy the stack stitching id after join barrier
2395     __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2396   }
2397   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2398   if (team->t.t_active_level == 1 &&
2399       (!master_th->th.th_teams_microtask || /* not in teams construct */
2400        master_th->th.th_teams_size.nteams == 1)) {
2401     master_th->th.th_ident = loc;
2402     // only one notification scheme (either "submit" or "forking/joined", not
2403     // both)
2404     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2405         __kmp_forkjoin_frames_mode == 3)
2406       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2407                              master_th->th.th_frame_time, 0, loc,
2408                              master_th->th.th_team_nproc, 1);
2409     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2410              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2411       __kmp_itt_region_joined(gtid);
2412   } // active_level == 1
2413 #endif /* USE_ITT_BUILD */
2414 
2415   if (master_th->th.th_teams_microtask && !exit_teams &&
2416       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2417       team->t.t_level == master_th->th.th_teams_level + 1) {
2418 // AC: We need to leave the team structure intact at the end of parallel
2419 // inside the teams construct, so that at the next parallel same (hot) team
2420 // works, only adjust nesting levels
2421 #if OMPT_SUPPORT
2422     ompt_data_t ompt_parallel_data = ompt_data_none;
2423     if (ompt_enabled.enabled) {
2424       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2425       if (ompt_enabled.ompt_callback_implicit_task) {
2426         int ompt_team_size = team->t.t_nproc;
2427         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2428             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2429             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2430       }
2431       task_info->frame.exit_frame = ompt_data_none;
2432       task_info->task_data = ompt_data_none;
2433       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2434       __ompt_lw_taskteam_unlink(master_th);
2435     }
2436 #endif
2437     /* Decrement our nested depth level */
2438     team->t.t_level--;
2439     team->t.t_active_level--;
2440     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2441 
2442     // Restore number of threads in the team if needed. This code relies on
2443     // the proper adjustment of th_teams_size.nth after the fork in
2444     // __kmp_teams_master on each teams master in the case that
2445     // __kmp_reserve_threads reduced it.
2446     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2447       int old_num = master_th->th.th_team_nproc;
2448       int new_num = master_th->th.th_teams_size.nth;
2449       kmp_info_t **other_threads = team->t.t_threads;
2450       team->t.t_nproc = new_num;
2451       for (int i = 0; i < old_num; ++i) {
2452         other_threads[i]->th.th_team_nproc = new_num;
2453       }
2454       // Adjust states of non-used threads of the team
2455       for (int i = old_num; i < new_num; ++i) {
2456         // Re-initialize thread's barrier data.
2457         KMP_DEBUG_ASSERT(other_threads[i]);
2458         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2459         for (int b = 0; b < bs_last_barrier; ++b) {
2460           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2461           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2462 #if USE_DEBUGGER
2463           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2464 #endif
2465         }
2466         if (__kmp_tasking_mode != tskm_immediate_exec) {
2467           // Synchronize thread's task state
2468           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2469         }
2470       }
2471     }
2472 
2473 #if OMPT_SUPPORT
2474     if (ompt_enabled.enabled) {
2475       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2476                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2477     }
2478 #endif
2479 
2480     return;
2481   }
2482 
2483   /* do cleanup and restore the parent team */
2484   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2485   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2486 
2487   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2488 
2489   /* jc: The following lock has instructions with REL and ACQ semantics,
2490      separating the parallel user code called in this parallel region
2491      from the serial user code called after this function returns. */
2492   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2493 
2494   if (!master_th->th.th_teams_microtask ||
2495       team->t.t_level > master_th->th.th_teams_level) {
2496     /* Decrement our nested depth level */
2497     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2498   }
2499   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2500 
2501 #if OMPT_SUPPORT
2502   if (ompt_enabled.enabled) {
2503     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2504     if (ompt_enabled.ompt_callback_implicit_task) {
2505       int flags = (team_microtask == (void *)__kmp_teams_master)
2506                       ? ompt_task_initial
2507                       : ompt_task_implicit;
2508       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2509       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2510           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2511           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2512     }
2513     task_info->frame.exit_frame = ompt_data_none;
2514     task_info->task_data = ompt_data_none;
2515   }
2516 #endif
2517 
2518   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2519                 master_th, team));
2520   __kmp_pop_current_task_from_thread(master_th);
2521 
2522 #if KMP_AFFINITY_SUPPORTED
2523   // Restore master thread's partition.
2524   master_th->th.th_first_place = team->t.t_first_place;
2525   master_th->th.th_last_place = team->t.t_last_place;
2526 #endif // KMP_AFFINITY_SUPPORTED
2527   master_th->th.th_def_allocator = team->t.t_def_allocator;
2528 
2529   updateHWFPControl(team);
2530 
2531   if (root->r.r_active != master_active)
2532     root->r.r_active = master_active;
2533 
2534   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2535                             master_th)); // this will free worker threads
2536 
2537   /* this race was fun to find. make sure the following is in the critical
2538      region otherwise assertions may fail occasionally since the old team may be
2539      reallocated and the hierarchy appears inconsistent. it is actually safe to
2540      run and won't cause any bugs, but will cause those assertion failures. it's
2541      only one deref&assign so might as well put this in the critical region */
2542   master_th->th.th_team = parent_team;
2543   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2544   master_th->th.th_team_master = parent_team->t.t_threads[0];
2545   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2546 
2547   /* restore serialized team, if need be */
2548   if (parent_team->t.t_serialized &&
2549       parent_team != master_th->th.th_serial_team &&
2550       parent_team != root->r.r_root_team) {
2551     __kmp_free_team(root,
2552                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2553     master_th->th.th_serial_team = parent_team;
2554   }
2555 
2556   if (__kmp_tasking_mode != tskm_immediate_exec) {
2557     if (master_th->th.th_task_state_top >
2558         0) { // Restore task state from memo stack
2559       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2560       // Remember master's state if we re-use this nested hot team
2561       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2562           master_th->th.th_task_state;
2563       --master_th->th.th_task_state_top; // pop
2564       // Now restore state at this level
2565       master_th->th.th_task_state =
2566           master_th->th
2567               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2568     }
2569     // Copy the task team from the parent team to the master thread
2570     master_th->th.th_task_team =
2571         parent_team->t.t_task_team[master_th->th.th_task_state];
2572     KA_TRACE(20,
2573              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2574               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2575               parent_team));
2576   }
2577 
2578   // TODO: GEH - cannot do this assertion because root thread not set up as
2579   // executing
2580   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2581   master_th->th.th_current_task->td_flags.executing = 1;
2582 
2583   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2584 
2585 #if OMPT_SUPPORT
2586   int flags =
2587       OMPT_INVOKER(fork_context) |
2588       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2589                                                       : ompt_parallel_team);
2590   if (ompt_enabled.enabled) {
2591     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2592                     codeptr);
2593   }
2594 #endif
2595 
2596   KMP_MB();
2597   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2598 }
2599 
2600 /* Check whether we should push an internal control record onto the
2601    serial team stack.  If so, do it.  */
2602 void __kmp_save_internal_controls(kmp_info_t *thread) {
2603 
2604   if (thread->th.th_team != thread->th.th_serial_team) {
2605     return;
2606   }
2607   if (thread->th.th_team->t.t_serialized > 1) {
2608     int push = 0;
2609 
2610     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2611       push = 1;
2612     } else {
2613       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2614           thread->th.th_team->t.t_serialized) {
2615         push = 1;
2616       }
2617     }
2618     if (push) { /* push a record on the serial team's stack */
2619       kmp_internal_control_t *control =
2620           (kmp_internal_control_t *)__kmp_allocate(
2621               sizeof(kmp_internal_control_t));
2622 
2623       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2624 
2625       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2626 
2627       control->next = thread->th.th_team->t.t_control_stack_top;
2628       thread->th.th_team->t.t_control_stack_top = control;
2629     }
2630   }
2631 }
2632 
2633 /* Changes set_nproc */
2634 void __kmp_set_num_threads(int new_nth, int gtid) {
2635   kmp_info_t *thread;
2636   kmp_root_t *root;
2637 
2638   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2639   KMP_DEBUG_ASSERT(__kmp_init_serial);
2640 
2641   if (new_nth < 1)
2642     new_nth = 1;
2643   else if (new_nth > __kmp_max_nth)
2644     new_nth = __kmp_max_nth;
2645 
2646   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2647   thread = __kmp_threads[gtid];
2648   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2649     return; // nothing to do
2650 
2651   __kmp_save_internal_controls(thread);
2652 
2653   set__nproc(thread, new_nth);
2654 
2655   // If this omp_set_num_threads() call will cause the hot team size to be
2656   // reduced (in the absence of a num_threads clause), then reduce it now,
2657   // rather than waiting for the next parallel region.
2658   root = thread->th.th_root;
2659   if (__kmp_init_parallel && (!root->r.r_active) &&
2660       (root->r.r_hot_team->t.t_nproc > new_nth)
2661 #if KMP_NESTED_HOT_TEAMS
2662       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2663 #endif
2664       ) {
2665     kmp_team_t *hot_team = root->r.r_hot_team;
2666     int f;
2667 
2668     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2669 
2670     // Release the extra threads we don't need any more.
2671     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2672       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2673       if (__kmp_tasking_mode != tskm_immediate_exec) {
2674         // When decreasing team size, threads no longer in the team should unref
2675         // task team.
2676         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2677       }
2678       __kmp_free_thread(hot_team->t.t_threads[f]);
2679       hot_team->t.t_threads[f] = NULL;
2680     }
2681     hot_team->t.t_nproc = new_nth;
2682 #if KMP_NESTED_HOT_TEAMS
2683     if (thread->th.th_hot_teams) {
2684       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2685       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2686     }
2687 #endif
2688 
2689     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2690 
2691     // Update the t_nproc field in the threads that are still active.
2692     for (f = 0; f < new_nth; f++) {
2693       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2694       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2695     }
2696     // Special flag in case omp_set_num_threads() call
2697     hot_team->t.t_size_changed = -1;
2698   }
2699 }
2700 
2701 /* Changes max_active_levels */
2702 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2703   kmp_info_t *thread;
2704 
2705   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2706                 "%d = (%d)\n",
2707                 gtid, max_active_levels));
2708   KMP_DEBUG_ASSERT(__kmp_init_serial);
2709 
2710   // validate max_active_levels
2711   if (max_active_levels < 0) {
2712     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2713     // We ignore this call if the user has specified a negative value.
2714     // The current setting won't be changed. The last valid setting will be
2715     // used. A warning will be issued (if warnings are allowed as controlled by
2716     // the KMP_WARNINGS env var).
2717     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2718                   "max_active_levels for thread %d = (%d)\n",
2719                   gtid, max_active_levels));
2720     return;
2721   }
2722   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2723     // it's OK, the max_active_levels is within the valid range: [ 0;
2724     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2725     // We allow a zero value. (implementation defined behavior)
2726   } else {
2727     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2728                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2729     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2730     // Current upper limit is MAX_INT. (implementation defined behavior)
2731     // If the input exceeds the upper limit, we correct the input to be the
2732     // upper limit. (implementation defined behavior)
2733     // Actually, the flow should never get here until we use MAX_INT limit.
2734   }
2735   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2736                 "max_active_levels for thread %d = (%d)\n",
2737                 gtid, max_active_levels));
2738 
2739   thread = __kmp_threads[gtid];
2740 
2741   __kmp_save_internal_controls(thread);
2742 
2743   set__max_active_levels(thread, max_active_levels);
2744 }
2745 
2746 /* Gets max_active_levels */
2747 int __kmp_get_max_active_levels(int gtid) {
2748   kmp_info_t *thread;
2749 
2750   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2751   KMP_DEBUG_ASSERT(__kmp_init_serial);
2752 
2753   thread = __kmp_threads[gtid];
2754   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2755   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2756                 "curtask_maxaclevel=%d\n",
2757                 gtid, thread->th.th_current_task,
2758                 thread->th.th_current_task->td_icvs.max_active_levels));
2759   return thread->th.th_current_task->td_icvs.max_active_levels;
2760 }
2761 
2762 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2763 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2764 
2765 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2766 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2767   kmp_info_t *thread;
2768   kmp_sched_t orig_kind;
2769   //    kmp_team_t *team;
2770 
2771   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2772                 gtid, (int)kind, chunk));
2773   KMP_DEBUG_ASSERT(__kmp_init_serial);
2774 
2775   // Check if the kind parameter is valid, correct if needed.
2776   // Valid parameters should fit in one of two intervals - standard or extended:
2777   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2778   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2779   orig_kind = kind;
2780   kind = __kmp_sched_without_mods(kind);
2781 
2782   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2783       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2784     // TODO: Hint needs attention in case we change the default schedule.
2785     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2786               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2787               __kmp_msg_null);
2788     kind = kmp_sched_default;
2789     chunk = 0; // ignore chunk value in case of bad kind
2790   }
2791 
2792   thread = __kmp_threads[gtid];
2793 
2794   __kmp_save_internal_controls(thread);
2795 
2796   if (kind < kmp_sched_upper_std) {
2797     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2798       // differ static chunked vs. unchunked:  chunk should be invalid to
2799       // indicate unchunked schedule (which is the default)
2800       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2801     } else {
2802       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2803           __kmp_sch_map[kind - kmp_sched_lower - 1];
2804     }
2805   } else {
2806     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2807     //    kmp_sched_lower - 2 ];
2808     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2809         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2810                       kmp_sched_lower - 2];
2811   }
2812   __kmp_sched_apply_mods_intkind(
2813       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2814   if (kind == kmp_sched_auto || chunk < 1) {
2815     // ignore parameter chunk for schedule auto
2816     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2817   } else {
2818     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2819   }
2820 }
2821 
2822 /* Gets def_sched_var ICV values */
2823 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2824   kmp_info_t *thread;
2825   enum sched_type th_type;
2826 
2827   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2828   KMP_DEBUG_ASSERT(__kmp_init_serial);
2829 
2830   thread = __kmp_threads[gtid];
2831 
2832   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2833   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2834   case kmp_sch_static:
2835   case kmp_sch_static_greedy:
2836   case kmp_sch_static_balanced:
2837     *kind = kmp_sched_static;
2838     __kmp_sched_apply_mods_stdkind(kind, th_type);
2839     *chunk = 0; // chunk was not set, try to show this fact via zero value
2840     return;
2841   case kmp_sch_static_chunked:
2842     *kind = kmp_sched_static;
2843     break;
2844   case kmp_sch_dynamic_chunked:
2845     *kind = kmp_sched_dynamic;
2846     break;
2847   case kmp_sch_guided_chunked:
2848   case kmp_sch_guided_iterative_chunked:
2849   case kmp_sch_guided_analytical_chunked:
2850     *kind = kmp_sched_guided;
2851     break;
2852   case kmp_sch_auto:
2853     *kind = kmp_sched_auto;
2854     break;
2855   case kmp_sch_trapezoidal:
2856     *kind = kmp_sched_trapezoidal;
2857     break;
2858 #if KMP_STATIC_STEAL_ENABLED
2859   case kmp_sch_static_steal:
2860     *kind = kmp_sched_static_steal;
2861     break;
2862 #endif
2863   default:
2864     KMP_FATAL(UnknownSchedulingType, th_type);
2865   }
2866 
2867   __kmp_sched_apply_mods_stdkind(kind, th_type);
2868   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2869 }
2870 
2871 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2872 
2873   int ii, dd;
2874   kmp_team_t *team;
2875   kmp_info_t *thr;
2876 
2877   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2878   KMP_DEBUG_ASSERT(__kmp_init_serial);
2879 
2880   // validate level
2881   if (level == 0)
2882     return 0;
2883   if (level < 0)
2884     return -1;
2885   thr = __kmp_threads[gtid];
2886   team = thr->th.th_team;
2887   ii = team->t.t_level;
2888   if (level > ii)
2889     return -1;
2890 
2891   if (thr->th.th_teams_microtask) {
2892     // AC: we are in teams region where multiple nested teams have same level
2893     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2894     if (level <=
2895         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2896       KMP_DEBUG_ASSERT(ii >= tlevel);
2897       // AC: As we need to pass by the teams league, we need to artificially
2898       // increase ii
2899       if (ii == tlevel) {
2900         ii += 2; // three teams have same level
2901       } else {
2902         ii++; // two teams have same level
2903       }
2904     }
2905   }
2906 
2907   if (ii == level)
2908     return __kmp_tid_from_gtid(gtid);
2909 
2910   dd = team->t.t_serialized;
2911   level++;
2912   while (ii > level) {
2913     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2914     }
2915     if ((team->t.t_serialized) && (!dd)) {
2916       team = team->t.t_parent;
2917       continue;
2918     }
2919     if (ii > level) {
2920       team = team->t.t_parent;
2921       dd = team->t.t_serialized;
2922       ii--;
2923     }
2924   }
2925 
2926   return (dd > 1) ? (0) : (team->t.t_master_tid);
2927 }
2928 
2929 int __kmp_get_team_size(int gtid, int level) {
2930 
2931   int ii, dd;
2932   kmp_team_t *team;
2933   kmp_info_t *thr;
2934 
2935   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2936   KMP_DEBUG_ASSERT(__kmp_init_serial);
2937 
2938   // validate level
2939   if (level == 0)
2940     return 1;
2941   if (level < 0)
2942     return -1;
2943   thr = __kmp_threads[gtid];
2944   team = thr->th.th_team;
2945   ii = team->t.t_level;
2946   if (level > ii)
2947     return -1;
2948 
2949   if (thr->th.th_teams_microtask) {
2950     // AC: we are in teams region where multiple nested teams have same level
2951     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2952     if (level <=
2953         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2954       KMP_DEBUG_ASSERT(ii >= tlevel);
2955       // AC: As we need to pass by the teams league, we need to artificially
2956       // increase ii
2957       if (ii == tlevel) {
2958         ii += 2; // three teams have same level
2959       } else {
2960         ii++; // two teams have same level
2961       }
2962     }
2963   }
2964 
2965   while (ii > level) {
2966     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2967     }
2968     if (team->t.t_serialized && (!dd)) {
2969       team = team->t.t_parent;
2970       continue;
2971     }
2972     if (ii > level) {
2973       team = team->t.t_parent;
2974       ii--;
2975     }
2976   }
2977 
2978   return team->t.t_nproc;
2979 }
2980 
2981 kmp_r_sched_t __kmp_get_schedule_global() {
2982   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2983   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2984   // independently. So one can get the updated schedule here.
2985 
2986   kmp_r_sched_t r_sched;
2987 
2988   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2989   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2990   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2991   // different roots (even in OMP 2.5)
2992   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2993   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2994   if (s == kmp_sch_static) {
2995     // replace STATIC with more detailed schedule (balanced or greedy)
2996     r_sched.r_sched_type = __kmp_static;
2997   } else if (s == kmp_sch_guided_chunked) {
2998     // replace GUIDED with more detailed schedule (iterative or analytical)
2999     r_sched.r_sched_type = __kmp_guided;
3000   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3001     r_sched.r_sched_type = __kmp_sched;
3002   }
3003   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3004 
3005   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3006     // __kmp_chunk may be wrong here (if it was not ever set)
3007     r_sched.chunk = KMP_DEFAULT_CHUNK;
3008   } else {
3009     r_sched.chunk = __kmp_chunk;
3010   }
3011 
3012   return r_sched;
3013 }
3014 
3015 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3016    at least argc number of *t_argv entries for the requested team. */
3017 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3018 
3019   KMP_DEBUG_ASSERT(team);
3020   if (!realloc || argc > team->t.t_max_argc) {
3021 
3022     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3023                    "current entries=%d\n",
3024                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3025     /* if previously allocated heap space for args, free them */
3026     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3027       __kmp_free((void *)team->t.t_argv);
3028 
3029     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3030       /* use unused space in the cache line for arguments */
3031       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3032       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3033                      "argv entries\n",
3034                      team->t.t_id, team->t.t_max_argc));
3035       team->t.t_argv = &team->t.t_inline_argv[0];
3036       if (__kmp_storage_map) {
3037         __kmp_print_storage_map_gtid(
3038             -1, &team->t.t_inline_argv[0],
3039             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3040             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3041             team->t.t_id);
3042       }
3043     } else {
3044       /* allocate space for arguments in the heap */
3045       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3046                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3047                                : 2 * argc;
3048       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3049                      "argv entries\n",
3050                      team->t.t_id, team->t.t_max_argc));
3051       team->t.t_argv =
3052           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3053       if (__kmp_storage_map) {
3054         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3055                                      &team->t.t_argv[team->t.t_max_argc],
3056                                      sizeof(void *) * team->t.t_max_argc,
3057                                      "team_%d.t_argv", team->t.t_id);
3058       }
3059     }
3060   }
3061 }
3062 
3063 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3064   int i;
3065   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3066   team->t.t_threads =
3067       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3068   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3069       sizeof(dispatch_shared_info_t) * num_disp_buff);
3070   team->t.t_dispatch =
3071       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3072   team->t.t_implicit_task_taskdata =
3073       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3074   team->t.t_max_nproc = max_nth;
3075 
3076   /* setup dispatch buffers */
3077   for (i = 0; i < num_disp_buff; ++i) {
3078     team->t.t_disp_buffer[i].buffer_index = i;
3079     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3080   }
3081 }
3082 
3083 static void __kmp_free_team_arrays(kmp_team_t *team) {
3084   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3085   int i;
3086   for (i = 0; i < team->t.t_max_nproc; ++i) {
3087     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3088       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3089       team->t.t_dispatch[i].th_disp_buffer = NULL;
3090     }
3091   }
3092 #if KMP_USE_HIER_SCHED
3093   __kmp_dispatch_free_hierarchies(team);
3094 #endif
3095   __kmp_free(team->t.t_threads);
3096   __kmp_free(team->t.t_disp_buffer);
3097   __kmp_free(team->t.t_dispatch);
3098   __kmp_free(team->t.t_implicit_task_taskdata);
3099   team->t.t_threads = NULL;
3100   team->t.t_disp_buffer = NULL;
3101   team->t.t_dispatch = NULL;
3102   team->t.t_implicit_task_taskdata = 0;
3103 }
3104 
3105 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3106   kmp_info_t **oldThreads = team->t.t_threads;
3107 
3108   __kmp_free(team->t.t_disp_buffer);
3109   __kmp_free(team->t.t_dispatch);
3110   __kmp_free(team->t.t_implicit_task_taskdata);
3111   __kmp_allocate_team_arrays(team, max_nth);
3112 
3113   KMP_MEMCPY(team->t.t_threads, oldThreads,
3114              team->t.t_nproc * sizeof(kmp_info_t *));
3115 
3116   __kmp_free(oldThreads);
3117 }
3118 
3119 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3120 
3121   kmp_r_sched_t r_sched =
3122       __kmp_get_schedule_global(); // get current state of scheduling globals
3123 
3124   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3125 
3126   kmp_internal_control_t g_icvs = {
3127     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3128     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3129     // adjustment of threads (per thread)
3130     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3131     // whether blocktime is explicitly set
3132     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3133 #if KMP_USE_MONITOR
3134     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3135 // intervals
3136 #endif
3137     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3138     // next parallel region (per thread)
3139     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3140     __kmp_cg_max_nth, // int thread_limit;
3141     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3142     // for max_active_levels
3143     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3144     // {sched,chunk} pair
3145     __kmp_nested_proc_bind.bind_types[0],
3146     __kmp_default_device,
3147     NULL // struct kmp_internal_control *next;
3148   };
3149 
3150   return g_icvs;
3151 }
3152 
3153 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3154 
3155   kmp_internal_control_t gx_icvs;
3156   gx_icvs.serial_nesting_level =
3157       0; // probably =team->t.t_serial like in save_inter_controls
3158   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3159   gx_icvs.next = NULL;
3160 
3161   return gx_icvs;
3162 }
3163 
3164 static void __kmp_initialize_root(kmp_root_t *root) {
3165   int f;
3166   kmp_team_t *root_team;
3167   kmp_team_t *hot_team;
3168   int hot_team_max_nth;
3169   kmp_r_sched_t r_sched =
3170       __kmp_get_schedule_global(); // get current state of scheduling globals
3171   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3172   KMP_DEBUG_ASSERT(root);
3173   KMP_ASSERT(!root->r.r_begin);
3174 
3175   /* setup the root state structure */
3176   __kmp_init_lock(&root->r.r_begin_lock);
3177   root->r.r_begin = FALSE;
3178   root->r.r_active = FALSE;
3179   root->r.r_in_parallel = 0;
3180   root->r.r_blocktime = __kmp_dflt_blocktime;
3181 
3182   /* setup the root team for this task */
3183   /* allocate the root team structure */
3184   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3185 
3186   root_team =
3187       __kmp_allocate_team(root,
3188                           1, // new_nproc
3189                           1, // max_nproc
3190 #if OMPT_SUPPORT
3191                           ompt_data_none, // root parallel id
3192 #endif
3193                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3194                           0 // argc
3195                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3196                           );
3197 #if USE_DEBUGGER
3198   // Non-NULL value should be assigned to make the debugger display the root
3199   // team.
3200   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3201 #endif
3202 
3203   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3204 
3205   root->r.r_root_team = root_team;
3206   root_team->t.t_control_stack_top = NULL;
3207 
3208   /* initialize root team */
3209   root_team->t.t_threads[0] = NULL;
3210   root_team->t.t_nproc = 1;
3211   root_team->t.t_serialized = 1;
3212   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3213   root_team->t.t_sched.sched = r_sched.sched;
3214   KA_TRACE(
3215       20,
3216       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3217        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3218 
3219   /* setup the  hot team for this task */
3220   /* allocate the hot team structure */
3221   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3222 
3223   hot_team =
3224       __kmp_allocate_team(root,
3225                           1, // new_nproc
3226                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3227 #if OMPT_SUPPORT
3228                           ompt_data_none, // root parallel id
3229 #endif
3230                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3231                           0 // argc
3232                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3233                           );
3234   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3235 
3236   root->r.r_hot_team = hot_team;
3237   root_team->t.t_control_stack_top = NULL;
3238 
3239   /* first-time initialization */
3240   hot_team->t.t_parent = root_team;
3241 
3242   /* initialize hot team */
3243   hot_team_max_nth = hot_team->t.t_max_nproc;
3244   for (f = 0; f < hot_team_max_nth; ++f) {
3245     hot_team->t.t_threads[f] = NULL;
3246   }
3247   hot_team->t.t_nproc = 1;
3248   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3249   hot_team->t.t_sched.sched = r_sched.sched;
3250   hot_team->t.t_size_changed = 0;
3251 }
3252 
3253 #ifdef KMP_DEBUG
3254 
3255 typedef struct kmp_team_list_item {
3256   kmp_team_p const *entry;
3257   struct kmp_team_list_item *next;
3258 } kmp_team_list_item_t;
3259 typedef kmp_team_list_item_t *kmp_team_list_t;
3260 
3261 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3262     kmp_team_list_t list, // List of teams.
3263     kmp_team_p const *team // Team to add.
3264     ) {
3265 
3266   // List must terminate with item where both entry and next are NULL.
3267   // Team is added to the list only once.
3268   // List is sorted in ascending order by team id.
3269   // Team id is *not* a key.
3270 
3271   kmp_team_list_t l;
3272 
3273   KMP_DEBUG_ASSERT(list != NULL);
3274   if (team == NULL) {
3275     return;
3276   }
3277 
3278   __kmp_print_structure_team_accum(list, team->t.t_parent);
3279   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3280 
3281   // Search list for the team.
3282   l = list;
3283   while (l->next != NULL && l->entry != team) {
3284     l = l->next;
3285   }
3286   if (l->next != NULL) {
3287     return; // Team has been added before, exit.
3288   }
3289 
3290   // Team is not found. Search list again for insertion point.
3291   l = list;
3292   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3293     l = l->next;
3294   }
3295 
3296   // Insert team.
3297   {
3298     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3299         sizeof(kmp_team_list_item_t));
3300     *item = *l;
3301     l->entry = team;
3302     l->next = item;
3303   }
3304 }
3305 
3306 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3307 
3308                                        ) {
3309   __kmp_printf("%s", title);
3310   if (team != NULL) {
3311     __kmp_printf("%2x %p\n", team->t.t_id, team);
3312   } else {
3313     __kmp_printf(" - (nil)\n");
3314   }
3315 }
3316 
3317 static void __kmp_print_structure_thread(char const *title,
3318                                          kmp_info_p const *thread) {
3319   __kmp_printf("%s", title);
3320   if (thread != NULL) {
3321     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3322   } else {
3323     __kmp_printf(" - (nil)\n");
3324   }
3325 }
3326 
3327 void __kmp_print_structure(void) {
3328 
3329   kmp_team_list_t list;
3330 
3331   // Initialize list of teams.
3332   list =
3333       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3334   list->entry = NULL;
3335   list->next = NULL;
3336 
3337   __kmp_printf("\n------------------------------\nGlobal Thread "
3338                "Table\n------------------------------\n");
3339   {
3340     int gtid;
3341     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3342       __kmp_printf("%2d", gtid);
3343       if (__kmp_threads != NULL) {
3344         __kmp_printf(" %p", __kmp_threads[gtid]);
3345       }
3346       if (__kmp_root != NULL) {
3347         __kmp_printf(" %p", __kmp_root[gtid]);
3348       }
3349       __kmp_printf("\n");
3350     }
3351   }
3352 
3353   // Print out __kmp_threads array.
3354   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3355                "----------\n");
3356   if (__kmp_threads != NULL) {
3357     int gtid;
3358     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3359       kmp_info_t const *thread = __kmp_threads[gtid];
3360       if (thread != NULL) {
3361         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3362         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3363         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3364         __kmp_print_structure_team("    Serial Team:  ",
3365                                    thread->th.th_serial_team);
3366         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3367         __kmp_print_structure_thread("    Master:       ",
3368                                      thread->th.th_team_master);
3369         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3370         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3371         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3372         __kmp_print_structure_thread("    Next in pool: ",
3373                                      thread->th.th_next_pool);
3374         __kmp_printf("\n");
3375         __kmp_print_structure_team_accum(list, thread->th.th_team);
3376         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3377       }
3378     }
3379   } else {
3380     __kmp_printf("Threads array is not allocated.\n");
3381   }
3382 
3383   // Print out __kmp_root array.
3384   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3385                "--------\n");
3386   if (__kmp_root != NULL) {
3387     int gtid;
3388     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3389       kmp_root_t const *root = __kmp_root[gtid];
3390       if (root != NULL) {
3391         __kmp_printf("GTID %2d %p:\n", gtid, root);
3392         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3393         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3394         __kmp_print_structure_thread("    Uber Thread:  ",
3395                                      root->r.r_uber_thread);
3396         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3397         __kmp_printf("    In Parallel:  %2d\n",
3398                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3399         __kmp_printf("\n");
3400         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3401         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3402       }
3403     }
3404   } else {
3405     __kmp_printf("Ubers array is not allocated.\n");
3406   }
3407 
3408   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3409                "--------\n");
3410   while (list->next != NULL) {
3411     kmp_team_p const *team = list->entry;
3412     int i;
3413     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3414     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3415     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3416     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3417     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3418     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3419     for (i = 0; i < team->t.t_nproc; ++i) {
3420       __kmp_printf("    Thread %2d:      ", i);
3421       __kmp_print_structure_thread("", team->t.t_threads[i]);
3422     }
3423     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3424     __kmp_printf("\n");
3425     list = list->next;
3426   }
3427 
3428   // Print out __kmp_thread_pool and __kmp_team_pool.
3429   __kmp_printf("\n------------------------------\nPools\n----------------------"
3430                "--------\n");
3431   __kmp_print_structure_thread("Thread pool:          ",
3432                                CCAST(kmp_info_t *, __kmp_thread_pool));
3433   __kmp_print_structure_team("Team pool:            ",
3434                              CCAST(kmp_team_t *, __kmp_team_pool));
3435   __kmp_printf("\n");
3436 
3437   // Free team list.
3438   while (list != NULL) {
3439     kmp_team_list_item_t *item = list;
3440     list = list->next;
3441     KMP_INTERNAL_FREE(item);
3442   }
3443 }
3444 
3445 #endif
3446 
3447 //---------------------------------------------------------------------------
3448 //  Stuff for per-thread fast random number generator
3449 //  Table of primes
3450 static const unsigned __kmp_primes[] = {
3451     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3452     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3453     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3454     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3455     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3456     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3457     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3458     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3459     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3460     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3461     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3462 
3463 //---------------------------------------------------------------------------
3464 //  __kmp_get_random: Get a random number using a linear congruential method.
3465 unsigned short __kmp_get_random(kmp_info_t *thread) {
3466   unsigned x = thread->th.th_x;
3467   unsigned short r = (unsigned short)(x >> 16);
3468 
3469   thread->th.th_x = x * thread->th.th_a + 1;
3470 
3471   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3472                 thread->th.th_info.ds.ds_tid, r));
3473 
3474   return r;
3475 }
3476 //--------------------------------------------------------
3477 // __kmp_init_random: Initialize a random number generator
3478 void __kmp_init_random(kmp_info_t *thread) {
3479   unsigned seed = thread->th.th_info.ds.ds_tid;
3480 
3481   thread->th.th_a =
3482       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3483   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3484   KA_TRACE(30,
3485            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3486 }
3487 
3488 #if KMP_OS_WINDOWS
3489 /* reclaim array entries for root threads that are already dead, returns number
3490  * reclaimed */
3491 static int __kmp_reclaim_dead_roots(void) {
3492   int i, r = 0;
3493 
3494   for (i = 0; i < __kmp_threads_capacity; ++i) {
3495     if (KMP_UBER_GTID(i) &&
3496         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3497         !__kmp_root[i]
3498              ->r.r_active) { // AC: reclaim only roots died in non-active state
3499       r += __kmp_unregister_root_other_thread(i);
3500     }
3501   }
3502   return r;
3503 }
3504 #endif
3505 
3506 /* This function attempts to create free entries in __kmp_threads and
3507    __kmp_root, and returns the number of free entries generated.
3508 
3509    For Windows* OS static library, the first mechanism used is to reclaim array
3510    entries for root threads that are already dead.
3511 
3512    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3513    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3514    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3515    threadprivate cache array has been created. Synchronization with
3516    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3517 
3518    After any dead root reclamation, if the clipping value allows array expansion
3519    to result in the generation of a total of nNeed free slots, the function does
3520    that expansion. If not, nothing is done beyond the possible initial root
3521    thread reclamation.
3522 
3523    If any argument is negative, the behavior is undefined. */
3524 static int __kmp_expand_threads(int nNeed) {
3525   int added = 0;
3526   int minimumRequiredCapacity;
3527   int newCapacity;
3528   kmp_info_t **newThreads;
3529   kmp_root_t **newRoot;
3530 
3531 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3532 // resizing __kmp_threads does not need additional protection if foreign
3533 // threads are present
3534 
3535 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3536   /* only for Windows static library */
3537   /* reclaim array entries for root threads that are already dead */
3538   added = __kmp_reclaim_dead_roots();
3539 
3540   if (nNeed) {
3541     nNeed -= added;
3542     if (nNeed < 0)
3543       nNeed = 0;
3544   }
3545 #endif
3546   if (nNeed <= 0)
3547     return added;
3548 
3549   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3550   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3551   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3552   // > __kmp_max_nth in one of two ways:
3553   //
3554   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3555   //    may not be reused by another thread, so we may need to increase
3556   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3557   //
3558   // 2) New foreign root(s) are encountered.  We always register new foreign
3559   //    roots. This may cause a smaller # of threads to be allocated at
3560   //    subsequent parallel regions, but the worker threads hang around (and
3561   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3562   //
3563   // Anyway, that is the reason for moving the check to see if
3564   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3565   // instead of having it performed here. -BB
3566 
3567   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3568 
3569   /* compute expansion headroom to check if we can expand */
3570   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3571     /* possible expansion too small -- give up */
3572     return added;
3573   }
3574   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3575 
3576   newCapacity = __kmp_threads_capacity;
3577   do {
3578     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3579                                                           : __kmp_sys_max_nth;
3580   } while (newCapacity < minimumRequiredCapacity);
3581   newThreads = (kmp_info_t **)__kmp_allocate(
3582       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3583   newRoot =
3584       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3585   KMP_MEMCPY(newThreads, __kmp_threads,
3586              __kmp_threads_capacity * sizeof(kmp_info_t *));
3587   KMP_MEMCPY(newRoot, __kmp_root,
3588              __kmp_threads_capacity * sizeof(kmp_root_t *));
3589 
3590   kmp_info_t **temp_threads = __kmp_threads;
3591   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3592   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3593   __kmp_free(temp_threads);
3594   added += newCapacity - __kmp_threads_capacity;
3595   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3596 
3597   if (newCapacity > __kmp_tp_capacity) {
3598     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3599     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3600       __kmp_threadprivate_resize_cache(newCapacity);
3601     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3602       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3603     }
3604     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3605   }
3606 
3607   return added;
3608 }
3609 
3610 /* Register the current thread as a root thread and obtain our gtid. We must
3611    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3612    thread that calls from __kmp_do_serial_initialize() */
3613 int __kmp_register_root(int initial_thread) {
3614   kmp_info_t *root_thread;
3615   kmp_root_t *root;
3616   int gtid;
3617   int capacity;
3618   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3619   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3620   KMP_MB();
3621 
3622   /* 2007-03-02:
3623      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3624      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3625      work as expected -- it may return false (that means there is at least one
3626      empty slot in __kmp_threads array), but it is possible the only free slot
3627      is #0, which is reserved for initial thread and so cannot be used for this
3628      one. Following code workarounds this bug.
3629 
3630      However, right solution seems to be not reserving slot #0 for initial
3631      thread because:
3632      (1) there is no magic in slot #0,
3633      (2) we cannot detect initial thread reliably (the first thread which does
3634         serial initialization may be not a real initial thread).
3635   */
3636   capacity = __kmp_threads_capacity;
3637   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3638     --capacity;
3639   }
3640 
3641   // If it is not for initializing the hidden helper team, we need to take
3642   // __kmp_hidden_helper_threads_num out of the capacity because it is included
3643   // in __kmp_threads_capacity.
3644   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3645     capacity -= __kmp_hidden_helper_threads_num;
3646   }
3647 
3648   /* see if there are too many threads */
3649   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3650     if (__kmp_tp_cached) {
3651       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3652                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3653                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3654     } else {
3655       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3656                   __kmp_msg_null);
3657     }
3658   }
3659 
3660   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3661   // 0: initial thread, also a regular OpenMP thread.
3662   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3663   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3664   // regular OpenMP threads.
3665   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3666     // Find an available thread slot for hidden helper thread. Slots for hidden
3667     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3668     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3669                    gtid <= __kmp_hidden_helper_threads_num;
3670          gtid++)
3671       ;
3672     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3673     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3674                  "hidden helper thread: T#%d\n",
3675                  gtid));
3676   } else {
3677     /* find an available thread slot */
3678     // Don't reassign the zero slot since we need that to only be used by
3679     // initial thread. Slots for hidden helper threads should also be skipped.
3680     if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3681       gtid = 0;
3682     } else {
3683       for (gtid = __kmp_hidden_helper_threads_num + 1;
3684            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3685         ;
3686     }
3687     KA_TRACE(
3688         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3689     KMP_ASSERT(gtid < __kmp_threads_capacity);
3690   }
3691 
3692   /* update global accounting */
3693   __kmp_all_nth++;
3694   TCW_4(__kmp_nth, __kmp_nth + 1);
3695 
3696   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3697   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3698   if (__kmp_adjust_gtid_mode) {
3699     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3700       if (TCR_4(__kmp_gtid_mode) != 2) {
3701         TCW_4(__kmp_gtid_mode, 2);
3702       }
3703     } else {
3704       if (TCR_4(__kmp_gtid_mode) != 1) {
3705         TCW_4(__kmp_gtid_mode, 1);
3706       }
3707     }
3708   }
3709 
3710 #ifdef KMP_ADJUST_BLOCKTIME
3711   /* Adjust blocktime to zero if necessary            */
3712   /* Middle initialization might not have occurred yet */
3713   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3714     if (__kmp_nth > __kmp_avail_proc) {
3715       __kmp_zero_bt = TRUE;
3716     }
3717   }
3718 #endif /* KMP_ADJUST_BLOCKTIME */
3719 
3720   /* setup this new hierarchy */
3721   if (!(root = __kmp_root[gtid])) {
3722     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3723     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3724   }
3725 
3726 #if KMP_STATS_ENABLED
3727   // Initialize stats as soon as possible (right after gtid assignment).
3728   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3729   __kmp_stats_thread_ptr->startLife();
3730   KMP_SET_THREAD_STATE(SERIAL_REGION);
3731   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3732 #endif
3733   __kmp_initialize_root(root);
3734 
3735   /* setup new root thread structure */
3736   if (root->r.r_uber_thread) {
3737     root_thread = root->r.r_uber_thread;
3738   } else {
3739     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3740     if (__kmp_storage_map) {
3741       __kmp_print_thread_storage_map(root_thread, gtid);
3742     }
3743     root_thread->th.th_info.ds.ds_gtid = gtid;
3744 #if OMPT_SUPPORT
3745     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3746 #endif
3747     root_thread->th.th_root = root;
3748     if (__kmp_env_consistency_check) {
3749       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3750     }
3751 #if USE_FAST_MEMORY
3752     __kmp_initialize_fast_memory(root_thread);
3753 #endif /* USE_FAST_MEMORY */
3754 
3755 #if KMP_USE_BGET
3756     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3757     __kmp_initialize_bget(root_thread);
3758 #endif
3759     __kmp_init_random(root_thread); // Initialize random number generator
3760   }
3761 
3762   /* setup the serial team held in reserve by the root thread */
3763   if (!root_thread->th.th_serial_team) {
3764     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3765     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3766     root_thread->th.th_serial_team = __kmp_allocate_team(
3767         root, 1, 1,
3768 #if OMPT_SUPPORT
3769         ompt_data_none, // root parallel id
3770 #endif
3771         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3772   }
3773   KMP_ASSERT(root_thread->th.th_serial_team);
3774   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3775                 root_thread->th.th_serial_team));
3776 
3777   /* drop root_thread into place */
3778   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3779 
3780   root->r.r_root_team->t.t_threads[0] = root_thread;
3781   root->r.r_hot_team->t.t_threads[0] = root_thread;
3782   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3783   // AC: the team created in reserve, not for execution (it is unused for now).
3784   root_thread->th.th_serial_team->t.t_serialized = 0;
3785   root->r.r_uber_thread = root_thread;
3786 
3787   /* initialize the thread, get it ready to go */
3788   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3789   TCW_4(__kmp_init_gtid, TRUE);
3790 
3791   /* prepare the master thread for get_gtid() */
3792   __kmp_gtid_set_specific(gtid);
3793 
3794 #if USE_ITT_BUILD
3795   __kmp_itt_thread_name(gtid);
3796 #endif /* USE_ITT_BUILD */
3797 
3798 #ifdef KMP_TDATA_GTID
3799   __kmp_gtid = gtid;
3800 #endif
3801   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3802   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3803 
3804   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3805                 "plain=%u\n",
3806                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3807                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3808                 KMP_INIT_BARRIER_STATE));
3809   { // Initialize barrier data.
3810     int b;
3811     for (b = 0; b < bs_last_barrier; ++b) {
3812       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3813 #if USE_DEBUGGER
3814       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3815 #endif
3816     }
3817   }
3818   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3819                    KMP_INIT_BARRIER_STATE);
3820 
3821 #if KMP_AFFINITY_SUPPORTED
3822   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3823   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3824   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3825   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3826   if (TCR_4(__kmp_init_middle)) {
3827     __kmp_affinity_set_init_mask(gtid, TRUE);
3828   }
3829 #endif /* KMP_AFFINITY_SUPPORTED */
3830   root_thread->th.th_def_allocator = __kmp_def_allocator;
3831   root_thread->th.th_prev_level = 0;
3832   root_thread->th.th_prev_num_threads = 1;
3833 
3834   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3835   tmp->cg_root = root_thread;
3836   tmp->cg_thread_limit = __kmp_cg_max_nth;
3837   tmp->cg_nthreads = 1;
3838   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3839                  " cg_nthreads init to 1\n",
3840                  root_thread, tmp));
3841   tmp->up = NULL;
3842   root_thread->th.th_cg_roots = tmp;
3843 
3844   __kmp_root_counter++;
3845 
3846 #if OMPT_SUPPORT
3847   if (!initial_thread && ompt_enabled.enabled) {
3848 
3849     kmp_info_t *root_thread = ompt_get_thread();
3850 
3851     ompt_set_thread_state(root_thread, ompt_state_overhead);
3852 
3853     if (ompt_enabled.ompt_callback_thread_begin) {
3854       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3855           ompt_thread_initial, __ompt_get_thread_data_internal());
3856     }
3857     ompt_data_t *task_data;
3858     ompt_data_t *parallel_data;
3859     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3860     if (ompt_enabled.ompt_callback_implicit_task) {
3861       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3862           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3863     }
3864 
3865     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3866   }
3867 #endif
3868 
3869   KMP_MB();
3870   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3871 
3872   return gtid;
3873 }
3874 
3875 #if KMP_NESTED_HOT_TEAMS
3876 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3877                                 const int max_level) {
3878   int i, n, nth;
3879   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3880   if (!hot_teams || !hot_teams[level].hot_team) {
3881     return 0;
3882   }
3883   KMP_DEBUG_ASSERT(level < max_level);
3884   kmp_team_t *team = hot_teams[level].hot_team;
3885   nth = hot_teams[level].hot_team_nth;
3886   n = nth - 1; // master is not freed
3887   if (level < max_level - 1) {
3888     for (i = 0; i < nth; ++i) {
3889       kmp_info_t *th = team->t.t_threads[i];
3890       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3891       if (i > 0 && th->th.th_hot_teams) {
3892         __kmp_free(th->th.th_hot_teams);
3893         th->th.th_hot_teams = NULL;
3894       }
3895     }
3896   }
3897   __kmp_free_team(root, team, NULL);
3898   return n;
3899 }
3900 #endif
3901 
3902 // Resets a root thread and clear its root and hot teams.
3903 // Returns the number of __kmp_threads entries directly and indirectly freed.
3904 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3905   kmp_team_t *root_team = root->r.r_root_team;
3906   kmp_team_t *hot_team = root->r.r_hot_team;
3907   int n = hot_team->t.t_nproc;
3908   int i;
3909 
3910   KMP_DEBUG_ASSERT(!root->r.r_active);
3911 
3912   root->r.r_root_team = NULL;
3913   root->r.r_hot_team = NULL;
3914   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3915   // before call to __kmp_free_team().
3916   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3917 #if KMP_NESTED_HOT_TEAMS
3918   if (__kmp_hot_teams_max_level >
3919       0) { // need to free nested hot teams and their threads if any
3920     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3921       kmp_info_t *th = hot_team->t.t_threads[i];
3922       if (__kmp_hot_teams_max_level > 1) {
3923         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3924       }
3925       if (th->th.th_hot_teams) {
3926         __kmp_free(th->th.th_hot_teams);
3927         th->th.th_hot_teams = NULL;
3928       }
3929     }
3930   }
3931 #endif
3932   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3933 
3934   // Before we can reap the thread, we need to make certain that all other
3935   // threads in the teams that had this root as ancestor have stopped trying to
3936   // steal tasks.
3937   if (__kmp_tasking_mode != tskm_immediate_exec) {
3938     __kmp_wait_to_unref_task_teams();
3939   }
3940 
3941 #if KMP_OS_WINDOWS
3942   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3943   KA_TRACE(
3944       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3945            "\n",
3946            (LPVOID) & (root->r.r_uber_thread->th),
3947            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3948   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3949 #endif /* KMP_OS_WINDOWS */
3950 
3951 #if OMPT_SUPPORT
3952   ompt_data_t *task_data;
3953   ompt_data_t *parallel_data;
3954   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3955   if (ompt_enabled.ompt_callback_implicit_task) {
3956     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3957         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3958   }
3959   if (ompt_enabled.ompt_callback_thread_end) {
3960     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3961         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3962   }
3963 #endif
3964 
3965   TCW_4(__kmp_nth,
3966         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3967   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3968   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3969                  " to %d\n",
3970                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3971                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3972   if (i == 1) {
3973     // need to free contention group structure
3974     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3975                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3976     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3977     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3978     root->r.r_uber_thread->th.th_cg_roots = NULL;
3979   }
3980   __kmp_reap_thread(root->r.r_uber_thread, 1);
3981 
3982   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3983   // instead of freeing.
3984   root->r.r_uber_thread = NULL;
3985   /* mark root as no longer in use */
3986   root->r.r_begin = FALSE;
3987 
3988   return n;
3989 }
3990 
3991 void __kmp_unregister_root_current_thread(int gtid) {
3992   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3993   /* this lock should be ok, since unregister_root_current_thread is never
3994      called during an abort, only during a normal close. furthermore, if you
3995      have the forkjoin lock, you should never try to get the initz lock */
3996   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3997   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3998     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3999                   "exiting T#%d\n",
4000                   gtid));
4001     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4002     return;
4003   }
4004   kmp_root_t *root = __kmp_root[gtid];
4005 
4006   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4007   KMP_ASSERT(KMP_UBER_GTID(gtid));
4008   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4009   KMP_ASSERT(root->r.r_active == FALSE);
4010 
4011   KMP_MB();
4012 
4013   kmp_info_t *thread = __kmp_threads[gtid];
4014   kmp_team_t *team = thread->th.th_team;
4015   kmp_task_team_t *task_team = thread->th.th_task_team;
4016 
4017   // we need to wait for the proxy tasks before finishing the thread
4018   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4019 #if OMPT_SUPPORT
4020     // the runtime is shutting down so we won't report any events
4021     thread->th.ompt_thread_info.state = ompt_state_undefined;
4022 #endif
4023     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4024   }
4025 
4026   __kmp_reset_root(gtid, root);
4027 
4028   KMP_MB();
4029   KC_TRACE(10,
4030            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4031 
4032   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4033 }
4034 
4035 #if KMP_OS_WINDOWS
4036 /* __kmp_forkjoin_lock must be already held
4037    Unregisters a root thread that is not the current thread.  Returns the number
4038    of __kmp_threads entries freed as a result. */
4039 static int __kmp_unregister_root_other_thread(int gtid) {
4040   kmp_root_t *root = __kmp_root[gtid];
4041   int r;
4042 
4043   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4044   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4045   KMP_ASSERT(KMP_UBER_GTID(gtid));
4046   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4047   KMP_ASSERT(root->r.r_active == FALSE);
4048 
4049   r = __kmp_reset_root(gtid, root);
4050   KC_TRACE(10,
4051            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4052   return r;
4053 }
4054 #endif
4055 
4056 #if KMP_DEBUG
4057 void __kmp_task_info() {
4058 
4059   kmp_int32 gtid = __kmp_entry_gtid();
4060   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4061   kmp_info_t *this_thr = __kmp_threads[gtid];
4062   kmp_team_t *steam = this_thr->th.th_serial_team;
4063   kmp_team_t *team = this_thr->th.th_team;
4064 
4065   __kmp_printf(
4066       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4067       "ptask=%p\n",
4068       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4069       team->t.t_implicit_task_taskdata[tid].td_parent);
4070 }
4071 #endif // KMP_DEBUG
4072 
4073 /* TODO optimize with one big memclr, take out what isn't needed, split
4074    responsibility to workers as much as possible, and delay initialization of
4075    features as much as possible  */
4076 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4077                                   int tid, int gtid) {
4078   /* this_thr->th.th_info.ds.ds_gtid is setup in
4079      kmp_allocate_thread/create_worker.
4080      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4081   kmp_info_t *master = team->t.t_threads[0];
4082   KMP_DEBUG_ASSERT(this_thr != NULL);
4083   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4084   KMP_DEBUG_ASSERT(team);
4085   KMP_DEBUG_ASSERT(team->t.t_threads);
4086   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4087   KMP_DEBUG_ASSERT(master);
4088   KMP_DEBUG_ASSERT(master->th.th_root);
4089 
4090   KMP_MB();
4091 
4092   TCW_SYNC_PTR(this_thr->th.th_team, team);
4093 
4094   this_thr->th.th_info.ds.ds_tid = tid;
4095   this_thr->th.th_set_nproc = 0;
4096   if (__kmp_tasking_mode != tskm_immediate_exec)
4097     // When tasking is possible, threads are not safe to reap until they are
4098     // done tasking; this will be set when tasking code is exited in wait
4099     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4100   else // no tasking --> always safe to reap
4101     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4102   this_thr->th.th_set_proc_bind = proc_bind_default;
4103 #if KMP_AFFINITY_SUPPORTED
4104   this_thr->th.th_new_place = this_thr->th.th_current_place;
4105 #endif
4106   this_thr->th.th_root = master->th.th_root;
4107 
4108   /* setup the thread's cache of the team structure */
4109   this_thr->th.th_team_nproc = team->t.t_nproc;
4110   this_thr->th.th_team_master = master;
4111   this_thr->th.th_team_serialized = team->t.t_serialized;
4112   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4113 
4114   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4115 
4116   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4117                 tid, gtid, this_thr, this_thr->th.th_current_task));
4118 
4119   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4120                            team, tid, TRUE);
4121 
4122   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4123                 tid, gtid, this_thr, this_thr->th.th_current_task));
4124   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4125   // __kmp_initialize_team()?
4126 
4127   /* TODO no worksharing in speculative threads */
4128   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4129 
4130   this_thr->th.th_local.this_construct = 0;
4131 
4132   if (!this_thr->th.th_pri_common) {
4133     this_thr->th.th_pri_common =
4134         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4135     if (__kmp_storage_map) {
4136       __kmp_print_storage_map_gtid(
4137           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4138           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4139     }
4140     this_thr->th.th_pri_head = NULL;
4141   }
4142 
4143   if (this_thr != master && // Master's CG root is initialized elsewhere
4144       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4145     // Make new thread's CG root same as master's
4146     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4147     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4148     if (tmp) {
4149       // worker changes CG, need to check if old CG should be freed
4150       int i = tmp->cg_nthreads--;
4151       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4152                      " on node %p of thread %p to %d\n",
4153                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4154       if (i == 1) {
4155         __kmp_free(tmp); // last thread left CG --> free it
4156       }
4157     }
4158     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4159     // Increment new thread's CG root's counter to add the new thread
4160     this_thr->th.th_cg_roots->cg_nthreads++;
4161     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4162                    " node %p of thread %p to %d\n",
4163                    this_thr, this_thr->th.th_cg_roots,
4164                    this_thr->th.th_cg_roots->cg_root,
4165                    this_thr->th.th_cg_roots->cg_nthreads));
4166     this_thr->th.th_current_task->td_icvs.thread_limit =
4167         this_thr->th.th_cg_roots->cg_thread_limit;
4168   }
4169 
4170   /* Initialize dynamic dispatch */
4171   {
4172     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4173     // Use team max_nproc since this will never change for the team.
4174     size_t disp_size =
4175         sizeof(dispatch_private_info_t) *
4176         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4177     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4178                   team->t.t_max_nproc));
4179     KMP_ASSERT(dispatch);
4180     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4181     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4182 
4183     dispatch->th_disp_index = 0;
4184     dispatch->th_doacross_buf_idx = 0;
4185     if (!dispatch->th_disp_buffer) {
4186       dispatch->th_disp_buffer =
4187           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4188 
4189       if (__kmp_storage_map) {
4190         __kmp_print_storage_map_gtid(
4191             gtid, &dispatch->th_disp_buffer[0],
4192             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4193                                           ? 1
4194                                           : __kmp_dispatch_num_buffers],
4195             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4196                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4197             gtid, team->t.t_id, gtid);
4198       }
4199     } else {
4200       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4201     }
4202 
4203     dispatch->th_dispatch_pr_current = 0;
4204     dispatch->th_dispatch_sh_current = 0;
4205 
4206     dispatch->th_deo_fcn = 0; /* ORDERED     */
4207     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4208   }
4209 
4210   this_thr->th.th_next_pool = NULL;
4211 
4212   if (!this_thr->th.th_task_state_memo_stack) {
4213     size_t i;
4214     this_thr->th.th_task_state_memo_stack =
4215         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4216     this_thr->th.th_task_state_top = 0;
4217     this_thr->th.th_task_state_stack_sz = 4;
4218     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4219          ++i) // zero init the stack
4220       this_thr->th.th_task_state_memo_stack[i] = 0;
4221   }
4222 
4223   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4224   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4225 
4226   KMP_MB();
4227 }
4228 
4229 /* allocate a new thread for the requesting team. this is only called from
4230    within a forkjoin critical section. we will first try to get an available
4231    thread from the thread pool. if none is available, we will fork a new one
4232    assuming we are able to create a new one. this should be assured, as the
4233    caller should check on this first. */
4234 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4235                                   int new_tid) {
4236   kmp_team_t *serial_team;
4237   kmp_info_t *new_thr;
4238   int new_gtid;
4239 
4240   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4241   KMP_DEBUG_ASSERT(root && team);
4242 #if !KMP_NESTED_HOT_TEAMS
4243   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4244 #endif
4245   KMP_MB();
4246 
4247   /* first, try to get one from the thread pool */
4248   if (__kmp_thread_pool) {
4249     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4250     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4251     if (new_thr == __kmp_thread_pool_insert_pt) {
4252       __kmp_thread_pool_insert_pt = NULL;
4253     }
4254     TCW_4(new_thr->th.th_in_pool, FALSE);
4255     __kmp_suspend_initialize_thread(new_thr);
4256     __kmp_lock_suspend_mx(new_thr);
4257     if (new_thr->th.th_active_in_pool == TRUE) {
4258       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4259       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4260       new_thr->th.th_active_in_pool = FALSE;
4261     }
4262     __kmp_unlock_suspend_mx(new_thr);
4263 
4264     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4265                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4266     KMP_ASSERT(!new_thr->th.th_team);
4267     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4268 
4269     /* setup the thread structure */
4270     __kmp_initialize_info(new_thr, team, new_tid,
4271                           new_thr->th.th_info.ds.ds_gtid);
4272     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4273 
4274     TCW_4(__kmp_nth, __kmp_nth + 1);
4275 
4276     new_thr->th.th_task_state = 0;
4277     new_thr->th.th_task_state_top = 0;
4278     new_thr->th.th_task_state_stack_sz = 4;
4279 
4280 #ifdef KMP_ADJUST_BLOCKTIME
4281     /* Adjust blocktime back to zero if necessary */
4282     /* Middle initialization might not have occurred yet */
4283     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4284       if (__kmp_nth > __kmp_avail_proc) {
4285         __kmp_zero_bt = TRUE;
4286       }
4287     }
4288 #endif /* KMP_ADJUST_BLOCKTIME */
4289 
4290 #if KMP_DEBUG
4291     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4292     // KMP_BARRIER_PARENT_FLAG.
4293     int b;
4294     kmp_balign_t *balign = new_thr->th.th_bar;
4295     for (b = 0; b < bs_last_barrier; ++b)
4296       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4297 #endif
4298 
4299     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4300                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4301 
4302     KMP_MB();
4303     return new_thr;
4304   }
4305 
4306   /* no, well fork a new one */
4307   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4308   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4309 
4310 #if KMP_USE_MONITOR
4311   // If this is the first worker thread the RTL is creating, then also
4312   // launch the monitor thread.  We try to do this as early as possible.
4313   if (!TCR_4(__kmp_init_monitor)) {
4314     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4315     if (!TCR_4(__kmp_init_monitor)) {
4316       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4317       TCW_4(__kmp_init_monitor, 1);
4318       __kmp_create_monitor(&__kmp_monitor);
4319       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4320 #if KMP_OS_WINDOWS
4321       // AC: wait until monitor has started. This is a fix for CQ232808.
4322       // The reason is that if the library is loaded/unloaded in a loop with
4323       // small (parallel) work in between, then there is high probability that
4324       // monitor thread started after the library shutdown. At shutdown it is
4325       // too late to cope with the problem, because when the master is in
4326       // DllMain (process detach) the monitor has no chances to start (it is
4327       // blocked), and master has no means to inform the monitor that the
4328       // library has gone, because all the memory which the monitor can access
4329       // is going to be released/reset.
4330       while (TCR_4(__kmp_init_monitor) < 2) {
4331         KMP_YIELD(TRUE);
4332       }
4333       KF_TRACE(10, ("after monitor thread has started\n"));
4334 #endif
4335     }
4336     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4337   }
4338 #endif
4339 
4340   KMP_MB();
4341 
4342   {
4343     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4344                              ? 1
4345                              : __kmp_hidden_helper_threads_num + 1;
4346 
4347     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4348          ++new_gtid) {
4349       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4350     }
4351 
4352     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4353       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4354     }
4355   }
4356 
4357   /* allocate space for it. */
4358   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4359 
4360   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4361 
4362 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4363   // suppress race conditions detection on synchronization flags in debug mode
4364   // this helps to analyze library internals eliminating false positives
4365   __itt_suppress_mark_range(
4366       __itt_suppress_range, __itt_suppress_threading_errors,
4367       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4368   __itt_suppress_mark_range(
4369       __itt_suppress_range, __itt_suppress_threading_errors,
4370       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4371 #if KMP_OS_WINDOWS
4372   __itt_suppress_mark_range(
4373       __itt_suppress_range, __itt_suppress_threading_errors,
4374       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4375 #else
4376   __itt_suppress_mark_range(__itt_suppress_range,
4377                             __itt_suppress_threading_errors,
4378                             &new_thr->th.th_suspend_init_count,
4379                             sizeof(new_thr->th.th_suspend_init_count));
4380 #endif
4381   // TODO: check if we need to also suppress b_arrived flags
4382   __itt_suppress_mark_range(__itt_suppress_range,
4383                             __itt_suppress_threading_errors,
4384                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4385                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4386   __itt_suppress_mark_range(__itt_suppress_range,
4387                             __itt_suppress_threading_errors,
4388                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4389                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4390   __itt_suppress_mark_range(__itt_suppress_range,
4391                             __itt_suppress_threading_errors,
4392                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4393                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4394 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4395   if (__kmp_storage_map) {
4396     __kmp_print_thread_storage_map(new_thr, new_gtid);
4397   }
4398 
4399   // add the reserve serialized team, initialized from the team's master thread
4400   {
4401     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4402     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4403     new_thr->th.th_serial_team = serial_team =
4404         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4405 #if OMPT_SUPPORT
4406                                           ompt_data_none, // root parallel id
4407 #endif
4408                                           proc_bind_default, &r_icvs,
4409                                           0 USE_NESTED_HOT_ARG(NULL));
4410   }
4411   KMP_ASSERT(serial_team);
4412   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4413   // execution (it is unused for now).
4414   serial_team->t.t_threads[0] = new_thr;
4415   KF_TRACE(10,
4416            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4417             new_thr));
4418 
4419   /* setup the thread structures */
4420   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4421 
4422 #if USE_FAST_MEMORY
4423   __kmp_initialize_fast_memory(new_thr);
4424 #endif /* USE_FAST_MEMORY */
4425 
4426 #if KMP_USE_BGET
4427   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4428   __kmp_initialize_bget(new_thr);
4429 #endif
4430 
4431   __kmp_init_random(new_thr); // Initialize random number generator
4432 
4433   /* Initialize these only once when thread is grabbed for a team allocation */
4434   KA_TRACE(20,
4435            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4436             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4437 
4438   int b;
4439   kmp_balign_t *balign = new_thr->th.th_bar;
4440   for (b = 0; b < bs_last_barrier; ++b) {
4441     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4442     balign[b].bb.team = NULL;
4443     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4444     balign[b].bb.use_oncore_barrier = 0;
4445   }
4446 
4447   new_thr->th.th_spin_here = FALSE;
4448   new_thr->th.th_next_waiting = 0;
4449 #if KMP_OS_UNIX
4450   new_thr->th.th_blocking = false;
4451 #endif
4452 
4453 #if KMP_AFFINITY_SUPPORTED
4454   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4455   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4456   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4457   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4458 #endif
4459   new_thr->th.th_def_allocator = __kmp_def_allocator;
4460   new_thr->th.th_prev_level = 0;
4461   new_thr->th.th_prev_num_threads = 1;
4462 
4463   TCW_4(new_thr->th.th_in_pool, FALSE);
4464   new_thr->th.th_active_in_pool = FALSE;
4465   TCW_4(new_thr->th.th_active, TRUE);
4466 
4467   /* adjust the global counters */
4468   __kmp_all_nth++;
4469   __kmp_nth++;
4470 
4471   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4472   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4473   if (__kmp_adjust_gtid_mode) {
4474     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4475       if (TCR_4(__kmp_gtid_mode) != 2) {
4476         TCW_4(__kmp_gtid_mode, 2);
4477       }
4478     } else {
4479       if (TCR_4(__kmp_gtid_mode) != 1) {
4480         TCW_4(__kmp_gtid_mode, 1);
4481       }
4482     }
4483   }
4484 
4485 #ifdef KMP_ADJUST_BLOCKTIME
4486   /* Adjust blocktime back to zero if necessary       */
4487   /* Middle initialization might not have occurred yet */
4488   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4489     if (__kmp_nth > __kmp_avail_proc) {
4490       __kmp_zero_bt = TRUE;
4491     }
4492   }
4493 #endif /* KMP_ADJUST_BLOCKTIME */
4494 
4495   /* actually fork it and create the new worker thread */
4496   KF_TRACE(
4497       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4498   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4499   KF_TRACE(10,
4500            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4501 
4502   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4503                 new_gtid));
4504   KMP_MB();
4505   return new_thr;
4506 }
4507 
4508 /* Reinitialize team for reuse.
4509    The hot team code calls this case at every fork barrier, so EPCC barrier
4510    test are extremely sensitive to changes in it, esp. writes to the team
4511    struct, which cause a cache invalidation in all threads.
4512    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4513 static void __kmp_reinitialize_team(kmp_team_t *team,
4514                                     kmp_internal_control_t *new_icvs,
4515                                     ident_t *loc) {
4516   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4517                 team->t.t_threads[0], team));
4518   KMP_DEBUG_ASSERT(team && new_icvs);
4519   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4520   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4521 
4522   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4523   // Copy ICVs to the master thread's implicit taskdata
4524   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4525   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4526 
4527   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4528                 team->t.t_threads[0], team));
4529 }
4530 
4531 /* Initialize the team data structure.
4532    This assumes the t_threads and t_max_nproc are already set.
4533    Also, we don't touch the arguments */
4534 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4535                                   kmp_internal_control_t *new_icvs,
4536                                   ident_t *loc) {
4537   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4538 
4539   /* verify */
4540   KMP_DEBUG_ASSERT(team);
4541   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4542   KMP_DEBUG_ASSERT(team->t.t_threads);
4543   KMP_MB();
4544 
4545   team->t.t_master_tid = 0; /* not needed */
4546   /* team->t.t_master_bar;        not needed */
4547   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4548   team->t.t_nproc = new_nproc;
4549 
4550   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4551   team->t.t_next_pool = NULL;
4552   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4553    * up hot team */
4554 
4555   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4556   team->t.t_invoke = NULL; /* not needed */
4557 
4558   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4559   team->t.t_sched.sched = new_icvs->sched.sched;
4560 
4561 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4562   team->t.t_fp_control_saved = FALSE; /* not needed */
4563   team->t.t_x87_fpu_control_word = 0; /* not needed */
4564   team->t.t_mxcsr = 0; /* not needed */
4565 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4566 
4567   team->t.t_construct = 0;
4568 
4569   team->t.t_ordered.dt.t_value = 0;
4570   team->t.t_master_active = FALSE;
4571 
4572 #ifdef KMP_DEBUG
4573   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4574 #endif
4575 #if KMP_OS_WINDOWS
4576   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4577 #endif
4578 
4579   team->t.t_control_stack_top = NULL;
4580 
4581   __kmp_reinitialize_team(team, new_icvs, loc);
4582 
4583   KMP_MB();
4584   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4585 }
4586 
4587 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4588 /* Sets full mask for thread and returns old mask, no changes to structures. */
4589 static void
4590 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4591   if (KMP_AFFINITY_CAPABLE()) {
4592     int status;
4593     if (old_mask != NULL) {
4594       status = __kmp_get_system_affinity(old_mask, TRUE);
4595       int error = errno;
4596       if (status != 0) {
4597         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4598                     __kmp_msg_null);
4599       }
4600     }
4601     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4602   }
4603 }
4604 #endif
4605 
4606 #if KMP_AFFINITY_SUPPORTED
4607 
4608 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4609 // It calculates the worker + master thread's partition based upon the parent
4610 // thread's partition, and binds each worker to a thread in their partition.
4611 // The master thread's partition should already include its current binding.
4612 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4613   // Copy the master thread's place partition to the team struct
4614   kmp_info_t *master_th = team->t.t_threads[0];
4615   KMP_DEBUG_ASSERT(master_th != NULL);
4616   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4617   int first_place = master_th->th.th_first_place;
4618   int last_place = master_th->th.th_last_place;
4619   int masters_place = master_th->th.th_current_place;
4620   team->t.t_first_place = first_place;
4621   team->t.t_last_place = last_place;
4622 
4623   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4624                 "bound to place %d partition = [%d,%d]\n",
4625                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4626                 team->t.t_id, masters_place, first_place, last_place));
4627 
4628   switch (proc_bind) {
4629 
4630   case proc_bind_default:
4631     // serial teams might have the proc_bind policy set to proc_bind_default. It
4632     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4633     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4634     break;
4635 
4636   case proc_bind_master: {
4637     int f;
4638     int n_th = team->t.t_nproc;
4639     for (f = 1; f < n_th; f++) {
4640       kmp_info_t *th = team->t.t_threads[f];
4641       KMP_DEBUG_ASSERT(th != NULL);
4642       th->th.th_first_place = first_place;
4643       th->th.th_last_place = last_place;
4644       th->th.th_new_place = masters_place;
4645       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4646           team->t.t_display_affinity != 1) {
4647         team->t.t_display_affinity = 1;
4648       }
4649 
4650       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4651                      "partition = [%d,%d]\n",
4652                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4653                      f, masters_place, first_place, last_place));
4654     }
4655   } break;
4656 
4657   case proc_bind_close: {
4658     int f;
4659     int n_th = team->t.t_nproc;
4660     int n_places;
4661     if (first_place <= last_place) {
4662       n_places = last_place - first_place + 1;
4663     } else {
4664       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4665     }
4666     if (n_th <= n_places) {
4667       int place = masters_place;
4668       for (f = 1; f < n_th; f++) {
4669         kmp_info_t *th = team->t.t_threads[f];
4670         KMP_DEBUG_ASSERT(th != NULL);
4671 
4672         if (place == last_place) {
4673           place = first_place;
4674         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4675           place = 0;
4676         } else {
4677           place++;
4678         }
4679         th->th.th_first_place = first_place;
4680         th->th.th_last_place = last_place;
4681         th->th.th_new_place = place;
4682         if (__kmp_display_affinity && place != th->th.th_current_place &&
4683             team->t.t_display_affinity != 1) {
4684           team->t.t_display_affinity = 1;
4685         }
4686 
4687         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4688                        "partition = [%d,%d]\n",
4689                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4690                        team->t.t_id, f, place, first_place, last_place));
4691       }
4692     } else {
4693       int S, rem, gap, s_count;
4694       S = n_th / n_places;
4695       s_count = 0;
4696       rem = n_th - (S * n_places);
4697       gap = rem > 0 ? n_places / rem : n_places;
4698       int place = masters_place;
4699       int gap_ct = gap;
4700       for (f = 0; f < n_th; f++) {
4701         kmp_info_t *th = team->t.t_threads[f];
4702         KMP_DEBUG_ASSERT(th != NULL);
4703 
4704         th->th.th_first_place = first_place;
4705         th->th.th_last_place = last_place;
4706         th->th.th_new_place = place;
4707         if (__kmp_display_affinity && place != th->th.th_current_place &&
4708             team->t.t_display_affinity != 1) {
4709           team->t.t_display_affinity = 1;
4710         }
4711         s_count++;
4712 
4713         if ((s_count == S) && rem && (gap_ct == gap)) {
4714           // do nothing, add an extra thread to place on next iteration
4715         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4716           // we added an extra thread to this place; move to next place
4717           if (place == last_place) {
4718             place = first_place;
4719           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4720             place = 0;
4721           } else {
4722             place++;
4723           }
4724           s_count = 0;
4725           gap_ct = 1;
4726           rem--;
4727         } else if (s_count == S) { // place full; don't add extra
4728           if (place == last_place) {
4729             place = first_place;
4730           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4731             place = 0;
4732           } else {
4733             place++;
4734           }
4735           gap_ct++;
4736           s_count = 0;
4737         }
4738 
4739         KA_TRACE(100,
4740                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4741                   "partition = [%d,%d]\n",
4742                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4743                   th->th.th_new_place, first_place, last_place));
4744       }
4745       KMP_DEBUG_ASSERT(place == masters_place);
4746     }
4747   } break;
4748 
4749   case proc_bind_spread: {
4750     int f;
4751     int n_th = team->t.t_nproc;
4752     int n_places;
4753     int thidx;
4754     if (first_place <= last_place) {
4755       n_places = last_place - first_place + 1;
4756     } else {
4757       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4758     }
4759     if (n_th <= n_places) {
4760       int place = -1;
4761 
4762       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4763         int S = n_places / n_th;
4764         int s_count, rem, gap, gap_ct;
4765 
4766         place = masters_place;
4767         rem = n_places - n_th * S;
4768         gap = rem ? n_th / rem : 1;
4769         gap_ct = gap;
4770         thidx = n_th;
4771         if (update_master_only == 1)
4772           thidx = 1;
4773         for (f = 0; f < thidx; f++) {
4774           kmp_info_t *th = team->t.t_threads[f];
4775           KMP_DEBUG_ASSERT(th != NULL);
4776 
4777           th->th.th_first_place = place;
4778           th->th.th_new_place = place;
4779           if (__kmp_display_affinity && place != th->th.th_current_place &&
4780               team->t.t_display_affinity != 1) {
4781             team->t.t_display_affinity = 1;
4782           }
4783           s_count = 1;
4784           while (s_count < S) {
4785             if (place == last_place) {
4786               place = first_place;
4787             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4788               place = 0;
4789             } else {
4790               place++;
4791             }
4792             s_count++;
4793           }
4794           if (rem && (gap_ct == gap)) {
4795             if (place == last_place) {
4796               place = first_place;
4797             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4798               place = 0;
4799             } else {
4800               place++;
4801             }
4802             rem--;
4803             gap_ct = 0;
4804           }
4805           th->th.th_last_place = place;
4806           gap_ct++;
4807 
4808           if (place == last_place) {
4809             place = first_place;
4810           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4811             place = 0;
4812           } else {
4813             place++;
4814           }
4815 
4816           KA_TRACE(100,
4817                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4818                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4819                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4820                     f, th->th.th_new_place, th->th.th_first_place,
4821                     th->th.th_last_place, __kmp_affinity_num_masks));
4822         }
4823       } else {
4824         /* Having uniform space of available computation places I can create
4825            T partitions of round(P/T) size and put threads into the first
4826            place of each partition. */
4827         double current = static_cast<double>(masters_place);
4828         double spacing =
4829             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4830         int first, last;
4831         kmp_info_t *th;
4832 
4833         thidx = n_th + 1;
4834         if (update_master_only == 1)
4835           thidx = 1;
4836         for (f = 0; f < thidx; f++) {
4837           first = static_cast<int>(current);
4838           last = static_cast<int>(current + spacing) - 1;
4839           KMP_DEBUG_ASSERT(last >= first);
4840           if (first >= n_places) {
4841             if (masters_place) {
4842               first -= n_places;
4843               last -= n_places;
4844               if (first == (masters_place + 1)) {
4845                 KMP_DEBUG_ASSERT(f == n_th);
4846                 first--;
4847               }
4848               if (last == masters_place) {
4849                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4850                 last--;
4851               }
4852             } else {
4853               KMP_DEBUG_ASSERT(f == n_th);
4854               first = 0;
4855               last = 0;
4856             }
4857           }
4858           if (last >= n_places) {
4859             last = (n_places - 1);
4860           }
4861           place = first;
4862           current += spacing;
4863           if (f < n_th) {
4864             KMP_DEBUG_ASSERT(0 <= first);
4865             KMP_DEBUG_ASSERT(n_places > first);
4866             KMP_DEBUG_ASSERT(0 <= last);
4867             KMP_DEBUG_ASSERT(n_places > last);
4868             KMP_DEBUG_ASSERT(last_place >= first_place);
4869             th = team->t.t_threads[f];
4870             KMP_DEBUG_ASSERT(th);
4871             th->th.th_first_place = first;
4872             th->th.th_new_place = place;
4873             th->th.th_last_place = last;
4874             if (__kmp_display_affinity && place != th->th.th_current_place &&
4875                 team->t.t_display_affinity != 1) {
4876               team->t.t_display_affinity = 1;
4877             }
4878             KA_TRACE(100,
4879                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4880                       "partition = [%d,%d], spacing = %.4f\n",
4881                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4882                       team->t.t_id, f, th->th.th_new_place,
4883                       th->th.th_first_place, th->th.th_last_place, spacing));
4884           }
4885         }
4886       }
4887       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4888     } else {
4889       int S, rem, gap, s_count;
4890       S = n_th / n_places;
4891       s_count = 0;
4892       rem = n_th - (S * n_places);
4893       gap = rem > 0 ? n_places / rem : n_places;
4894       int place = masters_place;
4895       int gap_ct = gap;
4896       thidx = n_th;
4897       if (update_master_only == 1)
4898         thidx = 1;
4899       for (f = 0; f < thidx; f++) {
4900         kmp_info_t *th = team->t.t_threads[f];
4901         KMP_DEBUG_ASSERT(th != NULL);
4902 
4903         th->th.th_first_place = place;
4904         th->th.th_last_place = place;
4905         th->th.th_new_place = place;
4906         if (__kmp_display_affinity && place != th->th.th_current_place &&
4907             team->t.t_display_affinity != 1) {
4908           team->t.t_display_affinity = 1;
4909         }
4910         s_count++;
4911 
4912         if ((s_count == S) && rem && (gap_ct == gap)) {
4913           // do nothing, add an extra thread to place on next iteration
4914         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4915           // we added an extra thread to this place; move on to next place
4916           if (place == last_place) {
4917             place = first_place;
4918           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4919             place = 0;
4920           } else {
4921             place++;
4922           }
4923           s_count = 0;
4924           gap_ct = 1;
4925           rem--;
4926         } else if (s_count == S) { // place is full; don't add extra thread
4927           if (place == last_place) {
4928             place = first_place;
4929           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4930             place = 0;
4931           } else {
4932             place++;
4933           }
4934           gap_ct++;
4935           s_count = 0;
4936         }
4937 
4938         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4939                        "partition = [%d,%d]\n",
4940                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4941                        team->t.t_id, f, th->th.th_new_place,
4942                        th->th.th_first_place, th->th.th_last_place));
4943       }
4944       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4945     }
4946   } break;
4947 
4948   default:
4949     break;
4950   }
4951 
4952   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4953 }
4954 
4955 #endif // KMP_AFFINITY_SUPPORTED
4956 
4957 /* allocate a new team data structure to use.  take one off of the free pool if
4958    available */
4959 kmp_team_t *
4960 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4961 #if OMPT_SUPPORT
4962                     ompt_data_t ompt_parallel_data,
4963 #endif
4964                     kmp_proc_bind_t new_proc_bind,
4965                     kmp_internal_control_t *new_icvs,
4966                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4967   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4968   int f;
4969   kmp_team_t *team;
4970   int use_hot_team = !root->r.r_active;
4971   int level = 0;
4972 
4973   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4974   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4975   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4976   KMP_MB();
4977 
4978 #if KMP_NESTED_HOT_TEAMS
4979   kmp_hot_team_ptr_t *hot_teams;
4980   if (master) {
4981     team = master->th.th_team;
4982     level = team->t.t_active_level;
4983     if (master->th.th_teams_microtask) { // in teams construct?
4984       if (master->th.th_teams_size.nteams > 1 &&
4985           ( // #teams > 1
4986               team->t.t_pkfn ==
4987                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4988               master->th.th_teams_level <
4989                   team->t.t_level)) { // or nested parallel inside the teams
4990         ++level; // not increment if #teams==1, or for outer fork of the teams;
4991         // increment otherwise
4992       }
4993     }
4994     hot_teams = master->th.th_hot_teams;
4995     if (level < __kmp_hot_teams_max_level && hot_teams &&
4996         hot_teams[level].hot_team) {
4997       // hot team has already been allocated for given level
4998       use_hot_team = 1;
4999     } else {
5000       use_hot_team = 0;
5001     }
5002   } else {
5003     // check we won't access uninitialized hot_teams, just in case
5004     KMP_DEBUG_ASSERT(new_nproc == 1);
5005   }
5006 #endif
5007   // Optimization to use a "hot" team
5008   if (use_hot_team && new_nproc > 1) {
5009     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5010 #if KMP_NESTED_HOT_TEAMS
5011     team = hot_teams[level].hot_team;
5012 #else
5013     team = root->r.r_hot_team;
5014 #endif
5015 #if KMP_DEBUG
5016     if (__kmp_tasking_mode != tskm_immediate_exec) {
5017       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5018                     "task_team[1] = %p before reinit\n",
5019                     team->t.t_task_team[0], team->t.t_task_team[1]));
5020     }
5021 #endif
5022 
5023     // Has the number of threads changed?
5024     /* Let's assume the most common case is that the number of threads is
5025        unchanged, and put that case first. */
5026     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5027       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5028       // This case can mean that omp_set_num_threads() was called and the hot
5029       // team size was already reduced, so we check the special flag
5030       if (team->t.t_size_changed == -1) {
5031         team->t.t_size_changed = 1;
5032       } else {
5033         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5034       }
5035 
5036       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5037       kmp_r_sched_t new_sched = new_icvs->sched;
5038       // set master's schedule as new run-time schedule
5039       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5040 
5041       __kmp_reinitialize_team(team, new_icvs,
5042                               root->r.r_uber_thread->th.th_ident);
5043 
5044       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5045                     team->t.t_threads[0], team));
5046       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5047 
5048 #if KMP_AFFINITY_SUPPORTED
5049       if ((team->t.t_size_changed == 0) &&
5050           (team->t.t_proc_bind == new_proc_bind)) {
5051         if (new_proc_bind == proc_bind_spread) {
5052           __kmp_partition_places(
5053               team, 1); // add flag to update only master for spread
5054         }
5055         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5056                        "proc_bind = %d, partition = [%d,%d]\n",
5057                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5058                        team->t.t_last_place));
5059       } else {
5060         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5061         __kmp_partition_places(team);
5062       }
5063 #else
5064       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5065 #endif /* KMP_AFFINITY_SUPPORTED */
5066     } else if (team->t.t_nproc > new_nproc) {
5067       KA_TRACE(20,
5068                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5069                 new_nproc));
5070 
5071       team->t.t_size_changed = 1;
5072 #if KMP_NESTED_HOT_TEAMS
5073       if (__kmp_hot_teams_mode == 0) {
5074         // AC: saved number of threads should correspond to team's value in this
5075         // mode, can be bigger in mode 1, when hot team has threads in reserve
5076         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5077         hot_teams[level].hot_team_nth = new_nproc;
5078 #endif // KMP_NESTED_HOT_TEAMS
5079         /* release the extra threads we don't need any more */
5080         for (f = new_nproc; f < team->t.t_nproc; f++) {
5081           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5082           if (__kmp_tasking_mode != tskm_immediate_exec) {
5083             // When decreasing team size, threads no longer in the team should
5084             // unref task team.
5085             team->t.t_threads[f]->th.th_task_team = NULL;
5086           }
5087           __kmp_free_thread(team->t.t_threads[f]);
5088           team->t.t_threads[f] = NULL;
5089         }
5090 #if KMP_NESTED_HOT_TEAMS
5091       } // (__kmp_hot_teams_mode == 0)
5092       else {
5093         // When keeping extra threads in team, switch threads to wait on own
5094         // b_go flag
5095         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5096           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5097           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5098           for (int b = 0; b < bs_last_barrier; ++b) {
5099             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5100               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5101             }
5102             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5103           }
5104         }
5105       }
5106 #endif // KMP_NESTED_HOT_TEAMS
5107       team->t.t_nproc = new_nproc;
5108       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5109       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5110       __kmp_reinitialize_team(team, new_icvs,
5111                               root->r.r_uber_thread->th.th_ident);
5112 
5113       // Update remaining threads
5114       for (f = 0; f < new_nproc; ++f) {
5115         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5116       }
5117 
5118       // restore the current task state of the master thread: should be the
5119       // implicit task
5120       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5121                     team->t.t_threads[0], team));
5122 
5123       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5124 
5125 #ifdef KMP_DEBUG
5126       for (f = 0; f < team->t.t_nproc; f++) {
5127         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5128                          team->t.t_threads[f]->th.th_team_nproc ==
5129                              team->t.t_nproc);
5130       }
5131 #endif
5132 
5133       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5134 #if KMP_AFFINITY_SUPPORTED
5135       __kmp_partition_places(team);
5136 #endif
5137     } else { // team->t.t_nproc < new_nproc
5138 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5139       kmp_affin_mask_t *old_mask;
5140       if (KMP_AFFINITY_CAPABLE()) {
5141         KMP_CPU_ALLOC(old_mask);
5142       }
5143 #endif
5144 
5145       KA_TRACE(20,
5146                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5147                 new_nproc));
5148 
5149       team->t.t_size_changed = 1;
5150 
5151 #if KMP_NESTED_HOT_TEAMS
5152       int avail_threads = hot_teams[level].hot_team_nth;
5153       if (new_nproc < avail_threads)
5154         avail_threads = new_nproc;
5155       kmp_info_t **other_threads = team->t.t_threads;
5156       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5157         // Adjust barrier data of reserved threads (if any) of the team
5158         // Other data will be set in __kmp_initialize_info() below.
5159         int b;
5160         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5161         for (b = 0; b < bs_last_barrier; ++b) {
5162           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5163           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5164 #if USE_DEBUGGER
5165           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5166 #endif
5167         }
5168       }
5169       if (hot_teams[level].hot_team_nth >= new_nproc) {
5170         // we have all needed threads in reserve, no need to allocate any
5171         // this only possible in mode 1, cannot have reserved threads in mode 0
5172         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5173         team->t.t_nproc = new_nproc; // just get reserved threads involved
5174       } else {
5175         // we may have some threads in reserve, but not enough
5176         team->t.t_nproc =
5177             hot_teams[level]
5178                 .hot_team_nth; // get reserved threads involved if any
5179         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5180 #endif // KMP_NESTED_HOT_TEAMS
5181         if (team->t.t_max_nproc < new_nproc) {
5182           /* reallocate larger arrays */
5183           __kmp_reallocate_team_arrays(team, new_nproc);
5184           __kmp_reinitialize_team(team, new_icvs, NULL);
5185         }
5186 
5187 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5188         /* Temporarily set full mask for master thread before creation of
5189            workers. The reason is that workers inherit the affinity from master,
5190            so if a lot of workers are created on the single core quickly, they
5191            don't get a chance to set their own affinity for a long time. */
5192         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5193 #endif
5194 
5195         /* allocate new threads for the hot team */
5196         for (f = team->t.t_nproc; f < new_nproc; f++) {
5197           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5198           KMP_DEBUG_ASSERT(new_worker);
5199           team->t.t_threads[f] = new_worker;
5200 
5201           KA_TRACE(20,
5202                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5203                     "join=%llu, plain=%llu\n",
5204                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5205                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5206                     team->t.t_bar[bs_plain_barrier].b_arrived));
5207 
5208           { // Initialize barrier data for new threads.
5209             int b;
5210             kmp_balign_t *balign = new_worker->th.th_bar;
5211             for (b = 0; b < bs_last_barrier; ++b) {
5212               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5213               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5214                                KMP_BARRIER_PARENT_FLAG);
5215 #if USE_DEBUGGER
5216               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5217 #endif
5218             }
5219           }
5220         }
5221 
5222 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5223         if (KMP_AFFINITY_CAPABLE()) {
5224           /* Restore initial master thread's affinity mask */
5225           __kmp_set_system_affinity(old_mask, TRUE);
5226           KMP_CPU_FREE(old_mask);
5227         }
5228 #endif
5229 #if KMP_NESTED_HOT_TEAMS
5230       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5231 #endif // KMP_NESTED_HOT_TEAMS
5232       /* make sure everyone is syncronized */
5233       int old_nproc = team->t.t_nproc; // save old value and use to update only
5234       // new threads below
5235       __kmp_initialize_team(team, new_nproc, new_icvs,
5236                             root->r.r_uber_thread->th.th_ident);
5237 
5238       /* reinitialize the threads */
5239       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5240       for (f = 0; f < team->t.t_nproc; ++f)
5241         __kmp_initialize_info(team->t.t_threads[f], team, f,
5242                               __kmp_gtid_from_tid(f, team));
5243 
5244       if (level) { // set th_task_state for new threads in nested hot team
5245         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5246         // only need to set the th_task_state for the new threads. th_task_state
5247         // for master thread will not be accurate until after this in
5248         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5249         // correct value.
5250         for (f = old_nproc; f < team->t.t_nproc; ++f)
5251           team->t.t_threads[f]->th.th_task_state =
5252               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5253       } else { // set th_task_state for new threads in non-nested hot team
5254         kmp_uint8 old_state =
5255             team->t.t_threads[0]->th.th_task_state; // copy master's state
5256         for (f = old_nproc; f < team->t.t_nproc; ++f)
5257           team->t.t_threads[f]->th.th_task_state = old_state;
5258       }
5259 
5260 #ifdef KMP_DEBUG
5261       for (f = 0; f < team->t.t_nproc; ++f) {
5262         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5263                          team->t.t_threads[f]->th.th_team_nproc ==
5264                              team->t.t_nproc);
5265       }
5266 #endif
5267 
5268       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5269 #if KMP_AFFINITY_SUPPORTED
5270       __kmp_partition_places(team);
5271 #endif
5272     } // Check changes in number of threads
5273 
5274     kmp_info_t *master = team->t.t_threads[0];
5275     if (master->th.th_teams_microtask) {
5276       for (f = 1; f < new_nproc; ++f) {
5277         // propagate teams construct specific info to workers
5278         kmp_info_t *thr = team->t.t_threads[f];
5279         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5280         thr->th.th_teams_level = master->th.th_teams_level;
5281         thr->th.th_teams_size = master->th.th_teams_size;
5282       }
5283     }
5284 #if KMP_NESTED_HOT_TEAMS
5285     if (level) {
5286       // Sync barrier state for nested hot teams, not needed for outermost hot
5287       // team.
5288       for (f = 1; f < new_nproc; ++f) {
5289         kmp_info_t *thr = team->t.t_threads[f];
5290         int b;
5291         kmp_balign_t *balign = thr->th.th_bar;
5292         for (b = 0; b < bs_last_barrier; ++b) {
5293           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5294           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5295 #if USE_DEBUGGER
5296           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5297 #endif
5298         }
5299       }
5300     }
5301 #endif // KMP_NESTED_HOT_TEAMS
5302 
5303     /* reallocate space for arguments if necessary */
5304     __kmp_alloc_argv_entries(argc, team, TRUE);
5305     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5306     // The hot team re-uses the previous task team,
5307     // if untouched during the previous release->gather phase.
5308 
5309     KF_TRACE(10, (" hot_team = %p\n", team));
5310 
5311 #if KMP_DEBUG
5312     if (__kmp_tasking_mode != tskm_immediate_exec) {
5313       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5314                     "task_team[1] = %p after reinit\n",
5315                     team->t.t_task_team[0], team->t.t_task_team[1]));
5316     }
5317 #endif
5318 
5319 #if OMPT_SUPPORT
5320     __ompt_team_assign_id(team, ompt_parallel_data);
5321 #endif
5322 
5323     KMP_MB();
5324 
5325     return team;
5326   }
5327 
5328   /* next, let's try to take one from the team pool */
5329   KMP_MB();
5330   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5331     /* TODO: consider resizing undersized teams instead of reaping them, now
5332        that we have a resizing mechanism */
5333     if (team->t.t_max_nproc >= max_nproc) {
5334       /* take this team from the team pool */
5335       __kmp_team_pool = team->t.t_next_pool;
5336 
5337       /* setup the team for fresh use */
5338       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5339 
5340       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5341                     "task_team[1] %p to NULL\n",
5342                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5343       team->t.t_task_team[0] = NULL;
5344       team->t.t_task_team[1] = NULL;
5345 
5346       /* reallocate space for arguments if necessary */
5347       __kmp_alloc_argv_entries(argc, team, TRUE);
5348       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5349 
5350       KA_TRACE(
5351           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5352                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5353       { // Initialize barrier data.
5354         int b;
5355         for (b = 0; b < bs_last_barrier; ++b) {
5356           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5357 #if USE_DEBUGGER
5358           team->t.t_bar[b].b_master_arrived = 0;
5359           team->t.t_bar[b].b_team_arrived = 0;
5360 #endif
5361         }
5362       }
5363 
5364       team->t.t_proc_bind = new_proc_bind;
5365 
5366       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5367                     team->t.t_id));
5368 
5369 #if OMPT_SUPPORT
5370       __ompt_team_assign_id(team, ompt_parallel_data);
5371 #endif
5372 
5373       KMP_MB();
5374 
5375       return team;
5376     }
5377 
5378     /* reap team if it is too small, then loop back and check the next one */
5379     // not sure if this is wise, but, will be redone during the hot-teams
5380     // rewrite.
5381     /* TODO: Use technique to find the right size hot-team, don't reap them */
5382     team = __kmp_reap_team(team);
5383     __kmp_team_pool = team;
5384   }
5385 
5386   /* nothing available in the pool, no matter, make a new team! */
5387   KMP_MB();
5388   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5389 
5390   /* and set it up */
5391   team->t.t_max_nproc = max_nproc;
5392   /* NOTE well, for some reason allocating one big buffer and dividing it up
5393      seems to really hurt performance a lot on the P4, so, let's not use this */
5394   __kmp_allocate_team_arrays(team, max_nproc);
5395 
5396   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5397   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5398 
5399   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5400                 "%p to NULL\n",
5401                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5402   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5403   // memory, no need to duplicate
5404   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5405   // memory, no need to duplicate
5406 
5407   if (__kmp_storage_map) {
5408     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5409   }
5410 
5411   /* allocate space for arguments */
5412   __kmp_alloc_argv_entries(argc, team, FALSE);
5413   team->t.t_argc = argc;
5414 
5415   KA_TRACE(20,
5416            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5417             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5418   { // Initialize barrier data.
5419     int b;
5420     for (b = 0; b < bs_last_barrier; ++b) {
5421       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5422 #if USE_DEBUGGER
5423       team->t.t_bar[b].b_master_arrived = 0;
5424       team->t.t_bar[b].b_team_arrived = 0;
5425 #endif
5426     }
5427   }
5428 
5429   team->t.t_proc_bind = new_proc_bind;
5430 
5431 #if OMPT_SUPPORT
5432   __ompt_team_assign_id(team, ompt_parallel_data);
5433   team->t.ompt_serialized_team_info = NULL;
5434 #endif
5435 
5436   KMP_MB();
5437 
5438   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5439                 team->t.t_id));
5440 
5441   return team;
5442 }
5443 
5444 /* TODO implement hot-teams at all levels */
5445 /* TODO implement lazy thread release on demand (disband request) */
5446 
5447 /* free the team.  return it to the team pool.  release all the threads
5448  * associated with it */
5449 void __kmp_free_team(kmp_root_t *root,
5450                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5451   int f;
5452   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5453                 team->t.t_id));
5454 
5455   /* verify state */
5456   KMP_DEBUG_ASSERT(root);
5457   KMP_DEBUG_ASSERT(team);
5458   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5459   KMP_DEBUG_ASSERT(team->t.t_threads);
5460 
5461   int use_hot_team = team == root->r.r_hot_team;
5462 #if KMP_NESTED_HOT_TEAMS
5463   int level;
5464   kmp_hot_team_ptr_t *hot_teams;
5465   if (master) {
5466     level = team->t.t_active_level - 1;
5467     if (master->th.th_teams_microtask) { // in teams construct?
5468       if (master->th.th_teams_size.nteams > 1) {
5469         ++level; // level was not increased in teams construct for
5470         // team_of_masters
5471       }
5472       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5473           master->th.th_teams_level == team->t.t_level) {
5474         ++level; // level was not increased in teams construct for
5475         // team_of_workers before the parallel
5476       } // team->t.t_level will be increased inside parallel
5477     }
5478     hot_teams = master->th.th_hot_teams;
5479     if (level < __kmp_hot_teams_max_level) {
5480       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5481       use_hot_team = 1;
5482     }
5483   }
5484 #endif // KMP_NESTED_HOT_TEAMS
5485 
5486   /* team is done working */
5487   TCW_SYNC_PTR(team->t.t_pkfn,
5488                NULL); // Important for Debugging Support Library.
5489 #if KMP_OS_WINDOWS
5490   team->t.t_copyin_counter = 0; // init counter for possible reuse
5491 #endif
5492   // Do not reset pointer to parent team to NULL for hot teams.
5493 
5494   /* if we are non-hot team, release our threads */
5495   if (!use_hot_team) {
5496     if (__kmp_tasking_mode != tskm_immediate_exec) {
5497       // Wait for threads to reach reapable state
5498       for (f = 1; f < team->t.t_nproc; ++f) {
5499         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5500         kmp_info_t *th = team->t.t_threads[f];
5501         volatile kmp_uint32 *state = &th->th.th_reap_state;
5502         while (*state != KMP_SAFE_TO_REAP) {
5503 #if KMP_OS_WINDOWS
5504           // On Windows a thread can be killed at any time, check this
5505           DWORD ecode;
5506           if (!__kmp_is_thread_alive(th, &ecode)) {
5507             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5508             break;
5509           }
5510 #endif
5511           // first check if thread is sleeping
5512           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5513           if (fl.is_sleeping())
5514             fl.resume(__kmp_gtid_from_thread(th));
5515           KMP_CPU_PAUSE();
5516         }
5517       }
5518 
5519       // Delete task teams
5520       int tt_idx;
5521       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5522         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5523         if (task_team != NULL) {
5524           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5525             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5526             team->t.t_threads[f]->th.th_task_team = NULL;
5527           }
5528           KA_TRACE(
5529               20,
5530               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5531                __kmp_get_gtid(), task_team, team->t.t_id));
5532 #if KMP_NESTED_HOT_TEAMS
5533           __kmp_free_task_team(master, task_team);
5534 #endif
5535           team->t.t_task_team[tt_idx] = NULL;
5536         }
5537       }
5538     }
5539 
5540     // Reset pointer to parent team only for non-hot teams.
5541     team->t.t_parent = NULL;
5542     team->t.t_level = 0;
5543     team->t.t_active_level = 0;
5544 
5545     /* free the worker threads */
5546     for (f = 1; f < team->t.t_nproc; ++f) {
5547       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5548       __kmp_free_thread(team->t.t_threads[f]);
5549       team->t.t_threads[f] = NULL;
5550     }
5551 
5552     /* put the team back in the team pool */
5553     /* TODO limit size of team pool, call reap_team if pool too large */
5554     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5555     __kmp_team_pool = (volatile kmp_team_t *)team;
5556   } else { // Check if team was created for the masters in a teams construct
5557     // See if first worker is a CG root
5558     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5559                      team->t.t_threads[1]->th.th_cg_roots);
5560     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5561       // Clean up the CG root nodes on workers so that this team can be re-used
5562       for (f = 1; f < team->t.t_nproc; ++f) {
5563         kmp_info_t *thr = team->t.t_threads[f];
5564         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5565                          thr->th.th_cg_roots->cg_root == thr);
5566         // Pop current CG root off list
5567         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5568         thr->th.th_cg_roots = tmp->up;
5569         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5570                        " up to node %p. cg_nthreads was %d\n",
5571                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5572         int i = tmp->cg_nthreads--;
5573         if (i == 1) {
5574           __kmp_free(tmp); // free CG if we are the last thread in it
5575         }
5576         // Restore current task's thread_limit from CG root
5577         if (thr->th.th_cg_roots)
5578           thr->th.th_current_task->td_icvs.thread_limit =
5579               thr->th.th_cg_roots->cg_thread_limit;
5580       }
5581     }
5582   }
5583 
5584   KMP_MB();
5585 }
5586 
5587 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5588 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5589   kmp_team_t *next_pool = team->t.t_next_pool;
5590 
5591   KMP_DEBUG_ASSERT(team);
5592   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5593   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5594   KMP_DEBUG_ASSERT(team->t.t_threads);
5595   KMP_DEBUG_ASSERT(team->t.t_argv);
5596 
5597   /* TODO clean the threads that are a part of this? */
5598 
5599   /* free stuff */
5600   __kmp_free_team_arrays(team);
5601   if (team->t.t_argv != &team->t.t_inline_argv[0])
5602     __kmp_free((void *)team->t.t_argv);
5603   __kmp_free(team);
5604 
5605   KMP_MB();
5606   return next_pool;
5607 }
5608 
5609 // Free the thread.  Don't reap it, just place it on the pool of available
5610 // threads.
5611 //
5612 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5613 // binding for the affinity mechanism to be useful.
5614 //
5615 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5616 // However, we want to avoid a potential performance problem by always
5617 // scanning through the list to find the correct point at which to insert
5618 // the thread (potential N**2 behavior).  To do this we keep track of the
5619 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5620 // With single-level parallelism, threads will always be added to the tail
5621 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5622 // parallelism, all bets are off and we may need to scan through the entire
5623 // free list.
5624 //
5625 // This change also has a potentially large performance benefit, for some
5626 // applications.  Previously, as threads were freed from the hot team, they
5627 // would be placed back on the free list in inverse order.  If the hot team
5628 // grew back to it's original size, then the freed thread would be placed
5629 // back on the hot team in reverse order.  This could cause bad cache
5630 // locality problems on programs where the size of the hot team regularly
5631 // grew and shrunk.
5632 //
5633 // Now, for single-level parallelism, the OMP tid is always == gtid.
5634 void __kmp_free_thread(kmp_info_t *this_th) {
5635   int gtid;
5636   kmp_info_t **scan;
5637 
5638   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5639                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5640 
5641   KMP_DEBUG_ASSERT(this_th);
5642 
5643   // When moving thread to pool, switch thread to wait on own b_go flag, and
5644   // uninitialized (NULL team).
5645   int b;
5646   kmp_balign_t *balign = this_th->th.th_bar;
5647   for (b = 0; b < bs_last_barrier; ++b) {
5648     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5649       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5650     balign[b].bb.team = NULL;
5651     balign[b].bb.leaf_kids = 0;
5652   }
5653   this_th->th.th_task_state = 0;
5654   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5655 
5656   /* put thread back on the free pool */
5657   TCW_PTR(this_th->th.th_team, NULL);
5658   TCW_PTR(this_th->th.th_root, NULL);
5659   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5660 
5661   while (this_th->th.th_cg_roots) {
5662     this_th->th.th_cg_roots->cg_nthreads--;
5663     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5664                    " %p of thread  %p to %d\n",
5665                    this_th, this_th->th.th_cg_roots,
5666                    this_th->th.th_cg_roots->cg_root,
5667                    this_th->th.th_cg_roots->cg_nthreads));
5668     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5669     if (tmp->cg_root == this_th) { // Thread is a cg_root
5670       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5671       KA_TRACE(
5672           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5673       this_th->th.th_cg_roots = tmp->up;
5674       __kmp_free(tmp);
5675     } else { // Worker thread
5676       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5677         __kmp_free(tmp);
5678       }
5679       this_th->th.th_cg_roots = NULL;
5680       break;
5681     }
5682   }
5683 
5684   /* If the implicit task assigned to this thread can be used by other threads
5685    * -> multiple threads can share the data and try to free the task at
5686    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5687    * with higher probability when hot team is disabled but can occurs even when
5688    * the hot team is enabled */
5689   __kmp_free_implicit_task(this_th);
5690   this_th->th.th_current_task = NULL;
5691 
5692   // If the __kmp_thread_pool_insert_pt is already past the new insert
5693   // point, then we need to re-scan the entire list.
5694   gtid = this_th->th.th_info.ds.ds_gtid;
5695   if (__kmp_thread_pool_insert_pt != NULL) {
5696     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5697     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5698       __kmp_thread_pool_insert_pt = NULL;
5699     }
5700   }
5701 
5702   // Scan down the list to find the place to insert the thread.
5703   // scan is the address of a link in the list, possibly the address of
5704   // __kmp_thread_pool itself.
5705   //
5706   // In the absence of nested parallelism, the for loop will have 0 iterations.
5707   if (__kmp_thread_pool_insert_pt != NULL) {
5708     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5709   } else {
5710     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5711   }
5712   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5713        scan = &((*scan)->th.th_next_pool))
5714     ;
5715 
5716   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5717   // to its address.
5718   TCW_PTR(this_th->th.th_next_pool, *scan);
5719   __kmp_thread_pool_insert_pt = *scan = this_th;
5720   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5721                    (this_th->th.th_info.ds.ds_gtid <
5722                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5723   TCW_4(this_th->th.th_in_pool, TRUE);
5724   __kmp_suspend_initialize_thread(this_th);
5725   __kmp_lock_suspend_mx(this_th);
5726   if (this_th->th.th_active == TRUE) {
5727     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5728     this_th->th.th_active_in_pool = TRUE;
5729   }
5730 #if KMP_DEBUG
5731   else {
5732     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5733   }
5734 #endif
5735   __kmp_unlock_suspend_mx(this_th);
5736 
5737   TCW_4(__kmp_nth, __kmp_nth - 1);
5738 
5739 #ifdef KMP_ADJUST_BLOCKTIME
5740   /* Adjust blocktime back to user setting or default if necessary */
5741   /* Middle initialization might never have occurred                */
5742   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5743     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5744     if (__kmp_nth <= __kmp_avail_proc) {
5745       __kmp_zero_bt = FALSE;
5746     }
5747   }
5748 #endif /* KMP_ADJUST_BLOCKTIME */
5749 
5750   KMP_MB();
5751 }
5752 
5753 /* ------------------------------------------------------------------------ */
5754 
5755 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5756 #if OMP_PROFILING_SUPPORT
5757   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5758   // TODO: add a configuration option for time granularity
5759   if (ProfileTraceFile)
5760     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5761 #endif
5762 
5763   int gtid = this_thr->th.th_info.ds.ds_gtid;
5764   /*    void                 *stack_data;*/
5765   kmp_team_t **volatile pteam;
5766 
5767   KMP_MB();
5768   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5769 
5770   if (__kmp_env_consistency_check) {
5771     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5772   }
5773 
5774 #if OMPT_SUPPORT
5775   ompt_data_t *thread_data;
5776   if (ompt_enabled.enabled) {
5777     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5778     *thread_data = ompt_data_none;
5779 
5780     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5781     this_thr->th.ompt_thread_info.wait_id = 0;
5782     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5783     this_thr->th.ompt_thread_info.parallel_flags = 0;
5784     if (ompt_enabled.ompt_callback_thread_begin) {
5785       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5786           ompt_thread_worker, thread_data);
5787     }
5788     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5789   }
5790 #endif
5791 
5792   /* This is the place where threads wait for work */
5793   while (!TCR_4(__kmp_global.g.g_done)) {
5794     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5795     KMP_MB();
5796 
5797     /* wait for work to do */
5798     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5799 
5800     /* No tid yet since not part of a team */
5801     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5802 
5803 #if OMPT_SUPPORT
5804     if (ompt_enabled.enabled) {
5805       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5806     }
5807 #endif
5808 
5809     pteam = &this_thr->th.th_team;
5810 
5811     /* have we been allocated? */
5812     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5813       /* we were just woken up, so run our new task */
5814       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5815         int rc;
5816         KA_TRACE(20,
5817                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5818                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5819                   (*pteam)->t.t_pkfn));
5820 
5821         updateHWFPControl(*pteam);
5822 
5823 #if OMPT_SUPPORT
5824         if (ompt_enabled.enabled) {
5825           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5826         }
5827 #endif
5828 
5829         rc = (*pteam)->t.t_invoke(gtid);
5830         KMP_ASSERT(rc);
5831 
5832         KMP_MB();
5833         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5834                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5835                       (*pteam)->t.t_pkfn));
5836       }
5837 #if OMPT_SUPPORT
5838       if (ompt_enabled.enabled) {
5839         /* no frame set while outside task */
5840         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5841 
5842         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5843       }
5844 #endif
5845       /* join barrier after parallel region */
5846       __kmp_join_barrier(gtid);
5847     }
5848   }
5849   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5850 
5851 #if OMPT_SUPPORT
5852   if (ompt_enabled.ompt_callback_thread_end) {
5853     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5854   }
5855 #endif
5856 
5857   this_thr->th.th_task_team = NULL;
5858   /* run the destructors for the threadprivate data for this thread */
5859   __kmp_common_destroy_gtid(gtid);
5860 
5861   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5862   KMP_MB();
5863 
5864 #if OMP_PROFILING_SUPPORT
5865   llvm::timeTraceProfilerFinishThread();
5866 #endif
5867   return this_thr;
5868 }
5869 
5870 /* ------------------------------------------------------------------------ */
5871 
5872 void __kmp_internal_end_dest(void *specific_gtid) {
5873   // Make sure no significant bits are lost
5874   int gtid;
5875   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5876 
5877   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5878   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5879    * this is because 0 is reserved for the nothing-stored case */
5880 
5881   __kmp_internal_end_thread(gtid);
5882 }
5883 
5884 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5885 
5886 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5887   __kmp_internal_end_atexit();
5888 }
5889 
5890 #endif
5891 
5892 /* [Windows] josh: when the atexit handler is called, there may still be more
5893    than one thread alive */
5894 void __kmp_internal_end_atexit(void) {
5895   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5896   /* [Windows]
5897      josh: ideally, we want to completely shutdown the library in this atexit
5898      handler, but stat code that depends on thread specific data for gtid fails
5899      because that data becomes unavailable at some point during the shutdown, so
5900      we call __kmp_internal_end_thread instead. We should eventually remove the
5901      dependency on __kmp_get_specific_gtid in the stat code and use
5902      __kmp_internal_end_library to cleanly shutdown the library.
5903 
5904      // TODO: Can some of this comment about GVS be removed?
5905      I suspect that the offending stat code is executed when the calling thread
5906      tries to clean up a dead root thread's data structures, resulting in GVS
5907      code trying to close the GVS structures for that thread, but since the stat
5908      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5909      the calling thread is cleaning up itself instead of another thread, it get
5910      confused. This happens because allowing a thread to unregister and cleanup
5911      another thread is a recent modification for addressing an issue.
5912      Based on the current design (20050722), a thread may end up
5913      trying to unregister another thread only if thread death does not trigger
5914      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5915      thread specific data destructor function to detect thread death. For
5916      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5917      is nothing.  Thus, the workaround is applicable only for Windows static
5918      stat library. */
5919   __kmp_internal_end_library(-1);
5920 #if KMP_OS_WINDOWS
5921   __kmp_close_console();
5922 #endif
5923 }
5924 
5925 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5926   // It is assumed __kmp_forkjoin_lock is acquired.
5927 
5928   int gtid;
5929 
5930   KMP_DEBUG_ASSERT(thread != NULL);
5931 
5932   gtid = thread->th.th_info.ds.ds_gtid;
5933 
5934   if (!is_root) {
5935     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5936       /* Assume the threads are at the fork barrier here */
5937       KA_TRACE(
5938           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5939                gtid));
5940       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5941        * (GEH) */
5942       ANNOTATE_HAPPENS_BEFORE(thread);
5943       kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5944                          thread);
5945       __kmp_release_64(&flag);
5946     }
5947 
5948     // Terminate OS thread.
5949     __kmp_reap_worker(thread);
5950 
5951     // The thread was killed asynchronously.  If it was actively
5952     // spinning in the thread pool, decrement the global count.
5953     //
5954     // There is a small timing hole here - if the worker thread was just waking
5955     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5956     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5957     // the global counter might not get updated.
5958     //
5959     // Currently, this can only happen as the library is unloaded,
5960     // so there are no harmful side effects.
5961     if (thread->th.th_active_in_pool) {
5962       thread->th.th_active_in_pool = FALSE;
5963       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5964       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5965     }
5966   }
5967 
5968   __kmp_free_implicit_task(thread);
5969 
5970 // Free the fast memory for tasking
5971 #if USE_FAST_MEMORY
5972   __kmp_free_fast_memory(thread);
5973 #endif /* USE_FAST_MEMORY */
5974 
5975   __kmp_suspend_uninitialize_thread(thread);
5976 
5977   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5978   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5979 
5980   --__kmp_all_nth;
5981 // __kmp_nth was decremented when thread is added to the pool.
5982 
5983 #ifdef KMP_ADJUST_BLOCKTIME
5984   /* Adjust blocktime back to user setting or default if necessary */
5985   /* Middle initialization might never have occurred                */
5986   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5987     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5988     if (__kmp_nth <= __kmp_avail_proc) {
5989       __kmp_zero_bt = FALSE;
5990     }
5991   }
5992 #endif /* KMP_ADJUST_BLOCKTIME */
5993 
5994   /* free the memory being used */
5995   if (__kmp_env_consistency_check) {
5996     if (thread->th.th_cons) {
5997       __kmp_free_cons_stack(thread->th.th_cons);
5998       thread->th.th_cons = NULL;
5999     }
6000   }
6001 
6002   if (thread->th.th_pri_common != NULL) {
6003     __kmp_free(thread->th.th_pri_common);
6004     thread->th.th_pri_common = NULL;
6005   }
6006 
6007   if (thread->th.th_task_state_memo_stack != NULL) {
6008     __kmp_free(thread->th.th_task_state_memo_stack);
6009     thread->th.th_task_state_memo_stack = NULL;
6010   }
6011 
6012 #if KMP_USE_BGET
6013   if (thread->th.th_local.bget_data != NULL) {
6014     __kmp_finalize_bget(thread);
6015   }
6016 #endif
6017 
6018 #if KMP_AFFINITY_SUPPORTED
6019   if (thread->th.th_affin_mask != NULL) {
6020     KMP_CPU_FREE(thread->th.th_affin_mask);
6021     thread->th.th_affin_mask = NULL;
6022   }
6023 #endif /* KMP_AFFINITY_SUPPORTED */
6024 
6025 #if KMP_USE_HIER_SCHED
6026   if (thread->th.th_hier_bar_data != NULL) {
6027     __kmp_free(thread->th.th_hier_bar_data);
6028     thread->th.th_hier_bar_data = NULL;
6029   }
6030 #endif
6031 
6032   __kmp_reap_team(thread->th.th_serial_team);
6033   thread->th.th_serial_team = NULL;
6034   __kmp_free(thread);
6035 
6036   KMP_MB();
6037 
6038 } // __kmp_reap_thread
6039 
6040 static void __kmp_internal_end(void) {
6041   int i;
6042 
6043   /* First, unregister the library */
6044   __kmp_unregister_library();
6045 
6046 #if KMP_OS_WINDOWS
6047   /* In Win static library, we can't tell when a root actually dies, so we
6048      reclaim the data structures for any root threads that have died but not
6049      unregistered themselves, in order to shut down cleanly.
6050      In Win dynamic library we also can't tell when a thread dies.  */
6051   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6052 // dead roots
6053 #endif
6054 
6055   for (i = 0; i < __kmp_threads_capacity; i++)
6056     if (__kmp_root[i])
6057       if (__kmp_root[i]->r.r_active)
6058         break;
6059   KMP_MB(); /* Flush all pending memory write invalidates.  */
6060   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6061 
6062   if (i < __kmp_threads_capacity) {
6063 #if KMP_USE_MONITOR
6064     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6065     KMP_MB(); /* Flush all pending memory write invalidates.  */
6066 
6067     // Need to check that monitor was initialized before reaping it. If we are
6068     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6069     // __kmp_monitor will appear to contain valid data, but it is only valid in
6070     // the parent process, not the child.
6071     // New behavior (201008): instead of keying off of the flag
6072     // __kmp_init_parallel, the monitor thread creation is keyed off
6073     // of the new flag __kmp_init_monitor.
6074     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6075     if (TCR_4(__kmp_init_monitor)) {
6076       __kmp_reap_monitor(&__kmp_monitor);
6077       TCW_4(__kmp_init_monitor, 0);
6078     }
6079     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6080     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6081 #endif // KMP_USE_MONITOR
6082   } else {
6083 /* TODO move this to cleanup code */
6084 #ifdef KMP_DEBUG
6085     /* make sure that everything has properly ended */
6086     for (i = 0; i < __kmp_threads_capacity; i++) {
6087       if (__kmp_root[i]) {
6088         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6089         //                    there can be uber threads alive here
6090         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6091       }
6092     }
6093 #endif
6094 
6095     KMP_MB();
6096 
6097     // Reap the worker threads.
6098     // This is valid for now, but be careful if threads are reaped sooner.
6099     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6100       // Get the next thread from the pool.
6101       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6102       __kmp_thread_pool = thread->th.th_next_pool;
6103       // Reap it.
6104       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6105       thread->th.th_next_pool = NULL;
6106       thread->th.th_in_pool = FALSE;
6107       __kmp_reap_thread(thread, 0);
6108     }
6109     __kmp_thread_pool_insert_pt = NULL;
6110 
6111     // Reap teams.
6112     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6113       // Get the next team from the pool.
6114       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6115       __kmp_team_pool = team->t.t_next_pool;
6116       // Reap it.
6117       team->t.t_next_pool = NULL;
6118       __kmp_reap_team(team);
6119     }
6120 
6121     __kmp_reap_task_teams();
6122 
6123 #if KMP_OS_UNIX
6124     // Threads that are not reaped should not access any resources since they
6125     // are going to be deallocated soon, so the shutdown sequence should wait
6126     // until all threads either exit the final spin-waiting loop or begin
6127     // sleeping after the given blocktime.
6128     for (i = 0; i < __kmp_threads_capacity; i++) {
6129       kmp_info_t *thr = __kmp_threads[i];
6130       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6131         KMP_CPU_PAUSE();
6132     }
6133 #endif
6134 
6135     for (i = 0; i < __kmp_threads_capacity; ++i) {
6136       // TBD: Add some checking...
6137       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6138     }
6139 
6140     /* Make sure all threadprivate destructors get run by joining with all
6141        worker threads before resetting this flag */
6142     TCW_SYNC_4(__kmp_init_common, FALSE);
6143 
6144     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6145     KMP_MB();
6146 
6147 #if KMP_USE_MONITOR
6148     // See note above: One of the possible fixes for CQ138434 / CQ140126
6149     //
6150     // FIXME: push both code fragments down and CSE them?
6151     // push them into __kmp_cleanup() ?
6152     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6153     if (TCR_4(__kmp_init_monitor)) {
6154       __kmp_reap_monitor(&__kmp_monitor);
6155       TCW_4(__kmp_init_monitor, 0);
6156     }
6157     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6158     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6159 #endif
6160   } /* else !__kmp_global.t_active */
6161   TCW_4(__kmp_init_gtid, FALSE);
6162   KMP_MB(); /* Flush all pending memory write invalidates.  */
6163 
6164   __kmp_cleanup();
6165 #if OMPT_SUPPORT
6166   ompt_fini();
6167 #endif
6168 }
6169 
6170 void __kmp_internal_end_library(int gtid_req) {
6171   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6172   /* this shouldn't be a race condition because __kmp_internal_end() is the
6173      only place to clear __kmp_serial_init */
6174   /* we'll check this later too, after we get the lock */
6175   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6176   // redundant, because the next check will work in any case.
6177   if (__kmp_global.g.g_abort) {
6178     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6179     /* TODO abort? */
6180     return;
6181   }
6182   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6183     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6184     return;
6185   }
6186 
6187   KMP_MB(); /* Flush all pending memory write invalidates.  */
6188   /* find out who we are and what we should do */
6189   {
6190     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6191     KA_TRACE(
6192         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6193     if (gtid == KMP_GTID_SHUTDOWN) {
6194       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6195                     "already shutdown\n"));
6196       return;
6197     } else if (gtid == KMP_GTID_MONITOR) {
6198       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6199                     "registered, or system shutdown\n"));
6200       return;
6201     } else if (gtid == KMP_GTID_DNE) {
6202       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6203                     "shutdown\n"));
6204       /* we don't know who we are, but we may still shutdown the library */
6205     } else if (KMP_UBER_GTID(gtid)) {
6206       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6207       if (__kmp_root[gtid]->r.r_active) {
6208         __kmp_global.g.g_abort = -1;
6209         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6210         __kmp_unregister_library();
6211         KA_TRACE(10,
6212                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6213                   gtid));
6214         return;
6215       } else {
6216         KA_TRACE(
6217             10,
6218             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6219         __kmp_unregister_root_current_thread(gtid);
6220       }
6221     } else {
6222 /* worker threads may call this function through the atexit handler, if they
6223  * call exit() */
6224 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6225    TODO: do a thorough shutdown instead */
6226 #ifdef DUMP_DEBUG_ON_EXIT
6227       if (__kmp_debug_buf)
6228         __kmp_dump_debug_buffer();
6229 #endif
6230       // added unregister library call here when we switch to shm linux
6231       // if we don't, it will leave lots of files in /dev/shm
6232       // cleanup shared memory file before exiting.
6233       __kmp_unregister_library();
6234       return;
6235     }
6236   }
6237   /* synchronize the termination process */
6238   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6239 
6240   /* have we already finished */
6241   if (__kmp_global.g.g_abort) {
6242     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6243     /* TODO abort? */
6244     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6245     return;
6246   }
6247   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6248     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6249     return;
6250   }
6251 
6252   /* We need this lock to enforce mutex between this reading of
6253      __kmp_threads_capacity and the writing by __kmp_register_root.
6254      Alternatively, we can use a counter of roots that is atomically updated by
6255      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6256      __kmp_internal_end_*.  */
6257   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6258 
6259   /* now we can safely conduct the actual termination */
6260   __kmp_internal_end();
6261 
6262   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6263   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6264 
6265   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6266 
6267 #ifdef DUMP_DEBUG_ON_EXIT
6268   if (__kmp_debug_buf)
6269     __kmp_dump_debug_buffer();
6270 #endif
6271 
6272 #if KMP_OS_WINDOWS
6273   __kmp_close_console();
6274 #endif
6275 
6276   __kmp_fini_allocator();
6277 
6278 } // __kmp_internal_end_library
6279 
6280 void __kmp_internal_end_thread(int gtid_req) {
6281   int i;
6282 
6283   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6284   /* this shouldn't be a race condition because __kmp_internal_end() is the
6285    * only place to clear __kmp_serial_init */
6286   /* we'll check this later too, after we get the lock */
6287   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6288   // redundant, because the next check will work in any case.
6289   if (__kmp_global.g.g_abort) {
6290     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6291     /* TODO abort? */
6292     return;
6293   }
6294   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6295     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6296     return;
6297   }
6298 
6299   // If hidden helper team has been initialized, we need to deinit it
6300   if (TCR_4(__kmp_init_hidden_helper)) {
6301     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6302     // First release the main thread to let it continue its work
6303     __kmp_hidden_helper_main_thread_release();
6304     // Wait until the hidden helper team has been destroyed
6305     __kmp_hidden_helper_threads_deinitz_wait();
6306   }
6307 
6308   KMP_MB(); /* Flush all pending memory write invalidates.  */
6309 
6310   /* find out who we are and what we should do */
6311   {
6312     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6313     KA_TRACE(10,
6314              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6315     if (gtid == KMP_GTID_SHUTDOWN) {
6316       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6317                     "already shutdown\n"));
6318       return;
6319     } else if (gtid == KMP_GTID_MONITOR) {
6320       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6321                     "registered, or system shutdown\n"));
6322       return;
6323     } else if (gtid == KMP_GTID_DNE) {
6324       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6325                     "shutdown\n"));
6326       return;
6327       /* we don't know who we are */
6328     } else if (KMP_UBER_GTID(gtid)) {
6329       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6330       if (__kmp_root[gtid]->r.r_active) {
6331         __kmp_global.g.g_abort = -1;
6332         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6333         KA_TRACE(10,
6334                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6335                   gtid));
6336         return;
6337       } else {
6338         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6339                       gtid));
6340         __kmp_unregister_root_current_thread(gtid);
6341       }
6342     } else {
6343       /* just a worker thread, let's leave */
6344       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6345 
6346       if (gtid >= 0) {
6347         __kmp_threads[gtid]->th.th_task_team = NULL;
6348       }
6349 
6350       KA_TRACE(10,
6351                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6352                 gtid));
6353       return;
6354     }
6355   }
6356 #if KMP_DYNAMIC_LIB
6357   if (__kmp_pause_status != kmp_hard_paused)
6358   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6359   // because we will better shutdown later in the library destructor.
6360   {
6361     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6362     return;
6363   }
6364 #endif
6365   /* synchronize the termination process */
6366   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6367 
6368   /* have we already finished */
6369   if (__kmp_global.g.g_abort) {
6370     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6371     /* TODO abort? */
6372     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6373     return;
6374   }
6375   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6376     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6377     return;
6378   }
6379 
6380   /* We need this lock to enforce mutex between this reading of
6381      __kmp_threads_capacity and the writing by __kmp_register_root.
6382      Alternatively, we can use a counter of roots that is atomically updated by
6383      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6384      __kmp_internal_end_*.  */
6385 
6386   /* should we finish the run-time?  are all siblings done? */
6387   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6388 
6389   for (i = 0; i < __kmp_threads_capacity; ++i) {
6390     if (KMP_UBER_GTID(i)) {
6391       KA_TRACE(
6392           10,
6393           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6394       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6395       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6396       return;
6397     }
6398   }
6399 
6400   /* now we can safely conduct the actual termination */
6401 
6402   __kmp_internal_end();
6403 
6404   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6405   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6406 
6407   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6408 
6409 #ifdef DUMP_DEBUG_ON_EXIT
6410   if (__kmp_debug_buf)
6411     __kmp_dump_debug_buffer();
6412 #endif
6413 } // __kmp_internal_end_thread
6414 
6415 // -----------------------------------------------------------------------------
6416 // Library registration stuff.
6417 
6418 static long __kmp_registration_flag = 0;
6419 // Random value used to indicate library initialization.
6420 static char *__kmp_registration_str = NULL;
6421 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6422 
6423 static inline char *__kmp_reg_status_name() {
6424 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6425    each thread. If registration and unregistration go in different threads
6426    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6427    env var can not be found, because the name will contain different pid. */
6428 // macOS* complains about name being too long with additional getuid()
6429 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6430   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6431                           (int)getuid());
6432 #else
6433   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6434 #endif
6435 } // __kmp_reg_status_get
6436 
6437 void __kmp_register_library_startup(void) {
6438 
6439   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6440   int done = 0;
6441   union {
6442     double dtime;
6443     long ltime;
6444   } time;
6445 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6446   __kmp_initialize_system_tick();
6447 #endif
6448   __kmp_read_system_time(&time.dtime);
6449   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6450   __kmp_registration_str =
6451       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6452                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6453 
6454   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6455                 __kmp_registration_str));
6456 
6457   while (!done) {
6458 
6459     char *value = NULL; // Actual value of the environment variable.
6460 
6461 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6462     char *shm_name = __kmp_str_format("/%s", name);
6463     int shm_preexist = 0;
6464     char *data1;
6465     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6466     if ((fd1 == -1) && (errno == EEXIST)) {
6467       // file didn't open because it already exists.
6468       // try opening existing file
6469       fd1 = shm_open(shm_name, O_RDWR, 0666);
6470       if (fd1 == -1) { // file didn't open
6471         // error out here
6472         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6473                     __kmp_msg_null);
6474       } else {
6475         // able to open existing file
6476         shm_preexist = 1;
6477       }
6478     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6479       // already exists.
6480       // error out here.
6481       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6482                   __kmp_msg_null);
6483     }
6484     if (shm_preexist == 0) {
6485       // we created SHM now set size
6486       if (ftruncate(fd1, SHM_SIZE) == -1) {
6487         // error occured setting size;
6488         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6489                     KMP_ERR(errno), __kmp_msg_null);
6490       }
6491     }
6492     data1 =
6493         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6494     if (data1 == MAP_FAILED) {
6495       // failed to map shared memory
6496       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6497                   __kmp_msg_null);
6498     }
6499     if (shm_preexist == 0) { // set data to SHM, set value
6500       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6501     }
6502     // Read value from either what we just wrote or existing file.
6503     value = __kmp_str_format("%s", data1); // read value from SHM
6504     munmap(data1, SHM_SIZE);
6505     close(fd1);
6506 #else // Windows and unix with static library
6507     // Set environment variable, but do not overwrite if it is exist.
6508     __kmp_env_set(name, __kmp_registration_str, 0);
6509     // read value to see if it got set
6510     value = __kmp_env_get(name);
6511 #endif
6512 
6513     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6514       done = 1; // Ok, environment variable set successfully, exit the loop.
6515     } else {
6516       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6517       // Check whether it alive or dead.
6518       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6519       char *tail = value;
6520       char *flag_addr_str = NULL;
6521       char *flag_val_str = NULL;
6522       char const *file_name = NULL;
6523       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6524       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6525       file_name = tail;
6526       if (tail != NULL) {
6527         long *flag_addr = 0;
6528         long flag_val = 0;
6529         KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6530         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6531         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6532           // First, check whether environment-encoded address is mapped into
6533           // addr space.
6534           // If so, dereference it to see if it still has the right value.
6535           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6536             neighbor = 1;
6537           } else {
6538             // If not, then we know the other copy of the library is no longer
6539             // running.
6540             neighbor = 2;
6541           }
6542         }
6543       }
6544       switch (neighbor) {
6545       case 0: // Cannot parse environment variable -- neighbor status unknown.
6546         // Assume it is the incompatible format of future version of the
6547         // library. Assume the other library is alive.
6548         // WARN( ... ); // TODO: Issue a warning.
6549         file_name = "unknown library";
6550         KMP_FALLTHROUGH();
6551       // Attention! Falling to the next case. That's intentional.
6552       case 1: { // Neighbor is alive.
6553         // Check it is allowed.
6554         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6555         if (!__kmp_str_match_true(duplicate_ok)) {
6556           // That's not allowed. Issue fatal error.
6557           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6558                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6559         }
6560         KMP_INTERNAL_FREE(duplicate_ok);
6561         __kmp_duplicate_library_ok = 1;
6562         done = 1; // Exit the loop.
6563       } break;
6564       case 2: { // Neighbor is dead.
6565 
6566 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6567         // close shared memory.
6568         shm_unlink(shm_name); // this removes file in /dev/shm
6569 #else
6570         // Clear the variable and try to register library again.
6571         __kmp_env_unset(name);
6572 #endif
6573       } break;
6574       default: { KMP_DEBUG_ASSERT(0); } break;
6575       }
6576     }
6577     KMP_INTERNAL_FREE((void *)value);
6578 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6579     KMP_INTERNAL_FREE((void *)shm_name);
6580 #endif
6581   } // while
6582   KMP_INTERNAL_FREE((void *)name);
6583 
6584 } // func __kmp_register_library_startup
6585 
6586 void __kmp_unregister_library(void) {
6587 
6588   char *name = __kmp_reg_status_name();
6589   char *value = NULL;
6590 
6591 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6592   char *shm_name = __kmp_str_format("/%s", name);
6593   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6594   if (fd1 == -1) {
6595     // file did not open. return.
6596     return;
6597   }
6598   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6599   if (data1 != MAP_FAILED) {
6600     value = __kmp_str_format("%s", data1); // read value from SHM
6601     munmap(data1, SHM_SIZE);
6602   }
6603   close(fd1);
6604 #else
6605   value = __kmp_env_get(name);
6606 #endif
6607 
6608   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6609   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6610   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6611 //  Ok, this is our variable. Delete it.
6612 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6613     shm_unlink(shm_name); // this removes file in /dev/shm
6614 #else
6615     __kmp_env_unset(name);
6616 #endif
6617   }
6618 
6619 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6620   KMP_INTERNAL_FREE(shm_name);
6621 #endif
6622 
6623   KMP_INTERNAL_FREE(__kmp_registration_str);
6624   KMP_INTERNAL_FREE(value);
6625   KMP_INTERNAL_FREE(name);
6626 
6627   __kmp_registration_flag = 0;
6628   __kmp_registration_str = NULL;
6629 
6630 } // __kmp_unregister_library
6631 
6632 // End of Library registration stuff.
6633 // -----------------------------------------------------------------------------
6634 
6635 #if KMP_MIC_SUPPORTED
6636 
6637 static void __kmp_check_mic_type() {
6638   kmp_cpuid_t cpuid_state = {0};
6639   kmp_cpuid_t *cs_p = &cpuid_state;
6640   __kmp_x86_cpuid(1, 0, cs_p);
6641   // We don't support mic1 at the moment
6642   if ((cs_p->eax & 0xff0) == 0xB10) {
6643     __kmp_mic_type = mic2;
6644   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6645     __kmp_mic_type = mic3;
6646   } else {
6647     __kmp_mic_type = non_mic;
6648   }
6649 }
6650 
6651 #endif /* KMP_MIC_SUPPORTED */
6652 
6653 #if KMP_HAVE_UMWAIT
6654 static void __kmp_user_level_mwait_init() {
6655   struct kmp_cpuid buf;
6656   __kmp_x86_cpuid(7, 0, &buf);
6657   __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6658   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6659                 __kmp_umwait_enabled));
6660 }
6661 #elif KMP_HAVE_MWAIT
6662 #ifndef AT_INTELPHIUSERMWAIT
6663 // Spurious, non-existent value that should always fail to return anything.
6664 // Will be replaced with the correct value when we know that.
6665 #define AT_INTELPHIUSERMWAIT 10000
6666 #endif
6667 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6668 // earlier OS is used to build the RTL, we'll use the following internal
6669 // function when the entry is not found.
6670 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6671 unsigned long getauxval(unsigned long) { return 0; }
6672 
6673 static void __kmp_user_level_mwait_init() {
6674   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6675   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6676   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6677   // KMP_USER_LEVEL_MWAIT was set to TRUE.
6678   if (__kmp_mic_type == mic3) {
6679     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6680     if ((res & 0x1) || __kmp_user_level_mwait) {
6681       __kmp_mwait_enabled = TRUE;
6682       if (__kmp_user_level_mwait) {
6683         KMP_INFORM(EnvMwaitWarn);
6684       }
6685     } else {
6686       __kmp_mwait_enabled = FALSE;
6687     }
6688   }
6689   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6690                 "__kmp_mwait_enabled = %d\n",
6691                 __kmp_mic_type, __kmp_mwait_enabled));
6692 }
6693 #endif /* KMP_HAVE_UMWAIT */
6694 
6695 static void __kmp_do_serial_initialize(void) {
6696   int i, gtid;
6697   size_t size;
6698 
6699   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6700 
6701   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6702   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6703   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6704   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6705   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6706 
6707 #if OMPT_SUPPORT
6708   ompt_pre_init();
6709 #endif
6710 
6711   __kmp_validate_locks();
6712 
6713   /* Initialize internal memory allocator */
6714   __kmp_init_allocator();
6715 
6716   /* Register the library startup via an environment variable and check to see
6717      whether another copy of the library is already registered. */
6718 
6719   __kmp_register_library_startup();
6720 
6721   /* TODO reinitialization of library */
6722   if (TCR_4(__kmp_global.g.g_done)) {
6723     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6724   }
6725 
6726   __kmp_global.g.g_abort = 0;
6727   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6728 
6729 /* initialize the locks */
6730 #if KMP_USE_ADAPTIVE_LOCKS
6731 #if KMP_DEBUG_ADAPTIVE_LOCKS
6732   __kmp_init_speculative_stats();
6733 #endif
6734 #endif
6735 #if KMP_STATS_ENABLED
6736   __kmp_stats_init();
6737 #endif
6738   __kmp_init_lock(&__kmp_global_lock);
6739   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6740   __kmp_init_lock(&__kmp_debug_lock);
6741   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6742   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6743   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6744   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6745   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6746   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6747   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6748   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6749   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6750   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6751   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6752   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6753   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6754   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6755   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6756 #if KMP_USE_MONITOR
6757   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6758 #endif
6759   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6760 
6761   /* conduct initialization and initial setup of configuration */
6762 
6763   __kmp_runtime_initialize();
6764 
6765 #if KMP_MIC_SUPPORTED
6766   __kmp_check_mic_type();
6767 #endif
6768 
6769 // Some global variable initialization moved here from kmp_env_initialize()
6770 #ifdef KMP_DEBUG
6771   kmp_diag = 0;
6772 #endif
6773   __kmp_abort_delay = 0;
6774 
6775   // From __kmp_init_dflt_team_nth()
6776   /* assume the entire machine will be used */
6777   __kmp_dflt_team_nth_ub = __kmp_xproc;
6778   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6779     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6780   }
6781   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6782     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6783   }
6784   __kmp_max_nth = __kmp_sys_max_nth;
6785   __kmp_cg_max_nth = __kmp_sys_max_nth;
6786   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6787   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6788     __kmp_teams_max_nth = __kmp_sys_max_nth;
6789   }
6790 
6791   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6792   // part
6793   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6794 #if KMP_USE_MONITOR
6795   __kmp_monitor_wakeups =
6796       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6797   __kmp_bt_intervals =
6798       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6799 #endif
6800   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6801   __kmp_library = library_throughput;
6802   // From KMP_SCHEDULE initialization
6803   __kmp_static = kmp_sch_static_balanced;
6804 // AC: do not use analytical here, because it is non-monotonous
6805 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6806 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6807 // need to repeat assignment
6808 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6809 // bit control and barrier method control parts
6810 #if KMP_FAST_REDUCTION_BARRIER
6811 #define kmp_reduction_barrier_gather_bb ((int)1)
6812 #define kmp_reduction_barrier_release_bb ((int)1)
6813 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6814 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6815 #endif // KMP_FAST_REDUCTION_BARRIER
6816   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6817     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6818     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6819     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6820     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6821 #if KMP_FAST_REDUCTION_BARRIER
6822     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6823       // lin_64 ): hyper,1
6824       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6825       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6826       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6827       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6828     }
6829 #endif // KMP_FAST_REDUCTION_BARRIER
6830   }
6831 #if KMP_FAST_REDUCTION_BARRIER
6832 #undef kmp_reduction_barrier_release_pat
6833 #undef kmp_reduction_barrier_gather_pat
6834 #undef kmp_reduction_barrier_release_bb
6835 #undef kmp_reduction_barrier_gather_bb
6836 #endif // KMP_FAST_REDUCTION_BARRIER
6837 #if KMP_MIC_SUPPORTED
6838   if (__kmp_mic_type == mic2) { // KNC
6839     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6840     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6841     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6842         1; // forkjoin release
6843     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6844     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6845   }
6846 #if KMP_FAST_REDUCTION_BARRIER
6847   if (__kmp_mic_type == mic2) { // KNC
6848     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6849     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6850   }
6851 #endif // KMP_FAST_REDUCTION_BARRIER
6852 #endif // KMP_MIC_SUPPORTED
6853 
6854 // From KMP_CHECKS initialization
6855 #ifdef KMP_DEBUG
6856   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6857 #else
6858   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6859 #endif
6860 
6861   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6862   __kmp_foreign_tp = TRUE;
6863 
6864   __kmp_global.g.g_dynamic = FALSE;
6865   __kmp_global.g.g_dynamic_mode = dynamic_default;
6866 
6867   __kmp_env_initialize(NULL);
6868 
6869 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6870   __kmp_user_level_mwait_init();
6871 #endif
6872 // Print all messages in message catalog for testing purposes.
6873 #ifdef KMP_DEBUG
6874   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6875   if (__kmp_str_match_true(val)) {
6876     kmp_str_buf_t buffer;
6877     __kmp_str_buf_init(&buffer);
6878     __kmp_i18n_dump_catalog(&buffer);
6879     __kmp_printf("%s", buffer.str);
6880     __kmp_str_buf_free(&buffer);
6881   }
6882   __kmp_env_free(&val);
6883 #endif
6884 
6885   __kmp_threads_capacity =
6886       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6887   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6888   __kmp_tp_capacity = __kmp_default_tp_capacity(
6889       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6890 
6891   // If the library is shut down properly, both pools must be NULL. Just in
6892   // case, set them to NULL -- some memory may leak, but subsequent code will
6893   // work even if pools are not freed.
6894   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6895   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6896   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6897   __kmp_thread_pool = NULL;
6898   __kmp_thread_pool_insert_pt = NULL;
6899   __kmp_team_pool = NULL;
6900 
6901   /* Allocate all of the variable sized records */
6902   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6903    * expandable */
6904   /* Since allocation is cache-aligned, just add extra padding at the end */
6905   size =
6906       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6907       CACHE_LINE;
6908   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6909   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6910                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6911 
6912   /* init thread counts */
6913   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6914                    0); // Asserts fail if the library is reinitializing and
6915   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6916   __kmp_all_nth = 0;
6917   __kmp_nth = 0;
6918 
6919   /* setup the uber master thread and hierarchy */
6920   gtid = __kmp_register_root(TRUE);
6921   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6922   KMP_ASSERT(KMP_UBER_GTID(gtid));
6923   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6924 
6925   KMP_MB(); /* Flush all pending memory write invalidates.  */
6926 
6927   __kmp_common_initialize();
6928 
6929 #if KMP_OS_UNIX
6930   /* invoke the child fork handler */
6931   __kmp_register_atfork();
6932 #endif
6933 
6934 #if !KMP_DYNAMIC_LIB
6935   {
6936     /* Invoke the exit handler when the program finishes, only for static
6937        library. For dynamic library, we already have _fini and DllMain. */
6938     int rc = atexit(__kmp_internal_end_atexit);
6939     if (rc != 0) {
6940       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6941                   __kmp_msg_null);
6942     }
6943   }
6944 #endif
6945 
6946 #if KMP_HANDLE_SIGNALS
6947 #if KMP_OS_UNIX
6948   /* NOTE: make sure that this is called before the user installs their own
6949      signal handlers so that the user handlers are called first. this way they
6950      can return false, not call our handler, avoid terminating the library, and
6951      continue execution where they left off. */
6952   __kmp_install_signals(FALSE);
6953 #endif /* KMP_OS_UNIX */
6954 #if KMP_OS_WINDOWS
6955   __kmp_install_signals(TRUE);
6956 #endif /* KMP_OS_WINDOWS */
6957 #endif
6958 
6959   /* we have finished the serial initialization */
6960   __kmp_init_counter++;
6961 
6962   __kmp_init_serial = TRUE;
6963 
6964   if (__kmp_settings) {
6965     __kmp_env_print();
6966   }
6967 
6968   if (__kmp_display_env || __kmp_display_env_verbose) {
6969     __kmp_env_print_2();
6970   }
6971 
6972 #if OMPT_SUPPORT
6973   ompt_post_init();
6974 #endif
6975 
6976   KMP_MB();
6977 
6978   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6979 }
6980 
6981 void __kmp_serial_initialize(void) {
6982   if (__kmp_init_serial) {
6983     return;
6984   }
6985   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6986   if (__kmp_init_serial) {
6987     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6988     return;
6989   }
6990   __kmp_do_serial_initialize();
6991   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6992 }
6993 
6994 static void __kmp_do_middle_initialize(void) {
6995   int i, j;
6996   int prev_dflt_team_nth;
6997 
6998   if (!__kmp_init_serial) {
6999     __kmp_do_serial_initialize();
7000   }
7001 
7002   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7003 
7004   // Save the previous value for the __kmp_dflt_team_nth so that
7005   // we can avoid some reinitialization if it hasn't changed.
7006   prev_dflt_team_nth = __kmp_dflt_team_nth;
7007 
7008 #if KMP_AFFINITY_SUPPORTED
7009   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7010   // number of cores on the machine.
7011   __kmp_affinity_initialize();
7012 
7013   // Run through the __kmp_threads array and set the affinity mask
7014   // for each root thread that is currently registered with the RTL.
7015   for (i = 0; i < __kmp_threads_capacity; i++) {
7016     if (TCR_PTR(__kmp_threads[i]) != NULL) {
7017       __kmp_affinity_set_init_mask(i, TRUE);
7018     }
7019   }
7020 #endif /* KMP_AFFINITY_SUPPORTED */
7021 
7022   KMP_ASSERT(__kmp_xproc > 0);
7023   if (__kmp_avail_proc == 0) {
7024     __kmp_avail_proc = __kmp_xproc;
7025   }
7026 
7027   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7028   // correct them now
7029   j = 0;
7030   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7031     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7032         __kmp_avail_proc;
7033     j++;
7034   }
7035 
7036   if (__kmp_dflt_team_nth == 0) {
7037 #ifdef KMP_DFLT_NTH_CORES
7038     // Default #threads = #cores
7039     __kmp_dflt_team_nth = __kmp_ncores;
7040     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7041                   "__kmp_ncores (%d)\n",
7042                   __kmp_dflt_team_nth));
7043 #else
7044     // Default #threads = #available OS procs
7045     __kmp_dflt_team_nth = __kmp_avail_proc;
7046     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7047                   "__kmp_avail_proc(%d)\n",
7048                   __kmp_dflt_team_nth));
7049 #endif /* KMP_DFLT_NTH_CORES */
7050   }
7051 
7052   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7053     __kmp_dflt_team_nth = KMP_MIN_NTH;
7054   }
7055   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7056     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7057   }
7058 
7059   // There's no harm in continuing if the following check fails,
7060   // but it indicates an error in the previous logic.
7061   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7062 
7063   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7064     // Run through the __kmp_threads array and set the num threads icv for each
7065     // root thread that is currently registered with the RTL (which has not
7066     // already explicitly set its nthreads-var with a call to
7067     // omp_set_num_threads()).
7068     for (i = 0; i < __kmp_threads_capacity; i++) {
7069       kmp_info_t *thread = __kmp_threads[i];
7070       if (thread == NULL)
7071         continue;
7072       if (thread->th.th_current_task->td_icvs.nproc != 0)
7073         continue;
7074 
7075       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7076     }
7077   }
7078   KA_TRACE(
7079       20,
7080       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7081        __kmp_dflt_team_nth));
7082 
7083 #ifdef KMP_ADJUST_BLOCKTIME
7084   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7085   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7086     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7087     if (__kmp_nth > __kmp_avail_proc) {
7088       __kmp_zero_bt = TRUE;
7089     }
7090   }
7091 #endif /* KMP_ADJUST_BLOCKTIME */
7092 
7093   /* we have finished middle initialization */
7094   TCW_SYNC_4(__kmp_init_middle, TRUE);
7095 
7096   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7097 }
7098 
7099 void __kmp_middle_initialize(void) {
7100   if (__kmp_init_middle) {
7101     return;
7102   }
7103   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7104   if (__kmp_init_middle) {
7105     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7106     return;
7107   }
7108   __kmp_do_middle_initialize();
7109   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7110 }
7111 
7112 void __kmp_parallel_initialize(void) {
7113   int gtid = __kmp_entry_gtid(); // this might be a new root
7114 
7115   /* synchronize parallel initialization (for sibling) */
7116   if (TCR_4(__kmp_init_parallel))
7117     return;
7118   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7119   if (TCR_4(__kmp_init_parallel)) {
7120     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7121     return;
7122   }
7123 
7124   /* TODO reinitialization after we have already shut down */
7125   if (TCR_4(__kmp_global.g.g_done)) {
7126     KA_TRACE(
7127         10,
7128         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7129     __kmp_infinite_loop();
7130   }
7131 
7132   /* jc: The lock __kmp_initz_lock is already held, so calling
7133      __kmp_serial_initialize would cause a deadlock.  So we call
7134      __kmp_do_serial_initialize directly. */
7135   if (!__kmp_init_middle) {
7136     __kmp_do_middle_initialize();
7137   }
7138   __kmp_resume_if_hard_paused();
7139 
7140   /* begin initialization */
7141   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7142   KMP_ASSERT(KMP_UBER_GTID(gtid));
7143 
7144 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7145   // Save the FP control regs.
7146   // Worker threads will set theirs to these values at thread startup.
7147   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7148   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7149   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7150 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7151 
7152 #if KMP_OS_UNIX
7153 #if KMP_HANDLE_SIGNALS
7154   /*  must be after __kmp_serial_initialize  */
7155   __kmp_install_signals(TRUE);
7156 #endif
7157 #endif
7158 
7159   __kmp_suspend_initialize();
7160 
7161 #if defined(USE_LOAD_BALANCE)
7162   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7163     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7164   }
7165 #else
7166   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7167     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7168   }
7169 #endif
7170 
7171   if (__kmp_version) {
7172     __kmp_print_version_2();
7173   }
7174 
7175   /* we have finished parallel initialization */
7176   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7177 
7178   KMP_MB();
7179   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7180 
7181   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7182 }
7183 
7184 void __kmp_hidden_helper_initialize() {
7185   if (TCR_4(__kmp_init_hidden_helper))
7186     return;
7187 
7188   // __kmp_parallel_initialize is required before we initialize hidden helper
7189   if (!TCR_4(__kmp_init_parallel))
7190     __kmp_parallel_initialize();
7191 
7192   // Double check. Note that this double check should not be placed before
7193   // __kmp_parallel_initialize as it will cause dead lock.
7194   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7195   if (TCR_4(__kmp_init_hidden_helper)) {
7196     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7197     return;
7198   }
7199 
7200   // Set the count of hidden helper tasks to be executed to zero
7201   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7202 
7203   // Set the global variable indicating that we're initializing hidden helper
7204   // team/threads
7205   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7206 
7207   // Platform independent initialization
7208   __kmp_do_initialize_hidden_helper_threads();
7209 
7210   // Wait here for the finish of initialization of hidden helper teams
7211   __kmp_hidden_helper_threads_initz_wait();
7212 
7213   // We have finished hidden helper initialization
7214   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7215 
7216   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7217 }
7218 
7219 /* ------------------------------------------------------------------------ */
7220 
7221 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7222                                    kmp_team_t *team) {
7223   kmp_disp_t *dispatch;
7224 
7225   KMP_MB();
7226 
7227   /* none of the threads have encountered any constructs, yet. */
7228   this_thr->th.th_local.this_construct = 0;
7229 #if KMP_CACHE_MANAGE
7230   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7231 #endif /* KMP_CACHE_MANAGE */
7232   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7233   KMP_DEBUG_ASSERT(dispatch);
7234   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7235   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7236   // this_thr->th.th_info.ds.ds_tid ] );
7237 
7238   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7239   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7240   if (__kmp_env_consistency_check)
7241     __kmp_push_parallel(gtid, team->t.t_ident);
7242 
7243   KMP_MB(); /* Flush all pending memory write invalidates.  */
7244 }
7245 
7246 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7247                                   kmp_team_t *team) {
7248   if (__kmp_env_consistency_check)
7249     __kmp_pop_parallel(gtid, team->t.t_ident);
7250 
7251   __kmp_finish_implicit_task(this_thr);
7252 }
7253 
7254 int __kmp_invoke_task_func(int gtid) {
7255   int rc;
7256   int tid = __kmp_tid_from_gtid(gtid);
7257   kmp_info_t *this_thr = __kmp_threads[gtid];
7258   kmp_team_t *team = this_thr->th.th_team;
7259 
7260   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7261 #if USE_ITT_BUILD
7262   if (__itt_stack_caller_create_ptr) {
7263     __kmp_itt_stack_callee_enter(
7264         (__itt_caller)
7265             team->t.t_stack_id); // inform ittnotify about entering user's code
7266   }
7267 #endif /* USE_ITT_BUILD */
7268 #if INCLUDE_SSC_MARKS
7269   SSC_MARK_INVOKING();
7270 #endif
7271 
7272 #if OMPT_SUPPORT
7273   void *dummy;
7274   void **exit_frame_p;
7275   ompt_data_t *my_task_data;
7276   ompt_data_t *my_parallel_data;
7277   int ompt_team_size;
7278 
7279   if (ompt_enabled.enabled) {
7280     exit_frame_p = &(
7281         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7282   } else {
7283     exit_frame_p = &dummy;
7284   }
7285 
7286   my_task_data =
7287       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7288   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7289   if (ompt_enabled.ompt_callback_implicit_task) {
7290     ompt_team_size = team->t.t_nproc;
7291     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7292         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7293         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7294     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7295   }
7296 #endif
7297 
7298 #if KMP_STATS_ENABLED
7299   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7300   if (previous_state == stats_state_e::TEAMS_REGION) {
7301     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7302   } else {
7303     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7304   }
7305   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7306 #endif
7307 
7308   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7309                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7310 #if OMPT_SUPPORT
7311                               ,
7312                               exit_frame_p
7313 #endif
7314                               );
7315 #if OMPT_SUPPORT
7316   *exit_frame_p = NULL;
7317    this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7318 #endif
7319 
7320 #if KMP_STATS_ENABLED
7321   if (previous_state == stats_state_e::TEAMS_REGION) {
7322     KMP_SET_THREAD_STATE(previous_state);
7323   }
7324   KMP_POP_PARTITIONED_TIMER();
7325 #endif
7326 
7327 #if USE_ITT_BUILD
7328   if (__itt_stack_caller_create_ptr) {
7329     __kmp_itt_stack_callee_leave(
7330         (__itt_caller)
7331             team->t.t_stack_id); // inform ittnotify about leaving user's code
7332   }
7333 #endif /* USE_ITT_BUILD */
7334   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7335 
7336   return rc;
7337 }
7338 
7339 void __kmp_teams_master(int gtid) {
7340   // This routine is called by all master threads in teams construct
7341   kmp_info_t *thr = __kmp_threads[gtid];
7342   kmp_team_t *team = thr->th.th_team;
7343   ident_t *loc = team->t.t_ident;
7344   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7345   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7346   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7347   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7348                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7349 
7350   // This thread is a new CG root.  Set up the proper variables.
7351   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7352   tmp->cg_root = thr; // Make thr the CG root
7353   // Init to thread limit that was stored when league masters were forked
7354   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7355   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7356   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7357                  " cg_nthreads to 1\n",
7358                  thr, tmp));
7359   tmp->up = thr->th.th_cg_roots;
7360   thr->th.th_cg_roots = tmp;
7361 
7362 // Launch league of teams now, but not let workers execute
7363 // (they hang on fork barrier until next parallel)
7364 #if INCLUDE_SSC_MARKS
7365   SSC_MARK_FORKING();
7366 #endif
7367   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7368                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7369                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7370 #if INCLUDE_SSC_MARKS
7371   SSC_MARK_JOINING();
7372 #endif
7373   // If the team size was reduced from the limit, set it to the new size
7374   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7375     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7376   // AC: last parameter "1" eliminates join barrier which won't work because
7377   // worker threads are in a fork barrier waiting for more parallel regions
7378   __kmp_join_call(loc, gtid
7379 #if OMPT_SUPPORT
7380                   ,
7381                   fork_context_intel
7382 #endif
7383                   ,
7384                   1);
7385 }
7386 
7387 int __kmp_invoke_teams_master(int gtid) {
7388   kmp_info_t *this_thr = __kmp_threads[gtid];
7389   kmp_team_t *team = this_thr->th.th_team;
7390 #if KMP_DEBUG
7391   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7392     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7393                      (void *)__kmp_teams_master);
7394 #endif
7395   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7396 #if OMPT_SUPPORT
7397   int tid = __kmp_tid_from_gtid(gtid);
7398   ompt_data_t *task_data =
7399       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7400   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7401   if (ompt_enabled.ompt_callback_implicit_task) {
7402     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7403         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7404         ompt_task_initial);
7405     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7406   }
7407 #endif
7408   __kmp_teams_master(gtid);
7409 #if OMPT_SUPPORT
7410   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7411 #endif
7412   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7413   return 1;
7414 }
7415 
7416 /* this sets the requested number of threads for the next parallel region
7417    encountered by this team. since this should be enclosed in the forkjoin
7418    critical section it should avoid race conditions with asymmetrical nested
7419    parallelism */
7420 
7421 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7422   kmp_info_t *thr = __kmp_threads[gtid];
7423 
7424   if (num_threads > 0)
7425     thr->th.th_set_nproc = num_threads;
7426 }
7427 
7428 /* this sets the requested number of teams for the teams region and/or
7429    the number of threads for the next parallel region encountered  */
7430 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7431                           int num_threads) {
7432   kmp_info_t *thr = __kmp_threads[gtid];
7433   KMP_DEBUG_ASSERT(num_teams >= 0);
7434   KMP_DEBUG_ASSERT(num_threads >= 0);
7435 
7436   if (num_teams == 0)
7437     num_teams = 1; // default number of teams is 1.
7438   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7439     if (!__kmp_reserve_warn) {
7440       __kmp_reserve_warn = 1;
7441       __kmp_msg(kmp_ms_warning,
7442                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7443                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7444     }
7445     num_teams = __kmp_teams_max_nth;
7446   }
7447   // Set number of teams (number of threads in the outer "parallel" of the
7448   // teams)
7449   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7450 
7451   // Remember the number of threads for inner parallel regions
7452   if (!TCR_4(__kmp_init_middle))
7453     __kmp_middle_initialize(); // get internal globals calculated
7454   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7455   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7456   if (num_threads == 0) {
7457     num_threads = __kmp_avail_proc / num_teams;
7458     // adjust num_threads w/o warning as it is not user setting
7459     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7460     // no thread_limit clause specified -  do not change thread-limit-var ICV
7461     if (num_threads > __kmp_dflt_team_nth) {
7462       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7463     }
7464     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7465       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7466     } // prevent team size to exceed thread-limit-var
7467     if (num_teams * num_threads > __kmp_teams_max_nth) {
7468       num_threads = __kmp_teams_max_nth / num_teams;
7469     }
7470   } else {
7471     // This thread will be the master of the league masters
7472     // Store new thread limit; old limit is saved in th_cg_roots list
7473     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7474     // num_threads = min(num_threads, nthreads-var)
7475     if (num_threads > __kmp_dflt_team_nth) {
7476       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7477     }
7478     if (num_teams * num_threads > __kmp_teams_max_nth) {
7479       int new_threads = __kmp_teams_max_nth / num_teams;
7480       if (!__kmp_reserve_warn) { // user asked for too many threads
7481         __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7482         __kmp_msg(kmp_ms_warning,
7483                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7484                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7485       }
7486       num_threads = new_threads;
7487     }
7488   }
7489   thr->th.th_teams_size.nth = num_threads;
7490 }
7491 
7492 // Set the proc_bind var to use in the following parallel region.
7493 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7494   kmp_info_t *thr = __kmp_threads[gtid];
7495   thr->th.th_set_proc_bind = proc_bind;
7496 }
7497 
7498 /* Launch the worker threads into the microtask. */
7499 
7500 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7501   kmp_info_t *this_thr = __kmp_threads[gtid];
7502 
7503 #ifdef KMP_DEBUG
7504   int f;
7505 #endif /* KMP_DEBUG */
7506 
7507   KMP_DEBUG_ASSERT(team);
7508   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7509   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7510   KMP_MB(); /* Flush all pending memory write invalidates.  */
7511 
7512   team->t.t_construct = 0; /* no single directives seen yet */
7513   team->t.t_ordered.dt.t_value =
7514       0; /* thread 0 enters the ordered section first */
7515 
7516   /* Reset the identifiers on the dispatch buffer */
7517   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7518   if (team->t.t_max_nproc > 1) {
7519     int i;
7520     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7521       team->t.t_disp_buffer[i].buffer_index = i;
7522       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7523     }
7524   } else {
7525     team->t.t_disp_buffer[0].buffer_index = 0;
7526     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7527   }
7528 
7529   KMP_MB(); /* Flush all pending memory write invalidates.  */
7530   KMP_ASSERT(this_thr->th.th_team == team);
7531 
7532 #ifdef KMP_DEBUG
7533   for (f = 0; f < team->t.t_nproc; f++) {
7534     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7535                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7536   }
7537 #endif /* KMP_DEBUG */
7538 
7539   /* release the worker threads so they may begin working */
7540   __kmp_fork_barrier(gtid, 0);
7541 }
7542 
7543 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7544   kmp_info_t *this_thr = __kmp_threads[gtid];
7545 
7546   KMP_DEBUG_ASSERT(team);
7547   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7548   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7549   KMP_MB(); /* Flush all pending memory write invalidates.  */
7550 
7551 /* Join barrier after fork */
7552 
7553 #ifdef KMP_DEBUG
7554   if (__kmp_threads[gtid] &&
7555       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7556     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7557                  __kmp_threads[gtid]);
7558     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7559                  "team->t.t_nproc=%d\n",
7560                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7561                  team->t.t_nproc);
7562     __kmp_print_structure();
7563   }
7564   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7565                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7566 #endif /* KMP_DEBUG */
7567 
7568   __kmp_join_barrier(gtid); /* wait for everyone */
7569 #if OMPT_SUPPORT
7570   if (ompt_enabled.enabled &&
7571       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7572     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7573     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7574     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7575 #if OMPT_OPTIONAL
7576     void *codeptr = NULL;
7577     if (KMP_MASTER_TID(ds_tid) &&
7578         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7579          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7580       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7581 
7582     if (ompt_enabled.ompt_callback_sync_region_wait) {
7583       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7584           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7585           codeptr);
7586     }
7587     if (ompt_enabled.ompt_callback_sync_region) {
7588       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7589           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7590           codeptr);
7591     }
7592 #endif
7593     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7594       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7595           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7596     }
7597   }
7598 #endif
7599 
7600   KMP_MB(); /* Flush all pending memory write invalidates.  */
7601   KMP_ASSERT(this_thr->th.th_team == team);
7602 }
7603 
7604 /* ------------------------------------------------------------------------ */
7605 
7606 #ifdef USE_LOAD_BALANCE
7607 
7608 // Return the worker threads actively spinning in the hot team, if we
7609 // are at the outermost level of parallelism.  Otherwise, return 0.
7610 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7611   int i;
7612   int retval;
7613   kmp_team_t *hot_team;
7614 
7615   if (root->r.r_active) {
7616     return 0;
7617   }
7618   hot_team = root->r.r_hot_team;
7619   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7620     return hot_team->t.t_nproc - 1; // Don't count master thread
7621   }
7622 
7623   // Skip the master thread - it is accounted for elsewhere.
7624   retval = 0;
7625   for (i = 1; i < hot_team->t.t_nproc; i++) {
7626     if (hot_team->t.t_threads[i]->th.th_active) {
7627       retval++;
7628     }
7629   }
7630   return retval;
7631 }
7632 
7633 // Perform an automatic adjustment to the number of
7634 // threads used by the next parallel region.
7635 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7636   int retval;
7637   int pool_active;
7638   int hot_team_active;
7639   int team_curr_active;
7640   int system_active;
7641 
7642   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7643                 set_nproc));
7644   KMP_DEBUG_ASSERT(root);
7645   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7646                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7647   KMP_DEBUG_ASSERT(set_nproc > 1);
7648 
7649   if (set_nproc == 1) {
7650     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7651     return 1;
7652   }
7653 
7654   // Threads that are active in the thread pool, active in the hot team for this
7655   // particular root (if we are at the outer par level), and the currently
7656   // executing thread (to become the master) are available to add to the new
7657   // team, but are currently contributing to the system load, and must be
7658   // accounted for.
7659   pool_active = __kmp_thread_pool_active_nth;
7660   hot_team_active = __kmp_active_hot_team_nproc(root);
7661   team_curr_active = pool_active + hot_team_active + 1;
7662 
7663   // Check the system load.
7664   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7665   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7666                 "hot team active = %d\n",
7667                 system_active, pool_active, hot_team_active));
7668 
7669   if (system_active < 0) {
7670     // There was an error reading the necessary info from /proc, so use the
7671     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7672     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7673     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7674     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7675 
7676     // Make this call behave like the thread limit algorithm.
7677     retval = __kmp_avail_proc - __kmp_nth +
7678              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7679     if (retval > set_nproc) {
7680       retval = set_nproc;
7681     }
7682     if (retval < KMP_MIN_NTH) {
7683       retval = KMP_MIN_NTH;
7684     }
7685 
7686     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7687                   retval));
7688     return retval;
7689   }
7690 
7691   // There is a slight delay in the load balance algorithm in detecting new
7692   // running procs. The real system load at this instant should be at least as
7693   // large as the #active omp thread that are available to add to the team.
7694   if (system_active < team_curr_active) {
7695     system_active = team_curr_active;
7696   }
7697   retval = __kmp_avail_proc - system_active + team_curr_active;
7698   if (retval > set_nproc) {
7699     retval = set_nproc;
7700   }
7701   if (retval < KMP_MIN_NTH) {
7702     retval = KMP_MIN_NTH;
7703   }
7704 
7705   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7706   return retval;
7707 } // __kmp_load_balance_nproc()
7708 
7709 #endif /* USE_LOAD_BALANCE */
7710 
7711 /* ------------------------------------------------------------------------ */
7712 
7713 /* NOTE: this is called with the __kmp_init_lock held */
7714 void __kmp_cleanup(void) {
7715   int f;
7716 
7717   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7718 
7719   if (TCR_4(__kmp_init_parallel)) {
7720 #if KMP_HANDLE_SIGNALS
7721     __kmp_remove_signals();
7722 #endif
7723     TCW_4(__kmp_init_parallel, FALSE);
7724   }
7725 
7726   if (TCR_4(__kmp_init_middle)) {
7727 #if KMP_AFFINITY_SUPPORTED
7728     __kmp_affinity_uninitialize();
7729 #endif /* KMP_AFFINITY_SUPPORTED */
7730     __kmp_cleanup_hierarchy();
7731     TCW_4(__kmp_init_middle, FALSE);
7732   }
7733 
7734   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7735 
7736   if (__kmp_init_serial) {
7737     __kmp_runtime_destroy();
7738     __kmp_init_serial = FALSE;
7739   }
7740 
7741   __kmp_cleanup_threadprivate_caches();
7742 
7743   for (f = 0; f < __kmp_threads_capacity; f++) {
7744     if (__kmp_root[f] != NULL) {
7745       __kmp_free(__kmp_root[f]);
7746       __kmp_root[f] = NULL;
7747     }
7748   }
7749   __kmp_free(__kmp_threads);
7750   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7751   // there is no need in freeing __kmp_root.
7752   __kmp_threads = NULL;
7753   __kmp_root = NULL;
7754   __kmp_threads_capacity = 0;
7755 
7756 #if KMP_USE_DYNAMIC_LOCK
7757   __kmp_cleanup_indirect_user_locks();
7758 #else
7759   __kmp_cleanup_user_locks();
7760 #endif
7761 
7762 #if KMP_AFFINITY_SUPPORTED
7763   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7764   __kmp_cpuinfo_file = NULL;
7765 #endif /* KMP_AFFINITY_SUPPORTED */
7766 
7767 #if KMP_USE_ADAPTIVE_LOCKS
7768 #if KMP_DEBUG_ADAPTIVE_LOCKS
7769   __kmp_print_speculative_stats();
7770 #endif
7771 #endif
7772   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7773   __kmp_nested_nth.nth = NULL;
7774   __kmp_nested_nth.size = 0;
7775   __kmp_nested_nth.used = 0;
7776   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7777   __kmp_nested_proc_bind.bind_types = NULL;
7778   __kmp_nested_proc_bind.size = 0;
7779   __kmp_nested_proc_bind.used = 0;
7780   if (__kmp_affinity_format) {
7781     KMP_INTERNAL_FREE(__kmp_affinity_format);
7782     __kmp_affinity_format = NULL;
7783   }
7784 
7785   __kmp_i18n_catclose();
7786 
7787 #if KMP_USE_HIER_SCHED
7788   __kmp_hier_scheds.deallocate();
7789 #endif
7790 
7791 #if KMP_STATS_ENABLED
7792   __kmp_stats_fini();
7793 #endif
7794 
7795   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7796 }
7797 
7798 /* ------------------------------------------------------------------------ */
7799 
7800 int __kmp_ignore_mppbeg(void) {
7801   char *env;
7802 
7803   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7804     if (__kmp_str_match_false(env))
7805       return FALSE;
7806   }
7807   // By default __kmpc_begin() is no-op.
7808   return TRUE;
7809 }
7810 
7811 int __kmp_ignore_mppend(void) {
7812   char *env;
7813 
7814   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7815     if (__kmp_str_match_false(env))
7816       return FALSE;
7817   }
7818   // By default __kmpc_end() is no-op.
7819   return TRUE;
7820 }
7821 
7822 void __kmp_internal_begin(void) {
7823   int gtid;
7824   kmp_root_t *root;
7825 
7826   /* this is a very important step as it will register new sibling threads
7827      and assign these new uber threads a new gtid */
7828   gtid = __kmp_entry_gtid();
7829   root = __kmp_threads[gtid]->th.th_root;
7830   KMP_ASSERT(KMP_UBER_GTID(gtid));
7831 
7832   if (root->r.r_begin)
7833     return;
7834   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7835   if (root->r.r_begin) {
7836     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7837     return;
7838   }
7839 
7840   root->r.r_begin = TRUE;
7841 
7842   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7843 }
7844 
7845 /* ------------------------------------------------------------------------ */
7846 
7847 void __kmp_user_set_library(enum library_type arg) {
7848   int gtid;
7849   kmp_root_t *root;
7850   kmp_info_t *thread;
7851 
7852   /* first, make sure we are initialized so we can get our gtid */
7853 
7854   gtid = __kmp_entry_gtid();
7855   thread = __kmp_threads[gtid];
7856 
7857   root = thread->th.th_root;
7858 
7859   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7860                 library_serial));
7861   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7862                                   thread */
7863     KMP_WARNING(SetLibraryIncorrectCall);
7864     return;
7865   }
7866 
7867   switch (arg) {
7868   case library_serial:
7869     thread->th.th_set_nproc = 0;
7870     set__nproc(thread, 1);
7871     break;
7872   case library_turnaround:
7873     thread->th.th_set_nproc = 0;
7874     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7875                                            : __kmp_dflt_team_nth_ub);
7876     break;
7877   case library_throughput:
7878     thread->th.th_set_nproc = 0;
7879     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7880                                            : __kmp_dflt_team_nth_ub);
7881     break;
7882   default:
7883     KMP_FATAL(UnknownLibraryType, arg);
7884   }
7885 
7886   __kmp_aux_set_library(arg);
7887 }
7888 
7889 void __kmp_aux_set_stacksize(size_t arg) {
7890   if (!__kmp_init_serial)
7891     __kmp_serial_initialize();
7892 
7893 #if KMP_OS_DARWIN
7894   if (arg & (0x1000 - 1)) {
7895     arg &= ~(0x1000 - 1);
7896     if (arg + 0x1000) /* check for overflow if we round up */
7897       arg += 0x1000;
7898   }
7899 #endif
7900   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7901 
7902   /* only change the default stacksize before the first parallel region */
7903   if (!TCR_4(__kmp_init_parallel)) {
7904     size_t value = arg; /* argument is in bytes */
7905 
7906     if (value < __kmp_sys_min_stksize)
7907       value = __kmp_sys_min_stksize;
7908     else if (value > KMP_MAX_STKSIZE)
7909       value = KMP_MAX_STKSIZE;
7910 
7911     __kmp_stksize = value;
7912 
7913     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7914   }
7915 
7916   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7917 }
7918 
7919 /* set the behaviour of the runtime library */
7920 /* TODO this can cause some odd behaviour with sibling parallelism... */
7921 void __kmp_aux_set_library(enum library_type arg) {
7922   __kmp_library = arg;
7923 
7924   switch (__kmp_library) {
7925   case library_serial: {
7926     KMP_INFORM(LibraryIsSerial);
7927   } break;
7928   case library_turnaround:
7929     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7930       __kmp_use_yield = 2; // only yield when oversubscribed
7931     break;
7932   case library_throughput:
7933     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7934       __kmp_dflt_blocktime = 200;
7935     break;
7936   default:
7937     KMP_FATAL(UnknownLibraryType, arg);
7938   }
7939 }
7940 
7941 /* Getting team information common for all team API */
7942 // Returns NULL if not in teams construct
7943 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7944   kmp_info_t *thr = __kmp_entry_thread();
7945   teams_serialized = 0;
7946   if (thr->th.th_teams_microtask) {
7947     kmp_team_t *team = thr->th.th_team;
7948     int tlevel = thr->th.th_teams_level; // the level of the teams construct
7949     int ii = team->t.t_level;
7950     teams_serialized = team->t.t_serialized;
7951     int level = tlevel + 1;
7952     KMP_DEBUG_ASSERT(ii >= tlevel);
7953     while (ii > level) {
7954       for (teams_serialized = team->t.t_serialized;
7955            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7956       }
7957       if (team->t.t_serialized && (!teams_serialized)) {
7958         team = team->t.t_parent;
7959         continue;
7960       }
7961       if (ii > level) {
7962         team = team->t.t_parent;
7963         ii--;
7964       }
7965     }
7966     return team;
7967   }
7968   return NULL;
7969 }
7970 
7971 int __kmp_aux_get_team_num() {
7972   int serialized;
7973   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7974   if (team) {
7975     if (serialized > 1) {
7976       return 0; // teams region is serialized ( 1 team of 1 thread ).
7977     } else {
7978       return team->t.t_master_tid;
7979     }
7980   }
7981   return 0;
7982 }
7983 
7984 int __kmp_aux_get_num_teams() {
7985   int serialized;
7986   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7987   if (team) {
7988     if (serialized > 1) {
7989       return 1;
7990     } else {
7991       return team->t.t_parent->t.t_nproc;
7992     }
7993   }
7994   return 1;
7995 }
7996 
7997 /* ------------------------------------------------------------------------ */
7998 
7999 /*
8000  * Affinity Format Parser
8001  *
8002  * Field is in form of: %[[[0].]size]type
8003  * % and type are required (%% means print a literal '%')
8004  * type is either single char or long name surrounded by {},
8005  * e.g., N or {num_threads}
8006  * 0 => leading zeros
8007  * . => right justified when size is specified
8008  * by default output is left justified
8009  * size is the *minimum* field length
8010  * All other characters are printed as is
8011  *
8012  * Available field types:
8013  * L {thread_level}      - omp_get_level()
8014  * n {thread_num}        - omp_get_thread_num()
8015  * h {host}              - name of host machine
8016  * P {process_id}        - process id (integer)
8017  * T {thread_identifier} - native thread identifier (integer)
8018  * N {num_threads}       - omp_get_num_threads()
8019  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8020  * a {thread_affinity}   - comma separated list of integers or integer ranges
8021  *                         (values of affinity mask)
8022  *
8023  * Implementation-specific field types can be added
8024  * If a type is unknown, print "undefined"
8025 */
8026 
8027 // Structure holding the short name, long name, and corresponding data type
8028 // for snprintf.  A table of these will represent the entire valid keyword
8029 // field types.
8030 typedef struct kmp_affinity_format_field_t {
8031   char short_name; // from spec e.g., L -> thread level
8032   const char *long_name; // from spec thread_level -> thread level
8033   char field_format; // data type for snprintf (typically 'd' or 's'
8034   // for integer or string)
8035 } kmp_affinity_format_field_t;
8036 
8037 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8038 #if KMP_AFFINITY_SUPPORTED
8039     {'A', "thread_affinity", 's'},
8040 #endif
8041     {'t', "team_num", 'd'},
8042     {'T', "num_teams", 'd'},
8043     {'L', "nesting_level", 'd'},
8044     {'n', "thread_num", 'd'},
8045     {'N', "num_threads", 'd'},
8046     {'a', "ancestor_tnum", 'd'},
8047     {'H', "host", 's'},
8048     {'P', "process_id", 'd'},
8049     {'i', "native_thread_id", 'd'}};
8050 
8051 // Return the number of characters it takes to hold field
8052 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8053                                             const char **ptr,
8054                                             kmp_str_buf_t *field_buffer) {
8055   int rc, format_index, field_value;
8056   const char *width_left, *width_right;
8057   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8058   static const int FORMAT_SIZE = 20;
8059   char format[FORMAT_SIZE] = {0};
8060   char absolute_short_name = 0;
8061 
8062   KMP_DEBUG_ASSERT(gtid >= 0);
8063   KMP_DEBUG_ASSERT(th);
8064   KMP_DEBUG_ASSERT(**ptr == '%');
8065   KMP_DEBUG_ASSERT(field_buffer);
8066 
8067   __kmp_str_buf_clear(field_buffer);
8068 
8069   // Skip the initial %
8070   (*ptr)++;
8071 
8072   // Check for %% first
8073   if (**ptr == '%') {
8074     __kmp_str_buf_cat(field_buffer, "%", 1);
8075     (*ptr)++; // skip over the second %
8076     return 1;
8077   }
8078 
8079   // Parse field modifiers if they are present
8080   pad_zeros = false;
8081   if (**ptr == '0') {
8082     pad_zeros = true;
8083     (*ptr)++; // skip over 0
8084   }
8085   right_justify = false;
8086   if (**ptr == '.') {
8087     right_justify = true;
8088     (*ptr)++; // skip over .
8089   }
8090   // Parse width of field: [width_left, width_right)
8091   width_left = width_right = NULL;
8092   if (**ptr >= '0' && **ptr <= '9') {
8093     width_left = *ptr;
8094     SKIP_DIGITS(*ptr);
8095     width_right = *ptr;
8096   }
8097 
8098   // Create the format for KMP_SNPRINTF based on flags parsed above
8099   format_index = 0;
8100   format[format_index++] = '%';
8101   if (!right_justify)
8102     format[format_index++] = '-';
8103   if (pad_zeros)
8104     format[format_index++] = '0';
8105   if (width_left && width_right) {
8106     int i = 0;
8107     // Only allow 8 digit number widths.
8108     // This also prevents overflowing format variable
8109     while (i < 8 && width_left < width_right) {
8110       format[format_index++] = *width_left;
8111       width_left++;
8112       i++;
8113     }
8114   }
8115 
8116   // Parse a name (long or short)
8117   // Canonicalize the name into absolute_short_name
8118   found_valid_name = false;
8119   parse_long_name = (**ptr == '{');
8120   if (parse_long_name)
8121     (*ptr)++; // skip initial left brace
8122   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8123                              sizeof(__kmp_affinity_format_table[0]);
8124        ++i) {
8125     char short_name = __kmp_affinity_format_table[i].short_name;
8126     const char *long_name = __kmp_affinity_format_table[i].long_name;
8127     char field_format = __kmp_affinity_format_table[i].field_format;
8128     if (parse_long_name) {
8129       size_t length = KMP_STRLEN(long_name);
8130       if (strncmp(*ptr, long_name, length) == 0) {
8131         found_valid_name = true;
8132         (*ptr) += length; // skip the long name
8133       }
8134     } else if (**ptr == short_name) {
8135       found_valid_name = true;
8136       (*ptr)++; // skip the short name
8137     }
8138     if (found_valid_name) {
8139       format[format_index++] = field_format;
8140       format[format_index++] = '\0';
8141       absolute_short_name = short_name;
8142       break;
8143     }
8144   }
8145   if (parse_long_name) {
8146     if (**ptr != '}') {
8147       absolute_short_name = 0;
8148     } else {
8149       (*ptr)++; // skip over the right brace
8150     }
8151   }
8152 
8153   // Attempt to fill the buffer with the requested
8154   // value using snprintf within __kmp_str_buf_print()
8155   switch (absolute_short_name) {
8156   case 't':
8157     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8158     break;
8159   case 'T':
8160     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8161     break;
8162   case 'L':
8163     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8164     break;
8165   case 'n':
8166     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8167     break;
8168   case 'H': {
8169     static const int BUFFER_SIZE = 256;
8170     char buf[BUFFER_SIZE];
8171     __kmp_expand_host_name(buf, BUFFER_SIZE);
8172     rc = __kmp_str_buf_print(field_buffer, format, buf);
8173   } break;
8174   case 'P':
8175     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8176     break;
8177   case 'i':
8178     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8179     break;
8180   case 'N':
8181     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8182     break;
8183   case 'a':
8184     field_value =
8185         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8186     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8187     break;
8188 #if KMP_AFFINITY_SUPPORTED
8189   case 'A': {
8190     kmp_str_buf_t buf;
8191     __kmp_str_buf_init(&buf);
8192     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8193     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8194     __kmp_str_buf_free(&buf);
8195   } break;
8196 #endif
8197   default:
8198     // According to spec, If an implementation does not have info for field
8199     // type, then "undefined" is printed
8200     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8201     // Skip the field
8202     if (parse_long_name) {
8203       SKIP_TOKEN(*ptr);
8204       if (**ptr == '}')
8205         (*ptr)++;
8206     } else {
8207       (*ptr)++;
8208     }
8209   }
8210 
8211   KMP_ASSERT(format_index <= FORMAT_SIZE);
8212   return rc;
8213 }
8214 
8215 /*
8216  * Return number of characters needed to hold the affinity string
8217  * (not including null byte character)
8218  * The resultant string is printed to buffer, which the caller can then
8219  * handle afterwards
8220 */
8221 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8222                                   kmp_str_buf_t *buffer) {
8223   const char *parse_ptr;
8224   size_t retval;
8225   const kmp_info_t *th;
8226   kmp_str_buf_t field;
8227 
8228   KMP_DEBUG_ASSERT(buffer);
8229   KMP_DEBUG_ASSERT(gtid >= 0);
8230 
8231   __kmp_str_buf_init(&field);
8232   __kmp_str_buf_clear(buffer);
8233 
8234   th = __kmp_threads[gtid];
8235   retval = 0;
8236 
8237   // If format is NULL or zero-length string, then we use
8238   // affinity-format-var ICV
8239   parse_ptr = format;
8240   if (parse_ptr == NULL || *parse_ptr == '\0') {
8241     parse_ptr = __kmp_affinity_format;
8242   }
8243   KMP_DEBUG_ASSERT(parse_ptr);
8244 
8245   while (*parse_ptr != '\0') {
8246     // Parse a field
8247     if (*parse_ptr == '%') {
8248       // Put field in the buffer
8249       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8250       __kmp_str_buf_catbuf(buffer, &field);
8251       retval += rc;
8252     } else {
8253       // Put literal character in buffer
8254       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8255       retval++;
8256       parse_ptr++;
8257     }
8258   }
8259   __kmp_str_buf_free(&field);
8260   return retval;
8261 }
8262 
8263 // Displays the affinity string to stdout
8264 void __kmp_aux_display_affinity(int gtid, const char *format) {
8265   kmp_str_buf_t buf;
8266   __kmp_str_buf_init(&buf);
8267   __kmp_aux_capture_affinity(gtid, format, &buf);
8268   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8269   __kmp_str_buf_free(&buf);
8270 }
8271 
8272 /* ------------------------------------------------------------------------ */
8273 
8274 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8275   int blocktime = arg; /* argument is in milliseconds */
8276 #if KMP_USE_MONITOR
8277   int bt_intervals;
8278 #endif
8279   kmp_int8 bt_set;
8280 
8281   __kmp_save_internal_controls(thread);
8282 
8283   /* Normalize and set blocktime for the teams */
8284   if (blocktime < KMP_MIN_BLOCKTIME)
8285     blocktime = KMP_MIN_BLOCKTIME;
8286   else if (blocktime > KMP_MAX_BLOCKTIME)
8287     blocktime = KMP_MAX_BLOCKTIME;
8288 
8289   set__blocktime_team(thread->th.th_team, tid, blocktime);
8290   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8291 
8292 #if KMP_USE_MONITOR
8293   /* Calculate and set blocktime intervals for the teams */
8294   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8295 
8296   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8297   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8298 #endif
8299 
8300   /* Set whether blocktime has been set to "TRUE" */
8301   bt_set = TRUE;
8302 
8303   set__bt_set_team(thread->th.th_team, tid, bt_set);
8304   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8305 #if KMP_USE_MONITOR
8306   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8307                 "bt_intervals=%d, monitor_updates=%d\n",
8308                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8309                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8310                 __kmp_monitor_wakeups));
8311 #else
8312   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8313                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8314                 thread->th.th_team->t.t_id, tid, blocktime));
8315 #endif
8316 }
8317 
8318 void __kmp_aux_set_defaults(char const *str, size_t len) {
8319   if (!__kmp_init_serial) {
8320     __kmp_serial_initialize();
8321   }
8322   __kmp_env_initialize(str);
8323 
8324   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8325     __kmp_env_print();
8326   }
8327 } // __kmp_aux_set_defaults
8328 
8329 /* ------------------------------------------------------------------------ */
8330 /* internal fast reduction routines */
8331 
8332 PACKED_REDUCTION_METHOD_T
8333 __kmp_determine_reduction_method(
8334     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8335     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8336     kmp_critical_name *lck) {
8337 
8338   // Default reduction method: critical construct ( lck != NULL, like in current
8339   // PAROPT )
8340   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8341   // can be selected by RTL
8342   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8343   // can be selected by RTL
8344   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8345   // among generated by PAROPT.
8346 
8347   PACKED_REDUCTION_METHOD_T retval;
8348 
8349   int team_size;
8350 
8351   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8352   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8353 
8354 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8355   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8356 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8357 
8358   retval = critical_reduce_block;
8359 
8360   // another choice of getting a team size (with 1 dynamic deference) is slower
8361   team_size = __kmp_get_team_num_threads(global_tid);
8362   if (team_size == 1) {
8363 
8364     retval = empty_reduce_block;
8365 
8366   } else {
8367 
8368     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8369 
8370 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8371     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8372 
8373 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8374     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8375 
8376     int teamsize_cutoff = 4;
8377 
8378 #if KMP_MIC_SUPPORTED
8379     if (__kmp_mic_type != non_mic) {
8380       teamsize_cutoff = 8;
8381     }
8382 #endif
8383     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8384     if (tree_available) {
8385       if (team_size <= teamsize_cutoff) {
8386         if (atomic_available) {
8387           retval = atomic_reduce_block;
8388         }
8389       } else {
8390         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8391       }
8392     } else if (atomic_available) {
8393       retval = atomic_reduce_block;
8394     }
8395 #else
8396 #error "Unknown or unsupported OS"
8397 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8398        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8399 
8400 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8401 
8402 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8403 
8404     // basic tuning
8405 
8406     if (atomic_available) {
8407       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8408         retval = atomic_reduce_block;
8409       }
8410     } // otherwise: use critical section
8411 
8412 #elif KMP_OS_DARWIN
8413 
8414     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8415     if (atomic_available && (num_vars <= 3)) {
8416       retval = atomic_reduce_block;
8417     } else if (tree_available) {
8418       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8419           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8420         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8421       }
8422     } // otherwise: use critical section
8423 
8424 #else
8425 #error "Unknown or unsupported OS"
8426 #endif
8427 
8428 #else
8429 #error "Unknown or unsupported architecture"
8430 #endif
8431   }
8432 
8433   // KMP_FORCE_REDUCTION
8434 
8435   // If the team is serialized (team_size == 1), ignore the forced reduction
8436   // method and stay with the unsynchronized method (empty_reduce_block)
8437   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8438       team_size != 1) {
8439 
8440     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8441 
8442     int atomic_available, tree_available;
8443 
8444     switch ((forced_retval = __kmp_force_reduction_method)) {
8445     case critical_reduce_block:
8446       KMP_ASSERT(lck); // lck should be != 0
8447       break;
8448 
8449     case atomic_reduce_block:
8450       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8451       if (!atomic_available) {
8452         KMP_WARNING(RedMethodNotSupported, "atomic");
8453         forced_retval = critical_reduce_block;
8454       }
8455       break;
8456 
8457     case tree_reduce_block:
8458       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8459       if (!tree_available) {
8460         KMP_WARNING(RedMethodNotSupported, "tree");
8461         forced_retval = critical_reduce_block;
8462       } else {
8463 #if KMP_FAST_REDUCTION_BARRIER
8464         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8465 #endif
8466       }
8467       break;
8468 
8469     default:
8470       KMP_ASSERT(0); // "unsupported method specified"
8471     }
8472 
8473     retval = forced_retval;
8474   }
8475 
8476   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8477 
8478 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8479 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8480 
8481   return (retval);
8482 }
8483 // this function is for testing set/get/determine reduce method
8484 kmp_int32 __kmp_get_reduce_method(void) {
8485   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8486 }
8487 
8488 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8489 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8490 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8491 
8492 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8493 // OpenMP is used subsequently.
8494 void __kmp_hard_pause() {
8495   __kmp_pause_status = kmp_hard_paused;
8496   __kmp_internal_end_thread(-1);
8497 }
8498 
8499 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8500 void __kmp_resume_if_soft_paused() {
8501   if (__kmp_pause_status == kmp_soft_paused) {
8502     __kmp_pause_status = kmp_not_paused;
8503 
8504     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8505       kmp_info_t *thread = __kmp_threads[gtid];
8506       if (thread) { // Wake it if sleeping
8507         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8508                          thread);
8509         if (fl.is_sleeping())
8510           fl.resume(gtid);
8511         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8512           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8513         } else { // thread holds the lock and may sleep soon
8514           do { // until either the thread sleeps, or we can get the lock
8515             if (fl.is_sleeping()) {
8516               fl.resume(gtid);
8517               break;
8518             } else if (__kmp_try_suspend_mx(thread)) {
8519               __kmp_unlock_suspend_mx(thread);
8520               break;
8521             }
8522           } while (1);
8523         }
8524       }
8525     }
8526   }
8527 }
8528 
8529 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8530 // TODO: add warning messages
8531 int __kmp_pause_resource(kmp_pause_status_t level) {
8532   if (level == kmp_not_paused) { // requesting resume
8533     if (__kmp_pause_status == kmp_not_paused) {
8534       // error message about runtime not being paused, so can't resume
8535       return 1;
8536     } else {
8537       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8538                        __kmp_pause_status == kmp_hard_paused);
8539       __kmp_pause_status = kmp_not_paused;
8540       return 0;
8541     }
8542   } else if (level == kmp_soft_paused) { // requesting soft pause
8543     if (__kmp_pause_status != kmp_not_paused) {
8544       // error message about already being paused
8545       return 1;
8546     } else {
8547       __kmp_soft_pause();
8548       return 0;
8549     }
8550   } else if (level == kmp_hard_paused) { // requesting hard pause
8551     if (__kmp_pause_status != kmp_not_paused) {
8552       // error message about already being paused
8553       return 1;
8554     } else {
8555       __kmp_hard_pause();
8556       return 0;
8557     }
8558   } else {
8559     // error message about invalid level
8560     return 1;
8561   }
8562 }
8563 
8564 void __kmp_omp_display_env(int verbose) {
8565   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8566   if (__kmp_init_serial == 0)
8567     __kmp_do_serial_initialize();
8568   __kmp_display_env_impl(!verbose, verbose);
8569   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8570 }
8571 
8572 // Globals and functions for hidden helper task
8573 kmp_info_t **__kmp_hidden_helper_threads;
8574 kmp_info_t *__kmp_hidden_helper_main_thread;
8575 kmp_int32 __kmp_hidden_helper_threads_num = 8;
8576 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8577 #if KMP_OS_LINUX
8578 kmp_int32 __kmp_enable_hidden_helper = TRUE;
8579 #else
8580 kmp_int32 __kmp_enable_hidden_helper = FALSE;
8581 #endif
8582 
8583 namespace {
8584 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8585 
8586 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8587   // This is an explicit synchronization on all hidden helper threads in case
8588   // that when a regular thread pushes a hidden helper task to one hidden
8589   // helper thread, the thread has not been awaken once since they're released
8590   // by the main thread after creating the team.
8591   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8592   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8593          __kmp_hidden_helper_threads_num)
8594     ;
8595 
8596   // If main thread, then wait for signal
8597   if (__kmpc_master(nullptr, *gtid)) {
8598     // First, unset the initial state and release the initial thread
8599     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8600     __kmp_hidden_helper_initz_release();
8601     __kmp_hidden_helper_main_thread_wait();
8602     // Now wake up all worker threads
8603     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8604       __kmp_hidden_helper_worker_thread_signal();
8605     }
8606   }
8607 }
8608 } // namespace
8609 
8610 void __kmp_hidden_helper_threads_initz_routine() {
8611   // Create a new root for hidden helper team/threads
8612   const int gtid = __kmp_register_root(TRUE);
8613   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8614   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8615   __kmp_hidden_helper_main_thread->th.th_set_nproc =
8616       __kmp_hidden_helper_threads_num;
8617 
8618   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8619 
8620   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8621 
8622   // Set the initialization flag to FALSE
8623   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8624 
8625   __kmp_hidden_helper_threads_deinitz_release();
8626 }
8627