xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_runtime.cpp (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #include "kmp_utils.h"
28 #if KMP_USE_HIER_SCHED
29 #include "kmp_dispatch_hier.h"
30 #endif
31 
32 #if OMPT_SUPPORT
33 #include "ompt-specific.h"
34 #endif
35 #if OMPD_SUPPORT
36 #include "ompd-specific.h"
37 #endif
38 
39 #if OMP_PROFILING_SUPPORT
40 #include "llvm/Support/TimeProfiler.h"
41 static char *ProfileTraceFile = nullptr;
42 #endif
43 
44 /* these are temporary issues to be dealt with */
45 #define KMP_USE_PRCTL 0
46 
47 #if KMP_OS_WINDOWS
48 #include <process.h>
49 #endif
50 
51 #ifndef KMP_USE_SHM
52 // Windows and WASI do not need these include files as they don't use shared
53 // memory.
54 #else
55 #include <sys/mman.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #define SHM_SIZE 1024
59 #endif
60 
61 #if defined(KMP_GOMP_COMPAT)
62 char const __kmp_version_alt_comp[] =
63     KMP_VERSION_PREFIX "alternative compiler support: yes";
64 #endif /* defined(KMP_GOMP_COMPAT) */
65 
66 char const __kmp_version_omp_api[] =
67     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68 
69 #ifdef KMP_DEBUG
70 char const __kmp_version_lock[] =
71     KMP_VERSION_PREFIX "lock type: run time selectable";
72 #endif /* KMP_DEBUG */
73 
74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75 
76 /* ------------------------------------------------------------------------ */
77 
78 #if KMP_USE_MONITOR
79 kmp_info_t __kmp_monitor;
80 #endif
81 
82 /* Forward declarations */
83 
84 void __kmp_cleanup(void);
85 
86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87                                   int gtid);
88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89                                   kmp_internal_control_t *new_icvs,
90                                   ident_t *loc);
91 #if KMP_AFFINITY_SUPPORTED
92 static void __kmp_partition_places(kmp_team_t *team,
93                                    int update_master_only = 0);
94 #endif
95 static void __kmp_do_serial_initialize(void);
96 void __kmp_fork_barrier(int gtid, int tid);
97 void __kmp_join_barrier(int gtid);
98 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99                           kmp_internal_control_t *new_icvs, ident_t *loc);
100 
101 #ifdef USE_LOAD_BALANCE
102 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103 #endif
104 
105 static int __kmp_expand_threads(int nNeed);
106 #if KMP_OS_WINDOWS
107 static int __kmp_unregister_root_other_thread(int gtid);
108 #endif
109 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111 
112 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
113                                int new_nthreads);
114 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
115 
116 static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
117                                                         int level) {
118   kmp_nested_nthreads_t *new_nested_nth =
119       (kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
120           sizeof(kmp_nested_nthreads_t));
121   int new_size = level + thr->th.th_set_nested_nth_sz;
122   new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
123   for (int i = 0; i < level + 1; ++i)
124     new_nested_nth->nth[i] = 0;
125   for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
126     new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
127   new_nested_nth->size = new_nested_nth->used = new_size;
128   return new_nested_nth;
129 }
130 
131 /* Calculate the identifier of the current thread */
132 /* fast (and somewhat portable) way to get unique identifier of executing
133    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
134 int __kmp_get_global_thread_id() {
135   int i;
136   kmp_info_t **other_threads;
137   size_t stack_data;
138   char *stack_addr;
139   size_t stack_size;
140   char *stack_base;
141 
142   KA_TRACE(
143       1000,
144       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
145        __kmp_nth, __kmp_all_nth));
146 
147   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
148      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
149      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
150      __kmp_init_gtid for this to work. */
151 
152   if (!TCR_4(__kmp_init_gtid))
153     return KMP_GTID_DNE;
154 
155 #ifdef KMP_TDATA_GTID
156   if (TCR_4(__kmp_gtid_mode) >= 3) {
157     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
158     return __kmp_gtid;
159   }
160 #endif
161   if (TCR_4(__kmp_gtid_mode) >= 2) {
162     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
163     return __kmp_gtid_get_specific();
164   }
165   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
166 
167   stack_addr = (char *)&stack_data;
168   other_threads = __kmp_threads;
169 
170   /* ATT: The code below is a source of potential bugs due to unsynchronized
171      access to __kmp_threads array. For example:
172      1. Current thread loads other_threads[i] to thr and checks it, it is
173         non-NULL.
174      2. Current thread is suspended by OS.
175      3. Another thread unregisters and finishes (debug versions of free()
176         may fill memory with something like 0xEF).
177      4. Current thread is resumed.
178      5. Current thread reads junk from *thr.
179      TODO: Fix it.  --ln  */
180 
181   for (i = 0; i < __kmp_threads_capacity; i++) {
182 
183     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
184     if (!thr)
185       continue;
186 
187     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
188     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
189 
190     /* stack grows down -- search through all of the active threads */
191 
192     if (stack_addr <= stack_base) {
193       size_t stack_diff = stack_base - stack_addr;
194 
195       if (stack_diff <= stack_size) {
196         /* The only way we can be closer than the allocated */
197         /* stack size is if we are running on this thread. */
198         // __kmp_gtid_get_specific can return negative value because this
199         // function can be called by thread destructor. However, before the
200         // thread destructor is called, the value of the corresponding
201         // thread-specific data will be reset to NULL.
202         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
203                          __kmp_gtid_get_specific() == i);
204         return i;
205       }
206     }
207   }
208 
209   /* get specific to try and determine our gtid */
210   KA_TRACE(1000,
211            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
212             "thread, using TLS\n"));
213   i = __kmp_gtid_get_specific();
214 
215   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
216 
217   /* if we havn't been assigned a gtid, then return code */
218   if (i < 0)
219     return i;
220 
221   // other_threads[i] can be nullptr at this point because the corresponding
222   // thread could have already been destructed. It can happen when this function
223   // is called in end library routine.
224   if (!TCR_SYNC_PTR(other_threads[i]))
225     return i;
226 
227   /* dynamically updated stack window for uber threads to avoid get_specific
228      call */
229   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
230     KMP_FATAL(StackOverflow, i);
231   }
232 
233   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
234   if (stack_addr > stack_base) {
235     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
236     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
237             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
238                 stack_base);
239   } else {
240     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
241             stack_base - stack_addr);
242   }
243 
244   /* Reprint stack bounds for ubermaster since they have been refined */
245   if (__kmp_storage_map) {
246     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
247     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
248     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
249                                  other_threads[i]->th.th_info.ds.ds_stacksize,
250                                  "th_%d stack (refinement)", i);
251   }
252   return i;
253 }
254 
255 int __kmp_get_global_thread_id_reg() {
256   int gtid;
257 
258   if (!__kmp_init_serial) {
259     gtid = KMP_GTID_DNE;
260   } else
261 #ifdef KMP_TDATA_GTID
262       if (TCR_4(__kmp_gtid_mode) >= 3) {
263     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
264     gtid = __kmp_gtid;
265   } else
266 #endif
267       if (TCR_4(__kmp_gtid_mode) >= 2) {
268     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
269     gtid = __kmp_gtid_get_specific();
270   } else {
271     KA_TRACE(1000,
272              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
273     gtid = __kmp_get_global_thread_id();
274   }
275 
276   /* we must be a new uber master sibling thread */
277   if (gtid == KMP_GTID_DNE) {
278     KA_TRACE(10,
279              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
280               "Registering a new gtid.\n"));
281     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
282     if (!__kmp_init_serial) {
283       __kmp_do_serial_initialize();
284       gtid = __kmp_gtid_get_specific();
285     } else {
286       gtid = __kmp_register_root(FALSE);
287     }
288     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
289     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
290   }
291 
292   KMP_DEBUG_ASSERT(gtid >= 0);
293 
294   return gtid;
295 }
296 
297 /* caller must hold forkjoin_lock */
298 void __kmp_check_stack_overlap(kmp_info_t *th) {
299   int f;
300   char *stack_beg = NULL;
301   char *stack_end = NULL;
302   int gtid;
303 
304   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
305   if (__kmp_storage_map) {
306     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
307     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
308 
309     gtid = __kmp_gtid_from_thread(th);
310 
311     if (gtid == KMP_GTID_MONITOR) {
312       __kmp_print_storage_map_gtid(
313           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
314           "th_%s stack (%s)", "mon",
315           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
316     } else {
317       __kmp_print_storage_map_gtid(
318           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
319           "th_%d stack (%s)", gtid,
320           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
321     }
322   }
323 
324   /* No point in checking ubermaster threads since they use refinement and
325    * cannot overlap */
326   gtid = __kmp_gtid_from_thread(th);
327   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
328     KA_TRACE(10,
329              ("__kmp_check_stack_overlap: performing extensive checking\n"));
330     if (stack_beg == NULL) {
331       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
332       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
333     }
334 
335     for (f = 0; f < __kmp_threads_capacity; f++) {
336       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
337 
338       if (f_th && f_th != th) {
339         char *other_stack_end =
340             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
341         char *other_stack_beg =
342             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
343         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
344             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
345 
346           /* Print the other stack values before the abort */
347           if (__kmp_storage_map)
348             __kmp_print_storage_map_gtid(
349                 -1, other_stack_beg, other_stack_end,
350                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
351                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
352 
353           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
354                       __kmp_msg_null);
355         }
356       }
357     }
358   }
359   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
360 }
361 
362 /* ------------------------------------------------------------------------ */
363 
364 void __kmp_infinite_loop(void) {
365   static int done = FALSE;
366 
367   while (!done) {
368     KMP_YIELD(TRUE);
369   }
370 }
371 
372 #define MAX_MESSAGE 512
373 
374 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
375                                   char const *format, ...) {
376   char buffer[MAX_MESSAGE];
377   va_list ap;
378 
379   va_start(ap, format);
380   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
381                p2, (unsigned long)size, format);
382   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
383   __kmp_vprintf(kmp_err, buffer, ap);
384 #if KMP_PRINT_DATA_PLACEMENT
385   int node;
386   if (gtid >= 0) {
387     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
388       if (__kmp_storage_map_verbose) {
389         node = __kmp_get_host_node(p1);
390         if (node < 0) /* doesn't work, so don't try this next time */
391           __kmp_storage_map_verbose = FALSE;
392         else {
393           char *last;
394           int lastNode;
395           int localProc = __kmp_get_cpu_from_gtid(gtid);
396 
397           const int page_size = KMP_GET_PAGE_SIZE();
398 
399           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
400           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
401           if (localProc >= 0)
402             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
403                                  localProc >> 1);
404           else
405             __kmp_printf_no_lock("  GTID %d\n", gtid);
406 #if KMP_USE_PRCTL
407           /* The more elaborate format is disabled for now because of the prctl
408            * hanging bug. */
409           do {
410             last = p1;
411             lastNode = node;
412             /* This loop collates adjacent pages with the same host node. */
413             do {
414               (char *)p1 += page_size;
415             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
416             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
417                                  lastNode);
418           } while (p1 <= p2);
419 #else
420           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
421                                (char *)p1 + (page_size - 1),
422                                __kmp_get_host_node(p1));
423           if (p1 < p2) {
424             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
425                                  (char *)p2 + (page_size - 1),
426                                  __kmp_get_host_node(p2));
427           }
428 #endif
429         }
430       }
431     } else
432       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
433   }
434 #endif /* KMP_PRINT_DATA_PLACEMENT */
435   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
436 
437   va_end(ap);
438 }
439 
440 void __kmp_warn(char const *format, ...) {
441   char buffer[MAX_MESSAGE];
442   va_list ap;
443 
444   if (__kmp_generate_warnings == kmp_warnings_off) {
445     return;
446   }
447 
448   va_start(ap, format);
449 
450   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
451   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
452   __kmp_vprintf(kmp_err, buffer, ap);
453   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
454 
455   va_end(ap);
456 }
457 
458 void __kmp_abort_process() {
459   // Later threads may stall here, but that's ok because abort() will kill them.
460   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
461 
462   if (__kmp_debug_buf) {
463     __kmp_dump_debug_buffer();
464   }
465 
466 #if KMP_OS_WINDOWS
467   // Let other threads know of abnormal termination and prevent deadlock
468   // if abort happened during library initialization or shutdown
469   __kmp_global.g.g_abort = SIGABRT;
470 
471   /* On Windows* OS by default abort() causes pop-up error box, which stalls
472      nightly testing. Unfortunately, we cannot reliably suppress pop-up error
473      boxes. _set_abort_behavior() works well, but this function is not
474      available in VS7 (this is not problem for DLL, but it is a problem for
475      static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
476      help, at least in some versions of MS C RTL.
477 
478      It seems following sequence is the only way to simulate abort() and
479      avoid pop-up error box. */
480   raise(SIGABRT);
481   _exit(3); // Just in case, if signal ignored, exit anyway.
482 #else
483   __kmp_unregister_library();
484   abort();
485 #endif
486 
487   __kmp_infinite_loop();
488   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
489 
490 } // __kmp_abort_process
491 
492 void __kmp_abort_thread(void) {
493   // TODO: Eliminate g_abort global variable and this function.
494   // In case of abort just call abort(), it will kill all the threads.
495   __kmp_infinite_loop();
496 } // __kmp_abort_thread
497 
498 /* Print out the storage map for the major kmp_info_t thread data structures
499    that are allocated together. */
500 
501 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
502   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
503                                gtid);
504 
505   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
506                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
507 
508   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
509                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
510 
511   __kmp_print_storage_map_gtid(
512       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
513       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
514 
515   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
516                                &thr->th.th_bar[bs_plain_barrier + 1],
517                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
518                                gtid);
519 
520   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
521                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
522                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
523                                gtid);
524 
525 #if KMP_FAST_REDUCTION_BARRIER
526   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
527                                &thr->th.th_bar[bs_reduction_barrier + 1],
528                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
529                                gtid);
530 #endif // KMP_FAST_REDUCTION_BARRIER
531 }
532 
533 /* Print out the storage map for the major kmp_team_t team data structures
534    that are allocated together. */
535 
536 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
537                                          int team_id, int num_thr) {
538   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
539   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
540                                header, team_id);
541 
542   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
543                                &team->t.t_bar[bs_last_barrier],
544                                sizeof(kmp_balign_team_t) * bs_last_barrier,
545                                "%s_%d.t_bar", header, team_id);
546 
547   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
548                                &team->t.t_bar[bs_plain_barrier + 1],
549                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
550                                header, team_id);
551 
552   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
553                                &team->t.t_bar[bs_forkjoin_barrier + 1],
554                                sizeof(kmp_balign_team_t),
555                                "%s_%d.t_bar[forkjoin]", header, team_id);
556 
557 #if KMP_FAST_REDUCTION_BARRIER
558   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
559                                &team->t.t_bar[bs_reduction_barrier + 1],
560                                sizeof(kmp_balign_team_t),
561                                "%s_%d.t_bar[reduction]", header, team_id);
562 #endif // KMP_FAST_REDUCTION_BARRIER
563 
564   __kmp_print_storage_map_gtid(
565       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
566       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
567 
568   __kmp_print_storage_map_gtid(
569       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
570       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
571 
572   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
573                                &team->t.t_disp_buffer[num_disp_buff],
574                                sizeof(dispatch_shared_info_t) * num_disp_buff,
575                                "%s_%d.t_disp_buffer", header, team_id);
576 }
577 
578 static void __kmp_init_allocator() {
579   __kmp_init_memkind();
580   __kmp_init_target_mem();
581 }
582 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
583 
584 /* ------------------------------------------------------------------------ */
585 
586 #if ENABLE_LIBOMPTARGET
587 static void __kmp_init_omptarget() {
588   __kmp_init_target_task();
589 }
590 #endif
591 
592 /* ------------------------------------------------------------------------ */
593 
594 #if KMP_DYNAMIC_LIB
595 #if KMP_OS_WINDOWS
596 
597 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
598   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
599 
600   switch (fdwReason) {
601 
602   case DLL_PROCESS_ATTACH:
603     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
604 
605     return TRUE;
606 
607   case DLL_PROCESS_DETACH:
608     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
609 
610     // According to Windows* documentation for DllMain entry point:
611     // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
612     //   lpReserved == NULL when FreeLibrary() is called,
613     //   lpReserved != NULL when the process is terminated.
614     // When FreeLibrary() is called, worker threads remain alive. So the
615     // runtime's state is consistent and executing proper shutdown is OK.
616     // When the process is terminated, worker threads have exited or been
617     // forcefully terminated by the OS and only the shutdown thread remains.
618     // This can leave the runtime in an inconsistent state.
619     // Hence, only attempt proper cleanup when FreeLibrary() is called.
620     // Otherwise, rely on OS to reclaim resources.
621     if (lpReserved == NULL)
622       __kmp_internal_end_library(__kmp_gtid_get_specific());
623 
624     return TRUE;
625 
626   case DLL_THREAD_ATTACH:
627     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
628 
629     /* if we want to register new siblings all the time here call
630      * __kmp_get_gtid(); */
631     return TRUE;
632 
633   case DLL_THREAD_DETACH:
634     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
635 
636     __kmp_internal_end_thread(__kmp_gtid_get_specific());
637     return TRUE;
638   }
639 
640   return TRUE;
641 }
642 
643 #endif /* KMP_OS_WINDOWS */
644 #endif /* KMP_DYNAMIC_LIB */
645 
646 /* __kmp_parallel_deo -- Wait until it's our turn. */
647 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
648   int gtid = *gtid_ref;
649 #ifdef BUILD_PARALLEL_ORDERED
650   kmp_team_t *team = __kmp_team_from_gtid(gtid);
651 #endif /* BUILD_PARALLEL_ORDERED */
652 
653   if (__kmp_env_consistency_check) {
654     if (__kmp_threads[gtid]->th.th_root->r.r_active)
655 #if KMP_USE_DYNAMIC_LOCK
656       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
657 #else
658       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
659 #endif
660   }
661 #ifdef BUILD_PARALLEL_ORDERED
662   if (!team->t.t_serialized) {
663     KMP_MB();
664     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
665              NULL);
666     KMP_MB();
667   }
668 #endif /* BUILD_PARALLEL_ORDERED */
669 }
670 
671 /* __kmp_parallel_dxo -- Signal the next task. */
672 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
673   int gtid = *gtid_ref;
674 #ifdef BUILD_PARALLEL_ORDERED
675   int tid = __kmp_tid_from_gtid(gtid);
676   kmp_team_t *team = __kmp_team_from_gtid(gtid);
677 #endif /* BUILD_PARALLEL_ORDERED */
678 
679   if (__kmp_env_consistency_check) {
680     if (__kmp_threads[gtid]->th.th_root->r.r_active)
681       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
682   }
683 #ifdef BUILD_PARALLEL_ORDERED
684   if (!team->t.t_serialized) {
685     KMP_MB(); /* Flush all pending memory write invalidates.  */
686 
687     /* use the tid of the next thread in this team */
688     /* TODO replace with general release procedure */
689     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
690 
691     KMP_MB(); /* Flush all pending memory write invalidates.  */
692   }
693 #endif /* BUILD_PARALLEL_ORDERED */
694 }
695 
696 /* ------------------------------------------------------------------------ */
697 /* The BARRIER for a SINGLE process section is always explicit   */
698 
699 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
700   int status;
701   kmp_info_t *th;
702   kmp_team_t *team;
703 
704   if (!TCR_4(__kmp_init_parallel))
705     __kmp_parallel_initialize();
706   __kmp_resume_if_soft_paused();
707 
708   th = __kmp_threads[gtid];
709   team = th->th.th_team;
710   status = 0;
711 
712   th->th.th_ident = id_ref;
713 
714   if (team->t.t_serialized) {
715     status = 1;
716   } else {
717     kmp_int32 old_this = th->th.th_local.this_construct;
718 
719     ++th->th.th_local.this_construct;
720     /* try to set team count to thread count--success means thread got the
721        single block */
722     /* TODO: Should this be acquire or release? */
723     if (team->t.t_construct == old_this) {
724       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
725                                               th->th.th_local.this_construct);
726     }
727 #if USE_ITT_BUILD
728     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
729         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
730         team->t.t_active_level == 1) {
731       // Only report metadata by primary thread of active team at level 1
732       __kmp_itt_metadata_single(id_ref);
733     }
734 #endif /* USE_ITT_BUILD */
735   }
736 
737   if (__kmp_env_consistency_check) {
738     if (status && push_ws) {
739       __kmp_push_workshare(gtid, ct_psingle, id_ref);
740     } else {
741       __kmp_check_workshare(gtid, ct_psingle, id_ref);
742     }
743   }
744 #if USE_ITT_BUILD
745   if (status) {
746     __kmp_itt_single_start(gtid);
747   }
748 #endif /* USE_ITT_BUILD */
749   return status;
750 }
751 
752 void __kmp_exit_single(int gtid) {
753 #if USE_ITT_BUILD
754   __kmp_itt_single_end(gtid);
755 #endif /* USE_ITT_BUILD */
756   if (__kmp_env_consistency_check)
757     __kmp_pop_workshare(gtid, ct_psingle, NULL);
758 }
759 
760 /* determine if we can go parallel or must use a serialized parallel region and
761  * how many threads we can use
762  * set_nproc is the number of threads requested for the team
763  * returns 0 if we should serialize or only use one thread,
764  * otherwise the number of threads to use
765  * The forkjoin lock is held by the caller. */
766 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
767                                  int master_tid, int set_nthreads,
768                                  int enter_teams) {
769   int capacity;
770   int new_nthreads;
771   KMP_DEBUG_ASSERT(__kmp_init_serial);
772   KMP_DEBUG_ASSERT(root && parent_team);
773   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
774 
775   // If dyn-var is set, dynamically adjust the number of desired threads,
776   // according to the method specified by dynamic_mode.
777   new_nthreads = set_nthreads;
778   if (!get__dynamic_2(parent_team, master_tid)) {
779     ;
780   }
781 #ifdef USE_LOAD_BALANCE
782   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
783     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
784     if (new_nthreads == 1) {
785       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
786                     "reservation to 1 thread\n",
787                     master_tid));
788       return 1;
789     }
790     if (new_nthreads < set_nthreads) {
791       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
792                     "reservation to %d threads\n",
793                     master_tid, new_nthreads));
794     }
795   }
796 #endif /* USE_LOAD_BALANCE */
797   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
798     new_nthreads = __kmp_avail_proc - __kmp_nth +
799                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
800     if (new_nthreads <= 1) {
801       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
802                     "reservation to 1 thread\n",
803                     master_tid));
804       return 1;
805     }
806     if (new_nthreads < set_nthreads) {
807       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
808                     "reservation to %d threads\n",
809                     master_tid, new_nthreads));
810     } else {
811       new_nthreads = set_nthreads;
812     }
813   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
814     if (set_nthreads > 2) {
815       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
816       new_nthreads = (new_nthreads % set_nthreads) + 1;
817       if (new_nthreads == 1) {
818         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
819                       "reservation to 1 thread\n",
820                       master_tid));
821         return 1;
822       }
823       if (new_nthreads < set_nthreads) {
824         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
825                       "reservation to %d threads\n",
826                       master_tid, new_nthreads));
827       }
828     }
829   } else {
830     KMP_ASSERT(0);
831   }
832 
833   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
834   if (__kmp_nth + new_nthreads -
835           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
836       __kmp_max_nth) {
837     int tl_nthreads = __kmp_max_nth - __kmp_nth +
838                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
839     if (tl_nthreads <= 0) {
840       tl_nthreads = 1;
841     }
842 
843     // If dyn-var is false, emit a 1-time warning.
844     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
845       __kmp_reserve_warn = 1;
846       __kmp_msg(kmp_ms_warning,
847                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
848                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
849     }
850     if (tl_nthreads == 1) {
851       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
852                     "reduced reservation to 1 thread\n",
853                     master_tid));
854       return 1;
855     }
856     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
857                   "reservation to %d threads\n",
858                   master_tid, tl_nthreads));
859     new_nthreads = tl_nthreads;
860   }
861 
862   // Respect OMP_THREAD_LIMIT
863   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
864   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
865   if (cg_nthreads + new_nthreads -
866           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
867       max_cg_threads) {
868     int tl_nthreads = max_cg_threads - cg_nthreads +
869                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
870     if (tl_nthreads <= 0) {
871       tl_nthreads = 1;
872     }
873 
874     // If dyn-var is false, emit a 1-time warning.
875     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
876       __kmp_reserve_warn = 1;
877       __kmp_msg(kmp_ms_warning,
878                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
879                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
880     }
881     if (tl_nthreads == 1) {
882       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
883                     "reduced reservation to 1 thread\n",
884                     master_tid));
885       return 1;
886     }
887     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
888                   "reservation to %d threads\n",
889                   master_tid, tl_nthreads));
890     new_nthreads = tl_nthreads;
891   }
892 
893   // Check if the threads array is large enough, or needs expanding.
894   // See comment in __kmp_register_root() about the adjustment if
895   // __kmp_threads[0] == NULL.
896   capacity = __kmp_threads_capacity;
897   if (TCR_PTR(__kmp_threads[0]) == NULL) {
898     --capacity;
899   }
900   // If it is not for initializing the hidden helper team, we need to take
901   // __kmp_hidden_helper_threads_num out of the capacity because it is included
902   // in __kmp_threads_capacity.
903   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
904     capacity -= __kmp_hidden_helper_threads_num;
905   }
906   if (__kmp_nth + new_nthreads -
907           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
908       capacity) {
909     // Expand the threads array.
910     int slotsRequired = __kmp_nth + new_nthreads -
911                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
912                         capacity;
913     int slotsAdded = __kmp_expand_threads(slotsRequired);
914     if (slotsAdded < slotsRequired) {
915       // The threads array was not expanded enough.
916       new_nthreads -= (slotsRequired - slotsAdded);
917       KMP_ASSERT(new_nthreads >= 1);
918 
919       // If dyn-var is false, emit a 1-time warning.
920       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
921         __kmp_reserve_warn = 1;
922         if (__kmp_tp_cached) {
923           __kmp_msg(kmp_ms_warning,
924                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
925                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
926                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
927         } else {
928           __kmp_msg(kmp_ms_warning,
929                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
930                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
931         }
932       }
933     }
934   }
935 
936 #ifdef KMP_DEBUG
937   if (new_nthreads == 1) {
938     KC_TRACE(10,
939              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
940               "dead roots and rechecking; requested %d threads\n",
941               __kmp_get_gtid(), set_nthreads));
942   } else {
943     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
944                   " %d threads\n",
945                   __kmp_get_gtid(), new_nthreads, set_nthreads));
946   }
947 #endif // KMP_DEBUG
948 
949   if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) {
950     __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
951                  this_thr->th.th_nt_msg);
952   }
953   return new_nthreads;
954 }
955 
956 /* Allocate threads from the thread pool and assign them to the new team. We are
957    assured that there are enough threads available, because we checked on that
958    earlier within critical section forkjoin */
959 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
960                                     kmp_info_t *master_th, int master_gtid,
961                                     int fork_teams_workers) {
962   int i;
963   int use_hot_team;
964 
965   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
966   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
967   KMP_MB();
968 
969   /* first, let's setup the primary thread */
970   master_th->th.th_info.ds.ds_tid = 0;
971   master_th->th.th_team = team;
972   master_th->th.th_team_nproc = team->t.t_nproc;
973   master_th->th.th_team_master = master_th;
974   master_th->th.th_team_serialized = FALSE;
975   master_th->th.th_dispatch = &team->t.t_dispatch[0];
976 
977 /* make sure we are not the optimized hot team */
978 #if KMP_NESTED_HOT_TEAMS
979   use_hot_team = 0;
980   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
981   if (hot_teams) { // hot teams array is not allocated if
982     // KMP_HOT_TEAMS_MAX_LEVEL=0
983     int level = team->t.t_active_level - 1; // index in array of hot teams
984     if (master_th->th.th_teams_microtask) { // are we inside the teams?
985       if (master_th->th.th_teams_size.nteams > 1) {
986         ++level; // level was not increased in teams construct for
987         // team_of_masters
988       }
989       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
990           master_th->th.th_teams_level == team->t.t_level) {
991         ++level; // level was not increased in teams construct for
992         // team_of_workers before the parallel
993       } // team->t.t_level will be increased inside parallel
994     }
995     if (level < __kmp_hot_teams_max_level) {
996       if (hot_teams[level].hot_team) {
997         // hot team has already been allocated for given level
998         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
999         use_hot_team = 1; // the team is ready to use
1000       } else {
1001         use_hot_team = 0; // AC: threads are not allocated yet
1002         hot_teams[level].hot_team = team; // remember new hot team
1003         hot_teams[level].hot_team_nth = team->t.t_nproc;
1004       }
1005     } else {
1006       use_hot_team = 0;
1007     }
1008   }
1009 #else
1010   use_hot_team = team == root->r.r_hot_team;
1011 #endif
1012   if (!use_hot_team) {
1013 
1014     /* install the primary thread */
1015     team->t.t_threads[0] = master_th;
1016     __kmp_initialize_info(master_th, team, 0, master_gtid);
1017 
1018     /* now, install the worker threads */
1019     for (i = 1; i < team->t.t_nproc; i++) {
1020 
1021       /* fork or reallocate a new thread and install it in team */
1022       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1023       team->t.t_threads[i] = thr;
1024       KMP_DEBUG_ASSERT(thr);
1025       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1026       /* align team and thread arrived states */
1027       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1028                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1029                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1030                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1031                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1032                     team->t.t_bar[bs_plain_barrier].b_arrived));
1033       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1034       thr->th.th_teams_level = master_th->th.th_teams_level;
1035       thr->th.th_teams_size = master_th->th.th_teams_size;
1036       { // Initialize threads' barrier data.
1037         int b;
1038         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1039         for (b = 0; b < bs_last_barrier; ++b) {
1040           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1041           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1042 #if USE_DEBUGGER
1043           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1044 #endif
1045         }
1046       }
1047     }
1048 
1049 #if KMP_AFFINITY_SUPPORTED
1050     // Do not partition the places list for teams construct workers who
1051     // haven't actually been forked to do real work yet. This partitioning
1052     // will take place in the parallel region nested within the teams construct.
1053     if (!fork_teams_workers) {
1054       __kmp_partition_places(team);
1055     }
1056 #endif
1057 
1058     if (team->t.t_nproc > 1 &&
1059         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1060       team->t.b->update_num_threads(team->t.t_nproc);
1061       __kmp_add_threads_to_team(team, team->t.t_nproc);
1062     }
1063   }
1064 
1065   // Take care of primary thread's task state
1066   if (__kmp_tasking_mode != tskm_immediate_exec) {
1067     if (use_hot_team) {
1068       KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
1069       KA_TRACE(
1070           20,
1071           ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
1072            "%p, new task_team %p / team %p\n",
1073            __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
1074            team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
1075            team));
1076 
1077       // Store primary thread's current task state on new team
1078       KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1079                        master_th->th.th_task_state);
1080 
1081       // Restore primary thread's task state to hot team's state
1082       // by using thread 1's task state
1083       if (team->t.t_nproc > 1) {
1084         KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
1085                          team->t.t_threads[1]->th.th_task_state == 1);
1086         KMP_CHECK_UPDATE(master_th->th.th_task_state,
1087                          team->t.t_threads[1]->th.th_task_state);
1088       } else {
1089         master_th->th.th_task_state = 0;
1090       }
1091     } else {
1092       // Store primary thread's current task_state on new team
1093       KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1094                        master_th->th.th_task_state);
1095       // Are not using hot team, so set task state to 0.
1096       master_th->th.th_task_state = 0;
1097     }
1098   }
1099 
1100   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1101     for (i = 0; i < team->t.t_nproc; i++) {
1102       kmp_info_t *thr = team->t.t_threads[i];
1103       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1104           thr->th.th_prev_level != team->t.t_level) {
1105         team->t.t_display_affinity = 1;
1106         break;
1107       }
1108     }
1109   }
1110 
1111   KMP_MB();
1112 }
1113 
1114 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1115 // Propagate any changes to the floating point control registers out to the team
1116 // We try to avoid unnecessary writes to the relevant cache line in the team
1117 // structure, so we don't make changes unless they are needed.
1118 inline static void propagateFPControl(kmp_team_t *team) {
1119   if (__kmp_inherit_fp_control) {
1120     kmp_int16 x87_fpu_control_word;
1121     kmp_uint32 mxcsr;
1122 
1123     // Get primary thread's values of FPU control flags (both X87 and vector)
1124     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1125     __kmp_store_mxcsr(&mxcsr);
1126     mxcsr &= KMP_X86_MXCSR_MASK;
1127 
1128     // There is no point looking at t_fp_control_saved here.
1129     // If it is TRUE, we still have to update the values if they are different
1130     // from those we now have. If it is FALSE we didn't save anything yet, but
1131     // our objective is the same. We have to ensure that the values in the team
1132     // are the same as those we have.
1133     // So, this code achieves what we need whether or not t_fp_control_saved is
1134     // true. By checking whether the value needs updating we avoid unnecessary
1135     // writes that would put the cache-line into a written state, causing all
1136     // threads in the team to have to read it again.
1137     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1138     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1139     // Although we don't use this value, other code in the runtime wants to know
1140     // whether it should restore them. So we must ensure it is correct.
1141     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1142   } else {
1143     // Similarly here. Don't write to this cache-line in the team structure
1144     // unless we have to.
1145     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1146   }
1147 }
1148 
1149 // Do the opposite, setting the hardware registers to the updated values from
1150 // the team.
1151 inline static void updateHWFPControl(kmp_team_t *team) {
1152   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1153     // Only reset the fp control regs if they have been changed in the team.
1154     // the parallel region that we are exiting.
1155     kmp_int16 x87_fpu_control_word;
1156     kmp_uint32 mxcsr;
1157     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1158     __kmp_store_mxcsr(&mxcsr);
1159     mxcsr &= KMP_X86_MXCSR_MASK;
1160 
1161     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1162       __kmp_clear_x87_fpu_status_word();
1163       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1164     }
1165 
1166     if (team->t.t_mxcsr != mxcsr) {
1167       __kmp_load_mxcsr(&team->t.t_mxcsr);
1168     }
1169   }
1170 }
1171 #else
1172 #define propagateFPControl(x) ((void)0)
1173 #define updateHWFPControl(x) ((void)0)
1174 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1175 
1176 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1177                                      int realloc); // forward declaration
1178 
1179 /* Run a parallel region that has been serialized, so runs only in a team of the
1180    single primary thread. */
1181 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1182   kmp_info_t *this_thr;
1183   kmp_team_t *serial_team;
1184 
1185   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1186 
1187   /* Skip all this code for autopar serialized loops since it results in
1188      unacceptable overhead */
1189   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1190     return;
1191 
1192   if (!TCR_4(__kmp_init_parallel))
1193     __kmp_parallel_initialize();
1194   __kmp_resume_if_soft_paused();
1195 
1196   this_thr = __kmp_threads[global_tid];
1197   serial_team = this_thr->th.th_serial_team;
1198 
1199   /* utilize the serialized team held by this thread */
1200   KMP_DEBUG_ASSERT(serial_team);
1201   KMP_MB();
1202 
1203   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1204   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1205     proc_bind = proc_bind_false;
1206   } else if (proc_bind == proc_bind_default) {
1207     // No proc_bind clause was specified, so use the current value
1208     // of proc-bind-var for this parallel region.
1209     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1210   }
1211   // Reset for next parallel region
1212   this_thr->th.th_set_proc_bind = proc_bind_default;
1213 
1214   // Reset num_threads for next parallel region
1215   this_thr->th.th_set_nproc = 0;
1216 
1217 #if OMPT_SUPPORT
1218   ompt_data_t ompt_parallel_data = ompt_data_none;
1219   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1220   if (ompt_enabled.enabled &&
1221       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1222 
1223     ompt_task_info_t *parent_task_info;
1224     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1225 
1226     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1227     if (ompt_enabled.ompt_callback_parallel_begin) {
1228       int team_size = 1;
1229 
1230       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1231           &(parent_task_info->task_data), &(parent_task_info->frame),
1232           &ompt_parallel_data, team_size,
1233           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1234     }
1235   }
1236 #endif // OMPT_SUPPORT
1237 
1238   if (this_thr->th.th_team != serial_team) {
1239     // Nested level will be an index in the nested nthreads array
1240     int level = this_thr->th.th_team->t.t_level;
1241 
1242     if (serial_team->t.t_serialized) {
1243       /* this serial team was already used
1244          TODO increase performance by making this locks more specific */
1245       kmp_team_t *new_team;
1246 
1247       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1248 
1249       new_team =
1250           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1251 #if OMPT_SUPPORT
1252                               ompt_parallel_data,
1253 #endif
1254                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1255                               0 USE_NESTED_HOT_ARG(NULL));
1256       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1257       KMP_ASSERT(new_team);
1258 
1259       /* setup new serialized team and install it */
1260       new_team->t.t_threads[0] = this_thr;
1261       new_team->t.t_parent = this_thr->th.th_team;
1262       serial_team = new_team;
1263       this_thr->th.th_serial_team = serial_team;
1264 
1265       KF_TRACE(
1266           10,
1267           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1268            global_tid, serial_team));
1269 
1270       /* TODO the above breaks the requirement that if we run out of resources,
1271          then we can still guarantee that serialized teams are ok, since we may
1272          need to allocate a new one */
1273     } else {
1274       KF_TRACE(
1275           10,
1276           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1277            global_tid, serial_team));
1278     }
1279 
1280     /* we have to initialize this serial team */
1281     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1282     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1283     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1284     serial_team->t.t_ident = loc;
1285     serial_team->t.t_serialized = 1;
1286     serial_team->t.t_nproc = 1;
1287     serial_team->t.t_parent = this_thr->th.th_team;
1288     if (this_thr->th.th_team->t.t_nested_nth)
1289       serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
1290     else
1291       serial_team->t.t_nested_nth = &__kmp_nested_nth;
1292     // Save previous team's task state on serial team structure
1293     serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
1294     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1295     this_thr->th.th_team = serial_team;
1296     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1297 
1298     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1299                   this_thr->th.th_current_task));
1300     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1301     this_thr->th.th_current_task->td_flags.executing = 0;
1302 
1303     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1304 
1305     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1306        implicit task for each serialized task represented by
1307        team->t.t_serialized? */
1308     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1309               &this_thr->th.th_current_task->td_parent->td_icvs);
1310 
1311     // Thread value exists in the nested nthreads array for the next nested
1312     // level
1313     kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1314     if (this_thr->th.th_team->t.t_nested_nth)
1315       nested_nth = this_thr->th.th_team->t.t_nested_nth;
1316     if (nested_nth->used && (level + 1 < nested_nth->used)) {
1317       this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1318     }
1319 
1320     if (__kmp_nested_proc_bind.used &&
1321         (level + 1 < __kmp_nested_proc_bind.used)) {
1322       this_thr->th.th_current_task->td_icvs.proc_bind =
1323           __kmp_nested_proc_bind.bind_types[level + 1];
1324     }
1325 
1326 #if USE_DEBUGGER
1327     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1328 #endif
1329     this_thr->th.th_info.ds.ds_tid = 0;
1330 
1331     /* set thread cache values */
1332     this_thr->th.th_team_nproc = 1;
1333     this_thr->th.th_team_master = this_thr;
1334     this_thr->th.th_team_serialized = 1;
1335     this_thr->th.th_task_team = NULL;
1336     this_thr->th.th_task_state = 0;
1337 
1338     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1339     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1340     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1341 
1342     propagateFPControl(serial_team);
1343 
1344     /* check if we need to allocate dispatch buffers stack */
1345     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1346     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1347       serial_team->t.t_dispatch->th_disp_buffer =
1348           (dispatch_private_info_t *)__kmp_allocate(
1349               sizeof(dispatch_private_info_t));
1350     }
1351     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1352 
1353     KMP_MB();
1354 
1355   } else {
1356     /* this serialized team is already being used,
1357      * that's fine, just add another nested level */
1358     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1359     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1360     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1361     ++serial_team->t.t_serialized;
1362     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1363 
1364     // Nested level will be an index in the nested nthreads array
1365     int level = this_thr->th.th_team->t.t_level;
1366     // Thread value exists in the nested nthreads array for the next nested
1367     // level
1368 
1369     kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1370     if (serial_team->t.t_nested_nth)
1371       nested_nth = serial_team->t.t_nested_nth;
1372     if (nested_nth->used && (level + 1 < nested_nth->used)) {
1373       this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1374     }
1375 
1376     serial_team->t.t_level++;
1377     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1378                   "of serial team %p to %d\n",
1379                   global_tid, serial_team, serial_team->t.t_level));
1380 
1381     /* allocate/push dispatch buffers stack */
1382     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1383     {
1384       dispatch_private_info_t *disp_buffer =
1385           (dispatch_private_info_t *)__kmp_allocate(
1386               sizeof(dispatch_private_info_t));
1387       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1388       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1389     }
1390     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1391 
1392     /* allocate/push task team stack */
1393     __kmp_push_task_team_node(this_thr, serial_team);
1394 
1395     KMP_MB();
1396   }
1397   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1398 
1399   // Perform the display affinity functionality for
1400   // serialized parallel regions
1401   if (__kmp_display_affinity) {
1402     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1403         this_thr->th.th_prev_num_threads != 1) {
1404       // NULL means use the affinity-format-var ICV
1405       __kmp_aux_display_affinity(global_tid, NULL);
1406       this_thr->th.th_prev_level = serial_team->t.t_level;
1407       this_thr->th.th_prev_num_threads = 1;
1408     }
1409   }
1410 
1411   if (__kmp_env_consistency_check)
1412     __kmp_push_parallel(global_tid, NULL);
1413 #if OMPT_SUPPORT
1414   serial_team->t.ompt_team_info.master_return_address = codeptr;
1415   if (ompt_enabled.enabled &&
1416       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1417     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1418         OMPT_GET_FRAME_ADDRESS(0);
1419 
1420     ompt_lw_taskteam_t lw_taskteam;
1421     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1422                             &ompt_parallel_data, codeptr);
1423 
1424     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1425     // don't use lw_taskteam after linking. content was swaped
1426 
1427     /* OMPT implicit task begin */
1428     if (ompt_enabled.ompt_callback_implicit_task) {
1429       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1430           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1431           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1432           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1433       OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1434           __kmp_tid_from_gtid(global_tid);
1435     }
1436 
1437     /* OMPT state */
1438     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1439     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1440         OMPT_GET_FRAME_ADDRESS(0);
1441   }
1442 #endif
1443 }
1444 
1445 // Test if this fork is for a team closely nested in a teams construct
1446 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1447                                           microtask_t microtask, int level,
1448                                           int teams_level, kmp_va_list ap) {
1449   return (master_th->th.th_teams_microtask && ap &&
1450           microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1451 }
1452 
1453 // Test if this fork is for the teams construct, i.e. to form the outer league
1454 // of teams
1455 static inline bool __kmp_is_entering_teams(int active_level, int level,
1456                                            int teams_level, kmp_va_list ap) {
1457   return ((ap == NULL && active_level == 0) ||
1458           (ap && teams_level > 0 && teams_level == level));
1459 }
1460 
1461 // AC: This is start of parallel that is nested inside teams construct.
1462 // The team is actual (hot), all workers are ready at the fork barrier.
1463 // No lock needed to initialize the team a bit, then free workers.
1464 static inline int
1465 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1466                     kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1467                     enum fork_context_e call_context, microtask_t microtask,
1468                     launch_t invoker, int master_set_numthreads, int level,
1469 #if OMPT_SUPPORT
1470                     ompt_data_t ompt_parallel_data, void *return_address,
1471 #endif
1472                     kmp_va_list ap) {
1473   void **argv;
1474   int i;
1475 
1476   parent_team->t.t_ident = loc;
1477   __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1478   parent_team->t.t_argc = argc;
1479   argv = (void **)parent_team->t.t_argv;
1480   for (i = argc - 1; i >= 0; --i) {
1481     *argv++ = va_arg(kmp_va_deref(ap), void *);
1482   }
1483   // Increment our nested depth levels, but not increase the serialization
1484   if (parent_team == master_th->th.th_serial_team) {
1485     // AC: we are in serialized parallel
1486     __kmpc_serialized_parallel(loc, gtid);
1487     KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1488 
1489     if (call_context == fork_context_gnu) {
1490       // AC: need to decrement t_serialized for enquiry functions to work
1491       // correctly, will restore at join time
1492       parent_team->t.t_serialized--;
1493       return TRUE;
1494     }
1495 
1496 #if OMPD_SUPPORT
1497     parent_team->t.t_pkfn = microtask;
1498 #endif
1499 
1500 #if OMPT_SUPPORT
1501     void *dummy;
1502     void **exit_frame_p;
1503     ompt_data_t *implicit_task_data;
1504     ompt_lw_taskteam_t lw_taskteam;
1505 
1506     if (ompt_enabled.enabled) {
1507       __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1508                               &ompt_parallel_data, return_address);
1509       exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1510 
1511       __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1512       // Don't use lw_taskteam after linking. Content was swapped.
1513 
1514       /* OMPT implicit task begin */
1515       implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1516       if (ompt_enabled.ompt_callback_implicit_task) {
1517         OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1518         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1519             ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1520             1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1521       }
1522 
1523       /* OMPT state */
1524       master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1525     } else {
1526       exit_frame_p = &dummy;
1527     }
1528 #endif
1529 
1530     // AC: need to decrement t_serialized for enquiry functions to work
1531     // correctly, will restore at join time
1532     parent_team->t.t_serialized--;
1533 
1534     {
1535       KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1536       KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1537       __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1538 #if OMPT_SUPPORT
1539                              ,
1540                              exit_frame_p
1541 #endif
1542                              );
1543     }
1544 
1545 #if OMPT_SUPPORT
1546     if (ompt_enabled.enabled) {
1547       *exit_frame_p = NULL;
1548       OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1549       if (ompt_enabled.ompt_callback_implicit_task) {
1550         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1551             ompt_scope_end, NULL, implicit_task_data, 1,
1552             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1553       }
1554       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1555       __ompt_lw_taskteam_unlink(master_th);
1556       if (ompt_enabled.ompt_callback_parallel_end) {
1557         ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1558             &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1559             OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1560       }
1561       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1562     }
1563 #endif
1564     return TRUE;
1565   }
1566 
1567   parent_team->t.t_pkfn = microtask;
1568   parent_team->t.t_invoke = invoker;
1569   KMP_ATOMIC_INC(&root->r.r_in_parallel);
1570   parent_team->t.t_active_level++;
1571   parent_team->t.t_level++;
1572   parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1573 
1574   // If the threads allocated to the team are less than the thread limit, update
1575   // the thread limit here. th_teams_size.nth is specific to this team nested
1576   // in a teams construct, the team is fully created, and we're about to do
1577   // the actual fork. Best to do this here so that the subsequent uses below
1578   // and in the join have the correct value.
1579   master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1580 
1581 #if OMPT_SUPPORT
1582   if (ompt_enabled.enabled) {
1583     ompt_lw_taskteam_t lw_taskteam;
1584     __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1585                             return_address);
1586     __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1587   }
1588 #endif
1589 
1590   /* Change number of threads in the team if requested */
1591   if (master_set_numthreads) { // The parallel has num_threads clause
1592     if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1593       // AC: only can reduce number of threads dynamically, can't increase
1594       kmp_info_t **other_threads = parent_team->t.t_threads;
1595       // NOTE: if using distributed barrier, we need to run this code block
1596       // even when the team size appears not to have changed from the max.
1597       int old_proc = master_th->th.th_teams_size.nth;
1598       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1599         __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1600         __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1601       }
1602       parent_team->t.t_nproc = master_set_numthreads;
1603       for (i = 0; i < master_set_numthreads; ++i) {
1604         other_threads[i]->th.th_team_nproc = master_set_numthreads;
1605       }
1606     }
1607     // Keep extra threads hot in the team for possible next parallels
1608     master_th->th.th_set_nproc = 0;
1609   }
1610 
1611 #if USE_DEBUGGER
1612   if (__kmp_debugging) { // Let debugger override number of threads.
1613     int nth = __kmp_omp_num_threads(loc);
1614     if (nth > 0) { // 0 means debugger doesn't want to change num threads
1615       master_set_numthreads = nth;
1616     }
1617   }
1618 #endif
1619 
1620   // Figure out the proc_bind policy for the nested parallel within teams
1621   kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1622   // proc_bind_default means don't update
1623   kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1624   if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1625     proc_bind = proc_bind_false;
1626   } else {
1627     // No proc_bind clause specified; use current proc-bind-var
1628     if (proc_bind == proc_bind_default) {
1629       proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1630     }
1631     /* else: The proc_bind policy was specified explicitly on parallel clause.
1632        This overrides proc-bind-var for this parallel region, but does not
1633        change proc-bind-var. */
1634     // Figure the value of proc-bind-var for the child threads.
1635     if ((level + 1 < __kmp_nested_proc_bind.used) &&
1636         (__kmp_nested_proc_bind.bind_types[level + 1] !=
1637          master_th->th.th_current_task->td_icvs.proc_bind)) {
1638       proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1639     }
1640   }
1641   KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1642   // Need to change the bind-var ICV to correct value for each implicit task
1643   if (proc_bind_icv != proc_bind_default &&
1644       master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1645     kmp_info_t **other_threads = parent_team->t.t_threads;
1646     for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1647       other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1648     }
1649   }
1650   // Reset for next parallel region
1651   master_th->th.th_set_proc_bind = proc_bind_default;
1652 
1653 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1654   if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1655        KMP_ITT_DEBUG) &&
1656       __kmp_forkjoin_frames_mode == 3 &&
1657       parent_team->t.t_active_level == 1 // only report frames at level 1
1658       && master_th->th.th_teams_size.nteams == 1) {
1659     kmp_uint64 tmp_time = __itt_get_timestamp();
1660     master_th->th.th_frame_time = tmp_time;
1661     parent_team->t.t_region_time = tmp_time;
1662   }
1663   if (__itt_stack_caller_create_ptr) {
1664     KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1665     // create new stack stitching id before entering fork barrier
1666     parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1667   }
1668 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1669 #if KMP_AFFINITY_SUPPORTED
1670   __kmp_partition_places(parent_team);
1671 #endif
1672 
1673   KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1674                 "master_th=%p, gtid=%d\n",
1675                 root, parent_team, master_th, gtid));
1676   __kmp_internal_fork(loc, gtid, parent_team);
1677   KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1678                 "master_th=%p, gtid=%d\n",
1679                 root, parent_team, master_th, gtid));
1680 
1681   if (call_context == fork_context_gnu)
1682     return TRUE;
1683 
1684   /* Invoke microtask for PRIMARY thread */
1685   KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1686                 parent_team->t.t_id, parent_team->t.t_pkfn));
1687 
1688   if (!parent_team->t.t_invoke(gtid)) {
1689     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1690   }
1691   KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1692                 parent_team->t.t_id, parent_team->t.t_pkfn));
1693   KMP_MB(); /* Flush all pending memory write invalidates.  */
1694 
1695   KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1696 
1697   return TRUE;
1698 }
1699 
1700 // Create a serialized parallel region
1701 static inline int
1702 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1703                        kmp_int32 argc, microtask_t microtask, launch_t invoker,
1704                        kmp_info_t *master_th, kmp_team_t *parent_team,
1705 #if OMPT_SUPPORT
1706                        ompt_data_t *ompt_parallel_data, void **return_address,
1707                        ompt_data_t **parent_task_data,
1708 #endif
1709                        kmp_va_list ap) {
1710   kmp_team_t *team;
1711   int i;
1712   void **argv;
1713 
1714 /* josh todo: hypothetical question: what do we do for OS X*? */
1715 #if KMP_OS_LINUX &&                                                            \
1716     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1717   SimpleVLA<void *> args(argc);
1718 #else
1719   void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1720 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1721           KMP_ARCH_AARCH64) */
1722 
1723   KA_TRACE(
1724       20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1725 
1726   __kmpc_serialized_parallel(loc, gtid);
1727 
1728 #if OMPD_SUPPORT
1729   master_th->th.th_serial_team->t.t_pkfn = microtask;
1730 #endif
1731 
1732   if (call_context == fork_context_intel) {
1733     /* TODO this sucks, use the compiler itself to pass args! :) */
1734     master_th->th.th_serial_team->t.t_ident = loc;
1735     if (!ap) {
1736       // revert change made in __kmpc_serialized_parallel()
1737       master_th->th.th_serial_team->t.t_level--;
1738 // Get args from parent team for teams construct
1739 
1740 #if OMPT_SUPPORT
1741       void *dummy;
1742       void **exit_frame_p;
1743       ompt_task_info_t *task_info;
1744       ompt_lw_taskteam_t lw_taskteam;
1745 
1746       if (ompt_enabled.enabled) {
1747         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1748                                 ompt_parallel_data, *return_address);
1749 
1750         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1751         // don't use lw_taskteam after linking. content was swaped
1752         task_info = OMPT_CUR_TASK_INFO(master_th);
1753         exit_frame_p = &(task_info->frame.exit_frame.ptr);
1754         if (ompt_enabled.ompt_callback_implicit_task) {
1755           OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1756           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1757               ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1758               &(task_info->task_data), 1,
1759               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1760         }
1761 
1762         /* OMPT state */
1763         master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1764       } else {
1765         exit_frame_p = &dummy;
1766       }
1767 #endif
1768 
1769       {
1770         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1771         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1772         __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1773 #if OMPT_SUPPORT
1774                                ,
1775                                exit_frame_p
1776 #endif
1777                                );
1778       }
1779 
1780 #if OMPT_SUPPORT
1781       if (ompt_enabled.enabled) {
1782         *exit_frame_p = NULL;
1783         if (ompt_enabled.ompt_callback_implicit_task) {
1784           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1785               ompt_scope_end, NULL, &(task_info->task_data), 1,
1786               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1787         }
1788         *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1789         __ompt_lw_taskteam_unlink(master_th);
1790         if (ompt_enabled.ompt_callback_parallel_end) {
1791           ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1792               ompt_parallel_data, *parent_task_data,
1793               OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1794         }
1795         master_th->th.ompt_thread_info.state = ompt_state_overhead;
1796       }
1797 #endif
1798     } else if (microtask == (microtask_t)__kmp_teams_master) {
1799       KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1800       team = master_th->th.th_team;
1801       // team->t.t_pkfn = microtask;
1802       team->t.t_invoke = invoker;
1803       __kmp_alloc_argv_entries(argc, team, TRUE);
1804       team->t.t_argc = argc;
1805       argv = (void **)team->t.t_argv;
1806       for (i = argc - 1; i >= 0; --i)
1807         *argv++ = va_arg(kmp_va_deref(ap), void *);
1808       // AC: revert change made in __kmpc_serialized_parallel()
1809       //     because initial code in teams should have level=0
1810       team->t.t_level--;
1811       // AC: call special invoker for outer "parallel" of teams construct
1812       invoker(gtid);
1813 #if OMPT_SUPPORT
1814       if (ompt_enabled.enabled) {
1815         ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1816         if (ompt_enabled.ompt_callback_implicit_task) {
1817           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1818               ompt_scope_end, NULL, &(task_info->task_data), 0,
1819               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1820         }
1821         if (ompt_enabled.ompt_callback_parallel_end) {
1822           ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1823               ompt_parallel_data, *parent_task_data,
1824               OMPT_INVOKER(call_context) | ompt_parallel_league,
1825               *return_address);
1826         }
1827         master_th->th.ompt_thread_info.state = ompt_state_overhead;
1828       }
1829 #endif
1830     } else {
1831       argv = args;
1832       for (i = argc - 1; i >= 0; --i)
1833         *argv++ = va_arg(kmp_va_deref(ap), void *);
1834       KMP_MB();
1835 
1836 #if OMPT_SUPPORT
1837       void *dummy;
1838       void **exit_frame_p;
1839       ompt_task_info_t *task_info;
1840       ompt_lw_taskteam_t lw_taskteam;
1841       ompt_data_t *implicit_task_data;
1842 
1843       if (ompt_enabled.enabled) {
1844         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1845                                 ompt_parallel_data, *return_address);
1846         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1847         // don't use lw_taskteam after linking. content was swaped
1848         task_info = OMPT_CUR_TASK_INFO(master_th);
1849         exit_frame_p = &(task_info->frame.exit_frame.ptr);
1850 
1851         /* OMPT implicit task begin */
1852         implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1853         if (ompt_enabled.ompt_callback_implicit_task) {
1854           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1855               ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1856               implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1857               ompt_task_implicit);
1858           OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1859         }
1860 
1861         /* OMPT state */
1862         master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1863       } else {
1864         exit_frame_p = &dummy;
1865       }
1866 #endif
1867 
1868       {
1869         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1870         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1871         __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1872 #if OMPT_SUPPORT
1873                                ,
1874                                exit_frame_p
1875 #endif
1876                                );
1877       }
1878 
1879 #if OMPT_SUPPORT
1880       if (ompt_enabled.enabled) {
1881         *exit_frame_p = NULL;
1882         if (ompt_enabled.ompt_callback_implicit_task) {
1883           ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1884               ompt_scope_end, NULL, &(task_info->task_data), 1,
1885               OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1886         }
1887 
1888         *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1889         __ompt_lw_taskteam_unlink(master_th);
1890         if (ompt_enabled.ompt_callback_parallel_end) {
1891           ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1892               ompt_parallel_data, *parent_task_data,
1893               OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1894         }
1895         master_th->th.ompt_thread_info.state = ompt_state_overhead;
1896       }
1897 #endif
1898     }
1899   } else if (call_context == fork_context_gnu) {
1900 #if OMPT_SUPPORT
1901     if (ompt_enabled.enabled) {
1902       ompt_lw_taskteam_t lwt;
1903       __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1904                               *return_address);
1905 
1906       lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1907       __ompt_lw_taskteam_link(&lwt, master_th, 1);
1908     }
1909 // don't use lw_taskteam after linking. content was swaped
1910 #endif
1911 
1912     // we were called from GNU native code
1913     KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1914     return FALSE;
1915   } else {
1916     KMP_ASSERT2(call_context < fork_context_last,
1917                 "__kmp_serial_fork_call: unknown fork_context parameter");
1918   }
1919 
1920   KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1921   KMP_MB();
1922   return FALSE;
1923 }
1924 
1925 /* most of the work for a fork */
1926 /* return true if we really went parallel, false if serialized */
1927 int __kmp_fork_call(ident_t *loc, int gtid,
1928                     enum fork_context_e call_context, // Intel, GNU, ...
1929                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1930                     kmp_va_list ap) {
1931   void **argv;
1932   int i;
1933   int master_tid;
1934   int master_this_cons;
1935   kmp_team_t *team;
1936   kmp_team_t *parent_team;
1937   kmp_info_t *master_th;
1938   kmp_root_t *root;
1939   int nthreads;
1940   int master_active;
1941   int master_set_numthreads;
1942   int task_thread_limit = 0;
1943   int level;
1944   int active_level;
1945   int teams_level;
1946 #if KMP_NESTED_HOT_TEAMS
1947   kmp_hot_team_ptr_t **p_hot_teams;
1948 #endif
1949   { // KMP_TIME_BLOCK
1950     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1951     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1952 
1953     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1954     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1955       /* Some systems prefer the stack for the root thread(s) to start with */
1956       /* some gap from the parent stack to prevent false sharing. */
1957       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1958       /* These 2 lines below are so this does not get optimized out */
1959       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1960         __kmp_stkpadding += (short)((kmp_int64)dummy);
1961     }
1962 
1963     /* initialize if needed */
1964     KMP_DEBUG_ASSERT(
1965         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1966     if (!TCR_4(__kmp_init_parallel))
1967       __kmp_parallel_initialize();
1968     __kmp_resume_if_soft_paused();
1969 
1970     /* setup current data */
1971     // AC: potentially unsafe, not in sync with library shutdown,
1972     // __kmp_threads can be freed
1973     master_th = __kmp_threads[gtid];
1974 
1975     parent_team = master_th->th.th_team;
1976     master_tid = master_th->th.th_info.ds.ds_tid;
1977     master_this_cons = master_th->th.th_local.this_construct;
1978     root = master_th->th.th_root;
1979     master_active = root->r.r_active;
1980     master_set_numthreads = master_th->th.th_set_nproc;
1981     task_thread_limit =
1982         master_th->th.th_current_task->td_icvs.task_thread_limit;
1983 
1984 #if OMPT_SUPPORT
1985     ompt_data_t ompt_parallel_data = ompt_data_none;
1986     ompt_data_t *parent_task_data;
1987     ompt_frame_t *ompt_frame;
1988     void *return_address = NULL;
1989 
1990     if (ompt_enabled.enabled) {
1991       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1992                                     NULL, NULL);
1993       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1994     }
1995 #endif
1996 
1997     // Assign affinity to root thread if it hasn't happened yet
1998     __kmp_assign_root_init_mask();
1999 
2000     // Nested level will be an index in the nested nthreads array
2001     level = parent_team->t.t_level;
2002     // used to launch non-serial teams even if nested is not allowed
2003     active_level = parent_team->t.t_active_level;
2004     // needed to check nesting inside the teams
2005     teams_level = master_th->th.th_teams_level;
2006 #if KMP_NESTED_HOT_TEAMS
2007     p_hot_teams = &master_th->th.th_hot_teams;
2008     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
2009       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
2010           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
2011       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
2012       // it is either actual or not needed (when active_level > 0)
2013       (*p_hot_teams)[0].hot_team_nth = 1;
2014     }
2015 #endif
2016 
2017 #if OMPT_SUPPORT
2018     if (ompt_enabled.enabled) {
2019       if (ompt_enabled.ompt_callback_parallel_begin) {
2020         int team_size = master_set_numthreads
2021                             ? master_set_numthreads
2022                             : get__nproc_2(parent_team, master_tid);
2023         int flags = OMPT_INVOKER(call_context) |
2024                     ((microtask == (microtask_t)__kmp_teams_master)
2025                          ? ompt_parallel_league
2026                          : ompt_parallel_team);
2027         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
2028             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
2029             return_address);
2030       }
2031       master_th->th.ompt_thread_info.state = ompt_state_overhead;
2032     }
2033 #endif
2034 
2035     master_th->th.th_ident = loc;
2036 
2037     // Parallel closely nested in teams construct:
2038     if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
2039       return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
2040                                  call_context, microtask, invoker,
2041                                  master_set_numthreads, level,
2042 #if OMPT_SUPPORT
2043                                  ompt_parallel_data, return_address,
2044 #endif
2045                                  ap);
2046     } // End parallel closely nested in teams construct
2047 
2048     // Need this to happen before we determine the number of threads, not while
2049     // we are allocating the team
2050     //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2051 
2052     KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
2053 
2054     // Determine the number of threads
2055     int enter_teams =
2056         __kmp_is_entering_teams(active_level, level, teams_level, ap);
2057     if ((!enter_teams &&
2058          (parent_team->t.t_active_level >=
2059           master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2060         (__kmp_library == library_serial)) {
2061       KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2062       nthreads = 1;
2063     } else {
2064       nthreads = master_set_numthreads
2065                      ? master_set_numthreads
2066                      // TODO: get nproc directly from current task
2067                      : get__nproc_2(parent_team, master_tid);
2068       // Use the thread_limit set for the current target task if exists, else go
2069       // with the deduced nthreads
2070       nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2071                      ? task_thread_limit
2072                      : nthreads;
2073       // Check if we need to take forkjoin lock? (no need for serialized
2074       // parallel out of teams construct).
2075       if (nthreads > 1) {
2076         /* determine how many new threads we can use */
2077         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2078         /* AC: If we execute teams from parallel region (on host), then teams
2079            should be created but each can only have 1 thread if nesting is
2080            disabled. If teams called from serial region, then teams and their
2081            threads should be created regardless of the nesting setting. */
2082         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2083                                          nthreads, enter_teams);
2084         if (nthreads == 1) {
2085           // Free lock for single thread execution here; for multi-thread
2086           // execution it will be freed later after team of threads created
2087           // and initialized
2088           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2089         }
2090       }
2091     }
2092     KMP_DEBUG_ASSERT(nthreads > 0);
2093 
2094     // If we temporarily changed the set number of threads then restore it now
2095     master_th->th.th_set_nproc = 0;
2096 
2097     if (nthreads == 1) {
2098       return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2099                                     invoker, master_th, parent_team,
2100 #if OMPT_SUPPORT
2101                                     &ompt_parallel_data, &return_address,
2102                                     &parent_task_data,
2103 #endif
2104                                     ap);
2105     } // if (nthreads == 1)
2106 
2107     // GEH: only modify the executing flag in the case when not serialized
2108     //      serialized case is handled in kmpc_serialized_parallel
2109     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2110                   "curtask=%p, curtask_max_aclevel=%d\n",
2111                   parent_team->t.t_active_level, master_th,
2112                   master_th->th.th_current_task,
2113                   master_th->th.th_current_task->td_icvs.max_active_levels));
2114     // TODO: GEH - cannot do this assertion because root thread not set up as
2115     // executing
2116     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2117     master_th->th.th_current_task->td_flags.executing = 0;
2118 
2119     if (!master_th->th.th_teams_microtask || level > teams_level) {
2120       /* Increment our nested depth level */
2121       KMP_ATOMIC_INC(&root->r.r_in_parallel);
2122     }
2123 
2124     // See if we need to make a copy of the ICVs.
2125     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2126     kmp_nested_nthreads_t *nested_nth = NULL;
2127     if (!master_th->th.th_set_nested_nth &&
2128         (level + 1 < parent_team->t.t_nested_nth->used) &&
2129         (parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
2130       nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
2131     } else if (master_th->th.th_set_nested_nth) {
2132       nested_nth = __kmp_override_nested_nth(master_th, level);
2133       if ((level + 1 < nested_nth->used) &&
2134           (nested_nth->nth[level + 1] != nthreads_icv))
2135         nthreads_icv = nested_nth->nth[level + 1];
2136       else
2137         nthreads_icv = 0; // don't update
2138     } else {
2139       nthreads_icv = 0; // don't update
2140     }
2141 
2142     // Figure out the proc_bind_policy for the new team.
2143     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2144     // proc_bind_default means don't update
2145     kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2146     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2147       proc_bind = proc_bind_false;
2148     } else {
2149       // No proc_bind clause specified; use current proc-bind-var for this
2150       // parallel region
2151       if (proc_bind == proc_bind_default) {
2152         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2153       }
2154       // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2155       if (master_th->th.th_teams_microtask &&
2156           microtask == (microtask_t)__kmp_teams_master) {
2157         proc_bind = __kmp_teams_proc_bind;
2158       }
2159       /* else: The proc_bind policy was specified explicitly on parallel clause.
2160          This overrides proc-bind-var for this parallel region, but does not
2161          change proc-bind-var. */
2162       // Figure the value of proc-bind-var for the child threads.
2163       if ((level + 1 < __kmp_nested_proc_bind.used) &&
2164           (__kmp_nested_proc_bind.bind_types[level + 1] !=
2165            master_th->th.th_current_task->td_icvs.proc_bind)) {
2166         // Do not modify the proc bind icv for the two teams construct forks
2167         // They just let the proc bind icv pass through
2168         if (!master_th->th.th_teams_microtask ||
2169             !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2170           proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2171       }
2172     }
2173 
2174     // Reset for next parallel region
2175     master_th->th.th_set_proc_bind = proc_bind_default;
2176 
2177     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2178       kmp_internal_control_t new_icvs;
2179       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2180       new_icvs.next = NULL;
2181       if (nthreads_icv > 0) {
2182         new_icvs.nproc = nthreads_icv;
2183       }
2184       if (proc_bind_icv != proc_bind_default) {
2185         new_icvs.proc_bind = proc_bind_icv;
2186       }
2187 
2188       /* allocate a new parallel team */
2189       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2190       team = __kmp_allocate_team(root, nthreads, nthreads,
2191 #if OMPT_SUPPORT
2192                                  ompt_parallel_data,
2193 #endif
2194                                  proc_bind, &new_icvs,
2195                                  argc USE_NESTED_HOT_ARG(master_th));
2196       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2197         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2198     } else {
2199       /* allocate a new parallel team */
2200       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2201       team = __kmp_allocate_team(root, nthreads, nthreads,
2202 #if OMPT_SUPPORT
2203                                  ompt_parallel_data,
2204 #endif
2205                                  proc_bind,
2206                                  &master_th->th.th_current_task->td_icvs,
2207                                  argc USE_NESTED_HOT_ARG(master_th));
2208       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2209         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2210                   &master_th->th.th_current_task->td_icvs);
2211     }
2212     KF_TRACE(
2213         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2214 
2215     /* setup the new team */
2216     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2217     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2218     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2219     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2220     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2221 #if OMPT_SUPPORT
2222     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2223                           return_address);
2224 #endif
2225     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2226     // TODO: parent_team->t.t_level == INT_MAX ???
2227     if (!master_th->th.th_teams_microtask || level > teams_level) {
2228       int new_level = parent_team->t.t_level + 1;
2229       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2230       new_level = parent_team->t.t_active_level + 1;
2231       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2232     } else {
2233       // AC: Do not increase parallel level at start of the teams construct
2234       int new_level = parent_team->t.t_level;
2235       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2236       new_level = parent_team->t.t_active_level;
2237       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2238     }
2239     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2240     // set primary thread's schedule as new run-time schedule
2241     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2242 
2243     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2244     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2245 
2246     // Check if hot team has potentially outdated list, and if so, free it
2247     if (team->t.t_nested_nth &&
2248         team->t.t_nested_nth != parent_team->t.t_nested_nth) {
2249       KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
2250       KMP_INTERNAL_FREE(team->t.t_nested_nth);
2251       team->t.t_nested_nth = NULL;
2252     }
2253     team->t.t_nested_nth = parent_team->t.t_nested_nth;
2254     if (master_th->th.th_set_nested_nth) {
2255       if (!nested_nth)
2256         nested_nth = __kmp_override_nested_nth(master_th, level);
2257       team->t.t_nested_nth = nested_nth;
2258       KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
2259       master_th->th.th_set_nested_nth = NULL;
2260       master_th->th.th_set_nested_nth_sz = 0;
2261       master_th->th.th_nt_strict = false;
2262     }
2263 
2264     // Update the floating point rounding in the team if required.
2265     propagateFPControl(team);
2266 #if OMPD_SUPPORT
2267     if (ompd_state & OMPD_ENABLE_BP)
2268       ompd_bp_parallel_begin();
2269 #endif
2270 
2271     KA_TRACE(
2272         20,
2273         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2274          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2275          team->t.t_nproc));
2276     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2277                      (team->t.t_master_tid == 0 &&
2278                       (team->t.t_parent == root->r.r_root_team ||
2279                        team->t.t_parent->t.t_serialized)));
2280     KMP_MB();
2281 
2282     /* now, setup the arguments */
2283     argv = (void **)team->t.t_argv;
2284     if (ap) {
2285       for (i = argc - 1; i >= 0; --i) {
2286         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2287         KMP_CHECK_UPDATE(*argv, new_argv);
2288         argv++;
2289       }
2290     } else {
2291       for (i = 0; i < argc; ++i) {
2292         // Get args from parent team for teams construct
2293         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2294       }
2295     }
2296 
2297     /* now actually fork the threads */
2298     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2299     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2300       root->r.r_active = TRUE;
2301 
2302     __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2303     __kmp_setup_icv_copy(team, nthreads,
2304                          &master_th->th.th_current_task->td_icvs, loc);
2305 
2306 #if OMPT_SUPPORT
2307     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2308 #endif
2309 
2310     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2311 
2312 #if USE_ITT_BUILD
2313     if (team->t.t_active_level == 1 // only report frames at level 1
2314         && !master_th->th.th_teams_microtask) { // not in teams construct
2315 #if USE_ITT_NOTIFY
2316       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2317           (__kmp_forkjoin_frames_mode == 3 ||
2318            __kmp_forkjoin_frames_mode == 1)) {
2319         kmp_uint64 tmp_time = 0;
2320         if (__itt_get_timestamp_ptr)
2321           tmp_time = __itt_get_timestamp();
2322         // Internal fork - report frame begin
2323         master_th->th.th_frame_time = tmp_time;
2324         if (__kmp_forkjoin_frames_mode == 3)
2325           team->t.t_region_time = tmp_time;
2326       } else
2327 // only one notification scheme (either "submit" or "forking/joined", not both)
2328 #endif /* USE_ITT_NOTIFY */
2329         if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2330             __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2331           // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2332           __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2333         }
2334     }
2335 #endif /* USE_ITT_BUILD */
2336 
2337     /* now go on and do the work */
2338     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2339     KMP_MB();
2340     KF_TRACE(10,
2341              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2342               root, team, master_th, gtid));
2343 
2344 #if USE_ITT_BUILD
2345     if (__itt_stack_caller_create_ptr) {
2346       // create new stack stitching id before entering fork barrier
2347       if (!enter_teams) {
2348         KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2349         team->t.t_stack_id = __kmp_itt_stack_caller_create();
2350       } else if (parent_team->t.t_serialized) {
2351         // keep stack stitching id in the serialized parent_team;
2352         // current team will be used for parallel inside the teams;
2353         // if parent_team is active, then it already keeps stack stitching id
2354         // for the league of teams
2355         KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2356         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2357       }
2358     }
2359 #endif /* USE_ITT_BUILD */
2360 
2361     // AC: skip __kmp_internal_fork at teams construct, let only primary
2362     // threads execute
2363     if (ap) {
2364       __kmp_internal_fork(loc, gtid, team);
2365       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2366                     "master_th=%p, gtid=%d\n",
2367                     root, team, master_th, gtid));
2368     }
2369 
2370     if (call_context == fork_context_gnu) {
2371       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2372       return TRUE;
2373     }
2374 
2375     /* Invoke microtask for PRIMARY thread */
2376     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2377                   team->t.t_id, team->t.t_pkfn));
2378   } // END of timer KMP_fork_call block
2379 
2380 #if KMP_STATS_ENABLED
2381   // If beginning a teams construct, then change thread state
2382   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2383   if (!ap) {
2384     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2385   }
2386 #endif
2387 
2388   if (!team->t.t_invoke(gtid)) {
2389     KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2390   }
2391 
2392 #if KMP_STATS_ENABLED
2393   // If was beginning of a teams construct, then reset thread state
2394   if (!ap) {
2395     KMP_SET_THREAD_STATE(previous_state);
2396   }
2397 #endif
2398 
2399   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2400                 team->t.t_id, team->t.t_pkfn));
2401   KMP_MB(); /* Flush all pending memory write invalidates.  */
2402 
2403   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2404 #if OMPT_SUPPORT
2405   if (ompt_enabled.enabled) {
2406     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2407   }
2408 #endif
2409 
2410   return TRUE;
2411 }
2412 
2413 #if OMPT_SUPPORT
2414 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2415                                             kmp_team_t *team) {
2416   // restore state outside the region
2417   thread->th.ompt_thread_info.state =
2418       ((team->t.t_serialized) ? ompt_state_work_serial
2419                               : ompt_state_work_parallel);
2420 }
2421 
2422 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2423                                    kmp_team_t *team, ompt_data_t *parallel_data,
2424                                    int flags, void *codeptr) {
2425   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2426   if (ompt_enabled.ompt_callback_parallel_end) {
2427     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2428         parallel_data, &(task_info->task_data), flags, codeptr);
2429   }
2430 
2431   task_info->frame.enter_frame = ompt_data_none;
2432   __kmp_join_restore_state(thread, team);
2433 }
2434 #endif
2435 
2436 void __kmp_join_call(ident_t *loc, int gtid
2437 #if OMPT_SUPPORT
2438                      ,
2439                      enum fork_context_e fork_context
2440 #endif
2441                      ,
2442                      int exit_teams) {
2443   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2444   kmp_team_t *team;
2445   kmp_team_t *parent_team;
2446   kmp_info_t *master_th;
2447   kmp_root_t *root;
2448   int master_active;
2449 
2450   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2451 
2452   /* setup current data */
2453   master_th = __kmp_threads[gtid];
2454   root = master_th->th.th_root;
2455   team = master_th->th.th_team;
2456   parent_team = team->t.t_parent;
2457 
2458   master_th->th.th_ident = loc;
2459 
2460 #if OMPT_SUPPORT
2461   void *team_microtask = (void *)team->t.t_pkfn;
2462   // For GOMP interface with serialized parallel, need the
2463   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2464   // and end-parallel events.
2465   if (ompt_enabled.enabled &&
2466       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2467     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2468   }
2469 #endif
2470 
2471 #if KMP_DEBUG
2472   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2473     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2474                   "th_task_team = %p\n",
2475                   __kmp_gtid_from_thread(master_th), team,
2476                   team->t.t_task_team[master_th->th.th_task_state],
2477                   master_th->th.th_task_team));
2478     KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
2479   }
2480 #endif
2481 
2482   if (team->t.t_serialized) {
2483     if (master_th->th.th_teams_microtask) {
2484       // We are in teams construct
2485       int level = team->t.t_level;
2486       int tlevel = master_th->th.th_teams_level;
2487       if (level == tlevel) {
2488         // AC: we haven't incremented it earlier at start of teams construct,
2489         //     so do it here - at the end of teams construct
2490         team->t.t_level++;
2491       } else if (level == tlevel + 1) {
2492         // AC: we are exiting parallel inside teams, need to increment
2493         // serialization in order to restore it in the next call to
2494         // __kmpc_end_serialized_parallel
2495         team->t.t_serialized++;
2496       }
2497     }
2498     __kmpc_end_serialized_parallel(loc, gtid);
2499 
2500 #if OMPT_SUPPORT
2501     if (ompt_enabled.enabled) {
2502       if (fork_context == fork_context_gnu) {
2503         __ompt_lw_taskteam_unlink(master_th);
2504       }
2505       __kmp_join_restore_state(master_th, parent_team);
2506     }
2507 #endif
2508 
2509     return;
2510   }
2511 
2512   master_active = team->t.t_master_active;
2513 
2514   if (!exit_teams) {
2515     // AC: No barrier for internal teams at exit from teams construct.
2516     //     But there is barrier for external team (league).
2517     __kmp_internal_join(loc, gtid, team);
2518 #if USE_ITT_BUILD
2519     if (__itt_stack_caller_create_ptr) {
2520       KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2521       // destroy the stack stitching id after join barrier
2522       __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2523       team->t.t_stack_id = NULL;
2524     }
2525 #endif
2526   } else {
2527     master_th->th.th_task_state =
2528         0; // AC: no tasking in teams (out of any parallel)
2529 #if USE_ITT_BUILD
2530     if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2531       KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2532       // destroy the stack stitching id on exit from the teams construct
2533       // if parent_team is active, then the id will be destroyed later on
2534       // by master of the league of teams
2535       __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2536       parent_team->t.t_stack_id = NULL;
2537     }
2538 #endif
2539   }
2540 
2541   KMP_MB();
2542 
2543 #if OMPT_SUPPORT
2544   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2545   void *codeptr = team->t.ompt_team_info.master_return_address;
2546 #endif
2547 
2548 #if USE_ITT_BUILD
2549   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2550   if (team->t.t_active_level == 1 &&
2551       (!master_th->th.th_teams_microtask || /* not in teams construct */
2552        master_th->th.th_teams_size.nteams == 1)) {
2553     master_th->th.th_ident = loc;
2554     // only one notification scheme (either "submit" or "forking/joined", not
2555     // both)
2556     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2557         __kmp_forkjoin_frames_mode == 3)
2558       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2559                              master_th->th.th_frame_time, 0, loc,
2560                              master_th->th.th_team_nproc, 1);
2561     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2562              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2563       __kmp_itt_region_joined(gtid);
2564   } // active_level == 1
2565 #endif /* USE_ITT_BUILD */
2566 
2567 #if KMP_AFFINITY_SUPPORTED
2568   if (!exit_teams) {
2569     // Restore master thread's partition.
2570     master_th->th.th_first_place = team->t.t_first_place;
2571     master_th->th.th_last_place = team->t.t_last_place;
2572   }
2573 #endif // KMP_AFFINITY_SUPPORTED
2574 
2575   if (master_th->th.th_teams_microtask && !exit_teams &&
2576       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2577       team->t.t_level == master_th->th.th_teams_level + 1) {
2578 // AC: We need to leave the team structure intact at the end of parallel
2579 // inside the teams construct, so that at the next parallel same (hot) team
2580 // works, only adjust nesting levels
2581 #if OMPT_SUPPORT
2582     ompt_data_t ompt_parallel_data = ompt_data_none;
2583     if (ompt_enabled.enabled) {
2584       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2585       if (ompt_enabled.ompt_callback_implicit_task) {
2586         int ompt_team_size = team->t.t_nproc;
2587         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2588             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2589             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2590       }
2591       task_info->frame.exit_frame = ompt_data_none;
2592       task_info->task_data = ompt_data_none;
2593       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2594       __ompt_lw_taskteam_unlink(master_th);
2595     }
2596 #endif
2597     /* Decrement our nested depth level */
2598     team->t.t_level--;
2599     team->t.t_active_level--;
2600     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2601 
2602     // Restore number of threads in the team if needed. This code relies on
2603     // the proper adjustment of th_teams_size.nth after the fork in
2604     // __kmp_teams_master on each teams primary thread in the case that
2605     // __kmp_reserve_threads reduced it.
2606     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2607       int old_num = master_th->th.th_team_nproc;
2608       int new_num = master_th->th.th_teams_size.nth;
2609       kmp_info_t **other_threads = team->t.t_threads;
2610       team->t.t_nproc = new_num;
2611       for (int i = 0; i < old_num; ++i) {
2612         other_threads[i]->th.th_team_nproc = new_num;
2613       }
2614       // Adjust states of non-used threads of the team
2615       for (int i = old_num; i < new_num; ++i) {
2616         // Re-initialize thread's barrier data.
2617         KMP_DEBUG_ASSERT(other_threads[i]);
2618         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2619         for (int b = 0; b < bs_last_barrier; ++b) {
2620           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2621           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2622 #if USE_DEBUGGER
2623           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2624 #endif
2625         }
2626         if (__kmp_tasking_mode != tskm_immediate_exec) {
2627           // Synchronize thread's task state
2628           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2629         }
2630       }
2631     }
2632 
2633 #if OMPT_SUPPORT
2634     if (ompt_enabled.enabled) {
2635       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2636                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2637     }
2638 #endif
2639 
2640     return;
2641   }
2642 
2643   /* do cleanup and restore the parent team */
2644   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2645   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2646 
2647   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2648 
2649   /* jc: The following lock has instructions with REL and ACQ semantics,
2650      separating the parallel user code called in this parallel region
2651      from the serial user code called after this function returns. */
2652   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2653 
2654   if (!master_th->th.th_teams_microtask ||
2655       team->t.t_level > master_th->th.th_teams_level) {
2656     /* Decrement our nested depth level */
2657     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2658   }
2659   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2660 
2661 #if OMPT_SUPPORT
2662   if (ompt_enabled.enabled) {
2663     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2664     if (ompt_enabled.ompt_callback_implicit_task) {
2665       int flags = (team_microtask == (void *)__kmp_teams_master)
2666                       ? ompt_task_initial
2667                       : ompt_task_implicit;
2668       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2669       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2670           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2671           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2672     }
2673     task_info->frame.exit_frame = ompt_data_none;
2674     task_info->task_data = ompt_data_none;
2675   }
2676 #endif
2677 
2678   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2679                 master_th, team));
2680   __kmp_pop_current_task_from_thread(master_th);
2681 
2682   master_th->th.th_def_allocator = team->t.t_def_allocator;
2683 
2684 #if OMPD_SUPPORT
2685   if (ompd_state & OMPD_ENABLE_BP)
2686     ompd_bp_parallel_end();
2687 #endif
2688   updateHWFPControl(team);
2689 
2690   if (root->r.r_active != master_active)
2691     root->r.r_active = master_active;
2692 
2693   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2694                             master_th)); // this will free worker threads
2695 
2696   /* this race was fun to find. make sure the following is in the critical
2697      region otherwise assertions may fail occasionally since the old team may be
2698      reallocated and the hierarchy appears inconsistent. it is actually safe to
2699      run and won't cause any bugs, but will cause those assertion failures. it's
2700      only one deref&assign so might as well put this in the critical region */
2701   master_th->th.th_team = parent_team;
2702   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2703   master_th->th.th_team_master = parent_team->t.t_threads[0];
2704   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2705 
2706   /* restore serialized team, if need be */
2707   if (parent_team->t.t_serialized &&
2708       parent_team != master_th->th.th_serial_team &&
2709       parent_team != root->r.r_root_team) {
2710     __kmp_free_team(root,
2711                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2712     master_th->th.th_serial_team = parent_team;
2713   }
2714 
2715   if (__kmp_tasking_mode != tskm_immediate_exec) {
2716     // Restore primary thread's task state from team structure
2717     KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
2718                      team->t.t_primary_task_state == 1);
2719     master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
2720 
2721     // Copy the task team from the parent team to the primary thread
2722     master_th->th.th_task_team =
2723         parent_team->t.t_task_team[master_th->th.th_task_state];
2724     KA_TRACE(20,
2725              ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2726               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2727               parent_team));
2728   }
2729 
2730   // TODO: GEH - cannot do this assertion because root thread not set up as
2731   // executing
2732   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2733   master_th->th.th_current_task->td_flags.executing = 1;
2734 
2735   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2736 
2737 #if KMP_AFFINITY_SUPPORTED
2738   if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2739     __kmp_reset_root_init_mask(gtid);
2740   }
2741 #endif
2742 #if OMPT_SUPPORT
2743   int flags =
2744       OMPT_INVOKER(fork_context) |
2745       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2746                                                       : ompt_parallel_team);
2747   if (ompt_enabled.enabled) {
2748     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2749                     codeptr);
2750   }
2751 #endif
2752 
2753   KMP_MB();
2754   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2755 }
2756 
2757 /* Check whether we should push an internal control record onto the
2758    serial team stack.  If so, do it.  */
2759 void __kmp_save_internal_controls(kmp_info_t *thread) {
2760 
2761   if (thread->th.th_team != thread->th.th_serial_team) {
2762     return;
2763   }
2764   if (thread->th.th_team->t.t_serialized > 1) {
2765     int push = 0;
2766 
2767     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2768       push = 1;
2769     } else {
2770       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2771           thread->th.th_team->t.t_serialized) {
2772         push = 1;
2773       }
2774     }
2775     if (push) { /* push a record on the serial team's stack */
2776       kmp_internal_control_t *control =
2777           (kmp_internal_control_t *)__kmp_allocate(
2778               sizeof(kmp_internal_control_t));
2779 
2780       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2781 
2782       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2783 
2784       control->next = thread->th.th_team->t.t_control_stack_top;
2785       thread->th.th_team->t.t_control_stack_top = control;
2786     }
2787   }
2788 }
2789 
2790 /* Changes set_nproc */
2791 void __kmp_set_num_threads(int new_nth, int gtid) {
2792   kmp_info_t *thread;
2793   kmp_root_t *root;
2794 
2795   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2796   KMP_DEBUG_ASSERT(__kmp_init_serial);
2797 
2798   if (new_nth < 1)
2799     new_nth = 1;
2800   else if (new_nth > __kmp_max_nth)
2801     new_nth = __kmp_max_nth;
2802 
2803   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2804   thread = __kmp_threads[gtid];
2805   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2806     return; // nothing to do
2807 
2808   __kmp_save_internal_controls(thread);
2809 
2810   set__nproc(thread, new_nth);
2811 
2812   // If this omp_set_num_threads() call will cause the hot team size to be
2813   // reduced (in the absence of a num_threads clause), then reduce it now,
2814   // rather than waiting for the next parallel region.
2815   root = thread->th.th_root;
2816   if (__kmp_init_parallel && (!root->r.r_active) &&
2817       (root->r.r_hot_team->t.t_nproc > new_nth)
2818 #if KMP_NESTED_HOT_TEAMS
2819       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2820 #endif
2821   ) {
2822     kmp_team_t *hot_team = root->r.r_hot_team;
2823     int f;
2824 
2825     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2826 
2827     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2828       __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2829     }
2830     // Release the extra threads we don't need any more.
2831     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2832       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2833       if (__kmp_tasking_mode != tskm_immediate_exec) {
2834         // When decreasing team size, threads no longer in the team should unref
2835         // task team.
2836         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2837       }
2838       __kmp_free_thread(hot_team->t.t_threads[f]);
2839       hot_team->t.t_threads[f] = NULL;
2840     }
2841     hot_team->t.t_nproc = new_nth;
2842 #if KMP_NESTED_HOT_TEAMS
2843     if (thread->th.th_hot_teams) {
2844       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2845       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2846     }
2847 #endif
2848 
2849     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2850       hot_team->t.b->update_num_threads(new_nth);
2851       __kmp_add_threads_to_team(hot_team, new_nth);
2852     }
2853 
2854     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2855 
2856     // Update the t_nproc field in the threads that are still active.
2857     for (f = 0; f < new_nth; f++) {
2858       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2859       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2860     }
2861     // Special flag in case omp_set_num_threads() call
2862     hot_team->t.t_size_changed = -1;
2863   }
2864 }
2865 
2866 /* Changes max_active_levels */
2867 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2868   kmp_info_t *thread;
2869 
2870   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2871                 "%d = (%d)\n",
2872                 gtid, max_active_levels));
2873   KMP_DEBUG_ASSERT(__kmp_init_serial);
2874 
2875   // validate max_active_levels
2876   if (max_active_levels < 0) {
2877     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2878     // We ignore this call if the user has specified a negative value.
2879     // The current setting won't be changed. The last valid setting will be
2880     // used. A warning will be issued (if warnings are allowed as controlled by
2881     // the KMP_WARNINGS env var).
2882     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2883                   "max_active_levels for thread %d = (%d)\n",
2884                   gtid, max_active_levels));
2885     return;
2886   }
2887   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2888     // it's OK, the max_active_levels is within the valid range: [ 0;
2889     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2890     // We allow a zero value. (implementation defined behavior)
2891   } else {
2892     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2893                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2894     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2895     // Current upper limit is MAX_INT. (implementation defined behavior)
2896     // If the input exceeds the upper limit, we correct the input to be the
2897     // upper limit. (implementation defined behavior)
2898     // Actually, the flow should never get here until we use MAX_INT limit.
2899   }
2900   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2901                 "max_active_levels for thread %d = (%d)\n",
2902                 gtid, max_active_levels));
2903 
2904   thread = __kmp_threads[gtid];
2905 
2906   __kmp_save_internal_controls(thread);
2907 
2908   set__max_active_levels(thread, max_active_levels);
2909 }
2910 
2911 /* Gets max_active_levels */
2912 int __kmp_get_max_active_levels(int gtid) {
2913   kmp_info_t *thread;
2914 
2915   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2916   KMP_DEBUG_ASSERT(__kmp_init_serial);
2917 
2918   thread = __kmp_threads[gtid];
2919   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2920   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2921                 "curtask_maxaclevel=%d\n",
2922                 gtid, thread->th.th_current_task,
2923                 thread->th.th_current_task->td_icvs.max_active_levels));
2924   return thread->th.th_current_task->td_icvs.max_active_levels;
2925 }
2926 
2927 // nteams-var per-device ICV
2928 void __kmp_set_num_teams(int num_teams) {
2929   if (num_teams > 0)
2930     __kmp_nteams = num_teams;
2931 }
2932 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2933 // teams-thread-limit-var per-device ICV
2934 void __kmp_set_teams_thread_limit(int limit) {
2935   if (limit > 0)
2936     __kmp_teams_thread_limit = limit;
2937 }
2938 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2939 
2940 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2941 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2942 
2943 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2944 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2945   kmp_info_t *thread;
2946   kmp_sched_t orig_kind;
2947   //    kmp_team_t *team;
2948 
2949   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2950                 gtid, (int)kind, chunk));
2951   KMP_DEBUG_ASSERT(__kmp_init_serial);
2952 
2953   // Check if the kind parameter is valid, correct if needed.
2954   // Valid parameters should fit in one of two intervals - standard or extended:
2955   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2956   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2957   orig_kind = kind;
2958   kind = __kmp_sched_without_mods(kind);
2959 
2960   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2961       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2962     // TODO: Hint needs attention in case we change the default schedule.
2963     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2964               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2965               __kmp_msg_null);
2966     kind = kmp_sched_default;
2967     chunk = 0; // ignore chunk value in case of bad kind
2968   }
2969 
2970   thread = __kmp_threads[gtid];
2971 
2972   __kmp_save_internal_controls(thread);
2973 
2974   if (kind < kmp_sched_upper_std) {
2975     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2976       // differ static chunked vs. unchunked:  chunk should be invalid to
2977       // indicate unchunked schedule (which is the default)
2978       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2979     } else {
2980       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2981           __kmp_sch_map[kind - kmp_sched_lower - 1];
2982     }
2983   } else {
2984     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2985     //    kmp_sched_lower - 2 ];
2986     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2987         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2988                       kmp_sched_lower - 2];
2989   }
2990   __kmp_sched_apply_mods_intkind(
2991       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2992   if (kind == kmp_sched_auto || chunk < 1) {
2993     // ignore parameter chunk for schedule auto
2994     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2995   } else {
2996     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2997   }
2998 }
2999 
3000 /* Gets def_sched_var ICV values */
3001 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
3002   kmp_info_t *thread;
3003   enum sched_type th_type;
3004 
3005   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
3006   KMP_DEBUG_ASSERT(__kmp_init_serial);
3007 
3008   thread = __kmp_threads[gtid];
3009 
3010   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3011   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3012   case kmp_sch_static:
3013   case kmp_sch_static_greedy:
3014   case kmp_sch_static_balanced:
3015     *kind = kmp_sched_static;
3016     __kmp_sched_apply_mods_stdkind(kind, th_type);
3017     *chunk = 0; // chunk was not set, try to show this fact via zero value
3018     return;
3019   case kmp_sch_static_chunked:
3020     *kind = kmp_sched_static;
3021     break;
3022   case kmp_sch_dynamic_chunked:
3023     *kind = kmp_sched_dynamic;
3024     break;
3025   case kmp_sch_guided_chunked:
3026   case kmp_sch_guided_iterative_chunked:
3027   case kmp_sch_guided_analytical_chunked:
3028     *kind = kmp_sched_guided;
3029     break;
3030   case kmp_sch_auto:
3031     *kind = kmp_sched_auto;
3032     break;
3033   case kmp_sch_trapezoidal:
3034     *kind = kmp_sched_trapezoidal;
3035     break;
3036 #if KMP_STATIC_STEAL_ENABLED
3037   case kmp_sch_static_steal:
3038     *kind = kmp_sched_static_steal;
3039     break;
3040 #endif
3041   default:
3042     KMP_FATAL(UnknownSchedulingType, th_type);
3043   }
3044 
3045   __kmp_sched_apply_mods_stdkind(kind, th_type);
3046   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3047 }
3048 
3049 int __kmp_get_ancestor_thread_num(int gtid, int level) {
3050 
3051   int ii, dd;
3052   kmp_team_t *team;
3053   kmp_info_t *thr;
3054 
3055   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3056   KMP_DEBUG_ASSERT(__kmp_init_serial);
3057 
3058   // validate level
3059   if (level == 0)
3060     return 0;
3061   if (level < 0)
3062     return -1;
3063   thr = __kmp_threads[gtid];
3064   team = thr->th.th_team;
3065   ii = team->t.t_level;
3066   if (level > ii)
3067     return -1;
3068 
3069   if (thr->th.th_teams_microtask) {
3070     // AC: we are in teams region where multiple nested teams have same level
3071     int tlevel = thr->th.th_teams_level; // the level of the teams construct
3072     if (level <=
3073         tlevel) { // otherwise usual algorithm works (will not touch the teams)
3074       KMP_DEBUG_ASSERT(ii >= tlevel);
3075       // AC: As we need to pass by the teams league, we need to artificially
3076       // increase ii
3077       if (ii == tlevel) {
3078         ii += 2; // three teams have same level
3079       } else {
3080         ii++; // two teams have same level
3081       }
3082     }
3083   }
3084 
3085   if (ii == level)
3086     return __kmp_tid_from_gtid(gtid);
3087 
3088   dd = team->t.t_serialized;
3089   level++;
3090   while (ii > level) {
3091     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3092     }
3093     if ((team->t.t_serialized) && (!dd)) {
3094       team = team->t.t_parent;
3095       continue;
3096     }
3097     if (ii > level) {
3098       team = team->t.t_parent;
3099       dd = team->t.t_serialized;
3100       ii--;
3101     }
3102   }
3103 
3104   return (dd > 1) ? (0) : (team->t.t_master_tid);
3105 }
3106 
3107 int __kmp_get_team_size(int gtid, int level) {
3108 
3109   int ii, dd;
3110   kmp_team_t *team;
3111   kmp_info_t *thr;
3112 
3113   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3114   KMP_DEBUG_ASSERT(__kmp_init_serial);
3115 
3116   // validate level
3117   if (level == 0)
3118     return 1;
3119   if (level < 0)
3120     return -1;
3121   thr = __kmp_threads[gtid];
3122   team = thr->th.th_team;
3123   ii = team->t.t_level;
3124   if (level > ii)
3125     return -1;
3126 
3127   if (thr->th.th_teams_microtask) {
3128     // AC: we are in teams region where multiple nested teams have same level
3129     int tlevel = thr->th.th_teams_level; // the level of the teams construct
3130     if (level <=
3131         tlevel) { // otherwise usual algorithm works (will not touch the teams)
3132       KMP_DEBUG_ASSERT(ii >= tlevel);
3133       // AC: As we need to pass by the teams league, we need to artificially
3134       // increase ii
3135       if (ii == tlevel) {
3136         ii += 2; // three teams have same level
3137       } else {
3138         ii++; // two teams have same level
3139       }
3140     }
3141   }
3142 
3143   while (ii > level) {
3144     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3145     }
3146     if (team->t.t_serialized && (!dd)) {
3147       team = team->t.t_parent;
3148       continue;
3149     }
3150     if (ii > level) {
3151       team = team->t.t_parent;
3152       ii--;
3153     }
3154   }
3155 
3156   return team->t.t_nproc;
3157 }
3158 
3159 kmp_r_sched_t __kmp_get_schedule_global() {
3160   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3161   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3162   // independently. So one can get the updated schedule here.
3163 
3164   kmp_r_sched_t r_sched;
3165 
3166   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3167   // __kmp_guided. __kmp_sched should keep original value, so that user can set
3168   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3169   // different roots (even in OMP 2.5)
3170   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3171   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3172   if (s == kmp_sch_static) {
3173     // replace STATIC with more detailed schedule (balanced or greedy)
3174     r_sched.r_sched_type = __kmp_static;
3175   } else if (s == kmp_sch_guided_chunked) {
3176     // replace GUIDED with more detailed schedule (iterative or analytical)
3177     r_sched.r_sched_type = __kmp_guided;
3178   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3179     r_sched.r_sched_type = __kmp_sched;
3180   }
3181   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3182 
3183   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3184     // __kmp_chunk may be wrong here (if it was not ever set)
3185     r_sched.chunk = KMP_DEFAULT_CHUNK;
3186   } else {
3187     r_sched.chunk = __kmp_chunk;
3188   }
3189 
3190   return r_sched;
3191 }
3192 
3193 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3194    at least argc number of *t_argv entries for the requested team. */
3195 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3196 
3197   KMP_DEBUG_ASSERT(team);
3198   if (!realloc || argc > team->t.t_max_argc) {
3199 
3200     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3201                    "current entries=%d\n",
3202                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3203     /* if previously allocated heap space for args, free them */
3204     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3205       __kmp_free((void *)team->t.t_argv);
3206 
3207     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3208       /* use unused space in the cache line for arguments */
3209       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3210       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3211                      "argv entries\n",
3212                      team->t.t_id, team->t.t_max_argc));
3213       team->t.t_argv = &team->t.t_inline_argv[0];
3214       if (__kmp_storage_map) {
3215         __kmp_print_storage_map_gtid(
3216             -1, &team->t.t_inline_argv[0],
3217             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3218             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3219             team->t.t_id);
3220       }
3221     } else {
3222       /* allocate space for arguments in the heap */
3223       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3224                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3225                                : 2 * argc;
3226       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3227                      "argv entries\n",
3228                      team->t.t_id, team->t.t_max_argc));
3229       team->t.t_argv =
3230           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3231       if (__kmp_storage_map) {
3232         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3233                                      &team->t.t_argv[team->t.t_max_argc],
3234                                      sizeof(void *) * team->t.t_max_argc,
3235                                      "team_%d.t_argv", team->t.t_id);
3236       }
3237     }
3238   }
3239 }
3240 
3241 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3242   int i;
3243   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3244   team->t.t_threads =
3245       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3246   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3247       sizeof(dispatch_shared_info_t) * num_disp_buff);
3248   team->t.t_dispatch =
3249       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3250   team->t.t_implicit_task_taskdata =
3251       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3252   team->t.t_max_nproc = max_nth;
3253 
3254   /* setup dispatch buffers */
3255   for (i = 0; i < num_disp_buff; ++i) {
3256     team->t.t_disp_buffer[i].buffer_index = i;
3257     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3258   }
3259 }
3260 
3261 static void __kmp_free_team_arrays(kmp_team_t *team) {
3262   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3263   int i;
3264   for (i = 0; i < team->t.t_max_nproc; ++i) {
3265     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3266       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3267       team->t.t_dispatch[i].th_disp_buffer = NULL;
3268     }
3269   }
3270 #if KMP_USE_HIER_SCHED
3271   __kmp_dispatch_free_hierarchies(team);
3272 #endif
3273   __kmp_free(team->t.t_threads);
3274   __kmp_free(team->t.t_disp_buffer);
3275   __kmp_free(team->t.t_dispatch);
3276   __kmp_free(team->t.t_implicit_task_taskdata);
3277   team->t.t_threads = NULL;
3278   team->t.t_disp_buffer = NULL;
3279   team->t.t_dispatch = NULL;
3280   team->t.t_implicit_task_taskdata = 0;
3281 }
3282 
3283 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3284   kmp_info_t **oldThreads = team->t.t_threads;
3285 
3286   __kmp_free(team->t.t_disp_buffer);
3287   __kmp_free(team->t.t_dispatch);
3288   __kmp_free(team->t.t_implicit_task_taskdata);
3289   __kmp_allocate_team_arrays(team, max_nth);
3290 
3291   KMP_MEMCPY(team->t.t_threads, oldThreads,
3292              team->t.t_nproc * sizeof(kmp_info_t *));
3293 
3294   __kmp_free(oldThreads);
3295 }
3296 
3297 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3298 
3299   kmp_r_sched_t r_sched =
3300       __kmp_get_schedule_global(); // get current state of scheduling globals
3301 
3302   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3303 
3304   kmp_internal_control_t g_icvs = {
3305     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3306     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3307     // adjustment of threads (per thread)
3308     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3309     // whether blocktime is explicitly set
3310     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3311 #if KMP_USE_MONITOR
3312     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3313 // intervals
3314 #endif
3315     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3316     // next parallel region (per thread)
3317     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3318     __kmp_cg_max_nth, // int thread_limit;
3319     __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3320     // on task. This is used in the case of target thread_limit
3321     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3322     // for max_active_levels
3323     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3324     // {sched,chunk} pair
3325     __kmp_nested_proc_bind.bind_types[0],
3326     __kmp_default_device,
3327     NULL // struct kmp_internal_control *next;
3328   };
3329 
3330   return g_icvs;
3331 }
3332 
3333 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3334 
3335   kmp_internal_control_t gx_icvs;
3336   gx_icvs.serial_nesting_level =
3337       0; // probably =team->t.t_serial like in save_inter_controls
3338   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3339   gx_icvs.next = NULL;
3340 
3341   return gx_icvs;
3342 }
3343 
3344 static void __kmp_initialize_root(kmp_root_t *root) {
3345   int f;
3346   kmp_team_t *root_team;
3347   kmp_team_t *hot_team;
3348   int hot_team_max_nth;
3349   kmp_r_sched_t r_sched =
3350       __kmp_get_schedule_global(); // get current state of scheduling globals
3351   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3352   KMP_DEBUG_ASSERT(root);
3353   KMP_ASSERT(!root->r.r_begin);
3354 
3355   /* setup the root state structure */
3356   __kmp_init_lock(&root->r.r_begin_lock);
3357   root->r.r_begin = FALSE;
3358   root->r.r_active = FALSE;
3359   root->r.r_in_parallel = 0;
3360   root->r.r_blocktime = __kmp_dflt_blocktime;
3361 #if KMP_AFFINITY_SUPPORTED
3362   root->r.r_affinity_assigned = FALSE;
3363 #endif
3364 
3365   /* setup the root team for this task */
3366   /* allocate the root team structure */
3367   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3368 
3369   root_team =
3370       __kmp_allocate_team(root,
3371                           1, // new_nproc
3372                           1, // max_nproc
3373 #if OMPT_SUPPORT
3374                           ompt_data_none, // root parallel id
3375 #endif
3376                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3377                           0 // argc
3378                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3379       );
3380 #if USE_DEBUGGER
3381   // Non-NULL value should be assigned to make the debugger display the root
3382   // team.
3383   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3384 #endif
3385 
3386   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3387 
3388   root->r.r_root_team = root_team;
3389   root_team->t.t_control_stack_top = NULL;
3390 
3391   /* initialize root team */
3392   root_team->t.t_threads[0] = NULL;
3393   root_team->t.t_nproc = 1;
3394   root_team->t.t_serialized = 1;
3395   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3396   root_team->t.t_sched.sched = r_sched.sched;
3397   root_team->t.t_nested_nth = &__kmp_nested_nth;
3398   KA_TRACE(
3399       20,
3400       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3401        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3402 
3403   /* setup the  hot team for this task */
3404   /* allocate the hot team structure */
3405   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3406 
3407   hot_team =
3408       __kmp_allocate_team(root,
3409                           1, // new_nproc
3410                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3411 #if OMPT_SUPPORT
3412                           ompt_data_none, // root parallel id
3413 #endif
3414                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3415                           0 // argc
3416                           USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3417       );
3418   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3419 
3420   root->r.r_hot_team = hot_team;
3421   root_team->t.t_control_stack_top = NULL;
3422 
3423   /* first-time initialization */
3424   hot_team->t.t_parent = root_team;
3425 
3426   /* initialize hot team */
3427   hot_team_max_nth = hot_team->t.t_max_nproc;
3428   for (f = 0; f < hot_team_max_nth; ++f) {
3429     hot_team->t.t_threads[f] = NULL;
3430   }
3431   hot_team->t.t_nproc = 1;
3432   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3433   hot_team->t.t_sched.sched = r_sched.sched;
3434   hot_team->t.t_size_changed = 0;
3435   hot_team->t.t_nested_nth = &__kmp_nested_nth;
3436 }
3437 
3438 #ifdef KMP_DEBUG
3439 
3440 typedef struct kmp_team_list_item {
3441   kmp_team_p const *entry;
3442   struct kmp_team_list_item *next;
3443 } kmp_team_list_item_t;
3444 typedef kmp_team_list_item_t *kmp_team_list_t;
3445 
3446 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3447     kmp_team_list_t list, // List of teams.
3448     kmp_team_p const *team // Team to add.
3449 ) {
3450 
3451   // List must terminate with item where both entry and next are NULL.
3452   // Team is added to the list only once.
3453   // List is sorted in ascending order by team id.
3454   // Team id is *not* a key.
3455 
3456   kmp_team_list_t l;
3457 
3458   KMP_DEBUG_ASSERT(list != NULL);
3459   if (team == NULL) {
3460     return;
3461   }
3462 
3463   __kmp_print_structure_team_accum(list, team->t.t_parent);
3464   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3465 
3466   // Search list for the team.
3467   l = list;
3468   while (l->next != NULL && l->entry != team) {
3469     l = l->next;
3470   }
3471   if (l->next != NULL) {
3472     return; // Team has been added before, exit.
3473   }
3474 
3475   // Team is not found. Search list again for insertion point.
3476   l = list;
3477   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3478     l = l->next;
3479   }
3480 
3481   // Insert team.
3482   {
3483     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3484         sizeof(kmp_team_list_item_t));
3485     *item = *l;
3486     l->entry = team;
3487     l->next = item;
3488   }
3489 }
3490 
3491 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3492 
3493 ) {
3494   __kmp_printf("%s", title);
3495   if (team != NULL) {
3496     __kmp_printf("%2x %p\n", team->t.t_id, team);
3497   } else {
3498     __kmp_printf(" - (nil)\n");
3499   }
3500 }
3501 
3502 static void __kmp_print_structure_thread(char const *title,
3503                                          kmp_info_p const *thread) {
3504   __kmp_printf("%s", title);
3505   if (thread != NULL) {
3506     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3507   } else {
3508     __kmp_printf(" - (nil)\n");
3509   }
3510 }
3511 
3512 void __kmp_print_structure(void) {
3513 
3514   kmp_team_list_t list;
3515 
3516   // Initialize list of teams.
3517   list =
3518       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3519   list->entry = NULL;
3520   list->next = NULL;
3521 
3522   __kmp_printf("\n------------------------------\nGlobal Thread "
3523                "Table\n------------------------------\n");
3524   {
3525     int gtid;
3526     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3527       __kmp_printf("%2d", gtid);
3528       if (__kmp_threads != NULL) {
3529         __kmp_printf(" %p", __kmp_threads[gtid]);
3530       }
3531       if (__kmp_root != NULL) {
3532         __kmp_printf(" %p", __kmp_root[gtid]);
3533       }
3534       __kmp_printf("\n");
3535     }
3536   }
3537 
3538   // Print out __kmp_threads array.
3539   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3540                "----------\n");
3541   if (__kmp_threads != NULL) {
3542     int gtid;
3543     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3544       kmp_info_t const *thread = __kmp_threads[gtid];
3545       if (thread != NULL) {
3546         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3547         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3548         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3549         __kmp_print_structure_team("    Serial Team:  ",
3550                                    thread->th.th_serial_team);
3551         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3552         __kmp_print_structure_thread("    Primary:      ",
3553                                      thread->th.th_team_master);
3554         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3555         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3556         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3557         __kmp_print_structure_thread("    Next in pool: ",
3558                                      thread->th.th_next_pool);
3559         __kmp_printf("\n");
3560         __kmp_print_structure_team_accum(list, thread->th.th_team);
3561         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3562       }
3563     }
3564   } else {
3565     __kmp_printf("Threads array is not allocated.\n");
3566   }
3567 
3568   // Print out __kmp_root array.
3569   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3570                "--------\n");
3571   if (__kmp_root != NULL) {
3572     int gtid;
3573     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3574       kmp_root_t const *root = __kmp_root[gtid];
3575       if (root != NULL) {
3576         __kmp_printf("GTID %2d %p:\n", gtid, root);
3577         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3578         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3579         __kmp_print_structure_thread("    Uber Thread:  ",
3580                                      root->r.r_uber_thread);
3581         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3582         __kmp_printf("    In Parallel:  %2d\n",
3583                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3584         __kmp_printf("\n");
3585         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3586         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3587       }
3588     }
3589   } else {
3590     __kmp_printf("Ubers array is not allocated.\n");
3591   }
3592 
3593   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3594                "--------\n");
3595   while (list->next != NULL) {
3596     kmp_team_p const *team = list->entry;
3597     int i;
3598     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3599     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3600     __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
3601     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3602     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3603     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3604     for (i = 0; i < team->t.t_nproc; ++i) {
3605       __kmp_printf("    Thread %2d:      ", i);
3606       __kmp_print_structure_thread("", team->t.t_threads[i]);
3607     }
3608     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3609     __kmp_printf("\n");
3610     list = list->next;
3611   }
3612 
3613   // Print out __kmp_thread_pool and __kmp_team_pool.
3614   __kmp_printf("\n------------------------------\nPools\n----------------------"
3615                "--------\n");
3616   __kmp_print_structure_thread("Thread pool:          ",
3617                                CCAST(kmp_info_t *, __kmp_thread_pool));
3618   __kmp_print_structure_team("Team pool:            ",
3619                              CCAST(kmp_team_t *, __kmp_team_pool));
3620   __kmp_printf("\n");
3621 
3622   // Free team list.
3623   while (list != NULL) {
3624     kmp_team_list_item_t *item = list;
3625     list = list->next;
3626     KMP_INTERNAL_FREE(item);
3627   }
3628 }
3629 
3630 #endif
3631 
3632 //---------------------------------------------------------------------------
3633 //  Stuff for per-thread fast random number generator
3634 //  Table of primes
3635 static const unsigned __kmp_primes[] = {
3636     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3637     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3638     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3639     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3640     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3641     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3642     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3643     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3644     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3645     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3646     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3647 
3648 //---------------------------------------------------------------------------
3649 //  __kmp_get_random: Get a random number using a linear congruential method.
3650 unsigned short __kmp_get_random(kmp_info_t *thread) {
3651   unsigned x = thread->th.th_x;
3652   unsigned short r = (unsigned short)(x >> 16);
3653 
3654   thread->th.th_x = x * thread->th.th_a + 1;
3655 
3656   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3657                 thread->th.th_info.ds.ds_tid, r));
3658 
3659   return r;
3660 }
3661 //--------------------------------------------------------
3662 // __kmp_init_random: Initialize a random number generator
3663 void __kmp_init_random(kmp_info_t *thread) {
3664   unsigned seed = thread->th.th_info.ds.ds_tid;
3665 
3666   thread->th.th_a =
3667       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3668   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3669   KA_TRACE(30,
3670            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3671 }
3672 
3673 #if KMP_OS_WINDOWS
3674 /* reclaim array entries for root threads that are already dead, returns number
3675  * reclaimed */
3676 static int __kmp_reclaim_dead_roots(void) {
3677   int i, r = 0;
3678 
3679   for (i = 0; i < __kmp_threads_capacity; ++i) {
3680     if (KMP_UBER_GTID(i) &&
3681         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3682         !__kmp_root[i]
3683              ->r.r_active) { // AC: reclaim only roots died in non-active state
3684       r += __kmp_unregister_root_other_thread(i);
3685     }
3686   }
3687   return r;
3688 }
3689 #endif
3690 
3691 /* This function attempts to create free entries in __kmp_threads and
3692    __kmp_root, and returns the number of free entries generated.
3693 
3694    For Windows* OS static library, the first mechanism used is to reclaim array
3695    entries for root threads that are already dead.
3696 
3697    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3698    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3699    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3700    threadprivate cache array has been created. Synchronization with
3701    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3702 
3703    After any dead root reclamation, if the clipping value allows array expansion
3704    to result in the generation of a total of nNeed free slots, the function does
3705    that expansion. If not, nothing is done beyond the possible initial root
3706    thread reclamation.
3707 
3708    If any argument is negative, the behavior is undefined. */
3709 static int __kmp_expand_threads(int nNeed) {
3710   int added = 0;
3711   int minimumRequiredCapacity;
3712   int newCapacity;
3713   kmp_info_t **newThreads;
3714   kmp_root_t **newRoot;
3715 
3716   // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3717   // resizing __kmp_threads does not need additional protection if foreign
3718   // threads are present
3719 
3720 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3721   /* only for Windows static library */
3722   /* reclaim array entries for root threads that are already dead */
3723   added = __kmp_reclaim_dead_roots();
3724 
3725   if (nNeed) {
3726     nNeed -= added;
3727     if (nNeed < 0)
3728       nNeed = 0;
3729   }
3730 #endif
3731   if (nNeed <= 0)
3732     return added;
3733 
3734   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3735   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3736   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3737   // > __kmp_max_nth in one of two ways:
3738   //
3739   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3740   //    may not be reused by another thread, so we may need to increase
3741   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3742   //
3743   // 2) New foreign root(s) are encountered.  We always register new foreign
3744   //    roots. This may cause a smaller # of threads to be allocated at
3745   //    subsequent parallel regions, but the worker threads hang around (and
3746   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3747   //
3748   // Anyway, that is the reason for moving the check to see if
3749   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3750   // instead of having it performed here. -BB
3751 
3752   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3753 
3754   /* compute expansion headroom to check if we can expand */
3755   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3756     /* possible expansion too small -- give up */
3757     return added;
3758   }
3759   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3760 
3761   newCapacity = __kmp_threads_capacity;
3762   do {
3763     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3764                                                           : __kmp_sys_max_nth;
3765   } while (newCapacity < minimumRequiredCapacity);
3766   newThreads = (kmp_info_t **)__kmp_allocate(
3767       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3768   newRoot =
3769       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3770   KMP_MEMCPY(newThreads, __kmp_threads,
3771              __kmp_threads_capacity * sizeof(kmp_info_t *));
3772   KMP_MEMCPY(newRoot, __kmp_root,
3773              __kmp_threads_capacity * sizeof(kmp_root_t *));
3774   // Put old __kmp_threads array on a list. Any ongoing references to the old
3775   // list will be valid. This list is cleaned up at library shutdown.
3776   kmp_old_threads_list_t *node =
3777       (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3778   node->threads = __kmp_threads;
3779   node->next = __kmp_old_threads_list;
3780   __kmp_old_threads_list = node;
3781 
3782   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3783   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3784   added += newCapacity - __kmp_threads_capacity;
3785   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3786 
3787   if (newCapacity > __kmp_tp_capacity) {
3788     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3789     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3790       __kmp_threadprivate_resize_cache(newCapacity);
3791     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3792       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3793     }
3794     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3795   }
3796 
3797   return added;
3798 }
3799 
3800 /* Register the current thread as a root thread and obtain our gtid. We must
3801    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3802    thread that calls from __kmp_do_serial_initialize() */
3803 int __kmp_register_root(int initial_thread) {
3804   kmp_info_t *root_thread;
3805   kmp_root_t *root;
3806   int gtid;
3807   int capacity;
3808   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3809   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3810   KMP_MB();
3811 
3812   /* 2007-03-02:
3813      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3814      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3815      work as expected -- it may return false (that means there is at least one
3816      empty slot in __kmp_threads array), but it is possible the only free slot
3817      is #0, which is reserved for initial thread and so cannot be used for this
3818      one. Following code workarounds this bug.
3819 
3820      However, right solution seems to be not reserving slot #0 for initial
3821      thread because:
3822      (1) there is no magic in slot #0,
3823      (2) we cannot detect initial thread reliably (the first thread which does
3824         serial initialization may be not a real initial thread).
3825   */
3826   capacity = __kmp_threads_capacity;
3827   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3828     --capacity;
3829   }
3830 
3831   // If it is not for initializing the hidden helper team, we need to take
3832   // __kmp_hidden_helper_threads_num out of the capacity because it is included
3833   // in __kmp_threads_capacity.
3834   if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3835     capacity -= __kmp_hidden_helper_threads_num;
3836   }
3837 
3838   /* see if there are too many threads */
3839   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3840     if (__kmp_tp_cached) {
3841       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3842                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3843                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3844     } else {
3845       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3846                   __kmp_msg_null);
3847     }
3848   }
3849 
3850   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3851   // 0: initial thread, also a regular OpenMP thread.
3852   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3853   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3854   // regular OpenMP threads.
3855   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3856     // Find an available thread slot for hidden helper thread. Slots for hidden
3857     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3858     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3859                    gtid <= __kmp_hidden_helper_threads_num;
3860          gtid++)
3861       ;
3862     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3863     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3864                  "hidden helper thread: T#%d\n",
3865                  gtid));
3866   } else {
3867     /* find an available thread slot */
3868     // Don't reassign the zero slot since we need that to only be used by
3869     // initial thread. Slots for hidden helper threads should also be skipped.
3870     if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3871       gtid = 0;
3872     } else {
3873       for (gtid = __kmp_hidden_helper_threads_num + 1;
3874            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3875         ;
3876     }
3877     KA_TRACE(
3878         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3879     KMP_ASSERT(gtid < __kmp_threads_capacity);
3880   }
3881 
3882   /* update global accounting */
3883   __kmp_all_nth++;
3884   TCW_4(__kmp_nth, __kmp_nth + 1);
3885 
3886   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3887   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3888   if (__kmp_adjust_gtid_mode) {
3889     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3890       if (TCR_4(__kmp_gtid_mode) != 2) {
3891         TCW_4(__kmp_gtid_mode, 2);
3892       }
3893     } else {
3894       if (TCR_4(__kmp_gtid_mode) != 1) {
3895         TCW_4(__kmp_gtid_mode, 1);
3896       }
3897     }
3898   }
3899 
3900 #ifdef KMP_ADJUST_BLOCKTIME
3901   /* Adjust blocktime to zero if necessary            */
3902   /* Middle initialization might not have occurred yet */
3903   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3904     if (__kmp_nth > __kmp_avail_proc) {
3905       __kmp_zero_bt = TRUE;
3906     }
3907   }
3908 #endif /* KMP_ADJUST_BLOCKTIME */
3909 
3910   /* setup this new hierarchy */
3911   if (!(root = __kmp_root[gtid])) {
3912     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3913     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3914   }
3915 
3916 #if KMP_STATS_ENABLED
3917   // Initialize stats as soon as possible (right after gtid assignment).
3918   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3919   __kmp_stats_thread_ptr->startLife();
3920   KMP_SET_THREAD_STATE(SERIAL_REGION);
3921   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3922 #endif
3923   __kmp_initialize_root(root);
3924 
3925   /* setup new root thread structure */
3926   if (root->r.r_uber_thread) {
3927     root_thread = root->r.r_uber_thread;
3928   } else {
3929     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3930     if (__kmp_storage_map) {
3931       __kmp_print_thread_storage_map(root_thread, gtid);
3932     }
3933     root_thread->th.th_info.ds.ds_gtid = gtid;
3934 #if OMPT_SUPPORT
3935     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3936 #endif
3937     root_thread->th.th_root = root;
3938     if (__kmp_env_consistency_check) {
3939       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3940     }
3941 #if USE_FAST_MEMORY
3942     __kmp_initialize_fast_memory(root_thread);
3943 #endif /* USE_FAST_MEMORY */
3944 
3945 #if KMP_USE_BGET
3946     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3947     __kmp_initialize_bget(root_thread);
3948 #endif
3949     __kmp_init_random(root_thread); // Initialize random number generator
3950   }
3951 
3952   /* setup the serial team held in reserve by the root thread */
3953   if (!root_thread->th.th_serial_team) {
3954     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3955     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3956     root_thread->th.th_serial_team = __kmp_allocate_team(
3957         root, 1, 1,
3958 #if OMPT_SUPPORT
3959         ompt_data_none, // root parallel id
3960 #endif
3961         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3962   }
3963   KMP_ASSERT(root_thread->th.th_serial_team);
3964   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3965                 root_thread->th.th_serial_team));
3966 
3967   /* drop root_thread into place */
3968   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3969 
3970   root->r.r_root_team->t.t_threads[0] = root_thread;
3971   root->r.r_hot_team->t.t_threads[0] = root_thread;
3972   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3973   // AC: the team created in reserve, not for execution (it is unused for now).
3974   root_thread->th.th_serial_team->t.t_serialized = 0;
3975   root->r.r_uber_thread = root_thread;
3976 
3977   /* initialize the thread, get it ready to go */
3978   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3979   TCW_4(__kmp_init_gtid, TRUE);
3980 
3981   /* prepare the primary thread for get_gtid() */
3982   __kmp_gtid_set_specific(gtid);
3983 
3984 #if USE_ITT_BUILD
3985   __kmp_itt_thread_name(gtid);
3986 #endif /* USE_ITT_BUILD */
3987 
3988 #ifdef KMP_TDATA_GTID
3989   __kmp_gtid = gtid;
3990 #endif
3991   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3992   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3993 
3994   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3995                 "plain=%u\n",
3996                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3997                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3998                 KMP_INIT_BARRIER_STATE));
3999   { // Initialize barrier data.
4000     int b;
4001     for (b = 0; b < bs_last_barrier; ++b) {
4002       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
4003 #if USE_DEBUGGER
4004       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
4005 #endif
4006     }
4007   }
4008   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
4009                    KMP_INIT_BARRIER_STATE);
4010 
4011 #if KMP_AFFINITY_SUPPORTED
4012   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4013   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4014   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4015   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4016 #endif /* KMP_AFFINITY_SUPPORTED */
4017   root_thread->th.th_def_allocator = __kmp_def_allocator;
4018   root_thread->th.th_prev_level = 0;
4019   root_thread->th.th_prev_num_threads = 1;
4020 
4021   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4022   tmp->cg_root = root_thread;
4023   tmp->cg_thread_limit = __kmp_cg_max_nth;
4024   tmp->cg_nthreads = 1;
4025   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4026                  " cg_nthreads init to 1\n",
4027                  root_thread, tmp));
4028   tmp->up = NULL;
4029   root_thread->th.th_cg_roots = tmp;
4030 
4031   __kmp_root_counter++;
4032 
4033 #if OMPT_SUPPORT
4034   if (ompt_enabled.enabled) {
4035 
4036     kmp_info_t *root_thread = ompt_get_thread();
4037 
4038     ompt_set_thread_state(root_thread, ompt_state_overhead);
4039 
4040     if (ompt_enabled.ompt_callback_thread_begin) {
4041       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4042           ompt_thread_initial, __ompt_get_thread_data_internal());
4043     }
4044     ompt_data_t *task_data;
4045     ompt_data_t *parallel_data;
4046     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4047                                   NULL);
4048     if (ompt_enabled.ompt_callback_implicit_task) {
4049       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4050           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4051     }
4052 
4053     ompt_set_thread_state(root_thread, ompt_state_work_serial);
4054   }
4055 #endif
4056 #if OMPD_SUPPORT
4057   if (ompd_state & OMPD_ENABLE_BP)
4058     ompd_bp_thread_begin();
4059 #endif
4060 
4061   KMP_MB();
4062   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4063 
4064   return gtid;
4065 }
4066 
4067 #if KMP_NESTED_HOT_TEAMS
4068 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4069                                 const int max_level) {
4070   int i, n, nth;
4071   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4072   if (!hot_teams || !hot_teams[level].hot_team) {
4073     return 0;
4074   }
4075   KMP_DEBUG_ASSERT(level < max_level);
4076   kmp_team_t *team = hot_teams[level].hot_team;
4077   nth = hot_teams[level].hot_team_nth;
4078   n = nth - 1; // primary thread is not freed
4079   if (level < max_level - 1) {
4080     for (i = 0; i < nth; ++i) {
4081       kmp_info_t *th = team->t.t_threads[i];
4082       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4083       if (i > 0 && th->th.th_hot_teams) {
4084         __kmp_free(th->th.th_hot_teams);
4085         th->th.th_hot_teams = NULL;
4086       }
4087     }
4088   }
4089   __kmp_free_team(root, team, NULL);
4090   return n;
4091 }
4092 #endif
4093 
4094 // Resets a root thread and clear its root and hot teams.
4095 // Returns the number of __kmp_threads entries directly and indirectly freed.
4096 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4097   kmp_team_t *root_team = root->r.r_root_team;
4098   kmp_team_t *hot_team = root->r.r_hot_team;
4099   int n = hot_team->t.t_nproc;
4100   int i;
4101 
4102   KMP_DEBUG_ASSERT(!root->r.r_active);
4103 
4104   root->r.r_root_team = NULL;
4105   root->r.r_hot_team = NULL;
4106   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4107   // before call to __kmp_free_team().
4108   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4109 #if KMP_NESTED_HOT_TEAMS
4110   if (__kmp_hot_teams_max_level >
4111       0) { // need to free nested hot teams and their threads if any
4112     for (i = 0; i < hot_team->t.t_nproc; ++i) {
4113       kmp_info_t *th = hot_team->t.t_threads[i];
4114       if (__kmp_hot_teams_max_level > 1) {
4115         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4116       }
4117       if (th->th.th_hot_teams) {
4118         __kmp_free(th->th.th_hot_teams);
4119         th->th.th_hot_teams = NULL;
4120       }
4121     }
4122   }
4123 #endif
4124   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4125 
4126   // Before we can reap the thread, we need to make certain that all other
4127   // threads in the teams that had this root as ancestor have stopped trying to
4128   // steal tasks.
4129   if (__kmp_tasking_mode != tskm_immediate_exec) {
4130     __kmp_wait_to_unref_task_teams();
4131   }
4132 
4133 #if KMP_OS_WINDOWS
4134   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4135   KA_TRACE(
4136       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4137            "\n",
4138            (LPVOID) & (root->r.r_uber_thread->th),
4139            root->r.r_uber_thread->th.th_info.ds.ds_thread));
4140   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4141 #endif /* KMP_OS_WINDOWS */
4142 
4143 #if OMPD_SUPPORT
4144   if (ompd_state & OMPD_ENABLE_BP)
4145     ompd_bp_thread_end();
4146 #endif
4147 
4148 #if OMPT_SUPPORT
4149   ompt_data_t *task_data;
4150   ompt_data_t *parallel_data;
4151   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4152                                 NULL);
4153   if (ompt_enabled.ompt_callback_implicit_task) {
4154     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4155         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4156   }
4157   if (ompt_enabled.ompt_callback_thread_end) {
4158     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4159         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4160   }
4161 #endif
4162 
4163   TCW_4(__kmp_nth,
4164         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4165   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4166   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4167                  " to %d\n",
4168                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4169                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4170   if (i == 1) {
4171     // need to free contention group structure
4172     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4173                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
4174     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4175     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4176     root->r.r_uber_thread->th.th_cg_roots = NULL;
4177   }
4178   __kmp_reap_thread(root->r.r_uber_thread, 1);
4179 
4180   // We canot put root thread to __kmp_thread_pool, so we have to reap it
4181   // instead of freeing.
4182   root->r.r_uber_thread = NULL;
4183   /* mark root as no longer in use */
4184   root->r.r_begin = FALSE;
4185 
4186   return n;
4187 }
4188 
4189 void __kmp_unregister_root_current_thread(int gtid) {
4190   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4191   /* this lock should be ok, since unregister_root_current_thread is never
4192      called during an abort, only during a normal close. furthermore, if you
4193      have the forkjoin lock, you should never try to get the initz lock */
4194   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4195   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4196     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4197                   "exiting T#%d\n",
4198                   gtid));
4199     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4200     return;
4201   }
4202   kmp_root_t *root = __kmp_root[gtid];
4203 
4204   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4205   KMP_ASSERT(KMP_UBER_GTID(gtid));
4206   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4207   KMP_ASSERT(root->r.r_active == FALSE);
4208 
4209   KMP_MB();
4210 
4211   kmp_info_t *thread = __kmp_threads[gtid];
4212   kmp_team_t *team = thread->th.th_team;
4213   kmp_task_team_t *task_team = thread->th.th_task_team;
4214 
4215   // we need to wait for the proxy tasks before finishing the thread
4216   if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4217                             task_team->tt.tt_hidden_helper_task_encountered)) {
4218 #if OMPT_SUPPORT
4219     // the runtime is shutting down so we won't report any events
4220     thread->th.ompt_thread_info.state = ompt_state_undefined;
4221 #endif
4222     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4223   }
4224 
4225   __kmp_reset_root(gtid, root);
4226 
4227   KMP_MB();
4228   KC_TRACE(10,
4229            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4230 
4231   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4232 }
4233 
4234 #if KMP_OS_WINDOWS
4235 /* __kmp_forkjoin_lock must be already held
4236    Unregisters a root thread that is not the current thread.  Returns the number
4237    of __kmp_threads entries freed as a result. */
4238 static int __kmp_unregister_root_other_thread(int gtid) {
4239   kmp_root_t *root = __kmp_root[gtid];
4240   int r;
4241 
4242   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4243   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4244   KMP_ASSERT(KMP_UBER_GTID(gtid));
4245   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4246   KMP_ASSERT(root->r.r_active == FALSE);
4247 
4248   r = __kmp_reset_root(gtid, root);
4249   KC_TRACE(10,
4250            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4251   return r;
4252 }
4253 #endif
4254 
4255 #if KMP_DEBUG
4256 void __kmp_task_info() {
4257 
4258   kmp_int32 gtid = __kmp_entry_gtid();
4259   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4260   kmp_info_t *this_thr = __kmp_threads[gtid];
4261   kmp_team_t *steam = this_thr->th.th_serial_team;
4262   kmp_team_t *team = this_thr->th.th_team;
4263 
4264   __kmp_printf(
4265       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4266       "ptask=%p\n",
4267       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4268       team->t.t_implicit_task_taskdata[tid].td_parent);
4269 }
4270 #endif // KMP_DEBUG
4271 
4272 /* TODO optimize with one big memclr, take out what isn't needed, split
4273    responsibility to workers as much as possible, and delay initialization of
4274    features as much as possible  */
4275 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4276                                   int tid, int gtid) {
4277   /* this_thr->th.th_info.ds.ds_gtid is setup in
4278      kmp_allocate_thread/create_worker.
4279      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4280   KMP_DEBUG_ASSERT(this_thr != NULL);
4281   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4282   KMP_DEBUG_ASSERT(team);
4283   KMP_DEBUG_ASSERT(team->t.t_threads);
4284   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4285   kmp_info_t *master = team->t.t_threads[0];
4286   KMP_DEBUG_ASSERT(master);
4287   KMP_DEBUG_ASSERT(master->th.th_root);
4288 
4289   KMP_MB();
4290 
4291   TCW_SYNC_PTR(this_thr->th.th_team, team);
4292 
4293   this_thr->th.th_info.ds.ds_tid = tid;
4294   this_thr->th.th_set_nproc = 0;
4295   if (__kmp_tasking_mode != tskm_immediate_exec)
4296     // When tasking is possible, threads are not safe to reap until they are
4297     // done tasking; this will be set when tasking code is exited in wait
4298     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4299   else // no tasking --> always safe to reap
4300     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4301   this_thr->th.th_set_proc_bind = proc_bind_default;
4302 
4303 #if KMP_AFFINITY_SUPPORTED
4304   this_thr->th.th_new_place = this_thr->th.th_current_place;
4305 #endif
4306   this_thr->th.th_root = master->th.th_root;
4307 
4308   /* setup the thread's cache of the team structure */
4309   this_thr->th.th_team_nproc = team->t.t_nproc;
4310   this_thr->th.th_team_master = master;
4311   this_thr->th.th_team_serialized = team->t.t_serialized;
4312 
4313   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4314 
4315   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4316                 tid, gtid, this_thr, this_thr->th.th_current_task));
4317 
4318   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4319                            team, tid, TRUE);
4320 
4321   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4322                 tid, gtid, this_thr, this_thr->th.th_current_task));
4323   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4324   // __kmp_initialize_team()?
4325 
4326   /* TODO no worksharing in speculative threads */
4327   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4328 
4329   this_thr->th.th_local.this_construct = 0;
4330 
4331   if (!this_thr->th.th_pri_common) {
4332     this_thr->th.th_pri_common =
4333         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4334     if (__kmp_storage_map) {
4335       __kmp_print_storage_map_gtid(
4336           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4337           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4338     }
4339     this_thr->th.th_pri_head = NULL;
4340   }
4341 
4342   if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4343       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4344     // Make new thread's CG root same as primary thread's
4345     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4346     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4347     if (tmp) {
4348       // worker changes CG, need to check if old CG should be freed
4349       int i = tmp->cg_nthreads--;
4350       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4351                      " on node %p of thread %p to %d\n",
4352                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4353       if (i == 1) {
4354         __kmp_free(tmp); // last thread left CG --> free it
4355       }
4356     }
4357     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4358     // Increment new thread's CG root's counter to add the new thread
4359     this_thr->th.th_cg_roots->cg_nthreads++;
4360     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4361                    " node %p of thread %p to %d\n",
4362                    this_thr, this_thr->th.th_cg_roots,
4363                    this_thr->th.th_cg_roots->cg_root,
4364                    this_thr->th.th_cg_roots->cg_nthreads));
4365     this_thr->th.th_current_task->td_icvs.thread_limit =
4366         this_thr->th.th_cg_roots->cg_thread_limit;
4367   }
4368 
4369   /* Initialize dynamic dispatch */
4370   {
4371     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4372     // Use team max_nproc since this will never change for the team.
4373     size_t disp_size =
4374         sizeof(dispatch_private_info_t) *
4375         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4376     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4377                   team->t.t_max_nproc));
4378     KMP_ASSERT(dispatch);
4379     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4380     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4381 
4382     dispatch->th_disp_index = 0;
4383     dispatch->th_doacross_buf_idx = 0;
4384     if (!dispatch->th_disp_buffer) {
4385       dispatch->th_disp_buffer =
4386           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4387 
4388       if (__kmp_storage_map) {
4389         __kmp_print_storage_map_gtid(
4390             gtid, &dispatch->th_disp_buffer[0],
4391             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4392                                           ? 1
4393                                           : __kmp_dispatch_num_buffers],
4394             disp_size,
4395             "th_%d.th_dispatch.th_disp_buffer "
4396             "(team_%d.t_dispatch[%d].th_disp_buffer)",
4397             gtid, team->t.t_id, gtid);
4398       }
4399     } else {
4400       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4401     }
4402 
4403     dispatch->th_dispatch_pr_current = 0;
4404     dispatch->th_dispatch_sh_current = 0;
4405 
4406     dispatch->th_deo_fcn = 0; /* ORDERED     */
4407     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4408   }
4409 
4410   this_thr->th.th_next_pool = NULL;
4411 
4412   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4413   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4414 
4415   KMP_MB();
4416 }
4417 
4418 /* allocate a new thread for the requesting team. this is only called from
4419    within a forkjoin critical section. we will first try to get an available
4420    thread from the thread pool. if none is available, we will fork a new one
4421    assuming we are able to create a new one. this should be assured, as the
4422    caller should check on this first. */
4423 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4424                                   int new_tid) {
4425   kmp_team_t *serial_team;
4426   kmp_info_t *new_thr;
4427   int new_gtid;
4428 
4429   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4430   KMP_DEBUG_ASSERT(root && team);
4431 #if !KMP_NESTED_HOT_TEAMS
4432   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4433 #endif
4434   KMP_MB();
4435 
4436   /* first, try to get one from the thread pool unless allocating thread is
4437    * the main hidden helper thread. The hidden helper team should always
4438    * allocate new OS threads. */
4439   if (__kmp_thread_pool && !KMP_HIDDEN_HELPER_TEAM(team)) {
4440     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4441     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4442     if (new_thr == __kmp_thread_pool_insert_pt) {
4443       __kmp_thread_pool_insert_pt = NULL;
4444     }
4445     TCW_4(new_thr->th.th_in_pool, FALSE);
4446     __kmp_suspend_initialize_thread(new_thr);
4447     __kmp_lock_suspend_mx(new_thr);
4448     if (new_thr->th.th_active_in_pool == TRUE) {
4449       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4450       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4451       new_thr->th.th_active_in_pool = FALSE;
4452     }
4453     __kmp_unlock_suspend_mx(new_thr);
4454 
4455     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4456                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4457     KMP_ASSERT(!new_thr->th.th_team);
4458     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4459 
4460     /* setup the thread structure */
4461     __kmp_initialize_info(new_thr, team, new_tid,
4462                           new_thr->th.th_info.ds.ds_gtid);
4463     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4464 
4465     TCW_4(__kmp_nth, __kmp_nth + 1);
4466 
4467     new_thr->th.th_task_state = 0;
4468 
4469     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4470       // Make sure pool thread has transitioned to waiting on own thread struct
4471       KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4472       // Thread activated in __kmp_allocate_team when increasing team size
4473     }
4474 
4475 #ifdef KMP_ADJUST_BLOCKTIME
4476     /* Adjust blocktime back to zero if necessary */
4477     /* Middle initialization might not have occurred yet */
4478     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4479       if (__kmp_nth > __kmp_avail_proc) {
4480         __kmp_zero_bt = TRUE;
4481       }
4482     }
4483 #endif /* KMP_ADJUST_BLOCKTIME */
4484 
4485 #if KMP_DEBUG
4486     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4487     // KMP_BARRIER_PARENT_FLAG.
4488     int b;
4489     kmp_balign_t *balign = new_thr->th.th_bar;
4490     for (b = 0; b < bs_last_barrier; ++b)
4491       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4492 #endif
4493 
4494     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4495                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4496 
4497     KMP_MB();
4498     return new_thr;
4499   }
4500 
4501   /* no, well fork a new one */
4502   KMP_ASSERT(KMP_HIDDEN_HELPER_TEAM(team) || __kmp_nth == __kmp_all_nth);
4503   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4504 
4505 #if KMP_USE_MONITOR
4506   // If this is the first worker thread the RTL is creating, then also
4507   // launch the monitor thread.  We try to do this as early as possible.
4508   if (!TCR_4(__kmp_init_monitor)) {
4509     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4510     if (!TCR_4(__kmp_init_monitor)) {
4511       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4512       TCW_4(__kmp_init_monitor, 1);
4513       __kmp_create_monitor(&__kmp_monitor);
4514       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4515 #if KMP_OS_WINDOWS
4516       // AC: wait until monitor has started. This is a fix for CQ232808.
4517       // The reason is that if the library is loaded/unloaded in a loop with
4518       // small (parallel) work in between, then there is high probability that
4519       // monitor thread started after the library shutdown. At shutdown it is
4520       // too late to cope with the problem, because when the primary thread is
4521       // in DllMain (process detach) the monitor has no chances to start (it is
4522       // blocked), and primary thread has no means to inform the monitor that
4523       // the library has gone, because all the memory which the monitor can
4524       // access is going to be released/reset.
4525       while (TCR_4(__kmp_init_monitor) < 2) {
4526         KMP_YIELD(TRUE);
4527       }
4528       KF_TRACE(10, ("after monitor thread has started\n"));
4529 #endif
4530     }
4531     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4532   }
4533 #endif
4534 
4535   KMP_MB();
4536 
4537   {
4538     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4539                              ? 1
4540                              : __kmp_hidden_helper_threads_num + 1;
4541 
4542     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4543          ++new_gtid) {
4544       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4545     }
4546 
4547     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4548       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4549     }
4550   }
4551 
4552   /* allocate space for it. */
4553   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4554 
4555   new_thr->th.th_nt_strict = false;
4556   new_thr->th.th_nt_loc = NULL;
4557   new_thr->th.th_nt_sev = severity_fatal;
4558   new_thr->th.th_nt_msg = NULL;
4559 
4560   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4561 
4562 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4563   // suppress race conditions detection on synchronization flags in debug mode
4564   // this helps to analyze library internals eliminating false positives
4565   __itt_suppress_mark_range(
4566       __itt_suppress_range, __itt_suppress_threading_errors,
4567       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4568   __itt_suppress_mark_range(
4569       __itt_suppress_range, __itt_suppress_threading_errors,
4570       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4571 #if KMP_OS_WINDOWS
4572   __itt_suppress_mark_range(
4573       __itt_suppress_range, __itt_suppress_threading_errors,
4574       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4575 #else
4576   __itt_suppress_mark_range(__itt_suppress_range,
4577                             __itt_suppress_threading_errors,
4578                             &new_thr->th.th_suspend_init_count,
4579                             sizeof(new_thr->th.th_suspend_init_count));
4580 #endif
4581   // TODO: check if we need to also suppress b_arrived flags
4582   __itt_suppress_mark_range(__itt_suppress_range,
4583                             __itt_suppress_threading_errors,
4584                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4585                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4586   __itt_suppress_mark_range(__itt_suppress_range,
4587                             __itt_suppress_threading_errors,
4588                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4589                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4590   __itt_suppress_mark_range(__itt_suppress_range,
4591                             __itt_suppress_threading_errors,
4592                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4593                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4594 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4595   if (__kmp_storage_map) {
4596     __kmp_print_thread_storage_map(new_thr, new_gtid);
4597   }
4598 
4599   // add the reserve serialized team, initialized from the team's primary thread
4600   {
4601     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4602     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4603     new_thr->th.th_serial_team = serial_team =
4604         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4605 #if OMPT_SUPPORT
4606                                           ompt_data_none, // root parallel id
4607 #endif
4608                                           proc_bind_default, &r_icvs,
4609                                           0 USE_NESTED_HOT_ARG(NULL));
4610   }
4611   KMP_ASSERT(serial_team);
4612   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4613   // execution (it is unused for now).
4614   serial_team->t.t_threads[0] = new_thr;
4615   KF_TRACE(10,
4616            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4617             new_thr));
4618 
4619   /* setup the thread structures */
4620   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4621 
4622 #if USE_FAST_MEMORY
4623   __kmp_initialize_fast_memory(new_thr);
4624 #endif /* USE_FAST_MEMORY */
4625 
4626 #if KMP_USE_BGET
4627   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4628   __kmp_initialize_bget(new_thr);
4629 #endif
4630 
4631   __kmp_init_random(new_thr); // Initialize random number generator
4632 
4633   /* Initialize these only once when thread is grabbed for a team allocation */
4634   KA_TRACE(20,
4635            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4636             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4637 
4638   int b;
4639   kmp_balign_t *balign = new_thr->th.th_bar;
4640   for (b = 0; b < bs_last_barrier; ++b) {
4641     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4642     balign[b].bb.team = NULL;
4643     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4644     balign[b].bb.use_oncore_barrier = 0;
4645   }
4646 
4647   TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4648   new_thr->th.th_sleep_loc_type = flag_unset;
4649 
4650   new_thr->th.th_spin_here = FALSE;
4651   new_thr->th.th_next_waiting = 0;
4652 #if KMP_OS_UNIX
4653   new_thr->th.th_blocking = false;
4654 #endif
4655 
4656 #if KMP_AFFINITY_SUPPORTED
4657   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4658   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4659   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4660   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4661 #endif
4662   new_thr->th.th_def_allocator = __kmp_def_allocator;
4663   new_thr->th.th_prev_level = 0;
4664   new_thr->th.th_prev_num_threads = 1;
4665 
4666   TCW_4(new_thr->th.th_in_pool, FALSE);
4667   new_thr->th.th_active_in_pool = FALSE;
4668   TCW_4(new_thr->th.th_active, TRUE);
4669 
4670   new_thr->th.th_set_nested_nth = NULL;
4671   new_thr->th.th_set_nested_nth_sz = 0;
4672 
4673   /* adjust the global counters */
4674   __kmp_all_nth++;
4675   __kmp_nth++;
4676 
4677   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4678   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4679   if (__kmp_adjust_gtid_mode) {
4680     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4681       if (TCR_4(__kmp_gtid_mode) != 2) {
4682         TCW_4(__kmp_gtid_mode, 2);
4683       }
4684     } else {
4685       if (TCR_4(__kmp_gtid_mode) != 1) {
4686         TCW_4(__kmp_gtid_mode, 1);
4687       }
4688     }
4689   }
4690 
4691 #ifdef KMP_ADJUST_BLOCKTIME
4692   /* Adjust blocktime back to zero if necessary       */
4693   /* Middle initialization might not have occurred yet */
4694   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4695     if (__kmp_nth > __kmp_avail_proc) {
4696       __kmp_zero_bt = TRUE;
4697     }
4698   }
4699 #endif /* KMP_ADJUST_BLOCKTIME */
4700 
4701 #if KMP_AFFINITY_SUPPORTED
4702   // Set the affinity and topology information for new thread
4703   __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
4704 #endif
4705 
4706   /* actually fork it and create the new worker thread */
4707   KF_TRACE(
4708       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4709   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4710   KF_TRACE(10,
4711            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4712 
4713   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4714                 new_gtid));
4715   KMP_MB();
4716   return new_thr;
4717 }
4718 
4719 /* Reinitialize team for reuse.
4720    The hot team code calls this case at every fork barrier, so EPCC barrier
4721    test are extremely sensitive to changes in it, esp. writes to the team
4722    struct, which cause a cache invalidation in all threads.
4723    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4724 static void __kmp_reinitialize_team(kmp_team_t *team,
4725                                     kmp_internal_control_t *new_icvs,
4726                                     ident_t *loc) {
4727   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4728                 team->t.t_threads[0], team));
4729   KMP_DEBUG_ASSERT(team && new_icvs);
4730   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4731   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4732 
4733   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4734   // Copy ICVs to the primary thread's implicit taskdata
4735   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4736   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4737 
4738   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4739                 team->t.t_threads[0], team));
4740 }
4741 
4742 /* Initialize the team data structure.
4743    This assumes the t_threads and t_max_nproc are already set.
4744    Also, we don't touch the arguments */
4745 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4746                                   kmp_internal_control_t *new_icvs,
4747                                   ident_t *loc) {
4748   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4749 
4750   /* verify */
4751   KMP_DEBUG_ASSERT(team);
4752   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4753   KMP_DEBUG_ASSERT(team->t.t_threads);
4754   KMP_MB();
4755 
4756   team->t.t_master_tid = 0; /* not needed */
4757   /* team->t.t_master_bar;        not needed */
4758   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4759   team->t.t_nproc = new_nproc;
4760 
4761   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4762   team->t.t_next_pool = NULL;
4763   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4764    * up hot team */
4765 
4766   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4767   team->t.t_invoke = NULL; /* not needed */
4768 
4769   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4770   team->t.t_sched.sched = new_icvs->sched.sched;
4771 
4772 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4773   team->t.t_fp_control_saved = FALSE; /* not needed */
4774   team->t.t_x87_fpu_control_word = 0; /* not needed */
4775   team->t.t_mxcsr = 0; /* not needed */
4776 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4777 
4778   team->t.t_construct = 0;
4779 
4780   team->t.t_ordered.dt.t_value = 0;
4781   team->t.t_master_active = FALSE;
4782 
4783 #ifdef KMP_DEBUG
4784   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4785 #endif
4786 #if KMP_OS_WINDOWS
4787   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4788 #endif
4789 
4790   team->t.t_control_stack_top = NULL;
4791 
4792   __kmp_reinitialize_team(team, new_icvs, loc);
4793 
4794   KMP_MB();
4795   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4796 }
4797 
4798 #if KMP_AFFINITY_SUPPORTED
4799 static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4800                                           int first, int last, int newp) {
4801   th->th.th_first_place = first;
4802   th->th.th_last_place = last;
4803   th->th.th_new_place = newp;
4804   if (newp != th->th.th_current_place) {
4805     if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4806       team->t.t_display_affinity = 1;
4807     // Copy topology information associated with the new place
4808     th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4809     th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4810   }
4811 }
4812 
4813 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4814 // It calculates the worker + primary thread's partition based upon the parent
4815 // thread's partition, and binds each worker to a thread in their partition.
4816 // The primary thread's partition should already include its current binding.
4817 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4818   // Do not partition places for the hidden helper team
4819   if (KMP_HIDDEN_HELPER_TEAM(team))
4820     return;
4821   // Copy the primary thread's place partition to the team struct
4822   kmp_info_t *master_th = team->t.t_threads[0];
4823   KMP_DEBUG_ASSERT(master_th != NULL);
4824   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4825   int first_place = master_th->th.th_first_place;
4826   int last_place = master_th->th.th_last_place;
4827   int masters_place = master_th->th.th_current_place;
4828   int num_masks = __kmp_affinity.num_masks;
4829   team->t.t_first_place = first_place;
4830   team->t.t_last_place = last_place;
4831 
4832   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4833                 "bound to place %d partition = [%d,%d]\n",
4834                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4835                 team->t.t_id, masters_place, first_place, last_place));
4836 
4837   switch (proc_bind) {
4838 
4839   case proc_bind_default:
4840     // Serial teams might have the proc_bind policy set to proc_bind_default.
4841     // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4842     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4843     break;
4844 
4845   case proc_bind_primary: {
4846     int f;
4847     int n_th = team->t.t_nproc;
4848     for (f = 1; f < n_th; f++) {
4849       kmp_info_t *th = team->t.t_threads[f];
4850       KMP_DEBUG_ASSERT(th != NULL);
4851       __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
4852 
4853       KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4854                      "partition = [%d,%d]\n",
4855                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4856                      f, masters_place, first_place, last_place));
4857     }
4858   } break;
4859 
4860   case proc_bind_close: {
4861     int f;
4862     int n_th = team->t.t_nproc;
4863     int n_places;
4864     if (first_place <= last_place) {
4865       n_places = last_place - first_place + 1;
4866     } else {
4867       n_places = num_masks - first_place + last_place + 1;
4868     }
4869     if (n_th <= n_places) {
4870       int place = masters_place;
4871       for (f = 1; f < n_th; f++) {
4872         kmp_info_t *th = team->t.t_threads[f];
4873         KMP_DEBUG_ASSERT(th != NULL);
4874 
4875         if (place == last_place) {
4876           place = first_place;
4877         } else if (place == (num_masks - 1)) {
4878           place = 0;
4879         } else {
4880           place++;
4881         }
4882         __kmp_set_thread_place(team, th, first_place, last_place, place);
4883 
4884         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4885                        "partition = [%d,%d]\n",
4886                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4887                        team->t.t_id, f, place, first_place, last_place));
4888       }
4889     } else {
4890       int S, rem, gap, s_count;
4891       S = n_th / n_places;
4892       s_count = 0;
4893       rem = n_th - (S * n_places);
4894       gap = rem > 0 ? n_places / rem : n_places;
4895       int place = masters_place;
4896       int gap_ct = gap;
4897       for (f = 0; f < n_th; f++) {
4898         kmp_info_t *th = team->t.t_threads[f];
4899         KMP_DEBUG_ASSERT(th != NULL);
4900 
4901         __kmp_set_thread_place(team, th, first_place, last_place, place);
4902         s_count++;
4903 
4904         if ((s_count == S) && rem && (gap_ct == gap)) {
4905           // do nothing, add an extra thread to place on next iteration
4906         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4907           // we added an extra thread to this place; move to next place
4908           if (place == last_place) {
4909             place = first_place;
4910           } else if (place == (num_masks - 1)) {
4911             place = 0;
4912           } else {
4913             place++;
4914           }
4915           s_count = 0;
4916           gap_ct = 1;
4917           rem--;
4918         } else if (s_count == S) { // place full; don't add extra
4919           if (place == last_place) {
4920             place = first_place;
4921           } else if (place == (num_masks - 1)) {
4922             place = 0;
4923           } else {
4924             place++;
4925           }
4926           gap_ct++;
4927           s_count = 0;
4928         }
4929 
4930         KA_TRACE(100,
4931                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4932                   "partition = [%d,%d]\n",
4933                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4934                   th->th.th_new_place, first_place, last_place));
4935       }
4936       KMP_DEBUG_ASSERT(place == masters_place);
4937     }
4938   } break;
4939 
4940   case proc_bind_spread: {
4941     int f;
4942     int n_th = team->t.t_nproc;
4943     int n_places;
4944     int thidx;
4945     if (first_place <= last_place) {
4946       n_places = last_place - first_place + 1;
4947     } else {
4948       n_places = num_masks - first_place + last_place + 1;
4949     }
4950     if (n_th <= n_places) {
4951       int place = -1;
4952 
4953       if (n_places != num_masks) {
4954         int S = n_places / n_th;
4955         int s_count, rem, gap, gap_ct;
4956 
4957         place = masters_place;
4958         rem = n_places - n_th * S;
4959         gap = rem ? n_th / rem : 1;
4960         gap_ct = gap;
4961         thidx = n_th;
4962         if (update_master_only == 1)
4963           thidx = 1;
4964         for (f = 0; f < thidx; f++) {
4965           kmp_info_t *th = team->t.t_threads[f];
4966           KMP_DEBUG_ASSERT(th != NULL);
4967 
4968           int fplace = place, nplace = place;
4969           s_count = 1;
4970           while (s_count < S) {
4971             if (place == last_place) {
4972               place = first_place;
4973             } else if (place == (num_masks - 1)) {
4974               place = 0;
4975             } else {
4976               place++;
4977             }
4978             s_count++;
4979           }
4980           if (rem && (gap_ct == gap)) {
4981             if (place == last_place) {
4982               place = first_place;
4983             } else if (place == (num_masks - 1)) {
4984               place = 0;
4985             } else {
4986               place++;
4987             }
4988             rem--;
4989             gap_ct = 0;
4990           }
4991           __kmp_set_thread_place(team, th, fplace, place, nplace);
4992           gap_ct++;
4993 
4994           if (place == last_place) {
4995             place = first_place;
4996           } else if (place == (num_masks - 1)) {
4997             place = 0;
4998           } else {
4999             place++;
5000           }
5001 
5002           KA_TRACE(100,
5003                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5004                     "partition = [%d,%d], num_masks: %u\n",
5005                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
5006                     f, th->th.th_new_place, th->th.th_first_place,
5007                     th->th.th_last_place, num_masks));
5008         }
5009       } else {
5010         /* Having uniform space of available computation places I can create
5011            T partitions of round(P/T) size and put threads into the first
5012            place of each partition. */
5013         double current = static_cast<double>(masters_place);
5014         double spacing =
5015             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5016         int first, last;
5017         kmp_info_t *th;
5018 
5019         thidx = n_th + 1;
5020         if (update_master_only == 1)
5021           thidx = 1;
5022         for (f = 0; f < thidx; f++) {
5023           first = static_cast<int>(current);
5024           last = static_cast<int>(current + spacing) - 1;
5025           KMP_DEBUG_ASSERT(last >= first);
5026           if (first >= n_places) {
5027             if (masters_place) {
5028               first -= n_places;
5029               last -= n_places;
5030               if (first == (masters_place + 1)) {
5031                 KMP_DEBUG_ASSERT(f == n_th);
5032                 first--;
5033               }
5034               if (last == masters_place) {
5035                 KMP_DEBUG_ASSERT(f == (n_th - 1));
5036                 last--;
5037               }
5038             } else {
5039               KMP_DEBUG_ASSERT(f == n_th);
5040               first = 0;
5041               last = 0;
5042             }
5043           }
5044           if (last >= n_places) {
5045             last = (n_places - 1);
5046           }
5047           place = first;
5048           current += spacing;
5049           if (f < n_th) {
5050             KMP_DEBUG_ASSERT(0 <= first);
5051             KMP_DEBUG_ASSERT(n_places > first);
5052             KMP_DEBUG_ASSERT(0 <= last);
5053             KMP_DEBUG_ASSERT(n_places > last);
5054             KMP_DEBUG_ASSERT(last_place >= first_place);
5055             th = team->t.t_threads[f];
5056             KMP_DEBUG_ASSERT(th);
5057             __kmp_set_thread_place(team, th, first, last, place);
5058             KA_TRACE(100,
5059                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5060                       "partition = [%d,%d], spacing = %.4f\n",
5061                       __kmp_gtid_from_thread(team->t.t_threads[f]),
5062                       team->t.t_id, f, th->th.th_new_place,
5063                       th->th.th_first_place, th->th.th_last_place, spacing));
5064           }
5065         }
5066       }
5067       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5068     } else {
5069       int S, rem, gap, s_count;
5070       S = n_th / n_places;
5071       s_count = 0;
5072       rem = n_th - (S * n_places);
5073       gap = rem > 0 ? n_places / rem : n_places;
5074       int place = masters_place;
5075       int gap_ct = gap;
5076       thidx = n_th;
5077       if (update_master_only == 1)
5078         thidx = 1;
5079       for (f = 0; f < thidx; f++) {
5080         kmp_info_t *th = team->t.t_threads[f];
5081         KMP_DEBUG_ASSERT(th != NULL);
5082 
5083         __kmp_set_thread_place(team, th, place, place, place);
5084         s_count++;
5085 
5086         if ((s_count == S) && rem && (gap_ct == gap)) {
5087           // do nothing, add an extra thread to place on next iteration
5088         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5089           // we added an extra thread to this place; move on to next place
5090           if (place == last_place) {
5091             place = first_place;
5092           } else if (place == (num_masks - 1)) {
5093             place = 0;
5094           } else {
5095             place++;
5096           }
5097           s_count = 0;
5098           gap_ct = 1;
5099           rem--;
5100         } else if (s_count == S) { // place is full; don't add extra thread
5101           if (place == last_place) {
5102             place = first_place;
5103           } else if (place == (num_masks - 1)) {
5104             place = 0;
5105           } else {
5106             place++;
5107           }
5108           gap_ct++;
5109           s_count = 0;
5110         }
5111 
5112         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5113                        "partition = [%d,%d]\n",
5114                        __kmp_gtid_from_thread(team->t.t_threads[f]),
5115                        team->t.t_id, f, th->th.th_new_place,
5116                        th->th.th_first_place, th->th.th_last_place));
5117       }
5118       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5119     }
5120   } break;
5121 
5122   default:
5123     break;
5124   }
5125 
5126   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5127 }
5128 
5129 #endif // KMP_AFFINITY_SUPPORTED
5130 
5131 /* allocate a new team data structure to use.  take one off of the free pool if
5132    available */
5133 kmp_team_t *
5134 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5135 #if OMPT_SUPPORT
5136                     ompt_data_t ompt_parallel_data,
5137 #endif
5138                     kmp_proc_bind_t new_proc_bind,
5139                     kmp_internal_control_t *new_icvs,
5140                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5141   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5142   int f;
5143   kmp_team_t *team;
5144   int use_hot_team = !root->r.r_active;
5145   int level = 0;
5146   int do_place_partition = 1;
5147 
5148   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5149   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5150   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5151   KMP_MB();
5152 
5153 #if KMP_NESTED_HOT_TEAMS
5154   kmp_hot_team_ptr_t *hot_teams;
5155   if (master) {
5156     team = master->th.th_team;
5157     level = team->t.t_active_level;
5158     if (master->th.th_teams_microtask) { // in teams construct?
5159       if (master->th.th_teams_size.nteams > 1 &&
5160           ( // #teams > 1
5161               team->t.t_pkfn ==
5162                   (microtask_t)__kmp_teams_master || // inner fork of the teams
5163               master->th.th_teams_level <
5164                   team->t.t_level)) { // or nested parallel inside the teams
5165         ++level; // not increment if #teams==1, or for outer fork of the teams;
5166         // increment otherwise
5167       }
5168       // Do not perform the place partition if inner fork of the teams
5169       // Wait until nested parallel region encountered inside teams construct
5170       if ((master->th.th_teams_size.nteams == 1 &&
5171            master->th.th_teams_level >= team->t.t_level) ||
5172           (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5173         do_place_partition = 0;
5174     }
5175     hot_teams = master->th.th_hot_teams;
5176     if (level < __kmp_hot_teams_max_level && hot_teams &&
5177         hot_teams[level].hot_team) {
5178       // hot team has already been allocated for given level
5179       use_hot_team = 1;
5180     } else {
5181       use_hot_team = 0;
5182     }
5183   } else {
5184     // check we won't access uninitialized hot_teams, just in case
5185     KMP_DEBUG_ASSERT(new_nproc == 1);
5186   }
5187 #endif
5188   // Optimization to use a "hot" team
5189   if (use_hot_team && new_nproc > 1) {
5190     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5191 #if KMP_NESTED_HOT_TEAMS
5192     team = hot_teams[level].hot_team;
5193 #else
5194     team = root->r.r_hot_team;
5195 #endif
5196 #if KMP_DEBUG
5197     if (__kmp_tasking_mode != tskm_immediate_exec) {
5198       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5199                     "task_team[1] = %p before reinit\n",
5200                     team->t.t_task_team[0], team->t.t_task_team[1]));
5201     }
5202 #endif
5203 
5204     if (team->t.t_nproc != new_nproc &&
5205         __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5206       // Distributed barrier may need a resize
5207       int old_nthr = team->t.t_nproc;
5208       __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5209     }
5210 
5211     // If not doing the place partition, then reset the team's proc bind
5212     // to indicate that partitioning of all threads still needs to take place
5213     if (do_place_partition == 0)
5214       team->t.t_proc_bind = proc_bind_default;
5215     // Has the number of threads changed?
5216     /* Let's assume the most common case is that the number of threads is
5217        unchanged, and put that case first. */
5218     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5219       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5220       // This case can mean that omp_set_num_threads() was called and the hot
5221       // team size was already reduced, so we check the special flag
5222       if (team->t.t_size_changed == -1) {
5223         team->t.t_size_changed = 1;
5224       } else {
5225         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5226       }
5227 
5228       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5229       kmp_r_sched_t new_sched = new_icvs->sched;
5230       // set primary thread's schedule as new run-time schedule
5231       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5232 
5233       __kmp_reinitialize_team(team, new_icvs,
5234                               root->r.r_uber_thread->th.th_ident);
5235 
5236       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5237                     team->t.t_threads[0], team));
5238       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5239 
5240 #if KMP_AFFINITY_SUPPORTED
5241       if ((team->t.t_size_changed == 0) &&
5242           (team->t.t_proc_bind == new_proc_bind)) {
5243         if (new_proc_bind == proc_bind_spread) {
5244           if (do_place_partition) {
5245             // add flag to update only master for spread
5246             __kmp_partition_places(team, 1);
5247           }
5248         }
5249         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5250                        "proc_bind = %d, partition = [%d,%d]\n",
5251                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5252                        team->t.t_last_place));
5253       } else {
5254         if (do_place_partition) {
5255           KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5256           __kmp_partition_places(team);
5257         }
5258       }
5259 #else
5260       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5261 #endif /* KMP_AFFINITY_SUPPORTED */
5262     } else if (team->t.t_nproc > new_nproc) {
5263       KA_TRACE(20,
5264                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5265                 new_nproc));
5266 
5267       team->t.t_size_changed = 1;
5268       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5269         // Barrier size already reduced earlier in this function
5270         // Activate team threads via th_used_in_team
5271         __kmp_add_threads_to_team(team, new_nproc);
5272       }
5273       // When decreasing team size, threads no longer in the team should
5274       // unref task team.
5275       if (__kmp_tasking_mode != tskm_immediate_exec) {
5276         for (f = new_nproc; f < team->t.t_nproc; f++) {
5277           kmp_info_t *th = team->t.t_threads[f];
5278           KMP_DEBUG_ASSERT(th);
5279           th->th.th_task_team = NULL;
5280         }
5281       }
5282 #if KMP_NESTED_HOT_TEAMS
5283       if (__kmp_hot_teams_mode == 0) {
5284         // AC: saved number of threads should correspond to team's value in this
5285         // mode, can be bigger in mode 1, when hot team has threads in reserve
5286         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5287         hot_teams[level].hot_team_nth = new_nproc;
5288 #endif // KMP_NESTED_HOT_TEAMS
5289         /* release the extra threads we don't need any more */
5290         for (f = new_nproc; f < team->t.t_nproc; f++) {
5291           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5292           __kmp_free_thread(team->t.t_threads[f]);
5293           team->t.t_threads[f] = NULL;
5294         }
5295 #if KMP_NESTED_HOT_TEAMS
5296       } // (__kmp_hot_teams_mode == 0)
5297       else {
5298         // When keeping extra threads in team, switch threads to wait on own
5299         // b_go flag
5300         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5301           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5302           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5303           for (int b = 0; b < bs_last_barrier; ++b) {
5304             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5305               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5306             }
5307             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5308           }
5309         }
5310       }
5311 #endif // KMP_NESTED_HOT_TEAMS
5312       team->t.t_nproc = new_nproc;
5313       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5314       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5315       __kmp_reinitialize_team(team, new_icvs,
5316                               root->r.r_uber_thread->th.th_ident);
5317 
5318       // Update remaining threads
5319       for (f = 0; f < new_nproc; ++f) {
5320         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5321       }
5322 
5323       // restore the current task state of the primary thread: should be the
5324       // implicit task
5325       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5326                     team->t.t_threads[0], team));
5327 
5328       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5329 
5330 #ifdef KMP_DEBUG
5331       for (f = 0; f < team->t.t_nproc; f++) {
5332         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5333                          team->t.t_threads[f]->th.th_team_nproc ==
5334                              team->t.t_nproc);
5335       }
5336 #endif
5337 
5338       if (do_place_partition) {
5339         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5340 #if KMP_AFFINITY_SUPPORTED
5341         __kmp_partition_places(team);
5342 #endif
5343       }
5344     } else { // team->t.t_nproc < new_nproc
5345 
5346       KA_TRACE(20,
5347                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5348                 new_nproc));
5349       int old_nproc = team->t.t_nproc; // save old value and use to update only
5350       team->t.t_size_changed = 1;
5351 
5352 #if KMP_NESTED_HOT_TEAMS
5353       int avail_threads = hot_teams[level].hot_team_nth;
5354       if (new_nproc < avail_threads)
5355         avail_threads = new_nproc;
5356       kmp_info_t **other_threads = team->t.t_threads;
5357       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5358         // Adjust barrier data of reserved threads (if any) of the team
5359         // Other data will be set in __kmp_initialize_info() below.
5360         int b;
5361         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5362         for (b = 0; b < bs_last_barrier; ++b) {
5363           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5364           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5365 #if USE_DEBUGGER
5366           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5367 #endif
5368         }
5369       }
5370       if (hot_teams[level].hot_team_nth >= new_nproc) {
5371         // we have all needed threads in reserve, no need to allocate any
5372         // this only possible in mode 1, cannot have reserved threads in mode 0
5373         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5374         team->t.t_nproc = new_nproc; // just get reserved threads involved
5375       } else {
5376         // We may have some threads in reserve, but not enough;
5377         // get reserved threads involved if any.
5378         team->t.t_nproc = hot_teams[level].hot_team_nth;
5379         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5380 #endif // KMP_NESTED_HOT_TEAMS
5381         if (team->t.t_max_nproc < new_nproc) {
5382           /* reallocate larger arrays */
5383           __kmp_reallocate_team_arrays(team, new_nproc);
5384           __kmp_reinitialize_team(team, new_icvs, NULL);
5385         }
5386 
5387 #if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) &&   \
5388     KMP_AFFINITY_SUPPORTED
5389         /* Temporarily set full mask for primary thread before creation of
5390            workers. The reason is that workers inherit the affinity from the
5391            primary thread, so if a lot of workers are created on the single
5392            core quickly, they don't get a chance to set their own affinity for
5393            a long time. */
5394         kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5395 #endif
5396 
5397         /* allocate new threads for the hot team */
5398         for (f = team->t.t_nproc; f < new_nproc; f++) {
5399           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5400           KMP_DEBUG_ASSERT(new_worker);
5401           team->t.t_threads[f] = new_worker;
5402 
5403           KA_TRACE(20,
5404                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5405                     "join=%llu, plain=%llu\n",
5406                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5407                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5408                     team->t.t_bar[bs_plain_barrier].b_arrived));
5409 
5410           { // Initialize barrier data for new threads.
5411             int b;
5412             kmp_balign_t *balign = new_worker->th.th_bar;
5413             for (b = 0; b < bs_last_barrier; ++b) {
5414               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5415               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5416                                KMP_BARRIER_PARENT_FLAG);
5417 #if USE_DEBUGGER
5418               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5419 #endif
5420             }
5421           }
5422         }
5423 
5424 #if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) &&   \
5425     KMP_AFFINITY_SUPPORTED
5426         /* Restore initial primary thread's affinity mask */
5427         new_temp_affinity.restore();
5428 #endif
5429 #if KMP_NESTED_HOT_TEAMS
5430       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5431 #endif // KMP_NESTED_HOT_TEAMS
5432       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5433         // Barrier size already increased earlier in this function
5434         // Activate team threads via th_used_in_team
5435         __kmp_add_threads_to_team(team, new_nproc);
5436       }
5437       /* make sure everyone is syncronized */
5438       // new threads below
5439       __kmp_initialize_team(team, new_nproc, new_icvs,
5440                             root->r.r_uber_thread->th.th_ident);
5441 
5442       /* reinitialize the threads */
5443       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5444       for (f = 0; f < team->t.t_nproc; ++f)
5445         __kmp_initialize_info(team->t.t_threads[f], team, f,
5446                               __kmp_gtid_from_tid(f, team));
5447 
5448       // set th_task_state for new threads in hot team with older thread's state
5449       kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5450       for (f = old_nproc; f < team->t.t_nproc; ++f)
5451         team->t.t_threads[f]->th.th_task_state = old_state;
5452 
5453 #ifdef KMP_DEBUG
5454       for (f = 0; f < team->t.t_nproc; ++f) {
5455         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5456                          team->t.t_threads[f]->th.th_team_nproc ==
5457                              team->t.t_nproc);
5458       }
5459 #endif
5460 
5461       if (do_place_partition) {
5462         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5463 #if KMP_AFFINITY_SUPPORTED
5464         __kmp_partition_places(team);
5465 #endif
5466       }
5467     } // Check changes in number of threads
5468 
5469     if (master->th.th_teams_microtask) {
5470       for (f = 1; f < new_nproc; ++f) {
5471         // propagate teams construct specific info to workers
5472         kmp_info_t *thr = team->t.t_threads[f];
5473         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5474         thr->th.th_teams_level = master->th.th_teams_level;
5475         thr->th.th_teams_size = master->th.th_teams_size;
5476       }
5477     }
5478 #if KMP_NESTED_HOT_TEAMS
5479     if (level) {
5480       // Sync barrier state for nested hot teams, not needed for outermost hot
5481       // team.
5482       for (f = 1; f < new_nproc; ++f) {
5483         kmp_info_t *thr = team->t.t_threads[f];
5484         int b;
5485         kmp_balign_t *balign = thr->th.th_bar;
5486         for (b = 0; b < bs_last_barrier; ++b) {
5487           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5488           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5489 #if USE_DEBUGGER
5490           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5491 #endif
5492         }
5493       }
5494     }
5495 #endif // KMP_NESTED_HOT_TEAMS
5496 
5497     /* reallocate space for arguments if necessary */
5498     __kmp_alloc_argv_entries(argc, team, TRUE);
5499     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5500     // The hot team re-uses the previous task team,
5501     // if untouched during the previous release->gather phase.
5502 
5503     KF_TRACE(10, (" hot_team = %p\n", team));
5504 
5505 #if KMP_DEBUG
5506     if (__kmp_tasking_mode != tskm_immediate_exec) {
5507       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5508                     "task_team[1] = %p after reinit\n",
5509                     team->t.t_task_team[0], team->t.t_task_team[1]));
5510     }
5511 #endif
5512 
5513 #if OMPT_SUPPORT
5514     __ompt_team_assign_id(team, ompt_parallel_data);
5515 #endif
5516 
5517     KMP_MB();
5518 
5519     return team;
5520   }
5521 
5522   /* next, let's try to take one from the team pool */
5523   KMP_MB();
5524   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5525     /* TODO: consider resizing undersized teams instead of reaping them, now
5526        that we have a resizing mechanism */
5527     if (team->t.t_max_nproc >= max_nproc) {
5528       /* take this team from the team pool */
5529       __kmp_team_pool = team->t.t_next_pool;
5530 
5531       if (max_nproc > 1 &&
5532           __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5533         if (!team->t.b) { // Allocate barrier structure
5534           team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5535         }
5536       }
5537 
5538       /* setup the team for fresh use */
5539       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5540 
5541       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5542                     "task_team[1] %p to NULL\n",
5543                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5544       team->t.t_task_team[0] = NULL;
5545       team->t.t_task_team[1] = NULL;
5546 
5547       /* reallocate space for arguments if necessary */
5548       __kmp_alloc_argv_entries(argc, team, TRUE);
5549       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5550 
5551       KA_TRACE(
5552           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5553                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5554       { // Initialize barrier data.
5555         int b;
5556         for (b = 0; b < bs_last_barrier; ++b) {
5557           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5558 #if USE_DEBUGGER
5559           team->t.t_bar[b].b_master_arrived = 0;
5560           team->t.t_bar[b].b_team_arrived = 0;
5561 #endif
5562         }
5563       }
5564 
5565       team->t.t_proc_bind = new_proc_bind;
5566 
5567       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5568                     team->t.t_id));
5569 
5570 #if OMPT_SUPPORT
5571       __ompt_team_assign_id(team, ompt_parallel_data);
5572 #endif
5573 
5574       team->t.t_nested_nth = NULL;
5575 
5576       KMP_MB();
5577 
5578       return team;
5579     }
5580 
5581     /* reap team if it is too small, then loop back and check the next one */
5582     // not sure if this is wise, but, will be redone during the hot-teams
5583     // rewrite.
5584     /* TODO: Use technique to find the right size hot-team, don't reap them */
5585     team = __kmp_reap_team(team);
5586     __kmp_team_pool = team;
5587   }
5588 
5589   /* nothing available in the pool, no matter, make a new team! */
5590   KMP_MB();
5591   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5592 
5593   /* and set it up */
5594   team->t.t_max_nproc = max_nproc;
5595   if (max_nproc > 1 &&
5596       __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5597     // Allocate barrier structure
5598     team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5599   }
5600 
5601   /* NOTE well, for some reason allocating one big buffer and dividing it up
5602      seems to really hurt performance a lot on the P4, so, let's not use this */
5603   __kmp_allocate_team_arrays(team, max_nproc);
5604 
5605   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5606   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5607 
5608   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5609                 "%p to NULL\n",
5610                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5611   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5612   // memory, no need to duplicate
5613   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5614   // memory, no need to duplicate
5615 
5616   if (__kmp_storage_map) {
5617     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5618   }
5619 
5620   /* allocate space for arguments */
5621   __kmp_alloc_argv_entries(argc, team, FALSE);
5622   team->t.t_argc = argc;
5623 
5624   KA_TRACE(20,
5625            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5626             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5627   { // Initialize barrier data.
5628     int b;
5629     for (b = 0; b < bs_last_barrier; ++b) {
5630       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5631 #if USE_DEBUGGER
5632       team->t.t_bar[b].b_master_arrived = 0;
5633       team->t.t_bar[b].b_team_arrived = 0;
5634 #endif
5635     }
5636   }
5637 
5638   team->t.t_proc_bind = new_proc_bind;
5639 
5640 #if OMPT_SUPPORT
5641   __ompt_team_assign_id(team, ompt_parallel_data);
5642   team->t.ompt_serialized_team_info = NULL;
5643 #endif
5644 
5645   KMP_MB();
5646 
5647   team->t.t_nested_nth = NULL;
5648 
5649   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5650                 team->t.t_id));
5651 
5652   return team;
5653 }
5654 
5655 /* TODO implement hot-teams at all levels */
5656 /* TODO implement lazy thread release on demand (disband request) */
5657 
5658 /* free the team.  return it to the team pool.  release all the threads
5659  * associated with it */
5660 void __kmp_free_team(kmp_root_t *root,
5661                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5662   int f;
5663   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5664                 team->t.t_id));
5665 
5666   /* verify state */
5667   KMP_DEBUG_ASSERT(root);
5668   KMP_DEBUG_ASSERT(team);
5669   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5670   KMP_DEBUG_ASSERT(team->t.t_threads);
5671 
5672   int use_hot_team = team == root->r.r_hot_team;
5673 #if KMP_NESTED_HOT_TEAMS
5674   int level;
5675   if (master) {
5676     level = team->t.t_active_level - 1;
5677     if (master->th.th_teams_microtask) { // in teams construct?
5678       if (master->th.th_teams_size.nteams > 1) {
5679         ++level; // level was not increased in teams construct for
5680         // team_of_masters
5681       }
5682       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5683           master->th.th_teams_level == team->t.t_level) {
5684         ++level; // level was not increased in teams construct for
5685         // team_of_workers before the parallel
5686       } // team->t.t_level will be increased inside parallel
5687     }
5688 #if KMP_DEBUG
5689     kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5690 #endif
5691     if (level < __kmp_hot_teams_max_level) {
5692       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5693       use_hot_team = 1;
5694     }
5695   }
5696 #endif // KMP_NESTED_HOT_TEAMS
5697 
5698   /* team is done working */
5699   TCW_SYNC_PTR(team->t.t_pkfn,
5700                NULL); // Important for Debugging Support Library.
5701 #if KMP_OS_WINDOWS
5702   team->t.t_copyin_counter = 0; // init counter for possible reuse
5703 #endif
5704   // Do not reset pointer to parent team to NULL for hot teams.
5705 
5706   /* if we are non-hot team, release our threads */
5707   if (!use_hot_team) {
5708     if (__kmp_tasking_mode != tskm_immediate_exec) {
5709       // Wait for threads to reach reapable state
5710       for (f = 1; f < team->t.t_nproc; ++f) {
5711         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5712         kmp_info_t *th = team->t.t_threads[f];
5713         volatile kmp_uint32 *state = &th->th.th_reap_state;
5714         while (*state != KMP_SAFE_TO_REAP) {
5715 #if KMP_OS_WINDOWS
5716           // On Windows a thread can be killed at any time, check this
5717           DWORD ecode;
5718           if (!__kmp_is_thread_alive(th, &ecode)) {
5719             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5720             break;
5721           }
5722 #endif
5723           // first check if thread is sleeping
5724           if (th->th.th_sleep_loc)
5725             __kmp_null_resume_wrapper(th);
5726           KMP_CPU_PAUSE();
5727         }
5728       }
5729 
5730       // Delete task teams
5731       int tt_idx;
5732       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5733         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5734         if (task_team != NULL) {
5735           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5736             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5737             team->t.t_threads[f]->th.th_task_team = NULL;
5738           }
5739           KA_TRACE(
5740               20,
5741               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5742                __kmp_get_gtid(), task_team, team->t.t_id));
5743 #if KMP_NESTED_HOT_TEAMS
5744           __kmp_free_task_team(master, task_team);
5745 #endif
5746           team->t.t_task_team[tt_idx] = NULL;
5747         }
5748       }
5749     }
5750 
5751     // Before clearing parent pointer, check if nested_nth list should be freed
5752     if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth &&
5753         team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) {
5754       KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
5755       KMP_INTERNAL_FREE(team->t.t_nested_nth);
5756     }
5757     team->t.t_nested_nth = NULL;
5758 
5759     // Reset pointer to parent team only for non-hot teams.
5760     team->t.t_parent = NULL;
5761     team->t.t_level = 0;
5762     team->t.t_active_level = 0;
5763 
5764     /* free the worker threads */
5765     for (f = 1; f < team->t.t_nproc; ++f) {
5766       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5767       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5768         KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5769                                     1, 2);
5770       }
5771       __kmp_free_thread(team->t.t_threads[f]);
5772     }
5773 
5774     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5775       if (team->t.b) {
5776         // wake up thread at old location
5777         team->t.b->go_release();
5778         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5779           for (f = 1; f < team->t.t_nproc; ++f) {
5780             if (team->t.b->sleep[f].sleep) {
5781               __kmp_atomic_resume_64(
5782                   team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5783                   (kmp_atomic_flag_64<> *)NULL);
5784             }
5785           }
5786         }
5787         // Wait for threads to be removed from team
5788         for (int f = 1; f < team->t.t_nproc; ++f) {
5789           while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5790             KMP_CPU_PAUSE();
5791         }
5792       }
5793     }
5794 
5795     for (f = 1; f < team->t.t_nproc; ++f) {
5796       team->t.t_threads[f] = NULL;
5797     }
5798 
5799     if (team->t.t_max_nproc > 1 &&
5800         __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5801       distributedBarrier::deallocate(team->t.b);
5802       team->t.b = NULL;
5803     }
5804     /* put the team back in the team pool */
5805     /* TODO limit size of team pool, call reap_team if pool too large */
5806     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5807     __kmp_team_pool = (volatile kmp_team_t *)team;
5808   } else { // Check if team was created for primary threads in teams construct
5809     // See if first worker is a CG root
5810     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5811                      team->t.t_threads[1]->th.th_cg_roots);
5812     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5813       // Clean up the CG root nodes on workers so that this team can be re-used
5814       for (f = 1; f < team->t.t_nproc; ++f) {
5815         kmp_info_t *thr = team->t.t_threads[f];
5816         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5817                          thr->th.th_cg_roots->cg_root == thr);
5818         // Pop current CG root off list
5819         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5820         thr->th.th_cg_roots = tmp->up;
5821         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5822                        " up to node %p. cg_nthreads was %d\n",
5823                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5824         int i = tmp->cg_nthreads--;
5825         if (i == 1) {
5826           __kmp_free(tmp); // free CG if we are the last thread in it
5827         }
5828         // Restore current task's thread_limit from CG root
5829         if (thr->th.th_cg_roots)
5830           thr->th.th_current_task->td_icvs.thread_limit =
5831               thr->th.th_cg_roots->cg_thread_limit;
5832       }
5833     }
5834   }
5835 
5836   KMP_MB();
5837 }
5838 
5839 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5840 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5841   kmp_team_t *next_pool = team->t.t_next_pool;
5842 
5843   KMP_DEBUG_ASSERT(team);
5844   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5845   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5846   KMP_DEBUG_ASSERT(team->t.t_threads);
5847   KMP_DEBUG_ASSERT(team->t.t_argv);
5848 
5849   /* TODO clean the threads that are a part of this? */
5850 
5851   /* free stuff */
5852   __kmp_free_team_arrays(team);
5853   if (team->t.t_argv != &team->t.t_inline_argv[0])
5854     __kmp_free((void *)team->t.t_argv);
5855   __kmp_free(team);
5856 
5857   KMP_MB();
5858   return next_pool;
5859 }
5860 
5861 // Free the thread.  Don't reap it, just place it on the pool of available
5862 // threads.
5863 //
5864 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5865 // binding for the affinity mechanism to be useful.
5866 //
5867 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5868 // However, we want to avoid a potential performance problem by always
5869 // scanning through the list to find the correct point at which to insert
5870 // the thread (potential N**2 behavior).  To do this we keep track of the
5871 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5872 // With single-level parallelism, threads will always be added to the tail
5873 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5874 // parallelism, all bets are off and we may need to scan through the entire
5875 // free list.
5876 //
5877 // This change also has a potentially large performance benefit, for some
5878 // applications.  Previously, as threads were freed from the hot team, they
5879 // would be placed back on the free list in inverse order.  If the hot team
5880 // grew back to it's original size, then the freed thread would be placed
5881 // back on the hot team in reverse order.  This could cause bad cache
5882 // locality problems on programs where the size of the hot team regularly
5883 // grew and shrunk.
5884 //
5885 // Now, for single-level parallelism, the OMP tid is always == gtid.
5886 void __kmp_free_thread(kmp_info_t *this_th) {
5887   int gtid;
5888   kmp_info_t **scan;
5889 
5890   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5891                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5892 
5893   KMP_DEBUG_ASSERT(this_th);
5894 
5895   // When moving thread to pool, switch thread to wait on own b_go flag, and
5896   // uninitialized (NULL team).
5897   int b;
5898   kmp_balign_t *balign = this_th->th.th_bar;
5899   for (b = 0; b < bs_last_barrier; ++b) {
5900     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5901       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5902     balign[b].bb.team = NULL;
5903     balign[b].bb.leaf_kids = 0;
5904   }
5905   this_th->th.th_task_state = 0;
5906   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5907 
5908   /* put thread back on the free pool */
5909   TCW_PTR(this_th->th.th_team, NULL);
5910   TCW_PTR(this_th->th.th_root, NULL);
5911   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5912 
5913   while (this_th->th.th_cg_roots) {
5914     this_th->th.th_cg_roots->cg_nthreads--;
5915     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5916                    " %p of thread  %p to %d\n",
5917                    this_th, this_th->th.th_cg_roots,
5918                    this_th->th.th_cg_roots->cg_root,
5919                    this_th->th.th_cg_roots->cg_nthreads));
5920     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5921     if (tmp->cg_root == this_th) { // Thread is a cg_root
5922       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5923       KA_TRACE(
5924           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5925       this_th->th.th_cg_roots = tmp->up;
5926       __kmp_free(tmp);
5927     } else { // Worker thread
5928       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5929         __kmp_free(tmp);
5930       }
5931       this_th->th.th_cg_roots = NULL;
5932       break;
5933     }
5934   }
5935 
5936   /* If the implicit task assigned to this thread can be used by other threads
5937    * -> multiple threads can share the data and try to free the task at
5938    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5939    * with higher probability when hot team is disabled but can occurs even when
5940    * the hot team is enabled */
5941   __kmp_free_implicit_task(this_th);
5942   this_th->th.th_current_task = NULL;
5943 
5944   // If the __kmp_thread_pool_insert_pt is already past the new insert
5945   // point, then we need to re-scan the entire list.
5946   gtid = this_th->th.th_info.ds.ds_gtid;
5947   if (__kmp_thread_pool_insert_pt != NULL) {
5948     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5949     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5950       __kmp_thread_pool_insert_pt = NULL;
5951     }
5952   }
5953 
5954   // Scan down the list to find the place to insert the thread.
5955   // scan is the address of a link in the list, possibly the address of
5956   // __kmp_thread_pool itself.
5957   //
5958   // In the absence of nested parallelism, the for loop will have 0 iterations.
5959   if (__kmp_thread_pool_insert_pt != NULL) {
5960     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5961   } else {
5962     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5963   }
5964   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5965        scan = &((*scan)->th.th_next_pool))
5966     ;
5967 
5968   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5969   // to its address.
5970   TCW_PTR(this_th->th.th_next_pool, *scan);
5971   __kmp_thread_pool_insert_pt = *scan = this_th;
5972   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5973                    (this_th->th.th_info.ds.ds_gtid <
5974                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5975   TCW_4(this_th->th.th_in_pool, TRUE);
5976   __kmp_suspend_initialize_thread(this_th);
5977   __kmp_lock_suspend_mx(this_th);
5978   if (this_th->th.th_active == TRUE) {
5979     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5980     this_th->th.th_active_in_pool = TRUE;
5981   }
5982 #if KMP_DEBUG
5983   else {
5984     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5985   }
5986 #endif
5987   __kmp_unlock_suspend_mx(this_th);
5988 
5989   TCW_4(__kmp_nth, __kmp_nth - 1);
5990 
5991 #ifdef KMP_ADJUST_BLOCKTIME
5992   /* Adjust blocktime back to user setting or default if necessary */
5993   /* Middle initialization might never have occurred                */
5994   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5995     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5996     if (__kmp_nth <= __kmp_avail_proc) {
5997       __kmp_zero_bt = FALSE;
5998     }
5999   }
6000 #endif /* KMP_ADJUST_BLOCKTIME */
6001 
6002   KMP_MB();
6003 }
6004 
6005 /* ------------------------------------------------------------------------ */
6006 
6007 void *__kmp_launch_thread(kmp_info_t *this_thr) {
6008 #if OMP_PROFILING_SUPPORT
6009   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
6010   // TODO: add a configuration option for time granularity
6011   if (ProfileTraceFile)
6012     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
6013 #endif
6014 
6015   int gtid = this_thr->th.th_info.ds.ds_gtid;
6016   /*    void                 *stack_data;*/
6017   kmp_team_t **volatile pteam;
6018 
6019   KMP_MB();
6020   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6021 
6022   if (__kmp_env_consistency_check) {
6023     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6024   }
6025 
6026 #if OMPD_SUPPORT
6027   if (ompd_state & OMPD_ENABLE_BP)
6028     ompd_bp_thread_begin();
6029 #endif
6030 
6031 #if OMPT_SUPPORT
6032   ompt_data_t *thread_data = nullptr;
6033   if (ompt_enabled.enabled) {
6034     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6035     *thread_data = ompt_data_none;
6036 
6037     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6038     this_thr->th.ompt_thread_info.wait_id = 0;
6039     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6040     this_thr->th.ompt_thread_info.parallel_flags = 0;
6041     if (ompt_enabled.ompt_callback_thread_begin) {
6042       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6043           ompt_thread_worker, thread_data);
6044     }
6045     this_thr->th.ompt_thread_info.state = ompt_state_idle;
6046   }
6047 #endif
6048 
6049   /* This is the place where threads wait for work */
6050   while (!TCR_4(__kmp_global.g.g_done)) {
6051     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6052     KMP_MB();
6053 
6054     /* wait for work to do */
6055     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6056 
6057     /* No tid yet since not part of a team */
6058     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6059 
6060 #if OMPT_SUPPORT
6061     if (ompt_enabled.enabled) {
6062       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6063     }
6064 #endif
6065 
6066     pteam = &this_thr->th.th_team;
6067 
6068     /* have we been allocated? */
6069     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6070       /* we were just woken up, so run our new task */
6071       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6072         int rc;
6073         KA_TRACE(20,
6074                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6075                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6076                   (*pteam)->t.t_pkfn));
6077 
6078         updateHWFPControl(*pteam);
6079 
6080 #if OMPT_SUPPORT
6081         if (ompt_enabled.enabled) {
6082           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6083         }
6084 #endif
6085 
6086         rc = (*pteam)->t.t_invoke(gtid);
6087         KMP_ASSERT(rc);
6088 
6089         KMP_MB();
6090         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6091                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6092                       (*pteam)->t.t_pkfn));
6093       }
6094 #if OMPT_SUPPORT
6095       if (ompt_enabled.enabled) {
6096         /* no frame set while outside task */
6097         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6098 
6099         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6100       }
6101 #endif
6102       /* join barrier after parallel region */
6103       __kmp_join_barrier(gtid);
6104     }
6105   }
6106 
6107 #if OMPD_SUPPORT
6108   if (ompd_state & OMPD_ENABLE_BP)
6109     ompd_bp_thread_end();
6110 #endif
6111 
6112 #if OMPT_SUPPORT
6113   if (ompt_enabled.ompt_callback_thread_end) {
6114     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6115   }
6116 #endif
6117 
6118   this_thr->th.th_task_team = NULL;
6119   /* run the destructors for the threadprivate data for this thread */
6120   __kmp_common_destroy_gtid(gtid);
6121 
6122   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6123   KMP_MB();
6124 
6125 #if OMP_PROFILING_SUPPORT
6126   llvm::timeTraceProfilerFinishThread();
6127 #endif
6128   return this_thr;
6129 }
6130 
6131 /* ------------------------------------------------------------------------ */
6132 
6133 void __kmp_internal_end_dest(void *specific_gtid) {
6134   // Make sure no significant bits are lost
6135   int gtid;
6136   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6137 
6138   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6139   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6140    * this is because 0 is reserved for the nothing-stored case */
6141 
6142   __kmp_internal_end_thread(gtid);
6143 }
6144 
6145 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6146 
6147 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6148   __kmp_internal_end_atexit();
6149 }
6150 
6151 #endif
6152 
6153 /* [Windows] josh: when the atexit handler is called, there may still be more
6154    than one thread alive */
6155 void __kmp_internal_end_atexit(void) {
6156   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6157   /* [Windows]
6158      josh: ideally, we want to completely shutdown the library in this atexit
6159      handler, but stat code that depends on thread specific data for gtid fails
6160      because that data becomes unavailable at some point during the shutdown, so
6161      we call __kmp_internal_end_thread instead. We should eventually remove the
6162      dependency on __kmp_get_specific_gtid in the stat code and use
6163      __kmp_internal_end_library to cleanly shutdown the library.
6164 
6165      // TODO: Can some of this comment about GVS be removed?
6166      I suspect that the offending stat code is executed when the calling thread
6167      tries to clean up a dead root thread's data structures, resulting in GVS
6168      code trying to close the GVS structures for that thread, but since the stat
6169      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6170      the calling thread is cleaning up itself instead of another thread, it get
6171      confused. This happens because allowing a thread to unregister and cleanup
6172      another thread is a recent modification for addressing an issue.
6173      Based on the current design (20050722), a thread may end up
6174      trying to unregister another thread only if thread death does not trigger
6175      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
6176      thread specific data destructor function to detect thread death. For
6177      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6178      is nothing.  Thus, the workaround is applicable only for Windows static
6179      stat library. */
6180   __kmp_internal_end_library(-1);
6181 #if KMP_OS_WINDOWS
6182   __kmp_close_console();
6183 #endif
6184 }
6185 
6186 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6187   // It is assumed __kmp_forkjoin_lock is acquired.
6188 
6189   int gtid;
6190 
6191   KMP_DEBUG_ASSERT(thread != NULL);
6192 
6193   gtid = thread->th.th_info.ds.ds_gtid;
6194 
6195   if (!is_root) {
6196     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6197       /* Assume the threads are at the fork barrier here */
6198       KA_TRACE(
6199           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6200                gtid));
6201       if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6202         while (
6203             !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6204           KMP_CPU_PAUSE();
6205         __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6206       } else {
6207         /* Need release fence here to prevent seg faults for tree forkjoin
6208            barrier (GEH) */
6209         kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6210                            thread);
6211         __kmp_release_64(&flag);
6212       }
6213     }
6214 
6215     // Terminate OS thread.
6216     __kmp_reap_worker(thread);
6217 
6218     // The thread was killed asynchronously.  If it was actively
6219     // spinning in the thread pool, decrement the global count.
6220     //
6221     // There is a small timing hole here - if the worker thread was just waking
6222     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6223     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6224     // the global counter might not get updated.
6225     //
6226     // Currently, this can only happen as the library is unloaded,
6227     // so there are no harmful side effects.
6228     if (thread->th.th_active_in_pool) {
6229       thread->th.th_active_in_pool = FALSE;
6230       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6231       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6232     }
6233   }
6234 
6235   __kmp_free_implicit_task(thread);
6236 
6237 // Free the fast memory for tasking
6238 #if USE_FAST_MEMORY
6239   __kmp_free_fast_memory(thread);
6240 #endif /* USE_FAST_MEMORY */
6241 
6242   __kmp_suspend_uninitialize_thread(thread);
6243 
6244   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6245   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6246 
6247   --__kmp_all_nth;
6248   // __kmp_nth was decremented when thread is added to the pool.
6249 
6250 #ifdef KMP_ADJUST_BLOCKTIME
6251   /* Adjust blocktime back to user setting or default if necessary */
6252   /* Middle initialization might never have occurred                */
6253   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6254     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6255     if (__kmp_nth <= __kmp_avail_proc) {
6256       __kmp_zero_bt = FALSE;
6257     }
6258   }
6259 #endif /* KMP_ADJUST_BLOCKTIME */
6260 
6261   /* free the memory being used */
6262   if (__kmp_env_consistency_check) {
6263     if (thread->th.th_cons) {
6264       __kmp_free_cons_stack(thread->th.th_cons);
6265       thread->th.th_cons = NULL;
6266     }
6267   }
6268 
6269   if (thread->th.th_pri_common != NULL) {
6270     __kmp_free(thread->th.th_pri_common);
6271     thread->th.th_pri_common = NULL;
6272   }
6273 
6274 #if KMP_USE_BGET
6275   if (thread->th.th_local.bget_data != NULL) {
6276     __kmp_finalize_bget(thread);
6277   }
6278 #endif
6279 
6280 #if KMP_AFFINITY_SUPPORTED
6281   if (thread->th.th_affin_mask != NULL) {
6282     KMP_CPU_FREE(thread->th.th_affin_mask);
6283     thread->th.th_affin_mask = NULL;
6284   }
6285 #endif /* KMP_AFFINITY_SUPPORTED */
6286 
6287 #if KMP_USE_HIER_SCHED
6288   if (thread->th.th_hier_bar_data != NULL) {
6289     __kmp_free(thread->th.th_hier_bar_data);
6290     thread->th.th_hier_bar_data = NULL;
6291   }
6292 #endif
6293 
6294   __kmp_reap_team(thread->th.th_serial_team);
6295   thread->th.th_serial_team = NULL;
6296   __kmp_free(thread);
6297 
6298   KMP_MB();
6299 
6300 } // __kmp_reap_thread
6301 
6302 static void __kmp_itthash_clean(kmp_info_t *th) {
6303 #if USE_ITT_NOTIFY
6304   if (__kmp_itt_region_domains.count > 0) {
6305     for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6306       kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6307       while (bucket) {
6308         kmp_itthash_entry_t *next = bucket->next_in_bucket;
6309         __kmp_thread_free(th, bucket);
6310         bucket = next;
6311       }
6312     }
6313   }
6314   if (__kmp_itt_barrier_domains.count > 0) {
6315     for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6316       kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6317       while (bucket) {
6318         kmp_itthash_entry_t *next = bucket->next_in_bucket;
6319         __kmp_thread_free(th, bucket);
6320         bucket = next;
6321       }
6322     }
6323   }
6324 #endif
6325 }
6326 
6327 static void __kmp_internal_end(void) {
6328   int i;
6329 
6330   /* First, unregister the library */
6331   __kmp_unregister_library();
6332 
6333 #if KMP_OS_WINDOWS
6334   /* In Win static library, we can't tell when a root actually dies, so we
6335      reclaim the data structures for any root threads that have died but not
6336      unregistered themselves, in order to shut down cleanly.
6337      In Win dynamic library we also can't tell when a thread dies.  */
6338   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6339 // dead roots
6340 #endif
6341 
6342   for (i = 0; i < __kmp_threads_capacity; i++)
6343     if (__kmp_root[i])
6344       if (__kmp_root[i]->r.r_active)
6345         break;
6346   KMP_MB(); /* Flush all pending memory write invalidates.  */
6347   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6348 
6349   if (i < __kmp_threads_capacity) {
6350 #if KMP_USE_MONITOR
6351     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6352     KMP_MB(); /* Flush all pending memory write invalidates.  */
6353 
6354     // Need to check that monitor was initialized before reaping it. If we are
6355     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6356     // __kmp_monitor will appear to contain valid data, but it is only valid in
6357     // the parent process, not the child.
6358     // New behavior (201008): instead of keying off of the flag
6359     // __kmp_init_parallel, the monitor thread creation is keyed off
6360     // of the new flag __kmp_init_monitor.
6361     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6362     if (TCR_4(__kmp_init_monitor)) {
6363       __kmp_reap_monitor(&__kmp_monitor);
6364       TCW_4(__kmp_init_monitor, 0);
6365     }
6366     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6367     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6368 #endif // KMP_USE_MONITOR
6369   } else {
6370 /* TODO move this to cleanup code */
6371 #ifdef KMP_DEBUG
6372     /* make sure that everything has properly ended */
6373     for (i = 0; i < __kmp_threads_capacity; i++) {
6374       if (__kmp_root[i]) {
6375         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6376         //                    there can be uber threads alive here
6377         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6378       }
6379     }
6380 #endif
6381 
6382     KMP_MB();
6383 
6384     // Reap the worker threads.
6385     // This is valid for now, but be careful if threads are reaped sooner.
6386     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6387       // Get the next thread from the pool.
6388       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6389       __kmp_thread_pool = thread->th.th_next_pool;
6390       // Reap it.
6391       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6392       thread->th.th_next_pool = NULL;
6393       thread->th.th_in_pool = FALSE;
6394       __kmp_reap_thread(thread, 0);
6395     }
6396     __kmp_thread_pool_insert_pt = NULL;
6397 
6398     // Reap teams.
6399     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6400       // Get the next team from the pool.
6401       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6402       __kmp_team_pool = team->t.t_next_pool;
6403       // Reap it.
6404       team->t.t_next_pool = NULL;
6405       __kmp_reap_team(team);
6406     }
6407 
6408     __kmp_reap_task_teams();
6409 
6410 #if KMP_OS_UNIX
6411     // Threads that are not reaped should not access any resources since they
6412     // are going to be deallocated soon, so the shutdown sequence should wait
6413     // until all threads either exit the final spin-waiting loop or begin
6414     // sleeping after the given blocktime.
6415     for (i = 0; i < __kmp_threads_capacity; i++) {
6416       kmp_info_t *thr = __kmp_threads[i];
6417       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6418         KMP_CPU_PAUSE();
6419     }
6420 #endif
6421 
6422     for (i = 0; i < __kmp_threads_capacity; ++i) {
6423       // TBD: Add some checking...
6424       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6425     }
6426 
6427     /* Make sure all threadprivate destructors get run by joining with all
6428        worker threads before resetting this flag */
6429     TCW_SYNC_4(__kmp_init_common, FALSE);
6430 
6431     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6432     KMP_MB();
6433 
6434 #if KMP_USE_MONITOR
6435     // See note above: One of the possible fixes for CQ138434 / CQ140126
6436     //
6437     // FIXME: push both code fragments down and CSE them?
6438     // push them into __kmp_cleanup() ?
6439     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6440     if (TCR_4(__kmp_init_monitor)) {
6441       __kmp_reap_monitor(&__kmp_monitor);
6442       TCW_4(__kmp_init_monitor, 0);
6443     }
6444     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6445     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6446 #endif
6447   } /* else !__kmp_global.t_active */
6448   TCW_4(__kmp_init_gtid, FALSE);
6449   KMP_MB(); /* Flush all pending memory write invalidates.  */
6450 
6451   __kmp_cleanup();
6452 #if OMPT_SUPPORT
6453   ompt_fini();
6454 #endif
6455 }
6456 
6457 void __kmp_internal_end_library(int gtid_req) {
6458   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6459   /* this shouldn't be a race condition because __kmp_internal_end() is the
6460      only place to clear __kmp_serial_init */
6461   /* we'll check this later too, after we get the lock */
6462   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6463   // redundant, because the next check will work in any case.
6464   if (__kmp_global.g.g_abort) {
6465     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6466     /* TODO abort? */
6467     return;
6468   }
6469   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6470     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6471     return;
6472   }
6473 
6474   // If hidden helper team has been initialized, we need to deinit it
6475   if (TCR_4(__kmp_init_hidden_helper) &&
6476       !TCR_4(__kmp_hidden_helper_team_done)) {
6477     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6478     // First release the main thread to let it continue its work
6479     __kmp_hidden_helper_main_thread_release();
6480     // Wait until the hidden helper team has been destroyed
6481     __kmp_hidden_helper_threads_deinitz_wait();
6482   }
6483 
6484   KMP_MB(); /* Flush all pending memory write invalidates.  */
6485   /* find out who we are and what we should do */
6486   {
6487     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6488     KA_TRACE(
6489         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6490     if (gtid == KMP_GTID_SHUTDOWN) {
6491       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6492                     "already shutdown\n"));
6493       return;
6494     } else if (gtid == KMP_GTID_MONITOR) {
6495       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6496                     "registered, or system shutdown\n"));
6497       return;
6498     } else if (gtid == KMP_GTID_DNE) {
6499       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6500                     "shutdown\n"));
6501       /* we don't know who we are, but we may still shutdown the library */
6502     } else if (KMP_UBER_GTID(gtid)) {
6503       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6504       if (__kmp_root[gtid]->r.r_active) {
6505         __kmp_global.g.g_abort = -1;
6506         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6507         __kmp_unregister_library();
6508         KA_TRACE(10,
6509                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6510                   gtid));
6511         return;
6512       } else {
6513         __kmp_itthash_clean(__kmp_threads[gtid]);
6514         KA_TRACE(
6515             10,
6516             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6517         __kmp_unregister_root_current_thread(gtid);
6518       }
6519     } else {
6520 /* worker threads may call this function through the atexit handler, if they
6521  * call exit() */
6522 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6523    TODO: do a thorough shutdown instead */
6524 #ifdef DUMP_DEBUG_ON_EXIT
6525       if (__kmp_debug_buf)
6526         __kmp_dump_debug_buffer();
6527 #endif
6528       // added unregister library call here when we switch to shm linux
6529       // if we don't, it will leave lots of files in /dev/shm
6530       // cleanup shared memory file before exiting.
6531       __kmp_unregister_library();
6532       return;
6533     }
6534   }
6535   /* synchronize the termination process */
6536   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6537 
6538   /* have we already finished */
6539   if (__kmp_global.g.g_abort) {
6540     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6541     /* TODO abort? */
6542     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6543     return;
6544   }
6545   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6546     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6547     return;
6548   }
6549 
6550   /* We need this lock to enforce mutex between this reading of
6551      __kmp_threads_capacity and the writing by __kmp_register_root.
6552      Alternatively, we can use a counter of roots that is atomically updated by
6553      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6554      __kmp_internal_end_*.  */
6555   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6556 
6557   /* now we can safely conduct the actual termination */
6558   __kmp_internal_end();
6559 
6560   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6561   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6562 
6563   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6564 
6565 #ifdef DUMP_DEBUG_ON_EXIT
6566   if (__kmp_debug_buf)
6567     __kmp_dump_debug_buffer();
6568 #endif
6569 
6570 #if KMP_OS_WINDOWS
6571   __kmp_close_console();
6572 #endif
6573 
6574   __kmp_fini_allocator();
6575 
6576 } // __kmp_internal_end_library
6577 
6578 void __kmp_internal_end_thread(int gtid_req) {
6579   int i;
6580 
6581   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6582   /* this shouldn't be a race condition because __kmp_internal_end() is the
6583    * only place to clear __kmp_serial_init */
6584   /* we'll check this later too, after we get the lock */
6585   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6586   // redundant, because the next check will work in any case.
6587   if (__kmp_global.g.g_abort) {
6588     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6589     /* TODO abort? */
6590     return;
6591   }
6592   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6593     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6594     return;
6595   }
6596 
6597   // If hidden helper team has been initialized, we need to deinit it
6598   if (TCR_4(__kmp_init_hidden_helper) &&
6599       !TCR_4(__kmp_hidden_helper_team_done)) {
6600     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6601     // First release the main thread to let it continue its work
6602     __kmp_hidden_helper_main_thread_release();
6603     // Wait until the hidden helper team has been destroyed
6604     __kmp_hidden_helper_threads_deinitz_wait();
6605   }
6606 
6607   KMP_MB(); /* Flush all pending memory write invalidates.  */
6608 
6609   /* find out who we are and what we should do */
6610   {
6611     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6612     KA_TRACE(10,
6613              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6614     if (gtid == KMP_GTID_SHUTDOWN) {
6615       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6616                     "already shutdown\n"));
6617       return;
6618     } else if (gtid == KMP_GTID_MONITOR) {
6619       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6620                     "registered, or system shutdown\n"));
6621       return;
6622     } else if (gtid == KMP_GTID_DNE) {
6623       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6624                     "shutdown\n"));
6625       return;
6626       /* we don't know who we are */
6627     } else if (KMP_UBER_GTID(gtid)) {
6628       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6629       if (__kmp_root[gtid]->r.r_active) {
6630         __kmp_global.g.g_abort = -1;
6631         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6632         KA_TRACE(10,
6633                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6634                   gtid));
6635         return;
6636       } else {
6637         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6638                       gtid));
6639         __kmp_unregister_root_current_thread(gtid);
6640       }
6641     } else {
6642       /* just a worker thread, let's leave */
6643       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6644 
6645       if (gtid >= 0) {
6646         __kmp_threads[gtid]->th.th_task_team = NULL;
6647       }
6648 
6649       KA_TRACE(10,
6650                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6651                 gtid));
6652       return;
6653     }
6654   }
6655 #if KMP_DYNAMIC_LIB
6656   if (__kmp_pause_status != kmp_hard_paused)
6657   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6658   // because we will better shutdown later in the library destructor.
6659   {
6660     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6661     return;
6662   }
6663 #endif
6664   /* synchronize the termination process */
6665   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6666 
6667   /* have we already finished */
6668   if (__kmp_global.g.g_abort) {
6669     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6670     /* TODO abort? */
6671     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6672     return;
6673   }
6674   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6675     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6676     return;
6677   }
6678 
6679   /* We need this lock to enforce mutex between this reading of
6680      __kmp_threads_capacity and the writing by __kmp_register_root.
6681      Alternatively, we can use a counter of roots that is atomically updated by
6682      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6683      __kmp_internal_end_*.  */
6684 
6685   /* should we finish the run-time?  are all siblings done? */
6686   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6687 
6688   for (i = 0; i < __kmp_threads_capacity; ++i) {
6689     if (KMP_UBER_GTID(i)) {
6690       KA_TRACE(
6691           10,
6692           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6693       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6694       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6695       return;
6696     }
6697   }
6698 
6699   /* now we can safely conduct the actual termination */
6700 
6701   __kmp_internal_end();
6702 
6703   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6704   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6705 
6706   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6707 
6708 #ifdef DUMP_DEBUG_ON_EXIT
6709   if (__kmp_debug_buf)
6710     __kmp_dump_debug_buffer();
6711 #endif
6712 } // __kmp_internal_end_thread
6713 
6714 // -----------------------------------------------------------------------------
6715 // Library registration stuff.
6716 
6717 static long __kmp_registration_flag = 0;
6718 // Random value used to indicate library initialization.
6719 static char *__kmp_registration_str = NULL;
6720 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6721 
6722 static inline char *__kmp_reg_status_name() {
6723 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6724    each thread. If registration and unregistration go in different threads
6725    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6726    env var can not be found, because the name will contain different pid. */
6727 // macOS* complains about name being too long with additional getuid()
6728 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6729   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6730                           (int)getuid());
6731 #else
6732   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6733 #endif
6734 } // __kmp_reg_status_get
6735 
6736 #if defined(KMP_USE_SHM)
6737 bool __kmp_shm_available = false;
6738 bool __kmp_tmp_available = false;
6739 // If /dev/shm is not accessible, we will create a temporary file under /tmp.
6740 char *temp_reg_status_file_name = nullptr;
6741 #endif
6742 
6743 void __kmp_register_library_startup(void) {
6744 
6745   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6746   int done = 0;
6747   union {
6748     double dtime;
6749     long ltime;
6750   } time;
6751 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6752   __kmp_initialize_system_tick();
6753 #endif
6754   __kmp_read_system_time(&time.dtime);
6755   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6756   __kmp_registration_str =
6757       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6758                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6759 
6760   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6761                 __kmp_registration_str));
6762 
6763   while (!done) {
6764 
6765     char *value = NULL; // Actual value of the environment variable.
6766 
6767 #if defined(KMP_USE_SHM)
6768     char *shm_name = nullptr;
6769     char *data1 = nullptr;
6770     __kmp_shm_available = __kmp_detect_shm();
6771     if (__kmp_shm_available) {
6772       int fd1 = -1;
6773       shm_name = __kmp_str_format("/%s", name);
6774       int shm_preexist = 0;
6775       fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6776       if ((fd1 == -1) && (errno == EEXIST)) {
6777         // file didn't open because it already exists.
6778         // try opening existing file
6779         fd1 = shm_open(shm_name, O_RDWR, 0600);
6780         if (fd1 == -1) { // file didn't open
6781           KMP_WARNING(FunctionError, "Can't open SHM");
6782           __kmp_shm_available = false;
6783         } else { // able to open existing file
6784           shm_preexist = 1;
6785         }
6786       }
6787       if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6788         if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6789           KMP_WARNING(FunctionError, "Can't set size of SHM");
6790           __kmp_shm_available = false;
6791         }
6792       }
6793       if (__kmp_shm_available) { // SHM exists, now map it
6794         data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6795                              fd1, 0);
6796         if (data1 == MAP_FAILED) { // failed to map shared memory
6797           KMP_WARNING(FunctionError, "Can't map SHM");
6798           __kmp_shm_available = false;
6799         }
6800       }
6801       if (__kmp_shm_available) { // SHM mapped
6802         if (shm_preexist == 0) { // set data to SHM, set value
6803           KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6804         }
6805         // Read value from either what we just wrote or existing file.
6806         value = __kmp_str_format("%s", data1); // read value from SHM
6807         munmap(data1, SHM_SIZE);
6808       }
6809       if (fd1 != -1)
6810         close(fd1);
6811     }
6812     if (!__kmp_shm_available)
6813       __kmp_tmp_available = __kmp_detect_tmp();
6814     if (!__kmp_shm_available && __kmp_tmp_available) {
6815       // SHM failed to work due to an error other than that the file already
6816       // exists. Try to create a temp file under /tmp.
6817       // If /tmp isn't accessible, fall back to using environment variable.
6818       // TODO: /tmp might not always be the temporary directory. For now we will
6819       // not consider TMPDIR.
6820       int fd1 = -1;
6821       temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
6822       int tmp_preexist = 0;
6823       fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6824       if ((fd1 == -1) && (errno == EEXIST)) {
6825         // file didn't open because it already exists.
6826         // try opening existing file
6827         fd1 = open(temp_reg_status_file_name, O_RDWR, 0600);
6828         if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6829           KMP_WARNING(FunctionError, "Can't open TEMP");
6830           __kmp_tmp_available = false;
6831         } else {
6832           tmp_preexist = 1;
6833         }
6834       }
6835       if (__kmp_tmp_available && tmp_preexist == 0) {
6836         // we created /tmp file now set size
6837         if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6838           KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6839           __kmp_tmp_available = false;
6840         }
6841       }
6842       if (__kmp_tmp_available) {
6843         data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6844                              fd1, 0);
6845         if (data1 == MAP_FAILED) { // failed to map /tmp
6846           KMP_WARNING(FunctionError, "Can't map /tmp");
6847           __kmp_tmp_available = false;
6848         }
6849       }
6850       if (__kmp_tmp_available) {
6851         if (tmp_preexist == 0) { // set data to TMP, set value
6852           KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6853         }
6854         // Read value from either what we just wrote or existing file.
6855         value = __kmp_str_format("%s", data1); // read value from SHM
6856         munmap(data1, SHM_SIZE);
6857       }
6858       if (fd1 != -1)
6859         close(fd1);
6860     }
6861     if (!__kmp_shm_available && !__kmp_tmp_available) {
6862       // no /dev/shm and no /tmp -- fall back to environment variable
6863       // Set environment variable, but do not overwrite if it exists.
6864       __kmp_env_set(name, __kmp_registration_str, 0);
6865       // read value to see if it got set
6866       value = __kmp_env_get(name);
6867     }
6868 #else // Windows and unix with static library
6869     // Set environment variable, but do not overwrite if it exists.
6870     __kmp_env_set(name, __kmp_registration_str, 0);
6871     // read value to see if it got set
6872     value = __kmp_env_get(name);
6873 #endif
6874 
6875     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6876       done = 1; // Ok, environment variable set successfully, exit the loop.
6877     } else {
6878       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6879       // Check whether it alive or dead.
6880       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6881       char *tail = value;
6882       char *flag_addr_str = NULL;
6883       char *flag_val_str = NULL;
6884       char const *file_name = NULL;
6885       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6886       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6887       file_name = tail;
6888       if (tail != NULL) {
6889         unsigned long *flag_addr = 0;
6890         unsigned long flag_val = 0;
6891         KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6892         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6893         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6894           // First, check whether environment-encoded address is mapped into
6895           // addr space.
6896           // If so, dereference it to see if it still has the right value.
6897           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6898             neighbor = 1;
6899           } else {
6900             // If not, then we know the other copy of the library is no longer
6901             // running.
6902             neighbor = 2;
6903           }
6904         }
6905       }
6906       switch (neighbor) {
6907       case 0: // Cannot parse environment variable -- neighbor status unknown.
6908         // Assume it is the incompatible format of future version of the
6909         // library. Assume the other library is alive.
6910         // WARN( ... ); // TODO: Issue a warning.
6911         file_name = "unknown library";
6912         KMP_FALLTHROUGH();
6913       // Attention! Falling to the next case. That's intentional.
6914       case 1: { // Neighbor is alive.
6915         // Check it is allowed.
6916         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6917         if (!__kmp_str_match_true(duplicate_ok)) {
6918           // That's not allowed. Issue fatal error.
6919           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6920                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6921         }
6922         KMP_INTERNAL_FREE(duplicate_ok);
6923         __kmp_duplicate_library_ok = 1;
6924         done = 1; // Exit the loop.
6925       } break;
6926       case 2: { // Neighbor is dead.
6927 
6928 #if defined(KMP_USE_SHM)
6929         if (__kmp_shm_available) { // close shared memory.
6930           shm_unlink(shm_name); // this removes file in /dev/shm
6931         } else if (__kmp_tmp_available) {
6932           unlink(temp_reg_status_file_name); // this removes the temp file
6933         } else {
6934           // Clear the variable and try to register library again.
6935           __kmp_env_unset(name);
6936         }
6937 #else
6938         // Clear the variable and try to register library again.
6939         __kmp_env_unset(name);
6940 #endif
6941       } break;
6942       default: {
6943         KMP_DEBUG_ASSERT(0);
6944       } break;
6945       }
6946     }
6947     KMP_INTERNAL_FREE((void *)value);
6948 #if defined(KMP_USE_SHM)
6949     if (shm_name)
6950       KMP_INTERNAL_FREE((void *)shm_name);
6951 #endif
6952   } // while
6953   KMP_INTERNAL_FREE((void *)name);
6954 
6955 } // func __kmp_register_library_startup
6956 
6957 void __kmp_unregister_library(void) {
6958 
6959   char *name = __kmp_reg_status_name();
6960   char *value = NULL;
6961 
6962 #if defined(KMP_USE_SHM)
6963   char *shm_name = nullptr;
6964   int fd1;
6965   if (__kmp_shm_available) {
6966     shm_name = __kmp_str_format("/%s", name);
6967     fd1 = shm_open(shm_name, O_RDONLY, 0600);
6968     if (fd1 != -1) { // File opened successfully
6969       char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6970       if (data1 != MAP_FAILED) {
6971         value = __kmp_str_format("%s", data1); // read value from SHM
6972         munmap(data1, SHM_SIZE);
6973       }
6974       close(fd1);
6975     }
6976   } else if (__kmp_tmp_available) { // try /tmp
6977     fd1 = open(temp_reg_status_file_name, O_RDONLY);
6978     if (fd1 != -1) { // File opened successfully
6979       char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6980       if (data1 != MAP_FAILED) {
6981         value = __kmp_str_format("%s", data1); // read value from /tmp
6982         munmap(data1, SHM_SIZE);
6983       }
6984       close(fd1);
6985     }
6986   } else { // fall back to envirable
6987     value = __kmp_env_get(name);
6988   }
6989 #else
6990   value = __kmp_env_get(name);
6991 #endif
6992 
6993   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6994   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6995   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6996 //  Ok, this is our variable. Delete it.
6997 #if defined(KMP_USE_SHM)
6998     if (__kmp_shm_available) {
6999       shm_unlink(shm_name); // this removes file in /dev/shm
7000     } else if (__kmp_tmp_available) {
7001       unlink(temp_reg_status_file_name); // this removes the temp file
7002     } else {
7003       __kmp_env_unset(name);
7004     }
7005 #else
7006     __kmp_env_unset(name);
7007 #endif
7008   }
7009 
7010 #if defined(KMP_USE_SHM)
7011   if (shm_name)
7012     KMP_INTERNAL_FREE(shm_name);
7013   if (temp_reg_status_file_name)
7014     KMP_INTERNAL_FREE(temp_reg_status_file_name);
7015 #endif
7016 
7017   KMP_INTERNAL_FREE(__kmp_registration_str);
7018   KMP_INTERNAL_FREE(value);
7019   KMP_INTERNAL_FREE(name);
7020 
7021   __kmp_registration_flag = 0;
7022   __kmp_registration_str = NULL;
7023 
7024 } // __kmp_unregister_library
7025 
7026 // End of Library registration stuff.
7027 // -----------------------------------------------------------------------------
7028 
7029 #if KMP_MIC_SUPPORTED
7030 
7031 static void __kmp_check_mic_type() {
7032   kmp_cpuid_t cpuid_state = {0};
7033   kmp_cpuid_t *cs_p = &cpuid_state;
7034   __kmp_x86_cpuid(1, 0, cs_p);
7035   // We don't support mic1 at the moment
7036   if ((cs_p->eax & 0xff0) == 0xB10) {
7037     __kmp_mic_type = mic2;
7038   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
7039     __kmp_mic_type = mic3;
7040   } else {
7041     __kmp_mic_type = non_mic;
7042   }
7043 }
7044 
7045 #endif /* KMP_MIC_SUPPORTED */
7046 
7047 #if KMP_HAVE_UMWAIT
7048 static void __kmp_user_level_mwait_init() {
7049   struct kmp_cpuid buf;
7050   __kmp_x86_cpuid(7, 0, &buf);
7051   __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7052   __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7053   __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7054   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7055                 __kmp_umwait_enabled));
7056 }
7057 #elif KMP_HAVE_MWAIT
7058 #ifndef AT_INTELPHIUSERMWAIT
7059 // Spurious, non-existent value that should always fail to return anything.
7060 // Will be replaced with the correct value when we know that.
7061 #define AT_INTELPHIUSERMWAIT 10000
7062 #endif
7063 // getauxval() function is available in RHEL7 and SLES12. If a system with an
7064 // earlier OS is used to build the RTL, we'll use the following internal
7065 // function when the entry is not found.
7066 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7067 unsigned long getauxval(unsigned long) { return 0; }
7068 
7069 static void __kmp_user_level_mwait_init() {
7070   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7071   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7072   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7073   // KMP_USER_LEVEL_MWAIT was set to TRUE.
7074   if (__kmp_mic_type == mic3) {
7075     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7076     if ((res & 0x1) || __kmp_user_level_mwait) {
7077       __kmp_mwait_enabled = TRUE;
7078       if (__kmp_user_level_mwait) {
7079         KMP_INFORM(EnvMwaitWarn);
7080       }
7081     } else {
7082       __kmp_mwait_enabled = FALSE;
7083     }
7084   }
7085   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7086                 "__kmp_mwait_enabled = %d\n",
7087                 __kmp_mic_type, __kmp_mwait_enabled));
7088 }
7089 #endif /* KMP_HAVE_UMWAIT */
7090 
7091 static void __kmp_do_serial_initialize(void) {
7092   int i, gtid;
7093   size_t size;
7094 
7095   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7096 
7097   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7098   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7099   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7100   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7101   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7102 
7103 #if OMPT_SUPPORT
7104   ompt_pre_init();
7105 #endif
7106 #if OMPD_SUPPORT
7107   __kmp_env_dump();
7108   ompd_init();
7109 #endif
7110 
7111   __kmp_validate_locks();
7112 
7113 #if ENABLE_LIBOMPTARGET
7114   /* Initialize functions from libomptarget */
7115   __kmp_init_omptarget();
7116 #endif
7117 
7118   /* Initialize internal memory allocator */
7119   __kmp_init_allocator();
7120 
7121   /* Register the library startup via an environment variable or via mapped
7122      shared memory file and check to see whether another copy of the library is
7123      already registered. Since forked child process is often terminated, we
7124      postpone the registration till middle initialization in the child */
7125   if (__kmp_need_register_serial)
7126     __kmp_register_library_startup();
7127 
7128   /* TODO reinitialization of library */
7129   if (TCR_4(__kmp_global.g.g_done)) {
7130     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7131   }
7132 
7133   __kmp_global.g.g_abort = 0;
7134   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7135 
7136 /* initialize the locks */
7137 #if KMP_USE_ADAPTIVE_LOCKS
7138 #if KMP_DEBUG_ADAPTIVE_LOCKS
7139   __kmp_init_speculative_stats();
7140 #endif
7141 #endif
7142 #if KMP_STATS_ENABLED
7143   __kmp_stats_init();
7144 #endif
7145   __kmp_init_lock(&__kmp_global_lock);
7146   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7147   __kmp_init_lock(&__kmp_debug_lock);
7148   __kmp_init_atomic_lock(&__kmp_atomic_lock);
7149   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7150   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7151   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7152   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7153   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7154   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7155   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7156   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7157   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7158   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7159   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7160   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7161   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7162   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7163 #if KMP_USE_MONITOR
7164   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7165 #endif
7166   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7167 
7168   /* conduct initialization and initial setup of configuration */
7169 
7170   __kmp_runtime_initialize();
7171 
7172 #if KMP_MIC_SUPPORTED
7173   __kmp_check_mic_type();
7174 #endif
7175 
7176 // Some global variable initialization moved here from kmp_env_initialize()
7177 #ifdef KMP_DEBUG
7178   kmp_diag = 0;
7179 #endif
7180   __kmp_abort_delay = 0;
7181 
7182   // From __kmp_init_dflt_team_nth()
7183   /* assume the entire machine will be used */
7184   __kmp_dflt_team_nth_ub = __kmp_xproc;
7185   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7186     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7187   }
7188   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7189     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7190   }
7191   __kmp_max_nth = __kmp_sys_max_nth;
7192   __kmp_cg_max_nth = __kmp_sys_max_nth;
7193   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7194   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7195     __kmp_teams_max_nth = __kmp_sys_max_nth;
7196   }
7197 
7198   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7199   // part
7200   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7201 #if KMP_USE_MONITOR
7202   __kmp_monitor_wakeups =
7203       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7204   __kmp_bt_intervals =
7205       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7206 #endif
7207   // From "KMP_LIBRARY" part of __kmp_env_initialize()
7208   __kmp_library = library_throughput;
7209   // From KMP_SCHEDULE initialization
7210   __kmp_static = kmp_sch_static_balanced;
7211 // AC: do not use analytical here, because it is non-monotonous
7212 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7213 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7214 // need to repeat assignment
7215 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7216 // bit control and barrier method control parts
7217 #if KMP_FAST_REDUCTION_BARRIER
7218 #define kmp_reduction_barrier_gather_bb ((int)1)
7219 #define kmp_reduction_barrier_release_bb ((int)1)
7220 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7221 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7222 #endif // KMP_FAST_REDUCTION_BARRIER
7223   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7224     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7225     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7226     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7227     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7228 #if KMP_FAST_REDUCTION_BARRIER
7229     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7230       // lin_64 ): hyper,1
7231       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7232       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7233       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7234       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7235     }
7236 #endif // KMP_FAST_REDUCTION_BARRIER
7237   }
7238 #if KMP_FAST_REDUCTION_BARRIER
7239 #undef kmp_reduction_barrier_release_pat
7240 #undef kmp_reduction_barrier_gather_pat
7241 #undef kmp_reduction_barrier_release_bb
7242 #undef kmp_reduction_barrier_gather_bb
7243 #endif // KMP_FAST_REDUCTION_BARRIER
7244 #if KMP_MIC_SUPPORTED
7245   if (__kmp_mic_type == mic2) { // KNC
7246     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7247     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7248     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7249         1; // forkjoin release
7250     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7251     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7252   }
7253 #if KMP_FAST_REDUCTION_BARRIER
7254   if (__kmp_mic_type == mic2) { // KNC
7255     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7256     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7257   }
7258 #endif // KMP_FAST_REDUCTION_BARRIER
7259 #endif // KMP_MIC_SUPPORTED
7260 
7261 // From KMP_CHECKS initialization
7262 #ifdef KMP_DEBUG
7263   __kmp_env_checks = TRUE; /* development versions have the extra checks */
7264 #else
7265   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7266 #endif
7267 
7268   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7269   __kmp_foreign_tp = TRUE;
7270 
7271   __kmp_global.g.g_dynamic = FALSE;
7272   __kmp_global.g.g_dynamic_mode = dynamic_default;
7273 
7274   __kmp_init_nesting_mode();
7275 
7276   __kmp_env_initialize(NULL);
7277 
7278 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7279   __kmp_user_level_mwait_init();
7280 #endif
7281 // Print all messages in message catalog for testing purposes.
7282 #ifdef KMP_DEBUG
7283   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7284   if (__kmp_str_match_true(val)) {
7285     kmp_str_buf_t buffer;
7286     __kmp_str_buf_init(&buffer);
7287     __kmp_i18n_dump_catalog(&buffer);
7288     __kmp_printf("%s", buffer.str);
7289     __kmp_str_buf_free(&buffer);
7290   }
7291   __kmp_env_free(&val);
7292 #endif
7293 
7294   __kmp_threads_capacity =
7295       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7296   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7297   __kmp_tp_capacity = __kmp_default_tp_capacity(
7298       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7299 
7300   // If the library is shut down properly, both pools must be NULL. Just in
7301   // case, set them to NULL -- some memory may leak, but subsequent code will
7302   // work even if pools are not freed.
7303   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7304   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7305   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7306   __kmp_thread_pool = NULL;
7307   __kmp_thread_pool_insert_pt = NULL;
7308   __kmp_team_pool = NULL;
7309 
7310   /* Allocate all of the variable sized records */
7311   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7312    * expandable */
7313   /* Since allocation is cache-aligned, just add extra padding at the end */
7314   size =
7315       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7316       CACHE_LINE;
7317   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7318   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7319                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
7320 
7321   /* init thread counts */
7322   KMP_DEBUG_ASSERT(__kmp_all_nth ==
7323                    0); // Asserts fail if the library is reinitializing and
7324   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7325   __kmp_all_nth = 0;
7326   __kmp_nth = 0;
7327 
7328   /* setup the uber master thread and hierarchy */
7329   gtid = __kmp_register_root(TRUE);
7330   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
7331   KMP_ASSERT(KMP_UBER_GTID(gtid));
7332   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7333 
7334   KMP_MB(); /* Flush all pending memory write invalidates.  */
7335 
7336   __kmp_common_initialize();
7337 
7338 #if KMP_OS_UNIX
7339   /* invoke the child fork handler */
7340   __kmp_register_atfork();
7341 #endif
7342 
7343 #if !KMP_DYNAMIC_LIB ||                                                        \
7344     ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7345   {
7346     /* Invoke the exit handler when the program finishes, only for static
7347        library and macOS* dynamic. For other dynamic libraries, we already
7348        have _fini and DllMain. */
7349     int rc = atexit(__kmp_internal_end_atexit);
7350     if (rc != 0) {
7351       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7352                   __kmp_msg_null);
7353     }
7354   }
7355 #endif
7356 
7357 #if KMP_HANDLE_SIGNALS
7358 #if KMP_OS_UNIX
7359   /* NOTE: make sure that this is called before the user installs their own
7360      signal handlers so that the user handlers are called first. this way they
7361      can return false, not call our handler, avoid terminating the library, and
7362      continue execution where they left off. */
7363   __kmp_install_signals(FALSE);
7364 #endif /* KMP_OS_UNIX */
7365 #if KMP_OS_WINDOWS
7366   __kmp_install_signals(TRUE);
7367 #endif /* KMP_OS_WINDOWS */
7368 #endif
7369 
7370   /* we have finished the serial initialization */
7371   __kmp_init_counter++;
7372 
7373   __kmp_init_serial = TRUE;
7374 
7375   if (__kmp_version) {
7376     __kmp_print_version_1();
7377   }
7378 
7379   if (__kmp_settings) {
7380     __kmp_env_print();
7381   }
7382 
7383   if (__kmp_display_env || __kmp_display_env_verbose) {
7384     __kmp_env_print_2();
7385   }
7386 
7387 #if OMPT_SUPPORT
7388   ompt_post_init();
7389 #endif
7390 
7391   KMP_MB();
7392 
7393   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7394 }
7395 
7396 void __kmp_serial_initialize(void) {
7397   if (__kmp_init_serial) {
7398     return;
7399   }
7400   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7401   if (__kmp_init_serial) {
7402     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7403     return;
7404   }
7405   __kmp_do_serial_initialize();
7406   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7407 }
7408 
7409 static void __kmp_do_middle_initialize(void) {
7410   int i, j;
7411   int prev_dflt_team_nth;
7412 
7413   if (!__kmp_init_serial) {
7414     __kmp_do_serial_initialize();
7415   }
7416 
7417   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7418 
7419   if (UNLIKELY(!__kmp_need_register_serial)) {
7420     // We are in a forked child process. The registration was skipped during
7421     // serial initialization in __kmp_atfork_child handler. Do it here.
7422     __kmp_register_library_startup();
7423   }
7424 
7425   // Save the previous value for the __kmp_dflt_team_nth so that
7426   // we can avoid some reinitialization if it hasn't changed.
7427   prev_dflt_team_nth = __kmp_dflt_team_nth;
7428 
7429 #if KMP_AFFINITY_SUPPORTED
7430   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7431   // number of cores on the machine.
7432   __kmp_affinity_initialize(__kmp_affinity);
7433 
7434 #endif /* KMP_AFFINITY_SUPPORTED */
7435 
7436   KMP_ASSERT(__kmp_xproc > 0);
7437   if (__kmp_avail_proc == 0) {
7438     __kmp_avail_proc = __kmp_xproc;
7439   }
7440 
7441   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7442   // correct them now
7443   j = 0;
7444   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7445     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7446         __kmp_avail_proc;
7447     j++;
7448   }
7449 
7450   if (__kmp_dflt_team_nth == 0) {
7451 #ifdef KMP_DFLT_NTH_CORES
7452     // Default #threads = #cores
7453     __kmp_dflt_team_nth = __kmp_ncores;
7454     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7455                   "__kmp_ncores (%d)\n",
7456                   __kmp_dflt_team_nth));
7457 #else
7458     // Default #threads = #available OS procs
7459     __kmp_dflt_team_nth = __kmp_avail_proc;
7460     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7461                   "__kmp_avail_proc(%d)\n",
7462                   __kmp_dflt_team_nth));
7463 #endif /* KMP_DFLT_NTH_CORES */
7464   }
7465 
7466   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7467     __kmp_dflt_team_nth = KMP_MIN_NTH;
7468   }
7469   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7470     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7471   }
7472 
7473   if (__kmp_nesting_mode > 0)
7474     __kmp_set_nesting_mode_threads();
7475 
7476   // There's no harm in continuing if the following check fails,
7477   // but it indicates an error in the previous logic.
7478   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7479 
7480   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7481     // Run through the __kmp_threads array and set the num threads icv for each
7482     // root thread that is currently registered with the RTL (which has not
7483     // already explicitly set its nthreads-var with a call to
7484     // omp_set_num_threads()).
7485     for (i = 0; i < __kmp_threads_capacity; i++) {
7486       kmp_info_t *thread = __kmp_threads[i];
7487       if (thread == NULL)
7488         continue;
7489       if (thread->th.th_current_task->td_icvs.nproc != 0)
7490         continue;
7491 
7492       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7493     }
7494   }
7495   KA_TRACE(
7496       20,
7497       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7498        __kmp_dflt_team_nth));
7499 
7500 #ifdef KMP_ADJUST_BLOCKTIME
7501   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7502   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7503     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7504     if (__kmp_nth > __kmp_avail_proc) {
7505       __kmp_zero_bt = TRUE;
7506     }
7507   }
7508 #endif /* KMP_ADJUST_BLOCKTIME */
7509 
7510   /* we have finished middle initialization */
7511   TCW_SYNC_4(__kmp_init_middle, TRUE);
7512 
7513   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7514 }
7515 
7516 void __kmp_middle_initialize(void) {
7517   if (__kmp_init_middle) {
7518     return;
7519   }
7520   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7521   if (__kmp_init_middle) {
7522     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7523     return;
7524   }
7525   __kmp_do_middle_initialize();
7526   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7527 }
7528 
7529 void __kmp_parallel_initialize(void) {
7530   int gtid = __kmp_entry_gtid(); // this might be a new root
7531 
7532   /* synchronize parallel initialization (for sibling) */
7533   if (TCR_4(__kmp_init_parallel))
7534     return;
7535   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7536   if (TCR_4(__kmp_init_parallel)) {
7537     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7538     return;
7539   }
7540 
7541   /* TODO reinitialization after we have already shut down */
7542   if (TCR_4(__kmp_global.g.g_done)) {
7543     KA_TRACE(
7544         10,
7545         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7546     __kmp_infinite_loop();
7547   }
7548 
7549   /* jc: The lock __kmp_initz_lock is already held, so calling
7550      __kmp_serial_initialize would cause a deadlock.  So we call
7551      __kmp_do_serial_initialize directly. */
7552   if (!__kmp_init_middle) {
7553     __kmp_do_middle_initialize();
7554   }
7555   __kmp_assign_root_init_mask();
7556   __kmp_resume_if_hard_paused();
7557 
7558   /* begin initialization */
7559   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7560   KMP_ASSERT(KMP_UBER_GTID(gtid));
7561 
7562 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7563   // Save the FP control regs.
7564   // Worker threads will set theirs to these values at thread startup.
7565   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7566   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7567   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7568 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7569 
7570 #if KMP_OS_UNIX
7571 #if KMP_HANDLE_SIGNALS
7572   /*  must be after __kmp_serial_initialize  */
7573   __kmp_install_signals(TRUE);
7574 #endif
7575 #endif
7576 
7577   __kmp_suspend_initialize();
7578 
7579 #if defined(USE_LOAD_BALANCE)
7580   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7581     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7582   }
7583 #else
7584   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7585     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7586   }
7587 #endif
7588 
7589   if (__kmp_version) {
7590     __kmp_print_version_2();
7591   }
7592 
7593   /* we have finished parallel initialization */
7594   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7595 
7596   KMP_MB();
7597   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7598 
7599   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7600 }
7601 
7602 void __kmp_hidden_helper_initialize() {
7603   if (TCR_4(__kmp_init_hidden_helper))
7604     return;
7605 
7606   // __kmp_parallel_initialize is required before we initialize hidden helper
7607   if (!TCR_4(__kmp_init_parallel))
7608     __kmp_parallel_initialize();
7609 
7610   // Double check. Note that this double check should not be placed before
7611   // __kmp_parallel_initialize as it will cause dead lock.
7612   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7613   if (TCR_4(__kmp_init_hidden_helper)) {
7614     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7615     return;
7616   }
7617 
7618 #if KMP_AFFINITY_SUPPORTED
7619   // Initialize hidden helper affinity settings.
7620   // The above __kmp_parallel_initialize() will initialize
7621   // regular affinity (and topology) if not already done.
7622   if (!__kmp_hh_affinity.flags.initialized)
7623     __kmp_affinity_initialize(__kmp_hh_affinity);
7624 #endif
7625 
7626   // Set the count of hidden helper tasks to be executed to zero
7627   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7628 
7629   // Set the global variable indicating that we're initializing hidden helper
7630   // team/threads
7631   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7632 
7633   // Platform independent initialization
7634   __kmp_do_initialize_hidden_helper_threads();
7635 
7636   // Wait here for the finish of initialization of hidden helper teams
7637   __kmp_hidden_helper_threads_initz_wait();
7638 
7639   // We have finished hidden helper initialization
7640   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7641 
7642   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7643 }
7644 
7645 /* ------------------------------------------------------------------------ */
7646 
7647 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7648                                    kmp_team_t *team) {
7649   kmp_disp_t *dispatch;
7650 
7651   KMP_MB();
7652 
7653   /* none of the threads have encountered any constructs, yet. */
7654   this_thr->th.th_local.this_construct = 0;
7655 #if KMP_CACHE_MANAGE
7656   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7657 #endif /* KMP_CACHE_MANAGE */
7658   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7659   KMP_DEBUG_ASSERT(dispatch);
7660   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7661   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7662   // this_thr->th.th_info.ds.ds_tid ] );
7663 
7664   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7665   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7666   if (__kmp_env_consistency_check)
7667     __kmp_push_parallel(gtid, team->t.t_ident);
7668 
7669   KMP_MB(); /* Flush all pending memory write invalidates.  */
7670 }
7671 
7672 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7673                                   kmp_team_t *team) {
7674   if (__kmp_env_consistency_check)
7675     __kmp_pop_parallel(gtid, team->t.t_ident);
7676 
7677   __kmp_finish_implicit_task(this_thr);
7678 }
7679 
7680 int __kmp_invoke_task_func(int gtid) {
7681   int rc;
7682   int tid = __kmp_tid_from_gtid(gtid);
7683   kmp_info_t *this_thr = __kmp_threads[gtid];
7684   kmp_team_t *team = this_thr->th.th_team;
7685 
7686   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7687 #if USE_ITT_BUILD
7688   if (__itt_stack_caller_create_ptr) {
7689     // inform ittnotify about entering user's code
7690     if (team->t.t_stack_id != NULL) {
7691       __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7692     } else {
7693       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7694       __kmp_itt_stack_callee_enter(
7695           (__itt_caller)team->t.t_parent->t.t_stack_id);
7696     }
7697   }
7698 #endif /* USE_ITT_BUILD */
7699 #if INCLUDE_SSC_MARKS
7700   SSC_MARK_INVOKING();
7701 #endif
7702 
7703 #if OMPT_SUPPORT
7704   void *dummy;
7705   void **exit_frame_p;
7706   ompt_data_t *my_task_data;
7707   ompt_data_t *my_parallel_data;
7708   int ompt_team_size;
7709 
7710   if (ompt_enabled.enabled) {
7711     exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7712                          .ompt_task_info.frame.exit_frame.ptr);
7713   } else {
7714     exit_frame_p = &dummy;
7715   }
7716 
7717   my_task_data =
7718       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7719   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7720   if (ompt_enabled.ompt_callback_implicit_task) {
7721     ompt_team_size = team->t.t_nproc;
7722     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7723         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7724         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7725     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7726   }
7727 #endif
7728 
7729 #if KMP_STATS_ENABLED
7730   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7731   if (previous_state == stats_state_e::TEAMS_REGION) {
7732     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7733   } else {
7734     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7735   }
7736   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7737 #endif
7738 
7739   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7740                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7741 #if OMPT_SUPPORT
7742                               ,
7743                               exit_frame_p
7744 #endif
7745   );
7746 #if OMPT_SUPPORT
7747   *exit_frame_p = NULL;
7748   this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_team;
7749 #endif
7750 
7751 #if KMP_STATS_ENABLED
7752   if (previous_state == stats_state_e::TEAMS_REGION) {
7753     KMP_SET_THREAD_STATE(previous_state);
7754   }
7755   KMP_POP_PARTITIONED_TIMER();
7756 #endif
7757 
7758 #if USE_ITT_BUILD
7759   if (__itt_stack_caller_create_ptr) {
7760     // inform ittnotify about leaving user's code
7761     if (team->t.t_stack_id != NULL) {
7762       __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7763     } else {
7764       KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7765       __kmp_itt_stack_callee_leave(
7766           (__itt_caller)team->t.t_parent->t.t_stack_id);
7767     }
7768   }
7769 #endif /* USE_ITT_BUILD */
7770   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7771 
7772   return rc;
7773 }
7774 
7775 void __kmp_teams_master(int gtid) {
7776   // This routine is called by all primary threads in teams construct
7777   kmp_info_t *thr = __kmp_threads[gtid];
7778   kmp_team_t *team = thr->th.th_team;
7779   ident_t *loc = team->t.t_ident;
7780   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7781   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7782   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7783   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7784                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7785 
7786   // This thread is a new CG root.  Set up the proper variables.
7787   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7788   tmp->cg_root = thr; // Make thr the CG root
7789   // Init to thread limit stored when league primary threads were forked
7790   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7791   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7792   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7793                  " cg_nthreads to 1\n",
7794                  thr, tmp));
7795   tmp->up = thr->th.th_cg_roots;
7796   thr->th.th_cg_roots = tmp;
7797 
7798 // Launch league of teams now, but not let workers execute
7799 // (they hang on fork barrier until next parallel)
7800 #if INCLUDE_SSC_MARKS
7801   SSC_MARK_FORKING();
7802 #endif
7803   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7804                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7805                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7806 #if INCLUDE_SSC_MARKS
7807   SSC_MARK_JOINING();
7808 #endif
7809   // If the team size was reduced from the limit, set it to the new size
7810   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7811     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7812   // AC: last parameter "1" eliminates join barrier which won't work because
7813   // worker threads are in a fork barrier waiting for more parallel regions
7814   __kmp_join_call(loc, gtid
7815 #if OMPT_SUPPORT
7816                   ,
7817                   fork_context_intel
7818 #endif
7819                   ,
7820                   1);
7821 }
7822 
7823 int __kmp_invoke_teams_master(int gtid) {
7824   kmp_info_t *this_thr = __kmp_threads[gtid];
7825   kmp_team_t *team = this_thr->th.th_team;
7826 #if KMP_DEBUG
7827   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7828     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7829                      (void *)__kmp_teams_master);
7830 #endif
7831   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7832 #if OMPT_SUPPORT
7833   int tid = __kmp_tid_from_gtid(gtid);
7834   ompt_data_t *task_data =
7835       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7836   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7837   if (ompt_enabled.ompt_callback_implicit_task) {
7838     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7839         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7840         ompt_task_initial);
7841     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7842   }
7843 #endif
7844   __kmp_teams_master(gtid);
7845 #if OMPT_SUPPORT
7846   this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_league;
7847 #endif
7848   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7849   return 1;
7850 }
7851 
7852 /* this sets the requested number of threads for the next parallel region
7853    encountered by this team. since this should be enclosed in the forkjoin
7854    critical section it should avoid race conditions with asymmetrical nested
7855    parallelism */
7856 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7857   kmp_info_t *thr = __kmp_threads[gtid];
7858 
7859   if (num_threads > 0)
7860     thr->th.th_set_nproc = num_threads;
7861 }
7862 
7863 void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length,
7864                                  int *num_threads_list) {
7865   kmp_info_t *thr = __kmp_threads[gtid];
7866 
7867   KMP_DEBUG_ASSERT(list_length > 1);
7868 
7869   if (num_threads_list[0] > 0)
7870     thr->th.th_set_nproc = num_threads_list[0];
7871   thr->th.th_set_nested_nth =
7872       (int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int));
7873   for (kmp_uint32 i = 0; i < list_length; ++i)
7874     thr->th.th_set_nested_nth[i] = num_threads_list[i];
7875   thr->th.th_set_nested_nth_sz = list_length;
7876 }
7877 
7878 void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
7879                                   const char *msg) {
7880   kmp_info_t *thr = __kmp_threads[gtid];
7881   thr->th.th_nt_strict = true;
7882   thr->th.th_nt_loc = loc;
7883   // if sev is unset make fatal
7884   if (sev == severity_warning)
7885     thr->th.th_nt_sev = sev;
7886   else
7887     thr->th.th_nt_sev = severity_fatal;
7888   // if msg is unset, use an appropriate message
7889   if (msg)
7890     thr->th.th_nt_msg = msg;
7891   else
7892     thr->th.th_nt_msg = "Cannot form team with number of threads specified by "
7893                         "strict num_threads clause.";
7894 }
7895 
7896 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7897                                     int num_threads) {
7898   KMP_DEBUG_ASSERT(thr);
7899   // Remember the number of threads for inner parallel regions
7900   if (!TCR_4(__kmp_init_middle))
7901     __kmp_middle_initialize(); // get internal globals calculated
7902   __kmp_assign_root_init_mask();
7903   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7904   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7905 
7906   if (num_threads == 0) {
7907     if (__kmp_teams_thread_limit > 0) {
7908       num_threads = __kmp_teams_thread_limit;
7909     } else {
7910       num_threads = __kmp_avail_proc / num_teams;
7911     }
7912     // adjust num_threads w/o warning as it is not user setting
7913     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7914     // no thread_limit clause specified -  do not change thread-limit-var ICV
7915     if (num_threads > __kmp_dflt_team_nth) {
7916       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7917     }
7918     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7919       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7920     } // prevent team size to exceed thread-limit-var
7921     if (num_teams * num_threads > __kmp_teams_max_nth) {
7922       num_threads = __kmp_teams_max_nth / num_teams;
7923     }
7924     if (num_threads == 0) {
7925       num_threads = 1;
7926     }
7927   } else {
7928     if (num_threads < 0) {
7929       __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7930                 __kmp_msg_null);
7931       num_threads = 1;
7932     }
7933     // This thread will be the primary thread of the league primary threads
7934     // Store new thread limit; old limit is saved in th_cg_roots list
7935     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7936     // num_threads = min(num_threads, nthreads-var)
7937     if (num_threads > __kmp_dflt_team_nth) {
7938       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7939     }
7940     if (num_teams * num_threads > __kmp_teams_max_nth) {
7941       int new_threads = __kmp_teams_max_nth / num_teams;
7942       if (new_threads == 0) {
7943         new_threads = 1;
7944       }
7945       if (new_threads != num_threads) {
7946         if (!__kmp_reserve_warn) { // user asked for too many threads
7947           __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7948           __kmp_msg(kmp_ms_warning,
7949                     KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7950                     KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7951         }
7952       }
7953       num_threads = new_threads;
7954     }
7955   }
7956   thr->th.th_teams_size.nth = num_threads;
7957 }
7958 
7959 /* this sets the requested number of teams for the teams region and/or
7960    the number of threads for the next parallel region encountered  */
7961 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7962                           int num_threads) {
7963   kmp_info_t *thr = __kmp_threads[gtid];
7964   if (num_teams < 0) {
7965     // OpenMP specification requires requested values to be positive,
7966     // but people can send us any value, so we'd better check
7967     __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7968               __kmp_msg_null);
7969     num_teams = 1;
7970   }
7971   if (num_teams == 0) {
7972     if (__kmp_nteams > 0) {
7973       num_teams = __kmp_nteams;
7974     } else {
7975       num_teams = 1; // default number of teams is 1.
7976     }
7977   }
7978   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7979     if (!__kmp_reserve_warn) {
7980       __kmp_reserve_warn = 1;
7981       __kmp_msg(kmp_ms_warning,
7982                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7983                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7984     }
7985     num_teams = __kmp_teams_max_nth;
7986   }
7987   // Set number of teams (number of threads in the outer "parallel" of the
7988   // teams)
7989   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7990 
7991   __kmp_push_thread_limit(thr, num_teams, num_threads);
7992 }
7993 
7994 /* This sets the requested number of teams for the teams region and/or
7995    the number of threads for the next parallel region encountered  */
7996 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7997                              int num_teams_ub, int num_threads) {
7998   kmp_info_t *thr = __kmp_threads[gtid];
7999   KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
8000   KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
8001   KMP_DEBUG_ASSERT(num_threads >= 0);
8002 
8003   if (num_teams_lb > num_teams_ub) {
8004     __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
8005                 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
8006   }
8007 
8008   int num_teams = 1; // defalt number of teams is 1.
8009 
8010   if (num_teams_lb == 0 && num_teams_ub > 0)
8011     num_teams_lb = num_teams_ub;
8012 
8013   if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
8014     num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
8015     if (num_teams > __kmp_teams_max_nth) {
8016       if (!__kmp_reserve_warn) {
8017         __kmp_reserve_warn = 1;
8018         __kmp_msg(kmp_ms_warning,
8019                   KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
8020                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
8021       }
8022       num_teams = __kmp_teams_max_nth;
8023     }
8024   } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
8025     num_teams = num_teams_ub;
8026   } else { // num_teams_lb <= num_teams <= num_teams_ub
8027     if (num_threads <= 0) {
8028       if (num_teams_ub > __kmp_teams_max_nth) {
8029         num_teams = num_teams_lb;
8030       } else {
8031         num_teams = num_teams_ub;
8032       }
8033     } else {
8034       num_teams = (num_threads > __kmp_teams_max_nth)
8035                       ? num_teams
8036                       : __kmp_teams_max_nth / num_threads;
8037       if (num_teams < num_teams_lb) {
8038         num_teams = num_teams_lb;
8039       } else if (num_teams > num_teams_ub) {
8040         num_teams = num_teams_ub;
8041       }
8042     }
8043   }
8044   // Set number of teams (number of threads in the outer "parallel" of the
8045   // teams)
8046   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
8047 
8048   __kmp_push_thread_limit(thr, num_teams, num_threads);
8049 }
8050 
8051 // Set the proc_bind var to use in the following parallel region.
8052 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8053   kmp_info_t *thr = __kmp_threads[gtid];
8054   thr->th.th_set_proc_bind = proc_bind;
8055 }
8056 
8057 /* Launch the worker threads into the microtask. */
8058 
8059 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8060   kmp_info_t *this_thr = __kmp_threads[gtid];
8061 
8062 #ifdef KMP_DEBUG
8063   int f;
8064 #endif /* KMP_DEBUG */
8065 
8066   KMP_DEBUG_ASSERT(team);
8067   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8068   KMP_ASSERT(KMP_MASTER_GTID(gtid));
8069   KMP_MB(); /* Flush all pending memory write invalidates.  */
8070 
8071   team->t.t_construct = 0; /* no single directives seen yet */
8072   team->t.t_ordered.dt.t_value =
8073       0; /* thread 0 enters the ordered section first */
8074 
8075   /* Reset the identifiers on the dispatch buffer */
8076   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8077   if (team->t.t_max_nproc > 1) {
8078     int i;
8079     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8080       team->t.t_disp_buffer[i].buffer_index = i;
8081       team->t.t_disp_buffer[i].doacross_buf_idx = i;
8082     }
8083   } else {
8084     team->t.t_disp_buffer[0].buffer_index = 0;
8085     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8086   }
8087 
8088   KMP_MB(); /* Flush all pending memory write invalidates.  */
8089   KMP_ASSERT(this_thr->th.th_team == team);
8090 
8091 #ifdef KMP_DEBUG
8092   for (f = 0; f < team->t.t_nproc; f++) {
8093     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8094                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8095   }
8096 #endif /* KMP_DEBUG */
8097 
8098   /* release the worker threads so they may begin working */
8099   __kmp_fork_barrier(gtid, 0);
8100 }
8101 
8102 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8103   kmp_info_t *this_thr = __kmp_threads[gtid];
8104 
8105   KMP_DEBUG_ASSERT(team);
8106   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8107   KMP_ASSERT(KMP_MASTER_GTID(gtid));
8108   KMP_MB(); /* Flush all pending memory write invalidates.  */
8109 
8110   /* Join barrier after fork */
8111 
8112 #ifdef KMP_DEBUG
8113   if (__kmp_threads[gtid] &&
8114       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8115     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8116                  __kmp_threads[gtid]);
8117     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8118                  "team->t.t_nproc=%d\n",
8119                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8120                  team->t.t_nproc);
8121     __kmp_print_structure();
8122   }
8123   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8124                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8125 #endif /* KMP_DEBUG */
8126 
8127   __kmp_join_barrier(gtid); /* wait for everyone */
8128 #if OMPT_SUPPORT
8129   ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
8130   if (ompt_enabled.enabled &&
8131       (ompt_state == ompt_state_wait_barrier_teams ||
8132        ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
8133     int ds_tid = this_thr->th.th_info.ds.ds_tid;
8134     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8135     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8136 #if OMPT_OPTIONAL
8137     void *codeptr = NULL;
8138     if (KMP_MASTER_TID(ds_tid) &&
8139         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8140          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8141       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8142 
8143     ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
8144     if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
8145       sync_kind = ompt_sync_region_barrier_teams;
8146     if (ompt_enabled.ompt_callback_sync_region_wait) {
8147       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8148           sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8149     }
8150     if (ompt_enabled.ompt_callback_sync_region) {
8151       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8152           sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8153     }
8154 #endif
8155     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8156       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8157           ompt_scope_end, NULL, task_data, 0, ds_tid,
8158           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8159     }
8160   }
8161 #endif
8162 
8163   KMP_MB(); /* Flush all pending memory write invalidates.  */
8164   KMP_ASSERT(this_thr->th.th_team == team);
8165 }
8166 
8167 /* ------------------------------------------------------------------------ */
8168 
8169 #ifdef USE_LOAD_BALANCE
8170 
8171 // Return the worker threads actively spinning in the hot team, if we
8172 // are at the outermost level of parallelism.  Otherwise, return 0.
8173 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8174   int i;
8175   int retval;
8176   kmp_team_t *hot_team;
8177 
8178   if (root->r.r_active) {
8179     return 0;
8180   }
8181   hot_team = root->r.r_hot_team;
8182   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8183     return hot_team->t.t_nproc - 1; // Don't count primary thread
8184   }
8185 
8186   // Skip the primary thread - it is accounted for elsewhere.
8187   retval = 0;
8188   for (i = 1; i < hot_team->t.t_nproc; i++) {
8189     if (hot_team->t.t_threads[i]->th.th_active) {
8190       retval++;
8191     }
8192   }
8193   return retval;
8194 }
8195 
8196 // Perform an automatic adjustment to the number of
8197 // threads used by the next parallel region.
8198 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8199   int retval;
8200   int pool_active;
8201   int hot_team_active;
8202   int team_curr_active;
8203   int system_active;
8204 
8205   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8206                 set_nproc));
8207   KMP_DEBUG_ASSERT(root);
8208   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8209                        ->th.th_current_task->td_icvs.dynamic == TRUE);
8210   KMP_DEBUG_ASSERT(set_nproc > 1);
8211 
8212   if (set_nproc == 1) {
8213     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8214     return 1;
8215   }
8216 
8217   // Threads that are active in the thread pool, active in the hot team for this
8218   // particular root (if we are at the outer par level), and the currently
8219   // executing thread (to become the primary thread) are available to add to the
8220   // new team, but are currently contributing to the system load, and must be
8221   // accounted for.
8222   pool_active = __kmp_thread_pool_active_nth;
8223   hot_team_active = __kmp_active_hot_team_nproc(root);
8224   team_curr_active = pool_active + hot_team_active + 1;
8225 
8226   // Check the system load.
8227   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8228   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8229                 "hot team active = %d\n",
8230                 system_active, pool_active, hot_team_active));
8231 
8232   if (system_active < 0) {
8233     // There was an error reading the necessary info from /proc, so use the
8234     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8235     // = dynamic_thread_limit, we shouldn't wind up getting back here.
8236     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8237     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8238 
8239     // Make this call behave like the thread limit algorithm.
8240     retval = __kmp_avail_proc - __kmp_nth +
8241              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8242     if (retval > set_nproc) {
8243       retval = set_nproc;
8244     }
8245     if (retval < KMP_MIN_NTH) {
8246       retval = KMP_MIN_NTH;
8247     }
8248 
8249     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8250                   retval));
8251     return retval;
8252   }
8253 
8254   // There is a slight delay in the load balance algorithm in detecting new
8255   // running procs. The real system load at this instant should be at least as
8256   // large as the #active omp thread that are available to add to the team.
8257   if (system_active < team_curr_active) {
8258     system_active = team_curr_active;
8259   }
8260   retval = __kmp_avail_proc - system_active + team_curr_active;
8261   if (retval > set_nproc) {
8262     retval = set_nproc;
8263   }
8264   if (retval < KMP_MIN_NTH) {
8265     retval = KMP_MIN_NTH;
8266   }
8267 
8268   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8269   return retval;
8270 } // __kmp_load_balance_nproc()
8271 
8272 #endif /* USE_LOAD_BALANCE */
8273 
8274 /* ------------------------------------------------------------------------ */
8275 
8276 /* NOTE: this is called with the __kmp_init_lock held */
8277 void __kmp_cleanup(void) {
8278   int f;
8279 
8280   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8281 
8282   if (TCR_4(__kmp_init_parallel)) {
8283 #if KMP_HANDLE_SIGNALS
8284     __kmp_remove_signals();
8285 #endif
8286     TCW_4(__kmp_init_parallel, FALSE);
8287   }
8288 
8289   if (TCR_4(__kmp_init_middle)) {
8290 #if KMP_AFFINITY_SUPPORTED
8291     __kmp_affinity_uninitialize();
8292 #endif /* KMP_AFFINITY_SUPPORTED */
8293     __kmp_cleanup_hierarchy();
8294     TCW_4(__kmp_init_middle, FALSE);
8295   }
8296 
8297   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8298 
8299   if (__kmp_init_serial) {
8300     __kmp_runtime_destroy();
8301     __kmp_init_serial = FALSE;
8302   }
8303 
8304   __kmp_cleanup_threadprivate_caches();
8305 
8306   for (f = 0; f < __kmp_threads_capacity; f++) {
8307     if (__kmp_root[f] != NULL) {
8308       __kmp_free(__kmp_root[f]);
8309       __kmp_root[f] = NULL;
8310     }
8311   }
8312   __kmp_free(__kmp_threads);
8313   // __kmp_threads and __kmp_root were allocated at once, as single block, so
8314   // there is no need in freeing __kmp_root.
8315   __kmp_threads = NULL;
8316   __kmp_root = NULL;
8317   __kmp_threads_capacity = 0;
8318 
8319   // Free old __kmp_threads arrays if they exist.
8320   kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8321   while (ptr) {
8322     kmp_old_threads_list_t *next = ptr->next;
8323     __kmp_free(ptr->threads);
8324     __kmp_free(ptr);
8325     ptr = next;
8326   }
8327 
8328 #if KMP_USE_DYNAMIC_LOCK
8329   __kmp_cleanup_indirect_user_locks();
8330 #else
8331   __kmp_cleanup_user_locks();
8332 #endif
8333 #if OMPD_SUPPORT
8334   if (ompd_state) {
8335     __kmp_free(ompd_env_block);
8336     ompd_env_block = NULL;
8337     ompd_env_block_size = 0;
8338   }
8339 #endif
8340 
8341 #if KMP_AFFINITY_SUPPORTED
8342   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8343   __kmp_cpuinfo_file = NULL;
8344 #endif /* KMP_AFFINITY_SUPPORTED */
8345 
8346 #if KMP_USE_ADAPTIVE_LOCKS
8347 #if KMP_DEBUG_ADAPTIVE_LOCKS
8348   __kmp_print_speculative_stats();
8349 #endif
8350 #endif
8351   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8352   __kmp_nested_nth.nth = NULL;
8353   __kmp_nested_nth.size = 0;
8354   __kmp_nested_nth.used = 0;
8355 
8356   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8357   __kmp_nested_proc_bind.bind_types = NULL;
8358   __kmp_nested_proc_bind.size = 0;
8359   __kmp_nested_proc_bind.used = 0;
8360   if (__kmp_affinity_format) {
8361     KMP_INTERNAL_FREE(__kmp_affinity_format);
8362     __kmp_affinity_format = NULL;
8363   }
8364 
8365   __kmp_i18n_catclose();
8366 
8367 #if KMP_USE_HIER_SCHED
8368   __kmp_hier_scheds.deallocate();
8369 #endif
8370 
8371 #if KMP_STATS_ENABLED
8372   __kmp_stats_fini();
8373 #endif
8374 
8375   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8376 }
8377 
8378 /* ------------------------------------------------------------------------ */
8379 
8380 int __kmp_ignore_mppbeg(void) {
8381   char *env;
8382 
8383   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8384     if (__kmp_str_match_false(env))
8385       return FALSE;
8386   }
8387   // By default __kmpc_begin() is no-op.
8388   return TRUE;
8389 }
8390 
8391 int __kmp_ignore_mppend(void) {
8392   char *env;
8393 
8394   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8395     if (__kmp_str_match_false(env))
8396       return FALSE;
8397   }
8398   // By default __kmpc_end() is no-op.
8399   return TRUE;
8400 }
8401 
8402 void __kmp_internal_begin(void) {
8403   int gtid;
8404   kmp_root_t *root;
8405 
8406   /* this is a very important step as it will register new sibling threads
8407      and assign these new uber threads a new gtid */
8408   gtid = __kmp_entry_gtid();
8409   root = __kmp_threads[gtid]->th.th_root;
8410   KMP_ASSERT(KMP_UBER_GTID(gtid));
8411 
8412   if (root->r.r_begin)
8413     return;
8414   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8415   if (root->r.r_begin) {
8416     __kmp_release_lock(&root->r.r_begin_lock, gtid);
8417     return;
8418   }
8419 
8420   root->r.r_begin = TRUE;
8421 
8422   __kmp_release_lock(&root->r.r_begin_lock, gtid);
8423 }
8424 
8425 /* ------------------------------------------------------------------------ */
8426 
8427 void __kmp_user_set_library(enum library_type arg) {
8428   int gtid;
8429   kmp_root_t *root;
8430   kmp_info_t *thread;
8431 
8432   /* first, make sure we are initialized so we can get our gtid */
8433 
8434   gtid = __kmp_entry_gtid();
8435   thread = __kmp_threads[gtid];
8436 
8437   root = thread->th.th_root;
8438 
8439   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8440                 library_serial));
8441   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8442                                   thread */
8443     KMP_WARNING(SetLibraryIncorrectCall);
8444     return;
8445   }
8446 
8447   switch (arg) {
8448   case library_serial:
8449     thread->th.th_set_nproc = 0;
8450     set__nproc(thread, 1);
8451     break;
8452   case library_turnaround:
8453     thread->th.th_set_nproc = 0;
8454     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8455                                            : __kmp_dflt_team_nth_ub);
8456     break;
8457   case library_throughput:
8458     thread->th.th_set_nproc = 0;
8459     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8460                                            : __kmp_dflt_team_nth_ub);
8461     break;
8462   default:
8463     KMP_FATAL(UnknownLibraryType, arg);
8464   }
8465 
8466   __kmp_aux_set_library(arg);
8467 }
8468 
8469 void __kmp_aux_set_stacksize(size_t arg) {
8470   if (!__kmp_init_serial)
8471     __kmp_serial_initialize();
8472 
8473 #if KMP_OS_DARWIN
8474   if (arg & (0x1000 - 1)) {
8475     arg &= ~(0x1000 - 1);
8476     if (arg + 0x1000) /* check for overflow if we round up */
8477       arg += 0x1000;
8478   }
8479 #endif
8480   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8481 
8482   /* only change the default stacksize before the first parallel region */
8483   if (!TCR_4(__kmp_init_parallel)) {
8484     size_t value = arg; /* argument is in bytes */
8485 
8486     if (value < __kmp_sys_min_stksize)
8487       value = __kmp_sys_min_stksize;
8488     else if (value > KMP_MAX_STKSIZE)
8489       value = KMP_MAX_STKSIZE;
8490 
8491     __kmp_stksize = value;
8492 
8493     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8494   }
8495 
8496   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8497 }
8498 
8499 /* set the behaviour of the runtime library */
8500 /* TODO this can cause some odd behaviour with sibling parallelism... */
8501 void __kmp_aux_set_library(enum library_type arg) {
8502   __kmp_library = arg;
8503 
8504   switch (__kmp_library) {
8505   case library_serial: {
8506     KMP_INFORM(LibraryIsSerial);
8507   } break;
8508   case library_turnaround:
8509     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8510       __kmp_use_yield = 2; // only yield when oversubscribed
8511     break;
8512   case library_throughput:
8513     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8514       __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8515     break;
8516   default:
8517     KMP_FATAL(UnknownLibraryType, arg);
8518   }
8519 }
8520 
8521 /* Getting team information common for all team API */
8522 // Returns NULL if not in teams construct
8523 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8524   kmp_info_t *thr = __kmp_entry_thread();
8525   teams_serialized = 0;
8526   if (thr->th.th_teams_microtask) {
8527     kmp_team_t *team = thr->th.th_team;
8528     int tlevel = thr->th.th_teams_level; // the level of the teams construct
8529     int ii = team->t.t_level;
8530     teams_serialized = team->t.t_serialized;
8531     int level = tlevel + 1;
8532     KMP_DEBUG_ASSERT(ii >= tlevel);
8533     while (ii > level) {
8534       for (teams_serialized = team->t.t_serialized;
8535            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8536       }
8537       if (team->t.t_serialized && (!teams_serialized)) {
8538         team = team->t.t_parent;
8539         continue;
8540       }
8541       if (ii > level) {
8542         team = team->t.t_parent;
8543         ii--;
8544       }
8545     }
8546     return team;
8547   }
8548   return NULL;
8549 }
8550 
8551 int __kmp_aux_get_team_num() {
8552   int serialized;
8553   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8554   if (team) {
8555     if (serialized > 1) {
8556       return 0; // teams region is serialized ( 1 team of 1 thread ).
8557     } else {
8558       return team->t.t_master_tid;
8559     }
8560   }
8561   return 0;
8562 }
8563 
8564 int __kmp_aux_get_num_teams() {
8565   int serialized;
8566   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8567   if (team) {
8568     if (serialized > 1) {
8569       return 1;
8570     } else {
8571       return team->t.t_parent->t.t_nproc;
8572     }
8573   }
8574   return 1;
8575 }
8576 
8577 /* ------------------------------------------------------------------------ */
8578 
8579 /*
8580  * Affinity Format Parser
8581  *
8582  * Field is in form of: %[[[0].]size]type
8583  * % and type are required (%% means print a literal '%')
8584  * type is either single char or long name surrounded by {},
8585  * e.g., N or {num_threads}
8586  * 0 => leading zeros
8587  * . => right justified when size is specified
8588  * by default output is left justified
8589  * size is the *minimum* field length
8590  * All other characters are printed as is
8591  *
8592  * Available field types:
8593  * L {thread_level}      - omp_get_level()
8594  * n {thread_num}        - omp_get_thread_num()
8595  * h {host}              - name of host machine
8596  * P {process_id}        - process id (integer)
8597  * T {thread_identifier} - native thread identifier (integer)
8598  * N {num_threads}       - omp_get_num_threads()
8599  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8600  * a {thread_affinity}   - comma separated list of integers or integer ranges
8601  *                         (values of affinity mask)
8602  *
8603  * Implementation-specific field types can be added
8604  * If a type is unknown, print "undefined"
8605  */
8606 
8607 // Structure holding the short name, long name, and corresponding data type
8608 // for snprintf.  A table of these will represent the entire valid keyword
8609 // field types.
8610 typedef struct kmp_affinity_format_field_t {
8611   char short_name; // from spec e.g., L -> thread level
8612   const char *long_name; // from spec thread_level -> thread level
8613   char field_format; // data type for snprintf (typically 'd' or 's'
8614   // for integer or string)
8615 } kmp_affinity_format_field_t;
8616 
8617 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8618 #if KMP_AFFINITY_SUPPORTED
8619     {'A', "thread_affinity", 's'},
8620 #endif
8621     {'t', "team_num", 'd'},
8622     {'T', "num_teams", 'd'},
8623     {'L', "nesting_level", 'd'},
8624     {'n', "thread_num", 'd'},
8625     {'N', "num_threads", 'd'},
8626     {'a', "ancestor_tnum", 'd'},
8627     {'H', "host", 's'},
8628     {'P', "process_id", 'd'},
8629     {'i', "native_thread_id", 'd'}};
8630 
8631 // Return the number of characters it takes to hold field
8632 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8633                                             const char **ptr,
8634                                             kmp_str_buf_t *field_buffer) {
8635   int rc, format_index, field_value;
8636   const char *width_left, *width_right;
8637   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8638   static const int FORMAT_SIZE = 20;
8639   char format[FORMAT_SIZE] = {0};
8640   char absolute_short_name = 0;
8641 
8642   KMP_DEBUG_ASSERT(gtid >= 0);
8643   KMP_DEBUG_ASSERT(th);
8644   KMP_DEBUG_ASSERT(**ptr == '%');
8645   KMP_DEBUG_ASSERT(field_buffer);
8646 
8647   __kmp_str_buf_clear(field_buffer);
8648 
8649   // Skip the initial %
8650   (*ptr)++;
8651 
8652   // Check for %% first
8653   if (**ptr == '%') {
8654     __kmp_str_buf_cat(field_buffer, "%", 1);
8655     (*ptr)++; // skip over the second %
8656     return 1;
8657   }
8658 
8659   // Parse field modifiers if they are present
8660   pad_zeros = false;
8661   if (**ptr == '0') {
8662     pad_zeros = true;
8663     (*ptr)++; // skip over 0
8664   }
8665   right_justify = false;
8666   if (**ptr == '.') {
8667     right_justify = true;
8668     (*ptr)++; // skip over .
8669   }
8670   // Parse width of field: [width_left, width_right)
8671   width_left = width_right = NULL;
8672   if (**ptr >= '0' && **ptr <= '9') {
8673     width_left = *ptr;
8674     SKIP_DIGITS(*ptr);
8675     width_right = *ptr;
8676   }
8677 
8678   // Create the format for KMP_SNPRINTF based on flags parsed above
8679   format_index = 0;
8680   format[format_index++] = '%';
8681   if (!right_justify)
8682     format[format_index++] = '-';
8683   if (pad_zeros)
8684     format[format_index++] = '0';
8685   if (width_left && width_right) {
8686     int i = 0;
8687     // Only allow 8 digit number widths.
8688     // This also prevents overflowing format variable
8689     while (i < 8 && width_left < width_right) {
8690       format[format_index++] = *width_left;
8691       width_left++;
8692       i++;
8693     }
8694   }
8695 
8696   // Parse a name (long or short)
8697   // Canonicalize the name into absolute_short_name
8698   found_valid_name = false;
8699   parse_long_name = (**ptr == '{');
8700   if (parse_long_name)
8701     (*ptr)++; // skip initial left brace
8702   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8703                              sizeof(__kmp_affinity_format_table[0]);
8704        ++i) {
8705     char short_name = __kmp_affinity_format_table[i].short_name;
8706     const char *long_name = __kmp_affinity_format_table[i].long_name;
8707     char field_format = __kmp_affinity_format_table[i].field_format;
8708     if (parse_long_name) {
8709       size_t length = KMP_STRLEN(long_name);
8710       if (strncmp(*ptr, long_name, length) == 0) {
8711         found_valid_name = true;
8712         (*ptr) += length; // skip the long name
8713       }
8714     } else if (**ptr == short_name) {
8715       found_valid_name = true;
8716       (*ptr)++; // skip the short name
8717     }
8718     if (found_valid_name) {
8719       format[format_index++] = field_format;
8720       format[format_index++] = '\0';
8721       absolute_short_name = short_name;
8722       break;
8723     }
8724   }
8725   if (parse_long_name) {
8726     if (**ptr != '}') {
8727       absolute_short_name = 0;
8728     } else {
8729       (*ptr)++; // skip over the right brace
8730     }
8731   }
8732 
8733   // Attempt to fill the buffer with the requested
8734   // value using snprintf within __kmp_str_buf_print()
8735   switch (absolute_short_name) {
8736   case 't':
8737     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8738     break;
8739   case 'T':
8740     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8741     break;
8742   case 'L':
8743     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8744     break;
8745   case 'n':
8746     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8747     break;
8748   case 'H': {
8749     static const int BUFFER_SIZE = 256;
8750     char buf[BUFFER_SIZE];
8751     __kmp_expand_host_name(buf, BUFFER_SIZE);
8752     rc = __kmp_str_buf_print(field_buffer, format, buf);
8753   } break;
8754   case 'P':
8755     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8756     break;
8757   case 'i':
8758     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8759     break;
8760   case 'N':
8761     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8762     break;
8763   case 'a':
8764     field_value =
8765         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8766     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8767     break;
8768 #if KMP_AFFINITY_SUPPORTED
8769   case 'A': {
8770     kmp_str_buf_t buf;
8771     __kmp_str_buf_init(&buf);
8772     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8773     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8774     __kmp_str_buf_free(&buf);
8775   } break;
8776 #endif
8777   default:
8778     // According to spec, If an implementation does not have info for field
8779     // type, then "undefined" is printed
8780     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8781     // Skip the field
8782     if (parse_long_name) {
8783       SKIP_TOKEN(*ptr);
8784       if (**ptr == '}')
8785         (*ptr)++;
8786     } else {
8787       (*ptr)++;
8788     }
8789   }
8790 
8791   KMP_ASSERT(format_index <= FORMAT_SIZE);
8792   return rc;
8793 }
8794 
8795 /*
8796  * Return number of characters needed to hold the affinity string
8797  * (not including null byte character)
8798  * The resultant string is printed to buffer, which the caller can then
8799  * handle afterwards
8800  */
8801 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8802                                   kmp_str_buf_t *buffer) {
8803   const char *parse_ptr;
8804   size_t retval;
8805   const kmp_info_t *th;
8806   kmp_str_buf_t field;
8807 
8808   KMP_DEBUG_ASSERT(buffer);
8809   KMP_DEBUG_ASSERT(gtid >= 0);
8810 
8811   __kmp_str_buf_init(&field);
8812   __kmp_str_buf_clear(buffer);
8813 
8814   th = __kmp_threads[gtid];
8815   retval = 0;
8816 
8817   // If format is NULL or zero-length string, then we use
8818   // affinity-format-var ICV
8819   parse_ptr = format;
8820   if (parse_ptr == NULL || *parse_ptr == '\0') {
8821     parse_ptr = __kmp_affinity_format;
8822   }
8823   KMP_DEBUG_ASSERT(parse_ptr);
8824 
8825   while (*parse_ptr != '\0') {
8826     // Parse a field
8827     if (*parse_ptr == '%') {
8828       // Put field in the buffer
8829       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8830       __kmp_str_buf_catbuf(buffer, &field);
8831       retval += rc;
8832     } else {
8833       // Put literal character in buffer
8834       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8835       retval++;
8836       parse_ptr++;
8837     }
8838   }
8839   __kmp_str_buf_free(&field);
8840   return retval;
8841 }
8842 
8843 // Displays the affinity string to stdout
8844 void __kmp_aux_display_affinity(int gtid, const char *format) {
8845   kmp_str_buf_t buf;
8846   __kmp_str_buf_init(&buf);
8847   __kmp_aux_capture_affinity(gtid, format, &buf);
8848   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8849   __kmp_str_buf_free(&buf);
8850 }
8851 
8852 /* ------------------------------------------------------------------------ */
8853 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8854   int blocktime = arg; /* argument is in microseconds */
8855 #if KMP_USE_MONITOR
8856   int bt_intervals;
8857 #endif
8858   kmp_int8 bt_set;
8859 
8860   __kmp_save_internal_controls(thread);
8861 
8862   /* Normalize and set blocktime for the teams */
8863   if (blocktime < KMP_MIN_BLOCKTIME)
8864     blocktime = KMP_MIN_BLOCKTIME;
8865   else if (blocktime > KMP_MAX_BLOCKTIME)
8866     blocktime = KMP_MAX_BLOCKTIME;
8867 
8868   set__blocktime_team(thread->th.th_team, tid, blocktime);
8869   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8870 
8871 #if KMP_USE_MONITOR
8872   /* Calculate and set blocktime intervals for the teams */
8873   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8874 
8875   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8876   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8877 #endif
8878 
8879   /* Set whether blocktime has been set to "TRUE" */
8880   bt_set = TRUE;
8881 
8882   set__bt_set_team(thread->th.th_team, tid, bt_set);
8883   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8884 #if KMP_USE_MONITOR
8885   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8886                 "bt_intervals=%d, monitor_updates=%d\n",
8887                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8888                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8889                 __kmp_monitor_wakeups));
8890 #else
8891   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8892                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8893                 thread->th.th_team->t.t_id, tid, blocktime));
8894 #endif
8895 }
8896 
8897 void __kmp_aux_set_defaults(char const *str, size_t len) {
8898   if (!__kmp_init_serial) {
8899     __kmp_serial_initialize();
8900   }
8901   __kmp_env_initialize(str);
8902 
8903   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8904     __kmp_env_print();
8905   }
8906 } // __kmp_aux_set_defaults
8907 
8908 /* ------------------------------------------------------------------------ */
8909 /* internal fast reduction routines */
8910 
8911 PACKED_REDUCTION_METHOD_T
8912 __kmp_determine_reduction_method(
8913     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8914     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8915     kmp_critical_name *lck) {
8916 
8917   // Default reduction method: critical construct ( lck != NULL, like in current
8918   // PAROPT )
8919   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8920   // can be selected by RTL
8921   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8922   // can be selected by RTL
8923   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8924   // among generated by PAROPT.
8925 
8926   PACKED_REDUCTION_METHOD_T retval;
8927 
8928   int team_size;
8929 
8930   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8931 
8932 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8933   (loc &&                                                                      \
8934    ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8935 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8936 
8937   retval = critical_reduce_block;
8938 
8939   // another choice of getting a team size (with 1 dynamic deference) is slower
8940   team_size = __kmp_get_team_num_threads(global_tid);
8941   if (team_size == 1) {
8942 
8943     retval = empty_reduce_block;
8944 
8945   } else {
8946 
8947     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8948 
8949 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8950     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||             \
8951     KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
8952 
8953 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8954     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||        \
8955     KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8956 
8957     int teamsize_cutoff = 4;
8958 
8959 #if KMP_MIC_SUPPORTED
8960     if (__kmp_mic_type != non_mic) {
8961       teamsize_cutoff = 8;
8962     }
8963 #endif
8964     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8965     if (tree_available) {
8966       if (team_size <= teamsize_cutoff) {
8967         if (atomic_available) {
8968           retval = atomic_reduce_block;
8969         }
8970       } else {
8971         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8972       }
8973     } else if (atomic_available) {
8974       retval = atomic_reduce_block;
8975     }
8976 #else
8977 #error "Unknown or unsupported OS"
8978 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8979        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||
8980        // KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8981 
8982 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS ||       \
8983     KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32
8984 
8985 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8986     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_SOLARIS ||       \
8987     KMP_OS_WASI || KMP_OS_AIX
8988 
8989     // basic tuning
8990 
8991     if (atomic_available) {
8992       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8993         retval = atomic_reduce_block;
8994       }
8995     } // otherwise: use critical section
8996 
8997 #elif KMP_OS_DARWIN
8998 
8999     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9000     if (atomic_available && (num_vars <= 3)) {
9001       retval = atomic_reduce_block;
9002     } else if (tree_available) {
9003       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
9004           (reduce_size < (2000 * sizeof(kmp_real64)))) {
9005         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
9006       }
9007     } // otherwise: use critical section
9008 
9009 #else
9010 #error "Unknown or unsupported OS"
9011 #endif
9012 
9013 #else
9014 #error "Unknown or unsupported architecture"
9015 #endif
9016   }
9017 
9018   // KMP_FORCE_REDUCTION
9019 
9020   // If the team is serialized (team_size == 1), ignore the forced reduction
9021   // method and stay with the unsynchronized method (empty_reduce_block)
9022   if (__kmp_force_reduction_method != reduction_method_not_defined &&
9023       team_size != 1) {
9024 
9025     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
9026 
9027     int atomic_available, tree_available;
9028 
9029     switch ((forced_retval = __kmp_force_reduction_method)) {
9030     case critical_reduce_block:
9031       KMP_ASSERT(lck); // lck should be != 0
9032       break;
9033 
9034     case atomic_reduce_block:
9035       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
9036       if (!atomic_available) {
9037         KMP_WARNING(RedMethodNotSupported, "atomic");
9038         forced_retval = critical_reduce_block;
9039       }
9040       break;
9041 
9042     case tree_reduce_block:
9043       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9044       if (!tree_available) {
9045         KMP_WARNING(RedMethodNotSupported, "tree");
9046         forced_retval = critical_reduce_block;
9047       } else {
9048 #if KMP_FAST_REDUCTION_BARRIER
9049         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
9050 #endif
9051       }
9052       break;
9053 
9054     default:
9055       KMP_ASSERT(0); // "unsupported method specified"
9056     }
9057 
9058     retval = forced_retval;
9059   }
9060 
9061   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9062 
9063 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
9064 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9065 
9066   return (retval);
9067 }
9068 // this function is for testing set/get/determine reduce method
9069 kmp_int32 __kmp_get_reduce_method(void) {
9070   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9071 }
9072 
9073 // Soft pause sets up threads to ignore blocktime and just go to sleep.
9074 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
9075 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9076 
9077 // Hard pause shuts down the runtime completely.  Resume happens naturally when
9078 // OpenMP is used subsequently.
9079 void __kmp_hard_pause() {
9080   __kmp_pause_status = kmp_hard_paused;
9081   __kmp_internal_end_thread(-1);
9082 }
9083 
9084 // Soft resume sets __kmp_pause_status, and wakes up all threads.
9085 void __kmp_resume_if_soft_paused() {
9086   if (__kmp_pause_status == kmp_soft_paused) {
9087     __kmp_pause_status = kmp_not_paused;
9088 
9089     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9090       kmp_info_t *thread = __kmp_threads[gtid];
9091       if (thread) { // Wake it if sleeping
9092         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9093                          thread);
9094         if (fl.is_sleeping())
9095           fl.resume(gtid);
9096         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
9097           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
9098         } else { // thread holds the lock and may sleep soon
9099           do { // until either the thread sleeps, or we can get the lock
9100             if (fl.is_sleeping()) {
9101               fl.resume(gtid);
9102               break;
9103             } else if (__kmp_try_suspend_mx(thread)) {
9104               __kmp_unlock_suspend_mx(thread);
9105               break;
9106             }
9107           } while (1);
9108         }
9109       }
9110     }
9111   }
9112 }
9113 
9114 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
9115 // TODO: add warning messages
9116 int __kmp_pause_resource(kmp_pause_status_t level) {
9117   if (level == kmp_not_paused) { // requesting resume
9118     if (__kmp_pause_status == kmp_not_paused) {
9119       // error message about runtime not being paused, so can't resume
9120       return 1;
9121     } else {
9122       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9123                        __kmp_pause_status == kmp_hard_paused);
9124       __kmp_pause_status = kmp_not_paused;
9125       return 0;
9126     }
9127   } else if (level == kmp_soft_paused) { // requesting soft pause
9128     if (__kmp_pause_status != kmp_not_paused) {
9129       // error message about already being paused
9130       return 1;
9131     } else {
9132       __kmp_soft_pause();
9133       return 0;
9134     }
9135   } else if (level == kmp_hard_paused) { // requesting hard pause
9136     if (__kmp_pause_status != kmp_not_paused) {
9137       // error message about already being paused
9138       return 1;
9139     } else {
9140       __kmp_hard_pause();
9141       return 0;
9142     }
9143   } else {
9144     // error message about invalid level
9145     return 1;
9146   }
9147 }
9148 
9149 void __kmp_omp_display_env(int verbose) {
9150   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9151   if (__kmp_init_serial == 0)
9152     __kmp_do_serial_initialize();
9153   __kmp_display_env_impl(!verbose, verbose);
9154   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9155 }
9156 
9157 // The team size is changing, so distributed barrier must be modified
9158 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9159                                int new_nthreads) {
9160   KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9161                    bp_dist_bar);
9162   kmp_info_t **other_threads = team->t.t_threads;
9163 
9164   // We want all the workers to stop waiting on the barrier while we adjust the
9165   // size of the team.
9166   for (int f = 1; f < old_nthreads; ++f) {
9167     KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9168     // Ignore threads that are already inactive or not present in the team
9169     if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9170       // teams construct causes thread_limit to get passed in, and some of
9171       // those could be inactive; just ignore them
9172       continue;
9173     }
9174     // If thread is transitioning still to in_use state, wait for it
9175     if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9176       while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9177         KMP_CPU_PAUSE();
9178     }
9179     // The thread should be in_use now
9180     KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9181     // Transition to unused state
9182     team->t.t_threads[f]->th.th_used_in_team.store(2);
9183     KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9184   }
9185   // Release all the workers
9186   team->t.b->go_release();
9187 
9188   KMP_MFENCE();
9189 
9190   // Workers should see transition status 2 and move to 0; but may need to be
9191   // woken up first
9192   int count = old_nthreads - 1;
9193   while (count > 0) {
9194     count = old_nthreads - 1;
9195     for (int f = 1; f < old_nthreads; ++f) {
9196       if (other_threads[f]->th.th_used_in_team.load() != 0) {
9197         if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9198           kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9199               void *, other_threads[f]->th.th_sleep_loc);
9200           __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9201         }
9202       } else {
9203         KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9204         count--;
9205       }
9206     }
9207   }
9208   // Now update the barrier size
9209   team->t.b->update_num_threads(new_nthreads);
9210   team->t.b->go_reset();
9211 }
9212 
9213 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9214   // Add the threads back to the team
9215   KMP_DEBUG_ASSERT(team);
9216   // Threads were paused and pointed at th_used_in_team temporarily during a
9217   // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9218   // the thread that it should transition itself back into the team. Then, if
9219   // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9220   // to wake it up.
9221   for (int f = 1; f < new_nthreads; ++f) {
9222     KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9223     KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9224                                 3);
9225     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9226       __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9227                       (kmp_flag_32<false, false> *)NULL);
9228     }
9229   }
9230   // The threads should be transitioning to the team; when they are done, they
9231   // should have set th_used_in_team to 1. This loop forces master to wait until
9232   // all threads have moved into the team and are waiting in the barrier.
9233   int count = new_nthreads - 1;
9234   while (count > 0) {
9235     count = new_nthreads - 1;
9236     for (int f = 1; f < new_nthreads; ++f) {
9237       if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9238         count--;
9239       }
9240     }
9241   }
9242 }
9243 
9244 // Globals and functions for hidden helper task
9245 kmp_info_t **__kmp_hidden_helper_threads;
9246 kmp_info_t *__kmp_hidden_helper_main_thread;
9247 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9248 #if KMP_OS_LINUX
9249 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9250 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9251 #else
9252 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9253 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9254 #endif
9255 
9256 namespace {
9257 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9258 
9259 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9260   // This is an explicit synchronization on all hidden helper threads in case
9261   // that when a regular thread pushes a hidden helper task to one hidden
9262   // helper thread, the thread has not been awaken once since they're released
9263   // by the main thread after creating the team.
9264   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9265   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9266          __kmp_hidden_helper_threads_num)
9267     ;
9268 
9269   // If main thread, then wait for signal
9270   if (__kmpc_master(nullptr, *gtid)) {
9271     // First, unset the initial state and release the initial thread
9272     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9273     __kmp_hidden_helper_initz_release();
9274     __kmp_hidden_helper_main_thread_wait();
9275     // Now wake up all worker threads
9276     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9277       __kmp_hidden_helper_worker_thread_signal();
9278     }
9279   }
9280 }
9281 } // namespace
9282 
9283 void __kmp_hidden_helper_threads_initz_routine() {
9284   // Create a new root for hidden helper team/threads
9285   const int gtid = __kmp_register_root(TRUE);
9286   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9287   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9288   __kmp_hidden_helper_main_thread->th.th_set_nproc =
9289       __kmp_hidden_helper_threads_num;
9290 
9291   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9292 
9293   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9294 
9295   // Set the initialization flag to FALSE
9296   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9297 
9298   __kmp_hidden_helper_threads_deinitz_release();
9299 }
9300 
9301 /* Nesting Mode:
9302    Set via KMP_NESTING_MODE, which takes an integer.
9303    Note: we skip duplicate topology levels, and skip levels with only
9304       one entity.
9305    KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9306    KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9307       in the topology, and initializes the number of threads at each of those
9308       levels to the number of entities at each level, respectively, below the
9309       entity at the parent level.
9310    KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9311       but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9312       the user to turn nesting on explicitly. This is an even more experimental
9313       option to this experimental feature, and may change or go away in the
9314       future.
9315 */
9316 
9317 // Allocate space to store nesting levels
9318 void __kmp_init_nesting_mode() {
9319   int levels = KMP_HW_LAST;
9320   __kmp_nesting_mode_nlevels = levels;
9321   __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9322   for (int i = 0; i < levels; ++i)
9323     __kmp_nesting_nth_level[i] = 0;
9324   if (__kmp_nested_nth.size < levels) {
9325     __kmp_nested_nth.nth =
9326         (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9327     __kmp_nested_nth.size = levels;
9328   }
9329 }
9330 
9331 // Set # threads for top levels of nesting; must be called after topology set
9332 void __kmp_set_nesting_mode_threads() {
9333   kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9334 
9335   if (__kmp_nesting_mode == 1)
9336     __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9337   else if (__kmp_nesting_mode > 1)
9338     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9339 
9340   if (__kmp_topology) { // use topology info
9341     int loc, hw_level;
9342     for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9343                                 loc < __kmp_nesting_mode_nlevels;
9344          loc++, hw_level++) {
9345       __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9346       if (__kmp_nesting_nth_level[loc] == 1)
9347         loc--;
9348     }
9349     // Make sure all cores are used
9350     if (__kmp_nesting_mode > 1 && loc > 1) {
9351       int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9352       int num_cores = __kmp_topology->get_count(core_level);
9353       int upper_levels = 1;
9354       for (int level = 0; level < loc - 1; ++level)
9355         upper_levels *= __kmp_nesting_nth_level[level];
9356       if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9357         __kmp_nesting_nth_level[loc - 1] =
9358             num_cores / __kmp_nesting_nth_level[loc - 2];
9359     }
9360     __kmp_nesting_mode_nlevels = loc;
9361     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9362   } else { // no topology info available; provide a reasonable guesstimation
9363     if (__kmp_avail_proc >= 4) {
9364       __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9365       __kmp_nesting_nth_level[1] = 2;
9366       __kmp_nesting_mode_nlevels = 2;
9367     } else {
9368       __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9369       __kmp_nesting_mode_nlevels = 1;
9370     }
9371     __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9372   }
9373   for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9374     __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9375   }
9376   set__nproc(thread, __kmp_nesting_nth_level[0]);
9377   if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9378     __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9379   if (get__max_active_levels(thread) > 1) {
9380     // if max levels was set, set nesting mode levels to same
9381     __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9382   }
9383   if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9384     set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9385 }
9386 
9387 // Empty symbols to export (see exports_so.txt) when feature is disabled
9388 extern "C" {
9389 #if !KMP_STATS_ENABLED
9390 void __kmp_reset_stats() {}
9391 #endif
9392 #if !USE_DEBUGGER
9393 int __kmp_omp_debug_struct_info = FALSE;
9394 int __kmp_debugging = FALSE;
9395 #endif
9396 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9397 void __kmp_itt_fini_ittlib() {}
9398 void __kmp_itt_init_ittlib() {}
9399 #endif
9400 }
9401 
9402 // end of file
9403