xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_runtime.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1  /*
2   * kmp_runtime.cpp -- KPTS runtime support library
3   */
4  
5  //===----------------------------------------------------------------------===//
6  //
7  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8  // See https://llvm.org/LICENSE.txt for license information.
9  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10  //
11  //===----------------------------------------------------------------------===//
12  
13  #include "kmp.h"
14  #include "kmp_affinity.h"
15  #include "kmp_atomic.h"
16  #include "kmp_environment.h"
17  #include "kmp_error.h"
18  #include "kmp_i18n.h"
19  #include "kmp_io.h"
20  #include "kmp_itt.h"
21  #include "kmp_settings.h"
22  #include "kmp_stats.h"
23  #include "kmp_str.h"
24  #include "kmp_wait_release.h"
25  #include "kmp_wrapper_getpid.h"
26  #include "kmp_dispatch.h"
27  #include "kmp_utils.h"
28  #if KMP_USE_HIER_SCHED
29  #include "kmp_dispatch_hier.h"
30  #endif
31  
32  #if OMPT_SUPPORT
33  #include "ompt-specific.h"
34  #endif
35  #if OMPD_SUPPORT
36  #include "ompd-specific.h"
37  #endif
38  
39  #if OMP_PROFILING_SUPPORT
40  #include "llvm/Support/TimeProfiler.h"
41  static char *ProfileTraceFile = nullptr;
42  #endif
43  
44  /* these are temporary issues to be dealt with */
45  #define KMP_USE_PRCTL 0
46  
47  #if KMP_OS_WINDOWS
48  #include <process.h>
49  #endif
50  
51  #ifndef KMP_USE_SHM
52  // Windows and WASI do not need these include files as they don't use shared
53  // memory.
54  #else
55  #include <sys/mman.h>
56  #include <sys/stat.h>
57  #include <fcntl.h>
58  #define SHM_SIZE 1024
59  #endif
60  
61  #if defined(KMP_GOMP_COMPAT)
62  char const __kmp_version_alt_comp[] =
63      KMP_VERSION_PREFIX "alternative compiler support: yes";
64  #endif /* defined(KMP_GOMP_COMPAT) */
65  
66  char const __kmp_version_omp_api[] =
67      KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68  
69  #ifdef KMP_DEBUG
70  char const __kmp_version_lock[] =
71      KMP_VERSION_PREFIX "lock type: run time selectable";
72  #endif /* KMP_DEBUG */
73  
74  #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75  
76  /* ------------------------------------------------------------------------ */
77  
78  #if KMP_USE_MONITOR
79  kmp_info_t __kmp_monitor;
80  #endif
81  
82  /* Forward declarations */
83  
84  void __kmp_cleanup(void);
85  
86  static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87                                    int gtid);
88  static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89                                    kmp_internal_control_t *new_icvs,
90                                    ident_t *loc);
91  #if KMP_AFFINITY_SUPPORTED
92  static void __kmp_partition_places(kmp_team_t *team,
93                                     int update_master_only = 0);
94  #endif
95  static void __kmp_do_serial_initialize(void);
96  void __kmp_fork_barrier(int gtid, int tid);
97  void __kmp_join_barrier(int gtid);
98  void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99                            kmp_internal_control_t *new_icvs, ident_t *loc);
100  
101  #ifdef USE_LOAD_BALANCE
102  static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103  #endif
104  
105  static int __kmp_expand_threads(int nNeed);
106  #if KMP_OS_WINDOWS
107  static int __kmp_unregister_root_other_thread(int gtid);
108  #endif
109  static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110  kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111  
112  void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
113                                 int new_nthreads);
114  void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
115  
__kmp_override_nested_nth(kmp_info_t * thr,int level)116  static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
117                                                          int level) {
118    kmp_nested_nthreads_t *new_nested_nth =
119        (kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
120            sizeof(kmp_nested_nthreads_t));
121    int new_size = level + thr->th.th_set_nested_nth_sz;
122    new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
123    for (int i = 0; i < level + 1; ++i)
124      new_nested_nth->nth[i] = 0;
125    for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
126      new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
127    new_nested_nth->size = new_nested_nth->used = new_size;
128    return new_nested_nth;
129  }
130  
131  /* Calculate the identifier of the current thread */
132  /* fast (and somewhat portable) way to get unique identifier of executing
133     thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
__kmp_get_global_thread_id()134  int __kmp_get_global_thread_id() {
135    int i;
136    kmp_info_t **other_threads;
137    size_t stack_data;
138    char *stack_addr;
139    size_t stack_size;
140    char *stack_base;
141  
142    KA_TRACE(
143        1000,
144        ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
145         __kmp_nth, __kmp_all_nth));
146  
147    /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
148       a parallel region, made it return KMP_GTID_DNE to force serial_initialize
149       by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
150       __kmp_init_gtid for this to work. */
151  
152    if (!TCR_4(__kmp_init_gtid))
153      return KMP_GTID_DNE;
154  
155  #ifdef KMP_TDATA_GTID
156    if (TCR_4(__kmp_gtid_mode) >= 3) {
157      KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
158      return __kmp_gtid;
159    }
160  #endif
161    if (TCR_4(__kmp_gtid_mode) >= 2) {
162      KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
163      return __kmp_gtid_get_specific();
164    }
165    KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
166  
167    stack_addr = (char *)&stack_data;
168    other_threads = __kmp_threads;
169  
170    /* ATT: The code below is a source of potential bugs due to unsynchronized
171       access to __kmp_threads array. For example:
172       1. Current thread loads other_threads[i] to thr and checks it, it is
173          non-NULL.
174       2. Current thread is suspended by OS.
175       3. Another thread unregisters and finishes (debug versions of free()
176          may fill memory with something like 0xEF).
177       4. Current thread is resumed.
178       5. Current thread reads junk from *thr.
179       TODO: Fix it.  --ln  */
180  
181    for (i = 0; i < __kmp_threads_capacity; i++) {
182  
183      kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
184      if (!thr)
185        continue;
186  
187      stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
188      stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
189  
190      /* stack grows down -- search through all of the active threads */
191  
192      if (stack_addr <= stack_base) {
193        size_t stack_diff = stack_base - stack_addr;
194  
195        if (stack_diff <= stack_size) {
196          /* The only way we can be closer than the allocated */
197          /* stack size is if we are running on this thread. */
198          // __kmp_gtid_get_specific can return negative value because this
199          // function can be called by thread destructor. However, before the
200          // thread destructor is called, the value of the corresponding
201          // thread-specific data will be reset to NULL.
202          KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
203                           __kmp_gtid_get_specific() == i);
204          return i;
205        }
206      }
207    }
208  
209    /* get specific to try and determine our gtid */
210    KA_TRACE(1000,
211             ("*** __kmp_get_global_thread_id: internal alg. failed to find "
212              "thread, using TLS\n"));
213    i = __kmp_gtid_get_specific();
214  
215    /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
216  
217    /* if we havn't been assigned a gtid, then return code */
218    if (i < 0)
219      return i;
220  
221    // other_threads[i] can be nullptr at this point because the corresponding
222    // thread could have already been destructed. It can happen when this function
223    // is called in end library routine.
224    if (!TCR_SYNC_PTR(other_threads[i]))
225      return i;
226  
227    /* dynamically updated stack window for uber threads to avoid get_specific
228       call */
229    if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
230      KMP_FATAL(StackOverflow, i);
231    }
232  
233    stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
234    if (stack_addr > stack_base) {
235      TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
236      TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
237              other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
238                  stack_base);
239    } else {
240      TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
241              stack_base - stack_addr);
242    }
243  
244    /* Reprint stack bounds for ubermaster since they have been refined */
245    if (__kmp_storage_map) {
246      char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
247      char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
248      __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
249                                   other_threads[i]->th.th_info.ds.ds_stacksize,
250                                   "th_%d stack (refinement)", i);
251    }
252    return i;
253  }
254  
__kmp_get_global_thread_id_reg()255  int __kmp_get_global_thread_id_reg() {
256    int gtid;
257  
258    if (!__kmp_init_serial) {
259      gtid = KMP_GTID_DNE;
260    } else
261  #ifdef KMP_TDATA_GTID
262        if (TCR_4(__kmp_gtid_mode) >= 3) {
263      KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
264      gtid = __kmp_gtid;
265    } else
266  #endif
267        if (TCR_4(__kmp_gtid_mode) >= 2) {
268      KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
269      gtid = __kmp_gtid_get_specific();
270    } else {
271      KA_TRACE(1000,
272               ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
273      gtid = __kmp_get_global_thread_id();
274    }
275  
276    /* we must be a new uber master sibling thread */
277    if (gtid == KMP_GTID_DNE) {
278      KA_TRACE(10,
279               ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
280                "Registering a new gtid.\n"));
281      __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
282      if (!__kmp_init_serial) {
283        __kmp_do_serial_initialize();
284        gtid = __kmp_gtid_get_specific();
285      } else {
286        gtid = __kmp_register_root(FALSE);
287      }
288      __kmp_release_bootstrap_lock(&__kmp_initz_lock);
289      /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
290    }
291  
292    KMP_DEBUG_ASSERT(gtid >= 0);
293  
294    return gtid;
295  }
296  
297  /* caller must hold forkjoin_lock */
__kmp_check_stack_overlap(kmp_info_t * th)298  void __kmp_check_stack_overlap(kmp_info_t *th) {
299    int f;
300    char *stack_beg = NULL;
301    char *stack_end = NULL;
302    int gtid;
303  
304    KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
305    if (__kmp_storage_map) {
306      stack_end = (char *)th->th.th_info.ds.ds_stackbase;
307      stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
308  
309      gtid = __kmp_gtid_from_thread(th);
310  
311      if (gtid == KMP_GTID_MONITOR) {
312        __kmp_print_storage_map_gtid(
313            gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
314            "th_%s stack (%s)", "mon",
315            (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
316      } else {
317        __kmp_print_storage_map_gtid(
318            gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
319            "th_%d stack (%s)", gtid,
320            (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
321      }
322    }
323  
324    /* No point in checking ubermaster threads since they use refinement and
325     * cannot overlap */
326    gtid = __kmp_gtid_from_thread(th);
327    if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
328      KA_TRACE(10,
329               ("__kmp_check_stack_overlap: performing extensive checking\n"));
330      if (stack_beg == NULL) {
331        stack_end = (char *)th->th.th_info.ds.ds_stackbase;
332        stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
333      }
334  
335      for (f = 0; f < __kmp_threads_capacity; f++) {
336        kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
337  
338        if (f_th && f_th != th) {
339          char *other_stack_end =
340              (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
341          char *other_stack_beg =
342              other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
343          if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
344              (stack_end > other_stack_beg && stack_end < other_stack_end)) {
345  
346            /* Print the other stack values before the abort */
347            if (__kmp_storage_map)
348              __kmp_print_storage_map_gtid(
349                  -1, other_stack_beg, other_stack_end,
350                  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
351                  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
352  
353            __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
354                        __kmp_msg_null);
355          }
356        }
357      }
358    }
359    KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
360  }
361  
362  /* ------------------------------------------------------------------------ */
363  
__kmp_infinite_loop(void)364  void __kmp_infinite_loop(void) {
365    static int done = FALSE;
366  
367    while (!done) {
368      KMP_YIELD(TRUE);
369    }
370  }
371  
372  #define MAX_MESSAGE 512
373  
__kmp_print_storage_map_gtid(int gtid,void * p1,void * p2,size_t size,char const * format,...)374  void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
375                                    char const *format, ...) {
376    char buffer[MAX_MESSAGE];
377    va_list ap;
378  
379    va_start(ap, format);
380    KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
381                 p2, (unsigned long)size, format);
382    __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
383    __kmp_vprintf(kmp_err, buffer, ap);
384  #if KMP_PRINT_DATA_PLACEMENT
385    int node;
386    if (gtid >= 0) {
387      if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
388        if (__kmp_storage_map_verbose) {
389          node = __kmp_get_host_node(p1);
390          if (node < 0) /* doesn't work, so don't try this next time */
391            __kmp_storage_map_verbose = FALSE;
392          else {
393            char *last;
394            int lastNode;
395            int localProc = __kmp_get_cpu_from_gtid(gtid);
396  
397            const int page_size = KMP_GET_PAGE_SIZE();
398  
399            p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
400            p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
401            if (localProc >= 0)
402              __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
403                                   localProc >> 1);
404            else
405              __kmp_printf_no_lock("  GTID %d\n", gtid);
406  #if KMP_USE_PRCTL
407            /* The more elaborate format is disabled for now because of the prctl
408             * hanging bug. */
409            do {
410              last = p1;
411              lastNode = node;
412              /* This loop collates adjacent pages with the same host node. */
413              do {
414                (char *)p1 += page_size;
415              } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
416              __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
417                                   lastNode);
418            } while (p1 <= p2);
419  #else
420            __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
421                                 (char *)p1 + (page_size - 1),
422                                 __kmp_get_host_node(p1));
423            if (p1 < p2) {
424              __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
425                                   (char *)p2 + (page_size - 1),
426                                   __kmp_get_host_node(p2));
427            }
428  #endif
429          }
430        }
431      } else
432        __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
433    }
434  #endif /* KMP_PRINT_DATA_PLACEMENT */
435    __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
436  
437    va_end(ap);
438  }
439  
__kmp_warn(char const * format,...)440  void __kmp_warn(char const *format, ...) {
441    char buffer[MAX_MESSAGE];
442    va_list ap;
443  
444    if (__kmp_generate_warnings == kmp_warnings_off) {
445      return;
446    }
447  
448    va_start(ap, format);
449  
450    KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
451    __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
452    __kmp_vprintf(kmp_err, buffer, ap);
453    __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
454  
455    va_end(ap);
456  }
457  
__kmp_abort_process()458  void __kmp_abort_process() {
459    // Later threads may stall here, but that's ok because abort() will kill them.
460    __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
461  
462    if (__kmp_debug_buf) {
463      __kmp_dump_debug_buffer();
464    }
465  
466  #if KMP_OS_WINDOWS
467    // Let other threads know of abnormal termination and prevent deadlock
468    // if abort happened during library initialization or shutdown
469    __kmp_global.g.g_abort = SIGABRT;
470  
471    /* On Windows* OS by default abort() causes pop-up error box, which stalls
472       nightly testing. Unfortunately, we cannot reliably suppress pop-up error
473       boxes. _set_abort_behavior() works well, but this function is not
474       available in VS7 (this is not problem for DLL, but it is a problem for
475       static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
476       help, at least in some versions of MS C RTL.
477  
478       It seems following sequence is the only way to simulate abort() and
479       avoid pop-up error box. */
480    raise(SIGABRT);
481    _exit(3); // Just in case, if signal ignored, exit anyway.
482  #else
483    __kmp_unregister_library();
484    abort();
485  #endif
486  
487    __kmp_infinite_loop();
488    __kmp_release_bootstrap_lock(&__kmp_exit_lock);
489  
490  } // __kmp_abort_process
491  
__kmp_abort_thread(void)492  void __kmp_abort_thread(void) {
493    // TODO: Eliminate g_abort global variable and this function.
494    // In case of abort just call abort(), it will kill all the threads.
495    __kmp_infinite_loop();
496  } // __kmp_abort_thread
497  
498  /* Print out the storage map for the major kmp_info_t thread data structures
499     that are allocated together. */
500  
__kmp_print_thread_storage_map(kmp_info_t * thr,int gtid)501  static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
502    __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
503                                 gtid);
504  
505    __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
506                                 sizeof(kmp_desc_t), "th_%d.th_info", gtid);
507  
508    __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
509                                 sizeof(kmp_local_t), "th_%d.th_local", gtid);
510  
511    __kmp_print_storage_map_gtid(
512        gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
513        sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
514  
515    __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
516                                 &thr->th.th_bar[bs_plain_barrier + 1],
517                                 sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
518                                 gtid);
519  
520    __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
521                                 &thr->th.th_bar[bs_forkjoin_barrier + 1],
522                                 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
523                                 gtid);
524  
525  #if KMP_FAST_REDUCTION_BARRIER
526    __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
527                                 &thr->th.th_bar[bs_reduction_barrier + 1],
528                                 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
529                                 gtid);
530  #endif // KMP_FAST_REDUCTION_BARRIER
531  }
532  
533  /* Print out the storage map for the major kmp_team_t team data structures
534     that are allocated together. */
535  
__kmp_print_team_storage_map(const char * header,kmp_team_t * team,int team_id,int num_thr)536  static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
537                                           int team_id, int num_thr) {
538    int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
539    __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
540                                 header, team_id);
541  
542    __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
543                                 &team->t.t_bar[bs_last_barrier],
544                                 sizeof(kmp_balign_team_t) * bs_last_barrier,
545                                 "%s_%d.t_bar", header, team_id);
546  
547    __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
548                                 &team->t.t_bar[bs_plain_barrier + 1],
549                                 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
550                                 header, team_id);
551  
552    __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
553                                 &team->t.t_bar[bs_forkjoin_barrier + 1],
554                                 sizeof(kmp_balign_team_t),
555                                 "%s_%d.t_bar[forkjoin]", header, team_id);
556  
557  #if KMP_FAST_REDUCTION_BARRIER
558    __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
559                                 &team->t.t_bar[bs_reduction_barrier + 1],
560                                 sizeof(kmp_balign_team_t),
561                                 "%s_%d.t_bar[reduction]", header, team_id);
562  #endif // KMP_FAST_REDUCTION_BARRIER
563  
564    __kmp_print_storage_map_gtid(
565        -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
566        sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
567  
568    __kmp_print_storage_map_gtid(
569        -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
570        sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
571  
572    __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
573                                 &team->t.t_disp_buffer[num_disp_buff],
574                                 sizeof(dispatch_shared_info_t) * num_disp_buff,
575                                 "%s_%d.t_disp_buffer", header, team_id);
576  }
577  
__kmp_init_allocator()578  static void __kmp_init_allocator() {
579    __kmp_init_memkind();
580    __kmp_init_target_mem();
581  }
__kmp_fini_allocator()582  static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
583  
584  /* ------------------------------------------------------------------------ */
585  
586  #if ENABLE_LIBOMPTARGET
__kmp_init_omptarget()587  static void __kmp_init_omptarget() {
588    __kmp_init_target_task();
589  }
590  #endif
591  
592  /* ------------------------------------------------------------------------ */
593  
594  #if KMP_DYNAMIC_LIB
595  #if KMP_OS_WINDOWS
596  
DllMain(HINSTANCE hInstDLL,DWORD fdwReason,LPVOID lpReserved)597  BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
598    //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
599  
600    switch (fdwReason) {
601  
602    case DLL_PROCESS_ATTACH:
603      KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
604  
605      return TRUE;
606  
607    case DLL_PROCESS_DETACH:
608      KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
609  
610      // According to Windows* documentation for DllMain entry point:
611      // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
612      //   lpReserved == NULL when FreeLibrary() is called,
613      //   lpReserved != NULL when the process is terminated.
614      // When FreeLibrary() is called, worker threads remain alive. So the
615      // runtime's state is consistent and executing proper shutdown is OK.
616      // When the process is terminated, worker threads have exited or been
617      // forcefully terminated by the OS and only the shutdown thread remains.
618      // This can leave the runtime in an inconsistent state.
619      // Hence, only attempt proper cleanup when FreeLibrary() is called.
620      // Otherwise, rely on OS to reclaim resources.
621      if (lpReserved == NULL)
622        __kmp_internal_end_library(__kmp_gtid_get_specific());
623  
624      return TRUE;
625  
626    case DLL_THREAD_ATTACH:
627      KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
628  
629      /* if we want to register new siblings all the time here call
630       * __kmp_get_gtid(); */
631      return TRUE;
632  
633    case DLL_THREAD_DETACH:
634      KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
635  
636      __kmp_internal_end_thread(__kmp_gtid_get_specific());
637      return TRUE;
638    }
639  
640    return TRUE;
641  }
642  
643  #endif /* KMP_OS_WINDOWS */
644  #endif /* KMP_DYNAMIC_LIB */
645  
646  /* __kmp_parallel_deo -- Wait until it's our turn. */
__kmp_parallel_deo(int * gtid_ref,int * cid_ref,ident_t * loc_ref)647  void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
648    int gtid = *gtid_ref;
649  #ifdef BUILD_PARALLEL_ORDERED
650    kmp_team_t *team = __kmp_team_from_gtid(gtid);
651  #endif /* BUILD_PARALLEL_ORDERED */
652  
653    if (__kmp_env_consistency_check) {
654      if (__kmp_threads[gtid]->th.th_root->r.r_active)
655  #if KMP_USE_DYNAMIC_LOCK
656        __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
657  #else
658        __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
659  #endif
660    }
661  #ifdef BUILD_PARALLEL_ORDERED
662    if (!team->t.t_serialized) {
663      KMP_MB();
664      KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
665               NULL);
666      KMP_MB();
667    }
668  #endif /* BUILD_PARALLEL_ORDERED */
669  }
670  
671  /* __kmp_parallel_dxo -- Signal the next task. */
__kmp_parallel_dxo(int * gtid_ref,int * cid_ref,ident_t * loc_ref)672  void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
673    int gtid = *gtid_ref;
674  #ifdef BUILD_PARALLEL_ORDERED
675    int tid = __kmp_tid_from_gtid(gtid);
676    kmp_team_t *team = __kmp_team_from_gtid(gtid);
677  #endif /* BUILD_PARALLEL_ORDERED */
678  
679    if (__kmp_env_consistency_check) {
680      if (__kmp_threads[gtid]->th.th_root->r.r_active)
681        __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
682    }
683  #ifdef BUILD_PARALLEL_ORDERED
684    if (!team->t.t_serialized) {
685      KMP_MB(); /* Flush all pending memory write invalidates.  */
686  
687      /* use the tid of the next thread in this team */
688      /* TODO replace with general release procedure */
689      team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
690  
691      KMP_MB(); /* Flush all pending memory write invalidates.  */
692    }
693  #endif /* BUILD_PARALLEL_ORDERED */
694  }
695  
696  /* ------------------------------------------------------------------------ */
697  /* The BARRIER for a SINGLE process section is always explicit   */
698  
__kmp_enter_single(int gtid,ident_t * id_ref,int push_ws)699  int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
700    int status;
701    kmp_info_t *th;
702    kmp_team_t *team;
703  
704    if (!TCR_4(__kmp_init_parallel))
705      __kmp_parallel_initialize();
706    __kmp_resume_if_soft_paused();
707  
708    th = __kmp_threads[gtid];
709    team = th->th.th_team;
710    status = 0;
711  
712    th->th.th_ident = id_ref;
713  
714    if (team->t.t_serialized) {
715      status = 1;
716    } else {
717      kmp_int32 old_this = th->th.th_local.this_construct;
718  
719      ++th->th.th_local.this_construct;
720      /* try to set team count to thread count--success means thread got the
721         single block */
722      /* TODO: Should this be acquire or release? */
723      if (team->t.t_construct == old_this) {
724        status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
725                                                th->th.th_local.this_construct);
726      }
727  #if USE_ITT_BUILD
728      if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
729          KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
730          team->t.t_active_level == 1) {
731        // Only report metadata by primary thread of active team at level 1
732        __kmp_itt_metadata_single(id_ref);
733      }
734  #endif /* USE_ITT_BUILD */
735    }
736  
737    if (__kmp_env_consistency_check) {
738      if (status && push_ws) {
739        __kmp_push_workshare(gtid, ct_psingle, id_ref);
740      } else {
741        __kmp_check_workshare(gtid, ct_psingle, id_ref);
742      }
743    }
744  #if USE_ITT_BUILD
745    if (status) {
746      __kmp_itt_single_start(gtid);
747    }
748  #endif /* USE_ITT_BUILD */
749    return status;
750  }
751  
__kmp_exit_single(int gtid)752  void __kmp_exit_single(int gtid) {
753  #if USE_ITT_BUILD
754    __kmp_itt_single_end(gtid);
755  #endif /* USE_ITT_BUILD */
756    if (__kmp_env_consistency_check)
757      __kmp_pop_workshare(gtid, ct_psingle, NULL);
758  }
759  
760  /* determine if we can go parallel or must use a serialized parallel region and
761   * how many threads we can use
762   * set_nproc is the number of threads requested for the team
763   * returns 0 if we should serialize or only use one thread,
764   * otherwise the number of threads to use
765   * The forkjoin lock is held by the caller. */
__kmp_reserve_threads(kmp_root_t * root,kmp_team_t * parent_team,int master_tid,int set_nthreads,int enter_teams)766  static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
767                                   int master_tid, int set_nthreads,
768                                   int enter_teams) {
769    int capacity;
770    int new_nthreads;
771    KMP_DEBUG_ASSERT(__kmp_init_serial);
772    KMP_DEBUG_ASSERT(root && parent_team);
773    kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
774  
775    // If dyn-var is set, dynamically adjust the number of desired threads,
776    // according to the method specified by dynamic_mode.
777    new_nthreads = set_nthreads;
778    if (!get__dynamic_2(parent_team, master_tid)) {
779      ;
780    }
781  #ifdef USE_LOAD_BALANCE
782    else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
783      new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
784      if (new_nthreads == 1) {
785        KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
786                      "reservation to 1 thread\n",
787                      master_tid));
788        return 1;
789      }
790      if (new_nthreads < set_nthreads) {
791        KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
792                      "reservation to %d threads\n",
793                      master_tid, new_nthreads));
794      }
795    }
796  #endif /* USE_LOAD_BALANCE */
797    else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
798      new_nthreads = __kmp_avail_proc - __kmp_nth +
799                     (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
800      if (new_nthreads <= 1) {
801        KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
802                      "reservation to 1 thread\n",
803                      master_tid));
804        return 1;
805      }
806      if (new_nthreads < set_nthreads) {
807        KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
808                      "reservation to %d threads\n",
809                      master_tid, new_nthreads));
810      } else {
811        new_nthreads = set_nthreads;
812      }
813    } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
814      if (set_nthreads > 2) {
815        new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
816        new_nthreads = (new_nthreads % set_nthreads) + 1;
817        if (new_nthreads == 1) {
818          KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
819                        "reservation to 1 thread\n",
820                        master_tid));
821          return 1;
822        }
823        if (new_nthreads < set_nthreads) {
824          KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
825                        "reservation to %d threads\n",
826                        master_tid, new_nthreads));
827        }
828      }
829    } else {
830      KMP_ASSERT(0);
831    }
832  
833    // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
834    if (__kmp_nth + new_nthreads -
835            (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
836        __kmp_max_nth) {
837      int tl_nthreads = __kmp_max_nth - __kmp_nth +
838                        (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
839      if (tl_nthreads <= 0) {
840        tl_nthreads = 1;
841      }
842  
843      // If dyn-var is false, emit a 1-time warning.
844      if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
845        __kmp_reserve_warn = 1;
846        __kmp_msg(kmp_ms_warning,
847                  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
848                  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
849      }
850      if (tl_nthreads == 1) {
851        KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
852                      "reduced reservation to 1 thread\n",
853                      master_tid));
854        return 1;
855      }
856      KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
857                    "reservation to %d threads\n",
858                    master_tid, tl_nthreads));
859      new_nthreads = tl_nthreads;
860    }
861  
862    // Respect OMP_THREAD_LIMIT
863    int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
864    int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
865    if (cg_nthreads + new_nthreads -
866            (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
867        max_cg_threads) {
868      int tl_nthreads = max_cg_threads - cg_nthreads +
869                        (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
870      if (tl_nthreads <= 0) {
871        tl_nthreads = 1;
872      }
873  
874      // If dyn-var is false, emit a 1-time warning.
875      if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
876        __kmp_reserve_warn = 1;
877        __kmp_msg(kmp_ms_warning,
878                  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
879                  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
880      }
881      if (tl_nthreads == 1) {
882        KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
883                      "reduced reservation to 1 thread\n",
884                      master_tid));
885        return 1;
886      }
887      KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
888                    "reservation to %d threads\n",
889                    master_tid, tl_nthreads));
890      new_nthreads = tl_nthreads;
891    }
892  
893    // Check if the threads array is large enough, or needs expanding.
894    // See comment in __kmp_register_root() about the adjustment if
895    // __kmp_threads[0] == NULL.
896    capacity = __kmp_threads_capacity;
897    if (TCR_PTR(__kmp_threads[0]) == NULL) {
898      --capacity;
899    }
900    // If it is not for initializing the hidden helper team, we need to take
901    // __kmp_hidden_helper_threads_num out of the capacity because it is included
902    // in __kmp_threads_capacity.
903    if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
904      capacity -= __kmp_hidden_helper_threads_num;
905    }
906    if (__kmp_nth + new_nthreads -
907            (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
908        capacity) {
909      // Expand the threads array.
910      int slotsRequired = __kmp_nth + new_nthreads -
911                          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
912                          capacity;
913      int slotsAdded = __kmp_expand_threads(slotsRequired);
914      if (slotsAdded < slotsRequired) {
915        // The threads array was not expanded enough.
916        new_nthreads -= (slotsRequired - slotsAdded);
917        KMP_ASSERT(new_nthreads >= 1);
918  
919        // If dyn-var is false, emit a 1-time warning.
920        if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
921          __kmp_reserve_warn = 1;
922          if (__kmp_tp_cached) {
923            __kmp_msg(kmp_ms_warning,
924                      KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
925                      KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
926                      KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
927          } else {
928            __kmp_msg(kmp_ms_warning,
929                      KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
930                      KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
931          }
932        }
933      }
934    }
935  
936  #ifdef KMP_DEBUG
937    if (new_nthreads == 1) {
938      KC_TRACE(10,
939               ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
940                "dead roots and rechecking; requested %d threads\n",
941                __kmp_get_gtid(), set_nthreads));
942    } else {
943      KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
944                    " %d threads\n",
945                    __kmp_get_gtid(), new_nthreads, set_nthreads));
946    }
947  #endif // KMP_DEBUG
948  
949    if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) {
950      __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
951                   this_thr->th.th_nt_msg);
952    }
953    return new_nthreads;
954  }
955  
956  /* Allocate threads from the thread pool and assign them to the new team. We are
957     assured that there are enough threads available, because we checked on that
958     earlier within critical section forkjoin */
__kmp_fork_team_threads(kmp_root_t * root,kmp_team_t * team,kmp_info_t * master_th,int master_gtid,int fork_teams_workers)959  static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
960                                      kmp_info_t *master_th, int master_gtid,
961                                      int fork_teams_workers) {
962    int i;
963    int use_hot_team;
964  
965    KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
966    KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
967    KMP_MB();
968  
969    /* first, let's setup the primary thread */
970    master_th->th.th_info.ds.ds_tid = 0;
971    master_th->th.th_team = team;
972    master_th->th.th_team_nproc = team->t.t_nproc;
973    master_th->th.th_team_master = master_th;
974    master_th->th.th_team_serialized = FALSE;
975    master_th->th.th_dispatch = &team->t.t_dispatch[0];
976  
977  /* make sure we are not the optimized hot team */
978  #if KMP_NESTED_HOT_TEAMS
979    use_hot_team = 0;
980    kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
981    if (hot_teams) { // hot teams array is not allocated if
982      // KMP_HOT_TEAMS_MAX_LEVEL=0
983      int level = team->t.t_active_level - 1; // index in array of hot teams
984      if (master_th->th.th_teams_microtask) { // are we inside the teams?
985        if (master_th->th.th_teams_size.nteams > 1) {
986          ++level; // level was not increased in teams construct for
987          // team_of_masters
988        }
989        if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
990            master_th->th.th_teams_level == team->t.t_level) {
991          ++level; // level was not increased in teams construct for
992          // team_of_workers before the parallel
993        } // team->t.t_level will be increased inside parallel
994      }
995      if (level < __kmp_hot_teams_max_level) {
996        if (hot_teams[level].hot_team) {
997          // hot team has already been allocated for given level
998          KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
999          use_hot_team = 1; // the team is ready to use
1000        } else {
1001          use_hot_team = 0; // AC: threads are not allocated yet
1002          hot_teams[level].hot_team = team; // remember new hot team
1003          hot_teams[level].hot_team_nth = team->t.t_nproc;
1004        }
1005      } else {
1006        use_hot_team = 0;
1007      }
1008    }
1009  #else
1010    use_hot_team = team == root->r.r_hot_team;
1011  #endif
1012    if (!use_hot_team) {
1013  
1014      /* install the primary thread */
1015      team->t.t_threads[0] = master_th;
1016      __kmp_initialize_info(master_th, team, 0, master_gtid);
1017  
1018      /* now, install the worker threads */
1019      for (i = 1; i < team->t.t_nproc; i++) {
1020  
1021        /* fork or reallocate a new thread and install it in team */
1022        kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1023        team->t.t_threads[i] = thr;
1024        KMP_DEBUG_ASSERT(thr);
1025        KMP_DEBUG_ASSERT(thr->th.th_team == team);
1026        /* align team and thread arrived states */
1027        KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1028                      "T#%d(%d:%d) join =%llu, plain=%llu\n",
1029                      __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1030                      __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1031                      team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1032                      team->t.t_bar[bs_plain_barrier].b_arrived));
1033        thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1034        thr->th.th_teams_level = master_th->th.th_teams_level;
1035        thr->th.th_teams_size = master_th->th.th_teams_size;
1036        { // Initialize threads' barrier data.
1037          int b;
1038          kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1039          for (b = 0; b < bs_last_barrier; ++b) {
1040            balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1041            KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1042  #if USE_DEBUGGER
1043            balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1044  #endif
1045          }
1046        }
1047      }
1048  
1049  #if KMP_AFFINITY_SUPPORTED
1050      // Do not partition the places list for teams construct workers who
1051      // haven't actually been forked to do real work yet. This partitioning
1052      // will take place in the parallel region nested within the teams construct.
1053      if (!fork_teams_workers) {
1054        __kmp_partition_places(team);
1055      }
1056  #endif
1057  
1058      if (team->t.t_nproc > 1 &&
1059          __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1060        team->t.b->update_num_threads(team->t.t_nproc);
1061        __kmp_add_threads_to_team(team, team->t.t_nproc);
1062      }
1063    }
1064  
1065    // Take care of primary thread's task state
1066    if (__kmp_tasking_mode != tskm_immediate_exec) {
1067      if (use_hot_team) {
1068        KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
1069        KA_TRACE(
1070            20,
1071            ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
1072             "%p, new task_team %p / team %p\n",
1073             __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
1074             team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
1075             team));
1076  
1077        // Store primary thread's current task state on new team
1078        KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1079                         master_th->th.th_task_state);
1080  
1081        // Restore primary thread's task state to hot team's state
1082        // by using thread 1's task state
1083        if (team->t.t_nproc > 1) {
1084          KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
1085                           team->t.t_threads[1]->th.th_task_state == 1);
1086          KMP_CHECK_UPDATE(master_th->th.th_task_state,
1087                           team->t.t_threads[1]->th.th_task_state);
1088        } else {
1089          master_th->th.th_task_state = 0;
1090        }
1091      } else {
1092        // Store primary thread's current task_state on new team
1093        KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1094                         master_th->th.th_task_state);
1095        // Are not using hot team, so set task state to 0.
1096        master_th->th.th_task_state = 0;
1097      }
1098    }
1099  
1100    if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1101      for (i = 0; i < team->t.t_nproc; i++) {
1102        kmp_info_t *thr = team->t.t_threads[i];
1103        if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1104            thr->th.th_prev_level != team->t.t_level) {
1105          team->t.t_display_affinity = 1;
1106          break;
1107        }
1108      }
1109    }
1110  
1111    KMP_MB();
1112  }
1113  
1114  #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1115  // Propagate any changes to the floating point control registers out to the team
1116  // We try to avoid unnecessary writes to the relevant cache line in the team
1117  // structure, so we don't make changes unless they are needed.
propagateFPControl(kmp_team_t * team)1118  inline static void propagateFPControl(kmp_team_t *team) {
1119    if (__kmp_inherit_fp_control) {
1120      kmp_int16 x87_fpu_control_word;
1121      kmp_uint32 mxcsr;
1122  
1123      // Get primary thread's values of FPU control flags (both X87 and vector)
1124      __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1125      __kmp_store_mxcsr(&mxcsr);
1126      mxcsr &= KMP_X86_MXCSR_MASK;
1127  
1128      // There is no point looking at t_fp_control_saved here.
1129      // If it is TRUE, we still have to update the values if they are different
1130      // from those we now have. If it is FALSE we didn't save anything yet, but
1131      // our objective is the same. We have to ensure that the values in the team
1132      // are the same as those we have.
1133      // So, this code achieves what we need whether or not t_fp_control_saved is
1134      // true. By checking whether the value needs updating we avoid unnecessary
1135      // writes that would put the cache-line into a written state, causing all
1136      // threads in the team to have to read it again.
1137      KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1138      KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1139      // Although we don't use this value, other code in the runtime wants to know
1140      // whether it should restore them. So we must ensure it is correct.
1141      KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1142    } else {
1143      // Similarly here. Don't write to this cache-line in the team structure
1144      // unless we have to.
1145      KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1146    }
1147  }
1148  
1149  // Do the opposite, setting the hardware registers to the updated values from
1150  // the team.
updateHWFPControl(kmp_team_t * team)1151  inline static void updateHWFPControl(kmp_team_t *team) {
1152    if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1153      // Only reset the fp control regs if they have been changed in the team.
1154      // the parallel region that we are exiting.
1155      kmp_int16 x87_fpu_control_word;
1156      kmp_uint32 mxcsr;
1157      __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1158      __kmp_store_mxcsr(&mxcsr);
1159      mxcsr &= KMP_X86_MXCSR_MASK;
1160  
1161      if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1162        __kmp_clear_x87_fpu_status_word();
1163        __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1164      }
1165  
1166      if (team->t.t_mxcsr != mxcsr) {
1167        __kmp_load_mxcsr(&team->t.t_mxcsr);
1168      }
1169    }
1170  }
1171  #else
1172  #define propagateFPControl(x) ((void)0)
1173  #define updateHWFPControl(x) ((void)0)
1174  #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1175  
1176  static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1177                                       int realloc); // forward declaration
1178  
1179  /* Run a parallel region that has been serialized, so runs only in a team of the
1180     single primary thread. */
__kmp_serialized_parallel(ident_t * loc,kmp_int32 global_tid)1181  void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1182    kmp_info_t *this_thr;
1183    kmp_team_t *serial_team;
1184  
1185    KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1186  
1187    /* Skip all this code for autopar serialized loops since it results in
1188       unacceptable overhead */
1189    if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1190      return;
1191  
1192    if (!TCR_4(__kmp_init_parallel))
1193      __kmp_parallel_initialize();
1194    __kmp_resume_if_soft_paused();
1195  
1196    this_thr = __kmp_threads[global_tid];
1197    serial_team = this_thr->th.th_serial_team;
1198  
1199    /* utilize the serialized team held by this thread */
1200    KMP_DEBUG_ASSERT(serial_team);
1201    KMP_MB();
1202  
1203    kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1204    if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1205      proc_bind = proc_bind_false;
1206    } else if (proc_bind == proc_bind_default) {
1207      // No proc_bind clause was specified, so use the current value
1208      // of proc-bind-var for this parallel region.
1209      proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1210    }
1211    // Reset for next parallel region
1212    this_thr->th.th_set_proc_bind = proc_bind_default;
1213  
1214    // Reset num_threads for next parallel region
1215    this_thr->th.th_set_nproc = 0;
1216  
1217  #if OMPT_SUPPORT
1218    ompt_data_t ompt_parallel_data = ompt_data_none;
1219    void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1220    if (ompt_enabled.enabled &&
1221        this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1222  
1223      ompt_task_info_t *parent_task_info;
1224      parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1225  
1226      parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1227      if (ompt_enabled.ompt_callback_parallel_begin) {
1228        int team_size = 1;
1229  
1230        ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1231            &(parent_task_info->task_data), &(parent_task_info->frame),
1232            &ompt_parallel_data, team_size,
1233            ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1234      }
1235    }
1236  #endif // OMPT_SUPPORT
1237  
1238    if (this_thr->th.th_team != serial_team) {
1239      // Nested level will be an index in the nested nthreads array
1240      int level = this_thr->th.th_team->t.t_level;
1241  
1242      if (serial_team->t.t_serialized) {
1243        /* this serial team was already used
1244           TODO increase performance by making this locks more specific */
1245        kmp_team_t *new_team;
1246  
1247        __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1248  
1249        new_team =
1250            __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1251  #if OMPT_SUPPORT
1252                                ompt_parallel_data,
1253  #endif
1254                                proc_bind, &this_thr->th.th_current_task->td_icvs,
1255                                0 USE_NESTED_HOT_ARG(NULL));
1256        __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1257        KMP_ASSERT(new_team);
1258  
1259        /* setup new serialized team and install it */
1260        new_team->t.t_threads[0] = this_thr;
1261        new_team->t.t_parent = this_thr->th.th_team;
1262        serial_team = new_team;
1263        this_thr->th.th_serial_team = serial_team;
1264  
1265        KF_TRACE(
1266            10,
1267            ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1268             global_tid, serial_team));
1269  
1270        /* TODO the above breaks the requirement that if we run out of resources,
1271           then we can still guarantee that serialized teams are ok, since we may
1272           need to allocate a new one */
1273      } else {
1274        KF_TRACE(
1275            10,
1276            ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1277             global_tid, serial_team));
1278      }
1279  
1280      /* we have to initialize this serial team */
1281      KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1282      KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1283      KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1284      serial_team->t.t_ident = loc;
1285      serial_team->t.t_serialized = 1;
1286      serial_team->t.t_nproc = 1;
1287      serial_team->t.t_parent = this_thr->th.th_team;
1288      if (this_thr->th.th_team->t.t_nested_nth)
1289        serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
1290      else
1291        serial_team->t.t_nested_nth = &__kmp_nested_nth;
1292      // Save previous team's task state on serial team structure
1293      serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
1294      serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1295      this_thr->th.th_team = serial_team;
1296      serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1297  
1298      KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1299                    this_thr->th.th_current_task));
1300      KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1301      this_thr->th.th_current_task->td_flags.executing = 0;
1302  
1303      __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1304  
1305      /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1306         implicit task for each serialized task represented by
1307         team->t.t_serialized? */
1308      copy_icvs(&this_thr->th.th_current_task->td_icvs,
1309                &this_thr->th.th_current_task->td_parent->td_icvs);
1310  
1311      // Thread value exists in the nested nthreads array for the next nested
1312      // level
1313      kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1314      if (this_thr->th.th_team->t.t_nested_nth)
1315        nested_nth = this_thr->th.th_team->t.t_nested_nth;
1316      if (nested_nth->used && (level + 1 < nested_nth->used)) {
1317        this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1318      }
1319  
1320      if (__kmp_nested_proc_bind.used &&
1321          (level + 1 < __kmp_nested_proc_bind.used)) {
1322        this_thr->th.th_current_task->td_icvs.proc_bind =
1323            __kmp_nested_proc_bind.bind_types[level + 1];
1324      }
1325  
1326  #if USE_DEBUGGER
1327      serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1328  #endif
1329      this_thr->th.th_info.ds.ds_tid = 0;
1330  
1331      /* set thread cache values */
1332      this_thr->th.th_team_nproc = 1;
1333      this_thr->th.th_team_master = this_thr;
1334      this_thr->th.th_team_serialized = 1;
1335      this_thr->th.th_task_team = NULL;
1336      this_thr->th.th_task_state = 0;
1337  
1338      serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1339      serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1340      serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1341  
1342      propagateFPControl(serial_team);
1343  
1344      /* check if we need to allocate dispatch buffers stack */
1345      KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1346      if (!serial_team->t.t_dispatch->th_disp_buffer) {
1347        serial_team->t.t_dispatch->th_disp_buffer =
1348            (dispatch_private_info_t *)__kmp_allocate(
1349                sizeof(dispatch_private_info_t));
1350      }
1351      this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1352  
1353      KMP_MB();
1354  
1355    } else {
1356      /* this serialized team is already being used,
1357       * that's fine, just add another nested level */
1358      KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1359      KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1360      KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1361      ++serial_team->t.t_serialized;
1362      this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1363  
1364      // Nested level will be an index in the nested nthreads array
1365      int level = this_thr->th.th_team->t.t_level;
1366      // Thread value exists in the nested nthreads array for the next nested
1367      // level
1368  
1369      kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1370      if (serial_team->t.t_nested_nth)
1371        nested_nth = serial_team->t.t_nested_nth;
1372      if (nested_nth->used && (level + 1 < nested_nth->used)) {
1373        this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1374      }
1375  
1376      serial_team->t.t_level++;
1377      KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1378                    "of serial team %p to %d\n",
1379                    global_tid, serial_team, serial_team->t.t_level));
1380  
1381      /* allocate/push dispatch buffers stack */
1382      KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1383      {
1384        dispatch_private_info_t *disp_buffer =
1385            (dispatch_private_info_t *)__kmp_allocate(
1386                sizeof(dispatch_private_info_t));
1387        disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1388        serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1389      }
1390      this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1391  
1392      /* allocate/push task team stack */
1393      __kmp_push_task_team_node(this_thr, serial_team);
1394  
1395      KMP_MB();
1396    }
1397    KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1398  
1399    // Perform the display affinity functionality for
1400    // serialized parallel regions
1401    if (__kmp_display_affinity) {
1402      if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1403          this_thr->th.th_prev_num_threads != 1) {
1404        // NULL means use the affinity-format-var ICV
1405        __kmp_aux_display_affinity(global_tid, NULL);
1406        this_thr->th.th_prev_level = serial_team->t.t_level;
1407        this_thr->th.th_prev_num_threads = 1;
1408      }
1409    }
1410  
1411    if (__kmp_env_consistency_check)
1412      __kmp_push_parallel(global_tid, NULL);
1413  #if OMPT_SUPPORT
1414    serial_team->t.ompt_team_info.master_return_address = codeptr;
1415    if (ompt_enabled.enabled &&
1416        this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1417      OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1418          OMPT_GET_FRAME_ADDRESS(0);
1419  
1420      ompt_lw_taskteam_t lw_taskteam;
1421      __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1422                              &ompt_parallel_data, codeptr);
1423  
1424      __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1425      // don't use lw_taskteam after linking. content was swaped
1426  
1427      /* OMPT implicit task begin */
1428      if (ompt_enabled.ompt_callback_implicit_task) {
1429        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1430            ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1431            OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1432            ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1433        OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1434            __kmp_tid_from_gtid(global_tid);
1435      }
1436  
1437      /* OMPT state */
1438      this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1439      OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1440          OMPT_GET_FRAME_ADDRESS(0);
1441    }
1442  #endif
1443  }
1444  
1445  // Test if this fork is for a team closely nested in a teams construct
__kmp_is_fork_in_teams(kmp_info_t * master_th,microtask_t microtask,int level,int teams_level,kmp_va_list ap)1446  static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1447                                            microtask_t microtask, int level,
1448                                            int teams_level, kmp_va_list ap) {
1449    return (master_th->th.th_teams_microtask && ap &&
1450            microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1451  }
1452  
1453  // Test if this fork is for the teams construct, i.e. to form the outer league
1454  // of teams
__kmp_is_entering_teams(int active_level,int level,int teams_level,kmp_va_list ap)1455  static inline bool __kmp_is_entering_teams(int active_level, int level,
1456                                             int teams_level, kmp_va_list ap) {
1457    return ((ap == NULL && active_level == 0) ||
1458            (ap && teams_level > 0 && teams_level == level));
1459  }
1460  
1461  // AC: This is start of parallel that is nested inside teams construct.
1462  // The team is actual (hot), all workers are ready at the fork barrier.
1463  // No lock needed to initialize the team a bit, then free workers.
1464  static inline int
__kmp_fork_in_teams(ident_t * loc,int gtid,kmp_team_t * parent_team,kmp_int32 argc,kmp_info_t * master_th,kmp_root_t * root,enum fork_context_e call_context,microtask_t microtask,launch_t invoker,int master_set_numthreads,int level,ompt_data_t ompt_parallel_data,void * return_address,kmp_va_list ap)1465  __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1466                      kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1467                      enum fork_context_e call_context, microtask_t microtask,
1468                      launch_t invoker, int master_set_numthreads, int level,
1469  #if OMPT_SUPPORT
1470                      ompt_data_t ompt_parallel_data, void *return_address,
1471  #endif
1472                      kmp_va_list ap) {
1473    void **argv;
1474    int i;
1475  
1476    parent_team->t.t_ident = loc;
1477    __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1478    parent_team->t.t_argc = argc;
1479    argv = (void **)parent_team->t.t_argv;
1480    for (i = argc - 1; i >= 0; --i) {
1481      *argv++ = va_arg(kmp_va_deref(ap), void *);
1482    }
1483    // Increment our nested depth levels, but not increase the serialization
1484    if (parent_team == master_th->th.th_serial_team) {
1485      // AC: we are in serialized parallel
1486      __kmpc_serialized_parallel(loc, gtid);
1487      KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1488  
1489      if (call_context == fork_context_gnu) {
1490        // AC: need to decrement t_serialized for enquiry functions to work
1491        // correctly, will restore at join time
1492        parent_team->t.t_serialized--;
1493        return TRUE;
1494      }
1495  
1496  #if OMPD_SUPPORT
1497      parent_team->t.t_pkfn = microtask;
1498  #endif
1499  
1500  #if OMPT_SUPPORT
1501      void *dummy;
1502      void **exit_frame_p;
1503      ompt_data_t *implicit_task_data;
1504      ompt_lw_taskteam_t lw_taskteam;
1505  
1506      if (ompt_enabled.enabled) {
1507        __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1508                                &ompt_parallel_data, return_address);
1509        exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1510  
1511        __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1512        // Don't use lw_taskteam after linking. Content was swapped.
1513  
1514        /* OMPT implicit task begin */
1515        implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1516        if (ompt_enabled.ompt_callback_implicit_task) {
1517          OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1518          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1519              ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1520              1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1521        }
1522  
1523        /* OMPT state */
1524        master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1525      } else {
1526        exit_frame_p = &dummy;
1527      }
1528  #endif
1529  
1530      // AC: need to decrement t_serialized for enquiry functions to work
1531      // correctly, will restore at join time
1532      parent_team->t.t_serialized--;
1533  
1534      {
1535        KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1536        KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1537        __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1538  #if OMPT_SUPPORT
1539                               ,
1540                               exit_frame_p
1541  #endif
1542                               );
1543      }
1544  
1545  #if OMPT_SUPPORT
1546      if (ompt_enabled.enabled) {
1547        *exit_frame_p = NULL;
1548        OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1549        if (ompt_enabled.ompt_callback_implicit_task) {
1550          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1551              ompt_scope_end, NULL, implicit_task_data, 1,
1552              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1553        }
1554        ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1555        __ompt_lw_taskteam_unlink(master_th);
1556        if (ompt_enabled.ompt_callback_parallel_end) {
1557          ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1558              &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1559              OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1560        }
1561        master_th->th.ompt_thread_info.state = ompt_state_overhead;
1562      }
1563  #endif
1564      return TRUE;
1565    }
1566  
1567    parent_team->t.t_pkfn = microtask;
1568    parent_team->t.t_invoke = invoker;
1569    KMP_ATOMIC_INC(&root->r.r_in_parallel);
1570    parent_team->t.t_active_level++;
1571    parent_team->t.t_level++;
1572    parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1573  
1574    // If the threads allocated to the team are less than the thread limit, update
1575    // the thread limit here. th_teams_size.nth is specific to this team nested
1576    // in a teams construct, the team is fully created, and we're about to do
1577    // the actual fork. Best to do this here so that the subsequent uses below
1578    // and in the join have the correct value.
1579    master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1580  
1581  #if OMPT_SUPPORT
1582    if (ompt_enabled.enabled) {
1583      ompt_lw_taskteam_t lw_taskteam;
1584      __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1585                              return_address);
1586      __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1587    }
1588  #endif
1589  
1590    /* Change number of threads in the team if requested */
1591    if (master_set_numthreads) { // The parallel has num_threads clause
1592      if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1593        // AC: only can reduce number of threads dynamically, can't increase
1594        kmp_info_t **other_threads = parent_team->t.t_threads;
1595        // NOTE: if using distributed barrier, we need to run this code block
1596        // even when the team size appears not to have changed from the max.
1597        int old_proc = master_th->th.th_teams_size.nth;
1598        if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1599          __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1600          __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1601        }
1602        parent_team->t.t_nproc = master_set_numthreads;
1603        for (i = 0; i < master_set_numthreads; ++i) {
1604          other_threads[i]->th.th_team_nproc = master_set_numthreads;
1605        }
1606      }
1607      // Keep extra threads hot in the team for possible next parallels
1608      master_th->th.th_set_nproc = 0;
1609    }
1610  
1611  #if USE_DEBUGGER
1612    if (__kmp_debugging) { // Let debugger override number of threads.
1613      int nth = __kmp_omp_num_threads(loc);
1614      if (nth > 0) { // 0 means debugger doesn't want to change num threads
1615        master_set_numthreads = nth;
1616      }
1617    }
1618  #endif
1619  
1620    // Figure out the proc_bind policy for the nested parallel within teams
1621    kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1622    // proc_bind_default means don't update
1623    kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1624    if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1625      proc_bind = proc_bind_false;
1626    } else {
1627      // No proc_bind clause specified; use current proc-bind-var
1628      if (proc_bind == proc_bind_default) {
1629        proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1630      }
1631      /* else: The proc_bind policy was specified explicitly on parallel clause.
1632         This overrides proc-bind-var for this parallel region, but does not
1633         change proc-bind-var. */
1634      // Figure the value of proc-bind-var for the child threads.
1635      if ((level + 1 < __kmp_nested_proc_bind.used) &&
1636          (__kmp_nested_proc_bind.bind_types[level + 1] !=
1637           master_th->th.th_current_task->td_icvs.proc_bind)) {
1638        proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1639      }
1640    }
1641    KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1642    // Need to change the bind-var ICV to correct value for each implicit task
1643    if (proc_bind_icv != proc_bind_default &&
1644        master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1645      kmp_info_t **other_threads = parent_team->t.t_threads;
1646      for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1647        other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1648      }
1649    }
1650    // Reset for next parallel region
1651    master_th->th.th_set_proc_bind = proc_bind_default;
1652  
1653  #if USE_ITT_BUILD && USE_ITT_NOTIFY
1654    if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1655         KMP_ITT_DEBUG) &&
1656        __kmp_forkjoin_frames_mode == 3 &&
1657        parent_team->t.t_active_level == 1 // only report frames at level 1
1658        && master_th->th.th_teams_size.nteams == 1) {
1659      kmp_uint64 tmp_time = __itt_get_timestamp();
1660      master_th->th.th_frame_time = tmp_time;
1661      parent_team->t.t_region_time = tmp_time;
1662    }
1663    if (__itt_stack_caller_create_ptr) {
1664      KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1665      // create new stack stitching id before entering fork barrier
1666      parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1667    }
1668  #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1669  #if KMP_AFFINITY_SUPPORTED
1670    __kmp_partition_places(parent_team);
1671  #endif
1672  
1673    KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1674                  "master_th=%p, gtid=%d\n",
1675                  root, parent_team, master_th, gtid));
1676    __kmp_internal_fork(loc, gtid, parent_team);
1677    KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1678                  "master_th=%p, gtid=%d\n",
1679                  root, parent_team, master_th, gtid));
1680  
1681    if (call_context == fork_context_gnu)
1682      return TRUE;
1683  
1684    /* Invoke microtask for PRIMARY thread */
1685    KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1686                  parent_team->t.t_id, parent_team->t.t_pkfn));
1687  
1688    if (!parent_team->t.t_invoke(gtid)) {
1689      KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1690    }
1691    KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1692                  parent_team->t.t_id, parent_team->t.t_pkfn));
1693    KMP_MB(); /* Flush all pending memory write invalidates.  */
1694  
1695    KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1696  
1697    return TRUE;
1698  }
1699  
1700  // Create a serialized parallel region
1701  static inline int
__kmp_serial_fork_call(ident_t * loc,int gtid,enum fork_context_e call_context,kmp_int32 argc,microtask_t microtask,launch_t invoker,kmp_info_t * master_th,kmp_team_t * parent_team,ompt_data_t * ompt_parallel_data,void ** return_address,ompt_data_t ** parent_task_data,kmp_va_list ap)1702  __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1703                         kmp_int32 argc, microtask_t microtask, launch_t invoker,
1704                         kmp_info_t *master_th, kmp_team_t *parent_team,
1705  #if OMPT_SUPPORT
1706                         ompt_data_t *ompt_parallel_data, void **return_address,
1707                         ompt_data_t **parent_task_data,
1708  #endif
1709                         kmp_va_list ap) {
1710    kmp_team_t *team;
1711    int i;
1712    void **argv;
1713  
1714  /* josh todo: hypothetical question: what do we do for OS X*? */
1715  #if KMP_OS_LINUX &&                                                            \
1716      (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1717    SimpleVLA<void *> args(argc);
1718  #else
1719    void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1720  #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1721            KMP_ARCH_AARCH64) */
1722  
1723    KA_TRACE(
1724        20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1725  
1726    __kmpc_serialized_parallel(loc, gtid);
1727  
1728  #if OMPD_SUPPORT
1729    master_th->th.th_serial_team->t.t_pkfn = microtask;
1730  #endif
1731  
1732    if (call_context == fork_context_intel) {
1733      /* TODO this sucks, use the compiler itself to pass args! :) */
1734      master_th->th.th_serial_team->t.t_ident = loc;
1735      if (!ap) {
1736        // revert change made in __kmpc_serialized_parallel()
1737        master_th->th.th_serial_team->t.t_level--;
1738  // Get args from parent team for teams construct
1739  
1740  #if OMPT_SUPPORT
1741        void *dummy;
1742        void **exit_frame_p;
1743        ompt_task_info_t *task_info;
1744        ompt_lw_taskteam_t lw_taskteam;
1745  
1746        if (ompt_enabled.enabled) {
1747          __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1748                                  ompt_parallel_data, *return_address);
1749  
1750          __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1751          // don't use lw_taskteam after linking. content was swaped
1752          task_info = OMPT_CUR_TASK_INFO(master_th);
1753          exit_frame_p = &(task_info->frame.exit_frame.ptr);
1754          if (ompt_enabled.ompt_callback_implicit_task) {
1755            OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1756            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1757                ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1758                &(task_info->task_data), 1,
1759                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1760          }
1761  
1762          /* OMPT state */
1763          master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1764        } else {
1765          exit_frame_p = &dummy;
1766        }
1767  #endif
1768  
1769        {
1770          KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1771          KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1772          __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1773  #if OMPT_SUPPORT
1774                                 ,
1775                                 exit_frame_p
1776  #endif
1777                                 );
1778        }
1779  
1780  #if OMPT_SUPPORT
1781        if (ompt_enabled.enabled) {
1782          *exit_frame_p = NULL;
1783          if (ompt_enabled.ompt_callback_implicit_task) {
1784            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1785                ompt_scope_end, NULL, &(task_info->task_data), 1,
1786                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1787          }
1788          *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1789          __ompt_lw_taskteam_unlink(master_th);
1790          if (ompt_enabled.ompt_callback_parallel_end) {
1791            ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1792                ompt_parallel_data, *parent_task_data,
1793                OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1794          }
1795          master_th->th.ompt_thread_info.state = ompt_state_overhead;
1796        }
1797  #endif
1798      } else if (microtask == (microtask_t)__kmp_teams_master) {
1799        KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1800        team = master_th->th.th_team;
1801        // team->t.t_pkfn = microtask;
1802        team->t.t_invoke = invoker;
1803        __kmp_alloc_argv_entries(argc, team, TRUE);
1804        team->t.t_argc = argc;
1805        argv = (void **)team->t.t_argv;
1806        for (i = argc - 1; i >= 0; --i)
1807          *argv++ = va_arg(kmp_va_deref(ap), void *);
1808        // AC: revert change made in __kmpc_serialized_parallel()
1809        //     because initial code in teams should have level=0
1810        team->t.t_level--;
1811        // AC: call special invoker for outer "parallel" of teams construct
1812        invoker(gtid);
1813  #if OMPT_SUPPORT
1814        if (ompt_enabled.enabled) {
1815          ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1816          if (ompt_enabled.ompt_callback_implicit_task) {
1817            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1818                ompt_scope_end, NULL, &(task_info->task_data), 0,
1819                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1820          }
1821          if (ompt_enabled.ompt_callback_parallel_end) {
1822            ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1823                ompt_parallel_data, *parent_task_data,
1824                OMPT_INVOKER(call_context) | ompt_parallel_league,
1825                *return_address);
1826          }
1827          master_th->th.ompt_thread_info.state = ompt_state_overhead;
1828        }
1829  #endif
1830      } else {
1831        argv = args;
1832        for (i = argc - 1; i >= 0; --i)
1833          *argv++ = va_arg(kmp_va_deref(ap), void *);
1834        KMP_MB();
1835  
1836  #if OMPT_SUPPORT
1837        void *dummy;
1838        void **exit_frame_p;
1839        ompt_task_info_t *task_info;
1840        ompt_lw_taskteam_t lw_taskteam;
1841        ompt_data_t *implicit_task_data;
1842  
1843        if (ompt_enabled.enabled) {
1844          __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1845                                  ompt_parallel_data, *return_address);
1846          __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1847          // don't use lw_taskteam after linking. content was swaped
1848          task_info = OMPT_CUR_TASK_INFO(master_th);
1849          exit_frame_p = &(task_info->frame.exit_frame.ptr);
1850  
1851          /* OMPT implicit task begin */
1852          implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1853          if (ompt_enabled.ompt_callback_implicit_task) {
1854            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1855                ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1856                implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1857                ompt_task_implicit);
1858            OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1859          }
1860  
1861          /* OMPT state */
1862          master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1863        } else {
1864          exit_frame_p = &dummy;
1865        }
1866  #endif
1867  
1868        {
1869          KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1870          KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1871          __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1872  #if OMPT_SUPPORT
1873                                 ,
1874                                 exit_frame_p
1875  #endif
1876                                 );
1877        }
1878  
1879  #if OMPT_SUPPORT
1880        if (ompt_enabled.enabled) {
1881          *exit_frame_p = NULL;
1882          if (ompt_enabled.ompt_callback_implicit_task) {
1883            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1884                ompt_scope_end, NULL, &(task_info->task_data), 1,
1885                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1886          }
1887  
1888          *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1889          __ompt_lw_taskteam_unlink(master_th);
1890          if (ompt_enabled.ompt_callback_parallel_end) {
1891            ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1892                ompt_parallel_data, *parent_task_data,
1893                OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1894          }
1895          master_th->th.ompt_thread_info.state = ompt_state_overhead;
1896        }
1897  #endif
1898      }
1899    } else if (call_context == fork_context_gnu) {
1900  #if OMPT_SUPPORT
1901      if (ompt_enabled.enabled) {
1902        ompt_lw_taskteam_t lwt;
1903        __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1904                                *return_address);
1905  
1906        lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1907        __ompt_lw_taskteam_link(&lwt, master_th, 1);
1908      }
1909  // don't use lw_taskteam after linking. content was swaped
1910  #endif
1911  
1912      // we were called from GNU native code
1913      KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1914      return FALSE;
1915    } else {
1916      KMP_ASSERT2(call_context < fork_context_last,
1917                  "__kmp_serial_fork_call: unknown fork_context parameter");
1918    }
1919  
1920    KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1921    KMP_MB();
1922    return FALSE;
1923  }
1924  
1925  /* most of the work for a fork */
1926  /* return true if we really went parallel, false if serialized */
__kmp_fork_call(ident_t * loc,int gtid,enum fork_context_e call_context,kmp_int32 argc,microtask_t microtask,launch_t invoker,kmp_va_list ap)1927  int __kmp_fork_call(ident_t *loc, int gtid,
1928                      enum fork_context_e call_context, // Intel, GNU, ...
1929                      kmp_int32 argc, microtask_t microtask, launch_t invoker,
1930                      kmp_va_list ap) {
1931    void **argv;
1932    int i;
1933    int master_tid;
1934    int master_this_cons;
1935    kmp_team_t *team;
1936    kmp_team_t *parent_team;
1937    kmp_info_t *master_th;
1938    kmp_root_t *root;
1939    int nthreads;
1940    int master_active;
1941    int master_set_numthreads;
1942    int task_thread_limit = 0;
1943    int level;
1944    int active_level;
1945    int teams_level;
1946  #if KMP_NESTED_HOT_TEAMS
1947    kmp_hot_team_ptr_t **p_hot_teams;
1948  #endif
1949    { // KMP_TIME_BLOCK
1950      KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1951      KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1952  
1953      KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1954      if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1955        /* Some systems prefer the stack for the root thread(s) to start with */
1956        /* some gap from the parent stack to prevent false sharing. */
1957        void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1958        /* These 2 lines below are so this does not get optimized out */
1959        if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1960          __kmp_stkpadding += (short)((kmp_int64)dummy);
1961      }
1962  
1963      /* initialize if needed */
1964      KMP_DEBUG_ASSERT(
1965          __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1966      if (!TCR_4(__kmp_init_parallel))
1967        __kmp_parallel_initialize();
1968      __kmp_resume_if_soft_paused();
1969  
1970      /* setup current data */
1971      // AC: potentially unsafe, not in sync with library shutdown,
1972      // __kmp_threads can be freed
1973      master_th = __kmp_threads[gtid];
1974  
1975      parent_team = master_th->th.th_team;
1976      master_tid = master_th->th.th_info.ds.ds_tid;
1977      master_this_cons = master_th->th.th_local.this_construct;
1978      root = master_th->th.th_root;
1979      master_active = root->r.r_active;
1980      master_set_numthreads = master_th->th.th_set_nproc;
1981      task_thread_limit =
1982          master_th->th.th_current_task->td_icvs.task_thread_limit;
1983  
1984  #if OMPT_SUPPORT
1985      ompt_data_t ompt_parallel_data = ompt_data_none;
1986      ompt_data_t *parent_task_data;
1987      ompt_frame_t *ompt_frame;
1988      void *return_address = NULL;
1989  
1990      if (ompt_enabled.enabled) {
1991        __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1992                                      NULL, NULL);
1993        return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1994      }
1995  #endif
1996  
1997      // Assign affinity to root thread if it hasn't happened yet
1998      __kmp_assign_root_init_mask();
1999  
2000      // Nested level will be an index in the nested nthreads array
2001      level = parent_team->t.t_level;
2002      // used to launch non-serial teams even if nested is not allowed
2003      active_level = parent_team->t.t_active_level;
2004      // needed to check nesting inside the teams
2005      teams_level = master_th->th.th_teams_level;
2006  #if KMP_NESTED_HOT_TEAMS
2007      p_hot_teams = &master_th->th.th_hot_teams;
2008      if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
2009        *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
2010            sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
2011        (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
2012        // it is either actual or not needed (when active_level > 0)
2013        (*p_hot_teams)[0].hot_team_nth = 1;
2014      }
2015  #endif
2016  
2017  #if OMPT_SUPPORT
2018      if (ompt_enabled.enabled) {
2019        if (ompt_enabled.ompt_callback_parallel_begin) {
2020          int team_size = master_set_numthreads
2021                              ? master_set_numthreads
2022                              : get__nproc_2(parent_team, master_tid);
2023          int flags = OMPT_INVOKER(call_context) |
2024                      ((microtask == (microtask_t)__kmp_teams_master)
2025                           ? ompt_parallel_league
2026                           : ompt_parallel_team);
2027          ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
2028              parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
2029              return_address);
2030        }
2031        master_th->th.ompt_thread_info.state = ompt_state_overhead;
2032      }
2033  #endif
2034  
2035      master_th->th.th_ident = loc;
2036  
2037      // Parallel closely nested in teams construct:
2038      if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
2039        return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
2040                                   call_context, microtask, invoker,
2041                                   master_set_numthreads, level,
2042  #if OMPT_SUPPORT
2043                                   ompt_parallel_data, return_address,
2044  #endif
2045                                   ap);
2046      } // End parallel closely nested in teams construct
2047  
2048      // Need this to happen before we determine the number of threads, not while
2049      // we are allocating the team
2050      //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2051  
2052      KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
2053  
2054      // Determine the number of threads
2055      int enter_teams =
2056          __kmp_is_entering_teams(active_level, level, teams_level, ap);
2057      if ((!enter_teams &&
2058           (parent_team->t.t_active_level >=
2059            master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2060          (__kmp_library == library_serial)) {
2061        KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2062        nthreads = 1;
2063      } else {
2064        nthreads = master_set_numthreads
2065                       ? master_set_numthreads
2066                       // TODO: get nproc directly from current task
2067                       : get__nproc_2(parent_team, master_tid);
2068        // Use the thread_limit set for the current target task if exists, else go
2069        // with the deduced nthreads
2070        nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2071                       ? task_thread_limit
2072                       : nthreads;
2073        // Check if we need to take forkjoin lock? (no need for serialized
2074        // parallel out of teams construct).
2075        if (nthreads > 1) {
2076          /* determine how many new threads we can use */
2077          __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2078          /* AC: If we execute teams from parallel region (on host), then teams
2079             should be created but each can only have 1 thread if nesting is
2080             disabled. If teams called from serial region, then teams and their
2081             threads should be created regardless of the nesting setting. */
2082          nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2083                                           nthreads, enter_teams);
2084          if (nthreads == 1) {
2085            // Free lock for single thread execution here; for multi-thread
2086            // execution it will be freed later after team of threads created
2087            // and initialized
2088            __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2089          }
2090        }
2091      }
2092      KMP_DEBUG_ASSERT(nthreads > 0);
2093  
2094      // If we temporarily changed the set number of threads then restore it now
2095      master_th->th.th_set_nproc = 0;
2096  
2097      if (nthreads == 1) {
2098        return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2099                                      invoker, master_th, parent_team,
2100  #if OMPT_SUPPORT
2101                                      &ompt_parallel_data, &return_address,
2102                                      &parent_task_data,
2103  #endif
2104                                      ap);
2105      } // if (nthreads == 1)
2106  
2107      // GEH: only modify the executing flag in the case when not serialized
2108      //      serialized case is handled in kmpc_serialized_parallel
2109      KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2110                    "curtask=%p, curtask_max_aclevel=%d\n",
2111                    parent_team->t.t_active_level, master_th,
2112                    master_th->th.th_current_task,
2113                    master_th->th.th_current_task->td_icvs.max_active_levels));
2114      // TODO: GEH - cannot do this assertion because root thread not set up as
2115      // executing
2116      // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2117      master_th->th.th_current_task->td_flags.executing = 0;
2118  
2119      if (!master_th->th.th_teams_microtask || level > teams_level) {
2120        /* Increment our nested depth level */
2121        KMP_ATOMIC_INC(&root->r.r_in_parallel);
2122      }
2123  
2124      // See if we need to make a copy of the ICVs.
2125      int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2126      kmp_nested_nthreads_t *nested_nth = NULL;
2127      if (!master_th->th.th_set_nested_nth &&
2128          (level + 1 < parent_team->t.t_nested_nth->used) &&
2129          (parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
2130        nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
2131      } else if (master_th->th.th_set_nested_nth) {
2132        nested_nth = __kmp_override_nested_nth(master_th, level);
2133        if ((level + 1 < nested_nth->used) &&
2134            (nested_nth->nth[level + 1] != nthreads_icv))
2135          nthreads_icv = nested_nth->nth[level + 1];
2136        else
2137          nthreads_icv = 0; // don't update
2138      } else {
2139        nthreads_icv = 0; // don't update
2140      }
2141  
2142      // Figure out the proc_bind_policy for the new team.
2143      kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2144      // proc_bind_default means don't update
2145      kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2146      if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2147        proc_bind = proc_bind_false;
2148      } else {
2149        // No proc_bind clause specified; use current proc-bind-var for this
2150        // parallel region
2151        if (proc_bind == proc_bind_default) {
2152          proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2153        }
2154        // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2155        if (master_th->th.th_teams_microtask &&
2156            microtask == (microtask_t)__kmp_teams_master) {
2157          proc_bind = __kmp_teams_proc_bind;
2158        }
2159        /* else: The proc_bind policy was specified explicitly on parallel clause.
2160           This overrides proc-bind-var for this parallel region, but does not
2161           change proc-bind-var. */
2162        // Figure the value of proc-bind-var for the child threads.
2163        if ((level + 1 < __kmp_nested_proc_bind.used) &&
2164            (__kmp_nested_proc_bind.bind_types[level + 1] !=
2165             master_th->th.th_current_task->td_icvs.proc_bind)) {
2166          // Do not modify the proc bind icv for the two teams construct forks
2167          // They just let the proc bind icv pass through
2168          if (!master_th->th.th_teams_microtask ||
2169              !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2170            proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2171        }
2172      }
2173  
2174      // Reset for next parallel region
2175      master_th->th.th_set_proc_bind = proc_bind_default;
2176  
2177      if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2178        kmp_internal_control_t new_icvs;
2179        copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2180        new_icvs.next = NULL;
2181        if (nthreads_icv > 0) {
2182          new_icvs.nproc = nthreads_icv;
2183        }
2184        if (proc_bind_icv != proc_bind_default) {
2185          new_icvs.proc_bind = proc_bind_icv;
2186        }
2187  
2188        /* allocate a new parallel team */
2189        KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2190        team = __kmp_allocate_team(root, nthreads, nthreads,
2191  #if OMPT_SUPPORT
2192                                   ompt_parallel_data,
2193  #endif
2194                                   proc_bind, &new_icvs,
2195                                   argc USE_NESTED_HOT_ARG(master_th));
2196        if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2197          copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2198      } else {
2199        /* allocate a new parallel team */
2200        KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2201        team = __kmp_allocate_team(root, nthreads, nthreads,
2202  #if OMPT_SUPPORT
2203                                   ompt_parallel_data,
2204  #endif
2205                                   proc_bind,
2206                                   &master_th->th.th_current_task->td_icvs,
2207                                   argc USE_NESTED_HOT_ARG(master_th));
2208        if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2209          copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2210                    &master_th->th.th_current_task->td_icvs);
2211      }
2212      KF_TRACE(
2213          10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2214  
2215      /* setup the new team */
2216      KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2217      KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2218      KMP_CHECK_UPDATE(team->t.t_ident, loc);
2219      KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2220      KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2221  #if OMPT_SUPPORT
2222      KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2223                            return_address);
2224  #endif
2225      KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2226      // TODO: parent_team->t.t_level == INT_MAX ???
2227      if (!master_th->th.th_teams_microtask || level > teams_level) {
2228        int new_level = parent_team->t.t_level + 1;
2229        KMP_CHECK_UPDATE(team->t.t_level, new_level);
2230        new_level = parent_team->t.t_active_level + 1;
2231        KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2232      } else {
2233        // AC: Do not increase parallel level at start of the teams construct
2234        int new_level = parent_team->t.t_level;
2235        KMP_CHECK_UPDATE(team->t.t_level, new_level);
2236        new_level = parent_team->t.t_active_level;
2237        KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2238      }
2239      kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2240      // set primary thread's schedule as new run-time schedule
2241      KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2242  
2243      KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2244      KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2245  
2246      // Check if hot team has potentially outdated list, and if so, free it
2247      if (team->t.t_nested_nth &&
2248          team->t.t_nested_nth != parent_team->t.t_nested_nth) {
2249        KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
2250        KMP_INTERNAL_FREE(team->t.t_nested_nth);
2251        team->t.t_nested_nth = NULL;
2252      }
2253      team->t.t_nested_nth = parent_team->t.t_nested_nth;
2254      if (master_th->th.th_set_nested_nth) {
2255        if (!nested_nth)
2256          nested_nth = __kmp_override_nested_nth(master_th, level);
2257        team->t.t_nested_nth = nested_nth;
2258        KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
2259        master_th->th.th_set_nested_nth = NULL;
2260        master_th->th.th_set_nested_nth_sz = 0;
2261        master_th->th.th_nt_strict = false;
2262      }
2263  
2264      // Update the floating point rounding in the team if required.
2265      propagateFPControl(team);
2266  #if OMPD_SUPPORT
2267      if (ompd_state & OMPD_ENABLE_BP)
2268        ompd_bp_parallel_begin();
2269  #endif
2270  
2271      KA_TRACE(
2272          20,
2273          ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2274           gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2275           team->t.t_nproc));
2276      KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2277                       (team->t.t_master_tid == 0 &&
2278                        (team->t.t_parent == root->r.r_root_team ||
2279                         team->t.t_parent->t.t_serialized)));
2280      KMP_MB();
2281  
2282      /* now, setup the arguments */
2283      argv = (void **)team->t.t_argv;
2284      if (ap) {
2285        for (i = argc - 1; i >= 0; --i) {
2286          void *new_argv = va_arg(kmp_va_deref(ap), void *);
2287          KMP_CHECK_UPDATE(*argv, new_argv);
2288          argv++;
2289        }
2290      } else {
2291        for (i = 0; i < argc; ++i) {
2292          // Get args from parent team for teams construct
2293          KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2294        }
2295      }
2296  
2297      /* now actually fork the threads */
2298      KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2299      if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2300        root->r.r_active = TRUE;
2301  
2302      __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2303      __kmp_setup_icv_copy(team, nthreads,
2304                           &master_th->th.th_current_task->td_icvs, loc);
2305  
2306  #if OMPT_SUPPORT
2307      master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2308  #endif
2309  
2310      __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2311  
2312  #if USE_ITT_BUILD
2313      if (team->t.t_active_level == 1 // only report frames at level 1
2314          && !master_th->th.th_teams_microtask) { // not in teams construct
2315  #if USE_ITT_NOTIFY
2316        if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2317            (__kmp_forkjoin_frames_mode == 3 ||
2318             __kmp_forkjoin_frames_mode == 1)) {
2319          kmp_uint64 tmp_time = 0;
2320          if (__itt_get_timestamp_ptr)
2321            tmp_time = __itt_get_timestamp();
2322          // Internal fork - report frame begin
2323          master_th->th.th_frame_time = tmp_time;
2324          if (__kmp_forkjoin_frames_mode == 3)
2325            team->t.t_region_time = tmp_time;
2326        } else
2327  // only one notification scheme (either "submit" or "forking/joined", not both)
2328  #endif /* USE_ITT_NOTIFY */
2329          if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2330              __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2331            // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2332            __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2333          }
2334      }
2335  #endif /* USE_ITT_BUILD */
2336  
2337      /* now go on and do the work */
2338      KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2339      KMP_MB();
2340      KF_TRACE(10,
2341               ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2342                root, team, master_th, gtid));
2343  
2344  #if USE_ITT_BUILD
2345      if (__itt_stack_caller_create_ptr) {
2346        // create new stack stitching id before entering fork barrier
2347        if (!enter_teams) {
2348          KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2349          team->t.t_stack_id = __kmp_itt_stack_caller_create();
2350        } else if (parent_team->t.t_serialized) {
2351          // keep stack stitching id in the serialized parent_team;
2352          // current team will be used for parallel inside the teams;
2353          // if parent_team is active, then it already keeps stack stitching id
2354          // for the league of teams
2355          KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2356          parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2357        }
2358      }
2359  #endif /* USE_ITT_BUILD */
2360  
2361      // AC: skip __kmp_internal_fork at teams construct, let only primary
2362      // threads execute
2363      if (ap) {
2364        __kmp_internal_fork(loc, gtid, team);
2365        KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2366                      "master_th=%p, gtid=%d\n",
2367                      root, team, master_th, gtid));
2368      }
2369  
2370      if (call_context == fork_context_gnu) {
2371        KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2372        return TRUE;
2373      }
2374  
2375      /* Invoke microtask for PRIMARY thread */
2376      KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2377                    team->t.t_id, team->t.t_pkfn));
2378    } // END of timer KMP_fork_call block
2379  
2380  #if KMP_STATS_ENABLED
2381    // If beginning a teams construct, then change thread state
2382    stats_state_e previous_state = KMP_GET_THREAD_STATE();
2383    if (!ap) {
2384      KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2385    }
2386  #endif
2387  
2388    if (!team->t.t_invoke(gtid)) {
2389      KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2390    }
2391  
2392  #if KMP_STATS_ENABLED
2393    // If was beginning of a teams construct, then reset thread state
2394    if (!ap) {
2395      KMP_SET_THREAD_STATE(previous_state);
2396    }
2397  #endif
2398  
2399    KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2400                  team->t.t_id, team->t.t_pkfn));
2401    KMP_MB(); /* Flush all pending memory write invalidates.  */
2402  
2403    KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2404  #if OMPT_SUPPORT
2405    if (ompt_enabled.enabled) {
2406      master_th->th.ompt_thread_info.state = ompt_state_overhead;
2407    }
2408  #endif
2409  
2410    return TRUE;
2411  }
2412  
2413  #if OMPT_SUPPORT
__kmp_join_restore_state(kmp_info_t * thread,kmp_team_t * team)2414  static inline void __kmp_join_restore_state(kmp_info_t *thread,
2415                                              kmp_team_t *team) {
2416    // restore state outside the region
2417    thread->th.ompt_thread_info.state =
2418        ((team->t.t_serialized) ? ompt_state_work_serial
2419                                : ompt_state_work_parallel);
2420  }
2421  
__kmp_join_ompt(int gtid,kmp_info_t * thread,kmp_team_t * team,ompt_data_t * parallel_data,int flags,void * codeptr)2422  static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2423                                     kmp_team_t *team, ompt_data_t *parallel_data,
2424                                     int flags, void *codeptr) {
2425    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2426    if (ompt_enabled.ompt_callback_parallel_end) {
2427      ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2428          parallel_data, &(task_info->task_data), flags, codeptr);
2429    }
2430  
2431    task_info->frame.enter_frame = ompt_data_none;
2432    __kmp_join_restore_state(thread, team);
2433  }
2434  #endif
2435  
__kmp_join_call(ident_t * loc,int gtid,enum fork_context_e fork_context,int exit_teams)2436  void __kmp_join_call(ident_t *loc, int gtid
2437  #if OMPT_SUPPORT
2438                       ,
2439                       enum fork_context_e fork_context
2440  #endif
2441                       ,
2442                       int exit_teams) {
2443    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2444    kmp_team_t *team;
2445    kmp_team_t *parent_team;
2446    kmp_info_t *master_th;
2447    kmp_root_t *root;
2448    int master_active;
2449  
2450    KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2451  
2452    /* setup current data */
2453    master_th = __kmp_threads[gtid];
2454    root = master_th->th.th_root;
2455    team = master_th->th.th_team;
2456    parent_team = team->t.t_parent;
2457  
2458    master_th->th.th_ident = loc;
2459  
2460  #if OMPT_SUPPORT
2461    void *team_microtask = (void *)team->t.t_pkfn;
2462    // For GOMP interface with serialized parallel, need the
2463    // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2464    // and end-parallel events.
2465    if (ompt_enabled.enabled &&
2466        !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2467      master_th->th.ompt_thread_info.state = ompt_state_overhead;
2468    }
2469  #endif
2470  
2471  #if KMP_DEBUG
2472    if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2473      KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2474                    "th_task_team = %p\n",
2475                    __kmp_gtid_from_thread(master_th), team,
2476                    team->t.t_task_team[master_th->th.th_task_state],
2477                    master_th->th.th_task_team));
2478      KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
2479    }
2480  #endif
2481  
2482    if (team->t.t_serialized) {
2483      if (master_th->th.th_teams_microtask) {
2484        // We are in teams construct
2485        int level = team->t.t_level;
2486        int tlevel = master_th->th.th_teams_level;
2487        if (level == tlevel) {
2488          // AC: we haven't incremented it earlier at start of teams construct,
2489          //     so do it here - at the end of teams construct
2490          team->t.t_level++;
2491        } else if (level == tlevel + 1) {
2492          // AC: we are exiting parallel inside teams, need to increment
2493          // serialization in order to restore it in the next call to
2494          // __kmpc_end_serialized_parallel
2495          team->t.t_serialized++;
2496        }
2497      }
2498      __kmpc_end_serialized_parallel(loc, gtid);
2499  
2500  #if OMPT_SUPPORT
2501      if (ompt_enabled.enabled) {
2502        if (fork_context == fork_context_gnu) {
2503          __ompt_lw_taskteam_unlink(master_th);
2504        }
2505        __kmp_join_restore_state(master_th, parent_team);
2506      }
2507  #endif
2508  
2509      return;
2510    }
2511  
2512    master_active = team->t.t_master_active;
2513  
2514    if (!exit_teams) {
2515      // AC: No barrier for internal teams at exit from teams construct.
2516      //     But there is barrier for external team (league).
2517      __kmp_internal_join(loc, gtid, team);
2518  #if USE_ITT_BUILD
2519      if (__itt_stack_caller_create_ptr) {
2520        KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2521        // destroy the stack stitching id after join barrier
2522        __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2523        team->t.t_stack_id = NULL;
2524      }
2525  #endif
2526    } else {
2527      master_th->th.th_task_state =
2528          0; // AC: no tasking in teams (out of any parallel)
2529  #if USE_ITT_BUILD
2530      if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2531        KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2532        // destroy the stack stitching id on exit from the teams construct
2533        // if parent_team is active, then the id will be destroyed later on
2534        // by master of the league of teams
2535        __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2536        parent_team->t.t_stack_id = NULL;
2537      }
2538  #endif
2539    }
2540  
2541    KMP_MB();
2542  
2543  #if OMPT_SUPPORT
2544    ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2545    void *codeptr = team->t.ompt_team_info.master_return_address;
2546  #endif
2547  
2548  #if USE_ITT_BUILD
2549    // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2550    if (team->t.t_active_level == 1 &&
2551        (!master_th->th.th_teams_microtask || /* not in teams construct */
2552         master_th->th.th_teams_size.nteams == 1)) {
2553      master_th->th.th_ident = loc;
2554      // only one notification scheme (either "submit" or "forking/joined", not
2555      // both)
2556      if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2557          __kmp_forkjoin_frames_mode == 3)
2558        __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2559                               master_th->th.th_frame_time, 0, loc,
2560                               master_th->th.th_team_nproc, 1);
2561      else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2562               !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2563        __kmp_itt_region_joined(gtid);
2564    } // active_level == 1
2565  #endif /* USE_ITT_BUILD */
2566  
2567  #if KMP_AFFINITY_SUPPORTED
2568    if (!exit_teams) {
2569      // Restore master thread's partition.
2570      master_th->th.th_first_place = team->t.t_first_place;
2571      master_th->th.th_last_place = team->t.t_last_place;
2572    }
2573  #endif // KMP_AFFINITY_SUPPORTED
2574  
2575    if (master_th->th.th_teams_microtask && !exit_teams &&
2576        team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2577        team->t.t_level == master_th->th.th_teams_level + 1) {
2578  // AC: We need to leave the team structure intact at the end of parallel
2579  // inside the teams construct, so that at the next parallel same (hot) team
2580  // works, only adjust nesting levels
2581  #if OMPT_SUPPORT
2582      ompt_data_t ompt_parallel_data = ompt_data_none;
2583      if (ompt_enabled.enabled) {
2584        ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2585        if (ompt_enabled.ompt_callback_implicit_task) {
2586          int ompt_team_size = team->t.t_nproc;
2587          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2588              ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2589              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2590        }
2591        task_info->frame.exit_frame = ompt_data_none;
2592        task_info->task_data = ompt_data_none;
2593        ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2594        __ompt_lw_taskteam_unlink(master_th);
2595      }
2596  #endif
2597      /* Decrement our nested depth level */
2598      team->t.t_level--;
2599      team->t.t_active_level--;
2600      KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2601  
2602      // Restore number of threads in the team if needed. This code relies on
2603      // the proper adjustment of th_teams_size.nth after the fork in
2604      // __kmp_teams_master on each teams primary thread in the case that
2605      // __kmp_reserve_threads reduced it.
2606      if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2607        int old_num = master_th->th.th_team_nproc;
2608        int new_num = master_th->th.th_teams_size.nth;
2609        kmp_info_t **other_threads = team->t.t_threads;
2610        team->t.t_nproc = new_num;
2611        for (int i = 0; i < old_num; ++i) {
2612          other_threads[i]->th.th_team_nproc = new_num;
2613        }
2614        // Adjust states of non-used threads of the team
2615        for (int i = old_num; i < new_num; ++i) {
2616          // Re-initialize thread's barrier data.
2617          KMP_DEBUG_ASSERT(other_threads[i]);
2618          kmp_balign_t *balign = other_threads[i]->th.th_bar;
2619          for (int b = 0; b < bs_last_barrier; ++b) {
2620            balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2621            KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2622  #if USE_DEBUGGER
2623            balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2624  #endif
2625          }
2626          if (__kmp_tasking_mode != tskm_immediate_exec) {
2627            // Synchronize thread's task state
2628            other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2629          }
2630        }
2631      }
2632  
2633  #if OMPT_SUPPORT
2634      if (ompt_enabled.enabled) {
2635        __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2636                        OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2637      }
2638  #endif
2639  
2640      return;
2641    }
2642  
2643    /* do cleanup and restore the parent team */
2644    master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2645    master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2646  
2647    master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2648  
2649    /* jc: The following lock has instructions with REL and ACQ semantics,
2650       separating the parallel user code called in this parallel region
2651       from the serial user code called after this function returns. */
2652    __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2653  
2654    if (!master_th->th.th_teams_microtask ||
2655        team->t.t_level > master_th->th.th_teams_level) {
2656      /* Decrement our nested depth level */
2657      KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2658    }
2659    KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2660  
2661  #if OMPT_SUPPORT
2662    if (ompt_enabled.enabled) {
2663      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2664      if (ompt_enabled.ompt_callback_implicit_task) {
2665        int flags = (team_microtask == (void *)__kmp_teams_master)
2666                        ? ompt_task_initial
2667                        : ompt_task_implicit;
2668        int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2669        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2670            ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2671            OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2672      }
2673      task_info->frame.exit_frame = ompt_data_none;
2674      task_info->task_data = ompt_data_none;
2675    }
2676  #endif
2677  
2678    KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2679                  master_th, team));
2680    __kmp_pop_current_task_from_thread(master_th);
2681  
2682    master_th->th.th_def_allocator = team->t.t_def_allocator;
2683  
2684  #if OMPD_SUPPORT
2685    if (ompd_state & OMPD_ENABLE_BP)
2686      ompd_bp_parallel_end();
2687  #endif
2688    updateHWFPControl(team);
2689  
2690    if (root->r.r_active != master_active)
2691      root->r.r_active = master_active;
2692  
2693    __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2694                              master_th)); // this will free worker threads
2695  
2696    /* this race was fun to find. make sure the following is in the critical
2697       region otherwise assertions may fail occasionally since the old team may be
2698       reallocated and the hierarchy appears inconsistent. it is actually safe to
2699       run and won't cause any bugs, but will cause those assertion failures. it's
2700       only one deref&assign so might as well put this in the critical region */
2701    master_th->th.th_team = parent_team;
2702    master_th->th.th_team_nproc = parent_team->t.t_nproc;
2703    master_th->th.th_team_master = parent_team->t.t_threads[0];
2704    master_th->th.th_team_serialized = parent_team->t.t_serialized;
2705  
2706    /* restore serialized team, if need be */
2707    if (parent_team->t.t_serialized &&
2708        parent_team != master_th->th.th_serial_team &&
2709        parent_team != root->r.r_root_team) {
2710      __kmp_free_team(root,
2711                      master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2712      master_th->th.th_serial_team = parent_team;
2713    }
2714  
2715    if (__kmp_tasking_mode != tskm_immediate_exec) {
2716      // Restore primary thread's task state from team structure
2717      KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
2718                       team->t.t_primary_task_state == 1);
2719      master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
2720  
2721      // Copy the task team from the parent team to the primary thread
2722      master_th->th.th_task_team =
2723          parent_team->t.t_task_team[master_th->th.th_task_state];
2724      KA_TRACE(20,
2725               ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2726                __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2727                parent_team));
2728    }
2729  
2730    // TODO: GEH - cannot do this assertion because root thread not set up as
2731    // executing
2732    // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2733    master_th->th.th_current_task->td_flags.executing = 1;
2734  
2735    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2736  
2737  #if KMP_AFFINITY_SUPPORTED
2738    if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2739      __kmp_reset_root_init_mask(gtid);
2740    }
2741  #endif
2742  #if OMPT_SUPPORT
2743    int flags =
2744        OMPT_INVOKER(fork_context) |
2745        ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2746                                                        : ompt_parallel_team);
2747    if (ompt_enabled.enabled) {
2748      __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2749                      codeptr);
2750    }
2751  #endif
2752  
2753    KMP_MB();
2754    KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2755  }
2756  
2757  /* Check whether we should push an internal control record onto the
2758     serial team stack.  If so, do it.  */
__kmp_save_internal_controls(kmp_info_t * thread)2759  void __kmp_save_internal_controls(kmp_info_t *thread) {
2760  
2761    if (thread->th.th_team != thread->th.th_serial_team) {
2762      return;
2763    }
2764    if (thread->th.th_team->t.t_serialized > 1) {
2765      int push = 0;
2766  
2767      if (thread->th.th_team->t.t_control_stack_top == NULL) {
2768        push = 1;
2769      } else {
2770        if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2771            thread->th.th_team->t.t_serialized) {
2772          push = 1;
2773        }
2774      }
2775      if (push) { /* push a record on the serial team's stack */
2776        kmp_internal_control_t *control =
2777            (kmp_internal_control_t *)__kmp_allocate(
2778                sizeof(kmp_internal_control_t));
2779  
2780        copy_icvs(control, &thread->th.th_current_task->td_icvs);
2781  
2782        control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2783  
2784        control->next = thread->th.th_team->t.t_control_stack_top;
2785        thread->th.th_team->t.t_control_stack_top = control;
2786      }
2787    }
2788  }
2789  
2790  /* Changes set_nproc */
__kmp_set_num_threads(int new_nth,int gtid)2791  void __kmp_set_num_threads(int new_nth, int gtid) {
2792    kmp_info_t *thread;
2793    kmp_root_t *root;
2794  
2795    KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2796    KMP_DEBUG_ASSERT(__kmp_init_serial);
2797  
2798    if (new_nth < 1)
2799      new_nth = 1;
2800    else if (new_nth > __kmp_max_nth)
2801      new_nth = __kmp_max_nth;
2802  
2803    KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2804    thread = __kmp_threads[gtid];
2805    if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2806      return; // nothing to do
2807  
2808    __kmp_save_internal_controls(thread);
2809  
2810    set__nproc(thread, new_nth);
2811  
2812    // If this omp_set_num_threads() call will cause the hot team size to be
2813    // reduced (in the absence of a num_threads clause), then reduce it now,
2814    // rather than waiting for the next parallel region.
2815    root = thread->th.th_root;
2816    if (__kmp_init_parallel && (!root->r.r_active) &&
2817        (root->r.r_hot_team->t.t_nproc > new_nth)
2818  #if KMP_NESTED_HOT_TEAMS
2819        && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2820  #endif
2821    ) {
2822      kmp_team_t *hot_team = root->r.r_hot_team;
2823      int f;
2824  
2825      __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2826  
2827      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2828        __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2829      }
2830      // Release the extra threads we don't need any more.
2831      for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2832        KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2833        if (__kmp_tasking_mode != tskm_immediate_exec) {
2834          // When decreasing team size, threads no longer in the team should unref
2835          // task team.
2836          hot_team->t.t_threads[f]->th.th_task_team = NULL;
2837        }
2838        __kmp_free_thread(hot_team->t.t_threads[f]);
2839        hot_team->t.t_threads[f] = NULL;
2840      }
2841      hot_team->t.t_nproc = new_nth;
2842  #if KMP_NESTED_HOT_TEAMS
2843      if (thread->th.th_hot_teams) {
2844        KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2845        thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2846      }
2847  #endif
2848  
2849      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2850        hot_team->t.b->update_num_threads(new_nth);
2851        __kmp_add_threads_to_team(hot_team, new_nth);
2852      }
2853  
2854      __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2855  
2856      // Update the t_nproc field in the threads that are still active.
2857      for (f = 0; f < new_nth; f++) {
2858        KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2859        hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2860      }
2861      // Special flag in case omp_set_num_threads() call
2862      hot_team->t.t_size_changed = -1;
2863    }
2864  }
2865  
2866  /* Changes max_active_levels */
__kmp_set_max_active_levels(int gtid,int max_active_levels)2867  void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2868    kmp_info_t *thread;
2869  
2870    KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2871                  "%d = (%d)\n",
2872                  gtid, max_active_levels));
2873    KMP_DEBUG_ASSERT(__kmp_init_serial);
2874  
2875    // validate max_active_levels
2876    if (max_active_levels < 0) {
2877      KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2878      // We ignore this call if the user has specified a negative value.
2879      // The current setting won't be changed. The last valid setting will be
2880      // used. A warning will be issued (if warnings are allowed as controlled by
2881      // the KMP_WARNINGS env var).
2882      KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2883                    "max_active_levels for thread %d = (%d)\n",
2884                    gtid, max_active_levels));
2885      return;
2886    }
2887    if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2888      // it's OK, the max_active_levels is within the valid range: [ 0;
2889      // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2890      // We allow a zero value. (implementation defined behavior)
2891    } else {
2892      KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2893                  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2894      max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2895      // Current upper limit is MAX_INT. (implementation defined behavior)
2896      // If the input exceeds the upper limit, we correct the input to be the
2897      // upper limit. (implementation defined behavior)
2898      // Actually, the flow should never get here until we use MAX_INT limit.
2899    }
2900    KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2901                  "max_active_levels for thread %d = (%d)\n",
2902                  gtid, max_active_levels));
2903  
2904    thread = __kmp_threads[gtid];
2905  
2906    __kmp_save_internal_controls(thread);
2907  
2908    set__max_active_levels(thread, max_active_levels);
2909  }
2910  
2911  /* Gets max_active_levels */
__kmp_get_max_active_levels(int gtid)2912  int __kmp_get_max_active_levels(int gtid) {
2913    kmp_info_t *thread;
2914  
2915    KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2916    KMP_DEBUG_ASSERT(__kmp_init_serial);
2917  
2918    thread = __kmp_threads[gtid];
2919    KMP_DEBUG_ASSERT(thread->th.th_current_task);
2920    KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2921                  "curtask_maxaclevel=%d\n",
2922                  gtid, thread->th.th_current_task,
2923                  thread->th.th_current_task->td_icvs.max_active_levels));
2924    return thread->th.th_current_task->td_icvs.max_active_levels;
2925  }
2926  
2927  // nteams-var per-device ICV
__kmp_set_num_teams(int num_teams)2928  void __kmp_set_num_teams(int num_teams) {
2929    if (num_teams > 0)
2930      __kmp_nteams = num_teams;
2931  }
__kmp_get_max_teams(void)2932  int __kmp_get_max_teams(void) { return __kmp_nteams; }
2933  // teams-thread-limit-var per-device ICV
__kmp_set_teams_thread_limit(int limit)2934  void __kmp_set_teams_thread_limit(int limit) {
2935    if (limit > 0)
2936      __kmp_teams_thread_limit = limit;
2937  }
__kmp_get_teams_thread_limit(void)2938  int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2939  
2940  KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2941  KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2942  
2943  /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
__kmp_set_schedule(int gtid,kmp_sched_t kind,int chunk)2944  void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2945    kmp_info_t *thread;
2946    kmp_sched_t orig_kind;
2947    //    kmp_team_t *team;
2948  
2949    KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2950                  gtid, (int)kind, chunk));
2951    KMP_DEBUG_ASSERT(__kmp_init_serial);
2952  
2953    // Check if the kind parameter is valid, correct if needed.
2954    // Valid parameters should fit in one of two intervals - standard or extended:
2955    //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2956    // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2957    orig_kind = kind;
2958    kind = __kmp_sched_without_mods(kind);
2959  
2960    if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2961        (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2962      // TODO: Hint needs attention in case we change the default schedule.
2963      __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2964                KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2965                __kmp_msg_null);
2966      kind = kmp_sched_default;
2967      chunk = 0; // ignore chunk value in case of bad kind
2968    }
2969  
2970    thread = __kmp_threads[gtid];
2971  
2972    __kmp_save_internal_controls(thread);
2973  
2974    if (kind < kmp_sched_upper_std) {
2975      if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2976        // differ static chunked vs. unchunked:  chunk should be invalid to
2977        // indicate unchunked schedule (which is the default)
2978        thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2979      } else {
2980        thread->th.th_current_task->td_icvs.sched.r_sched_type =
2981            __kmp_sch_map[kind - kmp_sched_lower - 1];
2982      }
2983    } else {
2984      //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2985      //    kmp_sched_lower - 2 ];
2986      thread->th.th_current_task->td_icvs.sched.r_sched_type =
2987          __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2988                        kmp_sched_lower - 2];
2989    }
2990    __kmp_sched_apply_mods_intkind(
2991        orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2992    if (kind == kmp_sched_auto || chunk < 1) {
2993      // ignore parameter chunk for schedule auto
2994      thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2995    } else {
2996      thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2997    }
2998  }
2999  
3000  /* Gets def_sched_var ICV values */
__kmp_get_schedule(int gtid,kmp_sched_t * kind,int * chunk)3001  void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
3002    kmp_info_t *thread;
3003    enum sched_type th_type;
3004  
3005    KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
3006    KMP_DEBUG_ASSERT(__kmp_init_serial);
3007  
3008    thread = __kmp_threads[gtid];
3009  
3010    th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3011    switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3012    case kmp_sch_static:
3013    case kmp_sch_static_greedy:
3014    case kmp_sch_static_balanced:
3015      *kind = kmp_sched_static;
3016      __kmp_sched_apply_mods_stdkind(kind, th_type);
3017      *chunk = 0; // chunk was not set, try to show this fact via zero value
3018      return;
3019    case kmp_sch_static_chunked:
3020      *kind = kmp_sched_static;
3021      break;
3022    case kmp_sch_dynamic_chunked:
3023      *kind = kmp_sched_dynamic;
3024      break;
3025    case kmp_sch_guided_chunked:
3026    case kmp_sch_guided_iterative_chunked:
3027    case kmp_sch_guided_analytical_chunked:
3028      *kind = kmp_sched_guided;
3029      break;
3030    case kmp_sch_auto:
3031      *kind = kmp_sched_auto;
3032      break;
3033    case kmp_sch_trapezoidal:
3034      *kind = kmp_sched_trapezoidal;
3035      break;
3036  #if KMP_STATIC_STEAL_ENABLED
3037    case kmp_sch_static_steal:
3038      *kind = kmp_sched_static_steal;
3039      break;
3040  #endif
3041    default:
3042      KMP_FATAL(UnknownSchedulingType, th_type);
3043    }
3044  
3045    __kmp_sched_apply_mods_stdkind(kind, th_type);
3046    *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3047  }
3048  
__kmp_get_ancestor_thread_num(int gtid,int level)3049  int __kmp_get_ancestor_thread_num(int gtid, int level) {
3050  
3051    int ii, dd;
3052    kmp_team_t *team;
3053    kmp_info_t *thr;
3054  
3055    KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3056    KMP_DEBUG_ASSERT(__kmp_init_serial);
3057  
3058    // validate level
3059    if (level == 0)
3060      return 0;
3061    if (level < 0)
3062      return -1;
3063    thr = __kmp_threads[gtid];
3064    team = thr->th.th_team;
3065    ii = team->t.t_level;
3066    if (level > ii)
3067      return -1;
3068  
3069    if (thr->th.th_teams_microtask) {
3070      // AC: we are in teams region where multiple nested teams have same level
3071      int tlevel = thr->th.th_teams_level; // the level of the teams construct
3072      if (level <=
3073          tlevel) { // otherwise usual algorithm works (will not touch the teams)
3074        KMP_DEBUG_ASSERT(ii >= tlevel);
3075        // AC: As we need to pass by the teams league, we need to artificially
3076        // increase ii
3077        if (ii == tlevel) {
3078          ii += 2; // three teams have same level
3079        } else {
3080          ii++; // two teams have same level
3081        }
3082      }
3083    }
3084  
3085    if (ii == level)
3086      return __kmp_tid_from_gtid(gtid);
3087  
3088    dd = team->t.t_serialized;
3089    level++;
3090    while (ii > level) {
3091      for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3092      }
3093      if ((team->t.t_serialized) && (!dd)) {
3094        team = team->t.t_parent;
3095        continue;
3096      }
3097      if (ii > level) {
3098        team = team->t.t_parent;
3099        dd = team->t.t_serialized;
3100        ii--;
3101      }
3102    }
3103  
3104    return (dd > 1) ? (0) : (team->t.t_master_tid);
3105  }
3106  
__kmp_get_team_size(int gtid,int level)3107  int __kmp_get_team_size(int gtid, int level) {
3108  
3109    int ii, dd;
3110    kmp_team_t *team;
3111    kmp_info_t *thr;
3112  
3113    KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3114    KMP_DEBUG_ASSERT(__kmp_init_serial);
3115  
3116    // validate level
3117    if (level == 0)
3118      return 1;
3119    if (level < 0)
3120      return -1;
3121    thr = __kmp_threads[gtid];
3122    team = thr->th.th_team;
3123    ii = team->t.t_level;
3124    if (level > ii)
3125      return -1;
3126  
3127    if (thr->th.th_teams_microtask) {
3128      // AC: we are in teams region where multiple nested teams have same level
3129      int tlevel = thr->th.th_teams_level; // the level of the teams construct
3130      if (level <=
3131          tlevel) { // otherwise usual algorithm works (will not touch the teams)
3132        KMP_DEBUG_ASSERT(ii >= tlevel);
3133        // AC: As we need to pass by the teams league, we need to artificially
3134        // increase ii
3135        if (ii == tlevel) {
3136          ii += 2; // three teams have same level
3137        } else {
3138          ii++; // two teams have same level
3139        }
3140      }
3141    }
3142  
3143    while (ii > level) {
3144      for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3145      }
3146      if (team->t.t_serialized && (!dd)) {
3147        team = team->t.t_parent;
3148        continue;
3149      }
3150      if (ii > level) {
3151        team = team->t.t_parent;
3152        ii--;
3153      }
3154    }
3155  
3156    return team->t.t_nproc;
3157  }
3158  
__kmp_get_schedule_global()3159  kmp_r_sched_t __kmp_get_schedule_global() {
3160    // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3161    // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3162    // independently. So one can get the updated schedule here.
3163  
3164    kmp_r_sched_t r_sched;
3165  
3166    // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3167    // __kmp_guided. __kmp_sched should keep original value, so that user can set
3168    // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3169    // different roots (even in OMP 2.5)
3170    enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3171    enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3172    if (s == kmp_sch_static) {
3173      // replace STATIC with more detailed schedule (balanced or greedy)
3174      r_sched.r_sched_type = __kmp_static;
3175    } else if (s == kmp_sch_guided_chunked) {
3176      // replace GUIDED with more detailed schedule (iterative or analytical)
3177      r_sched.r_sched_type = __kmp_guided;
3178    } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3179      r_sched.r_sched_type = __kmp_sched;
3180    }
3181    SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3182  
3183    if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3184      // __kmp_chunk may be wrong here (if it was not ever set)
3185      r_sched.chunk = KMP_DEFAULT_CHUNK;
3186    } else {
3187      r_sched.chunk = __kmp_chunk;
3188    }
3189  
3190    return r_sched;
3191  }
3192  
3193  /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3194     at least argc number of *t_argv entries for the requested team. */
__kmp_alloc_argv_entries(int argc,kmp_team_t * team,int realloc)3195  static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3196  
3197    KMP_DEBUG_ASSERT(team);
3198    if (!realloc || argc > team->t.t_max_argc) {
3199  
3200      KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3201                     "current entries=%d\n",
3202                     team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3203      /* if previously allocated heap space for args, free them */
3204      if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3205        __kmp_free((void *)team->t.t_argv);
3206  
3207      if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3208        /* use unused space in the cache line for arguments */
3209        team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3210        KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3211                       "argv entries\n",
3212                       team->t.t_id, team->t.t_max_argc));
3213        team->t.t_argv = &team->t.t_inline_argv[0];
3214        if (__kmp_storage_map) {
3215          __kmp_print_storage_map_gtid(
3216              -1, &team->t.t_inline_argv[0],
3217              &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3218              (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3219              team->t.t_id);
3220        }
3221      } else {
3222        /* allocate space for arguments in the heap */
3223        team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3224                                 ? KMP_MIN_MALLOC_ARGV_ENTRIES
3225                                 : 2 * argc;
3226        KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3227                       "argv entries\n",
3228                       team->t.t_id, team->t.t_max_argc));
3229        team->t.t_argv =
3230            (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3231        if (__kmp_storage_map) {
3232          __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3233                                       &team->t.t_argv[team->t.t_max_argc],
3234                                       sizeof(void *) * team->t.t_max_argc,
3235                                       "team_%d.t_argv", team->t.t_id);
3236        }
3237      }
3238    }
3239  }
3240  
__kmp_allocate_team_arrays(kmp_team_t * team,int max_nth)3241  static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3242    int i;
3243    int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3244    team->t.t_threads =
3245        (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3246    team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3247        sizeof(dispatch_shared_info_t) * num_disp_buff);
3248    team->t.t_dispatch =
3249        (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3250    team->t.t_implicit_task_taskdata =
3251        (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3252    team->t.t_max_nproc = max_nth;
3253  
3254    /* setup dispatch buffers */
3255    for (i = 0; i < num_disp_buff; ++i) {
3256      team->t.t_disp_buffer[i].buffer_index = i;
3257      team->t.t_disp_buffer[i].doacross_buf_idx = i;
3258    }
3259  }
3260  
__kmp_free_team_arrays(kmp_team_t * team)3261  static void __kmp_free_team_arrays(kmp_team_t *team) {
3262    /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3263    int i;
3264    for (i = 0; i < team->t.t_max_nproc; ++i) {
3265      if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3266        __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3267        team->t.t_dispatch[i].th_disp_buffer = NULL;
3268      }
3269    }
3270  #if KMP_USE_HIER_SCHED
3271    __kmp_dispatch_free_hierarchies(team);
3272  #endif
3273    __kmp_free(team->t.t_threads);
3274    __kmp_free(team->t.t_disp_buffer);
3275    __kmp_free(team->t.t_dispatch);
3276    __kmp_free(team->t.t_implicit_task_taskdata);
3277    team->t.t_threads = NULL;
3278    team->t.t_disp_buffer = NULL;
3279    team->t.t_dispatch = NULL;
3280    team->t.t_implicit_task_taskdata = 0;
3281  }
3282  
__kmp_reallocate_team_arrays(kmp_team_t * team,int max_nth)3283  static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3284    kmp_info_t **oldThreads = team->t.t_threads;
3285  
3286    __kmp_free(team->t.t_disp_buffer);
3287    __kmp_free(team->t.t_dispatch);
3288    __kmp_free(team->t.t_implicit_task_taskdata);
3289    __kmp_allocate_team_arrays(team, max_nth);
3290  
3291    KMP_MEMCPY(team->t.t_threads, oldThreads,
3292               team->t.t_nproc * sizeof(kmp_info_t *));
3293  
3294    __kmp_free(oldThreads);
3295  }
3296  
__kmp_get_global_icvs(void)3297  static kmp_internal_control_t __kmp_get_global_icvs(void) {
3298  
3299    kmp_r_sched_t r_sched =
3300        __kmp_get_schedule_global(); // get current state of scheduling globals
3301  
3302    KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3303  
3304    kmp_internal_control_t g_icvs = {
3305      0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3306      (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3307      // adjustment of threads (per thread)
3308      (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3309      // whether blocktime is explicitly set
3310      __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3311  #if KMP_USE_MONITOR
3312      __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3313  // intervals
3314  #endif
3315      __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3316      // next parallel region (per thread)
3317      // (use a max ub on value if __kmp_parallel_initialize not called yet)
3318      __kmp_cg_max_nth, // int thread_limit;
3319      __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3320      // on task. This is used in the case of target thread_limit
3321      __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3322      // for max_active_levels
3323      r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3324      // {sched,chunk} pair
3325      __kmp_nested_proc_bind.bind_types[0],
3326      __kmp_default_device,
3327      NULL // struct kmp_internal_control *next;
3328    };
3329  
3330    return g_icvs;
3331  }
3332  
__kmp_get_x_global_icvs(const kmp_team_t * team)3333  static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3334  
3335    kmp_internal_control_t gx_icvs;
3336    gx_icvs.serial_nesting_level =
3337        0; // probably =team->t.t_serial like in save_inter_controls
3338    copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3339    gx_icvs.next = NULL;
3340  
3341    return gx_icvs;
3342  }
3343  
__kmp_initialize_root(kmp_root_t * root)3344  static void __kmp_initialize_root(kmp_root_t *root) {
3345    int f;
3346    kmp_team_t *root_team;
3347    kmp_team_t *hot_team;
3348    int hot_team_max_nth;
3349    kmp_r_sched_t r_sched =
3350        __kmp_get_schedule_global(); // get current state of scheduling globals
3351    kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3352    KMP_DEBUG_ASSERT(root);
3353    KMP_ASSERT(!root->r.r_begin);
3354  
3355    /* setup the root state structure */
3356    __kmp_init_lock(&root->r.r_begin_lock);
3357    root->r.r_begin = FALSE;
3358    root->r.r_active = FALSE;
3359    root->r.r_in_parallel = 0;
3360    root->r.r_blocktime = __kmp_dflt_blocktime;
3361  #if KMP_AFFINITY_SUPPORTED
3362    root->r.r_affinity_assigned = FALSE;
3363  #endif
3364  
3365    /* setup the root team for this task */
3366    /* allocate the root team structure */
3367    KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3368  
3369    root_team =
3370        __kmp_allocate_team(root,
3371                            1, // new_nproc
3372                            1, // max_nproc
3373  #if OMPT_SUPPORT
3374                            ompt_data_none, // root parallel id
3375  #endif
3376                            __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3377                            0 // argc
3378                            USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3379        );
3380  #if USE_DEBUGGER
3381    // Non-NULL value should be assigned to make the debugger display the root
3382    // team.
3383    TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3384  #endif
3385  
3386    KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3387  
3388    root->r.r_root_team = root_team;
3389    root_team->t.t_control_stack_top = NULL;
3390  
3391    /* initialize root team */
3392    root_team->t.t_threads[0] = NULL;
3393    root_team->t.t_nproc = 1;
3394    root_team->t.t_serialized = 1;
3395    // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3396    root_team->t.t_sched.sched = r_sched.sched;
3397    root_team->t.t_nested_nth = &__kmp_nested_nth;
3398    KA_TRACE(
3399        20,
3400        ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3401         root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3402  
3403    /* setup the  hot team for this task */
3404    /* allocate the hot team structure */
3405    KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3406  
3407    hot_team =
3408        __kmp_allocate_team(root,
3409                            1, // new_nproc
3410                            __kmp_dflt_team_nth_ub * 2, // max_nproc
3411  #if OMPT_SUPPORT
3412                            ompt_data_none, // root parallel id
3413  #endif
3414                            __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3415                            0 // argc
3416                            USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3417        );
3418    KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3419  
3420    root->r.r_hot_team = hot_team;
3421    root_team->t.t_control_stack_top = NULL;
3422  
3423    /* first-time initialization */
3424    hot_team->t.t_parent = root_team;
3425  
3426    /* initialize hot team */
3427    hot_team_max_nth = hot_team->t.t_max_nproc;
3428    for (f = 0; f < hot_team_max_nth; ++f) {
3429      hot_team->t.t_threads[f] = NULL;
3430    }
3431    hot_team->t.t_nproc = 1;
3432    // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3433    hot_team->t.t_sched.sched = r_sched.sched;
3434    hot_team->t.t_size_changed = 0;
3435    hot_team->t.t_nested_nth = &__kmp_nested_nth;
3436  }
3437  
3438  #ifdef KMP_DEBUG
3439  
3440  typedef struct kmp_team_list_item {
3441    kmp_team_p const *entry;
3442    struct kmp_team_list_item *next;
3443  } kmp_team_list_item_t;
3444  typedef kmp_team_list_item_t *kmp_team_list_t;
3445  
__kmp_print_structure_team_accum(kmp_team_list_t list,kmp_team_p const * team)3446  static void __kmp_print_structure_team_accum( // Add team to list of teams.
3447      kmp_team_list_t list, // List of teams.
3448      kmp_team_p const *team // Team to add.
3449  ) {
3450  
3451    // List must terminate with item where both entry and next are NULL.
3452    // Team is added to the list only once.
3453    // List is sorted in ascending order by team id.
3454    // Team id is *not* a key.
3455  
3456    kmp_team_list_t l;
3457  
3458    KMP_DEBUG_ASSERT(list != NULL);
3459    if (team == NULL) {
3460      return;
3461    }
3462  
3463    __kmp_print_structure_team_accum(list, team->t.t_parent);
3464    __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3465  
3466    // Search list for the team.
3467    l = list;
3468    while (l->next != NULL && l->entry != team) {
3469      l = l->next;
3470    }
3471    if (l->next != NULL) {
3472      return; // Team has been added before, exit.
3473    }
3474  
3475    // Team is not found. Search list again for insertion point.
3476    l = list;
3477    while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3478      l = l->next;
3479    }
3480  
3481    // Insert team.
3482    {
3483      kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3484          sizeof(kmp_team_list_item_t));
3485      *item = *l;
3486      l->entry = team;
3487      l->next = item;
3488    }
3489  }
3490  
__kmp_print_structure_team(char const * title,kmp_team_p const * team)3491  static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3492  
3493  ) {
3494    __kmp_printf("%s", title);
3495    if (team != NULL) {
3496      __kmp_printf("%2x %p\n", team->t.t_id, team);
3497    } else {
3498      __kmp_printf(" - (nil)\n");
3499    }
3500  }
3501  
__kmp_print_structure_thread(char const * title,kmp_info_p const * thread)3502  static void __kmp_print_structure_thread(char const *title,
3503                                           kmp_info_p const *thread) {
3504    __kmp_printf("%s", title);
3505    if (thread != NULL) {
3506      __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3507    } else {
3508      __kmp_printf(" - (nil)\n");
3509    }
3510  }
3511  
__kmp_print_structure(void)3512  void __kmp_print_structure(void) {
3513  
3514    kmp_team_list_t list;
3515  
3516    // Initialize list of teams.
3517    list =
3518        (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3519    list->entry = NULL;
3520    list->next = NULL;
3521  
3522    __kmp_printf("\n------------------------------\nGlobal Thread "
3523                 "Table\n------------------------------\n");
3524    {
3525      int gtid;
3526      for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3527        __kmp_printf("%2d", gtid);
3528        if (__kmp_threads != NULL) {
3529          __kmp_printf(" %p", __kmp_threads[gtid]);
3530        }
3531        if (__kmp_root != NULL) {
3532          __kmp_printf(" %p", __kmp_root[gtid]);
3533        }
3534        __kmp_printf("\n");
3535      }
3536    }
3537  
3538    // Print out __kmp_threads array.
3539    __kmp_printf("\n------------------------------\nThreads\n--------------------"
3540                 "----------\n");
3541    if (__kmp_threads != NULL) {
3542      int gtid;
3543      for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3544        kmp_info_t const *thread = __kmp_threads[gtid];
3545        if (thread != NULL) {
3546          __kmp_printf("GTID %2d %p:\n", gtid, thread);
3547          __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3548          __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3549          __kmp_print_structure_team("    Serial Team:  ",
3550                                     thread->th.th_serial_team);
3551          __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3552          __kmp_print_structure_thread("    Primary:      ",
3553                                       thread->th.th_team_master);
3554          __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3555          __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3556          __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3557          __kmp_print_structure_thread("    Next in pool: ",
3558                                       thread->th.th_next_pool);
3559          __kmp_printf("\n");
3560          __kmp_print_structure_team_accum(list, thread->th.th_team);
3561          __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3562        }
3563      }
3564    } else {
3565      __kmp_printf("Threads array is not allocated.\n");
3566    }
3567  
3568    // Print out __kmp_root array.
3569    __kmp_printf("\n------------------------------\nUbers\n----------------------"
3570                 "--------\n");
3571    if (__kmp_root != NULL) {
3572      int gtid;
3573      for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3574        kmp_root_t const *root = __kmp_root[gtid];
3575        if (root != NULL) {
3576          __kmp_printf("GTID %2d %p:\n", gtid, root);
3577          __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3578          __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3579          __kmp_print_structure_thread("    Uber Thread:  ",
3580                                       root->r.r_uber_thread);
3581          __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3582          __kmp_printf("    In Parallel:  %2d\n",
3583                       KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3584          __kmp_printf("\n");
3585          __kmp_print_structure_team_accum(list, root->r.r_root_team);
3586          __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3587        }
3588      }
3589    } else {
3590      __kmp_printf("Ubers array is not allocated.\n");
3591    }
3592  
3593    __kmp_printf("\n------------------------------\nTeams\n----------------------"
3594                 "--------\n");
3595    while (list->next != NULL) {
3596      kmp_team_p const *team = list->entry;
3597      int i;
3598      __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3599      __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3600      __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
3601      __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3602      __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3603      __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3604      for (i = 0; i < team->t.t_nproc; ++i) {
3605        __kmp_printf("    Thread %2d:      ", i);
3606        __kmp_print_structure_thread("", team->t.t_threads[i]);
3607      }
3608      __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3609      __kmp_printf("\n");
3610      list = list->next;
3611    }
3612  
3613    // Print out __kmp_thread_pool and __kmp_team_pool.
3614    __kmp_printf("\n------------------------------\nPools\n----------------------"
3615                 "--------\n");
3616    __kmp_print_structure_thread("Thread pool:          ",
3617                                 CCAST(kmp_info_t *, __kmp_thread_pool));
3618    __kmp_print_structure_team("Team pool:            ",
3619                               CCAST(kmp_team_t *, __kmp_team_pool));
3620    __kmp_printf("\n");
3621  
3622    // Free team list.
3623    while (list != NULL) {
3624      kmp_team_list_item_t *item = list;
3625      list = list->next;
3626      KMP_INTERNAL_FREE(item);
3627    }
3628  }
3629  
3630  #endif
3631  
3632  //---------------------------------------------------------------------------
3633  //  Stuff for per-thread fast random number generator
3634  //  Table of primes
3635  static const unsigned __kmp_primes[] = {
3636      0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3637      0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3638      0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3639      0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3640      0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3641      0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3642      0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3643      0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3644      0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3645      0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3646      0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3647  
3648  //---------------------------------------------------------------------------
3649  //  __kmp_get_random: Get a random number using a linear congruential method.
__kmp_get_random(kmp_info_t * thread)3650  unsigned short __kmp_get_random(kmp_info_t *thread) {
3651    unsigned x = thread->th.th_x;
3652    unsigned short r = (unsigned short)(x >> 16);
3653  
3654    thread->th.th_x = x * thread->th.th_a + 1;
3655  
3656    KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3657                  thread->th.th_info.ds.ds_tid, r));
3658  
3659    return r;
3660  }
3661  //--------------------------------------------------------
3662  // __kmp_init_random: Initialize a random number generator
__kmp_init_random(kmp_info_t * thread)3663  void __kmp_init_random(kmp_info_t *thread) {
3664    unsigned seed = thread->th.th_info.ds.ds_tid;
3665  
3666    thread->th.th_a =
3667        __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3668    thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3669    KA_TRACE(30,
3670             ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3671  }
3672  
3673  #if KMP_OS_WINDOWS
3674  /* reclaim array entries for root threads that are already dead, returns number
3675   * reclaimed */
__kmp_reclaim_dead_roots(void)3676  static int __kmp_reclaim_dead_roots(void) {
3677    int i, r = 0;
3678  
3679    for (i = 0; i < __kmp_threads_capacity; ++i) {
3680      if (KMP_UBER_GTID(i) &&
3681          !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3682          !__kmp_root[i]
3683               ->r.r_active) { // AC: reclaim only roots died in non-active state
3684        r += __kmp_unregister_root_other_thread(i);
3685      }
3686    }
3687    return r;
3688  }
3689  #endif
3690  
3691  /* This function attempts to create free entries in __kmp_threads and
3692     __kmp_root, and returns the number of free entries generated.
3693  
3694     For Windows* OS static library, the first mechanism used is to reclaim array
3695     entries for root threads that are already dead.
3696  
3697     On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3698     __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3699     capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3700     threadprivate cache array has been created. Synchronization with
3701     __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3702  
3703     After any dead root reclamation, if the clipping value allows array expansion
3704     to result in the generation of a total of nNeed free slots, the function does
3705     that expansion. If not, nothing is done beyond the possible initial root
3706     thread reclamation.
3707  
3708     If any argument is negative, the behavior is undefined. */
__kmp_expand_threads(int nNeed)3709  static int __kmp_expand_threads(int nNeed) {
3710    int added = 0;
3711    int minimumRequiredCapacity;
3712    int newCapacity;
3713    kmp_info_t **newThreads;
3714    kmp_root_t **newRoot;
3715  
3716    // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3717    // resizing __kmp_threads does not need additional protection if foreign
3718    // threads are present
3719  
3720  #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3721    /* only for Windows static library */
3722    /* reclaim array entries for root threads that are already dead */
3723    added = __kmp_reclaim_dead_roots();
3724  
3725    if (nNeed) {
3726      nNeed -= added;
3727      if (nNeed < 0)
3728        nNeed = 0;
3729    }
3730  #endif
3731    if (nNeed <= 0)
3732      return added;
3733  
3734    // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3735    // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3736    // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3737    // > __kmp_max_nth in one of two ways:
3738    //
3739    // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3740    //    may not be reused by another thread, so we may need to increase
3741    //    __kmp_threads_capacity to __kmp_max_nth + 1.
3742    //
3743    // 2) New foreign root(s) are encountered.  We always register new foreign
3744    //    roots. This may cause a smaller # of threads to be allocated at
3745    //    subsequent parallel regions, but the worker threads hang around (and
3746    //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3747    //
3748    // Anyway, that is the reason for moving the check to see if
3749    // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3750    // instead of having it performed here. -BB
3751  
3752    KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3753  
3754    /* compute expansion headroom to check if we can expand */
3755    if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3756      /* possible expansion too small -- give up */
3757      return added;
3758    }
3759    minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3760  
3761    newCapacity = __kmp_threads_capacity;
3762    do {
3763      newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3764                                                            : __kmp_sys_max_nth;
3765    } while (newCapacity < minimumRequiredCapacity);
3766    newThreads = (kmp_info_t **)__kmp_allocate(
3767        (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3768    newRoot =
3769        (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3770    KMP_MEMCPY(newThreads, __kmp_threads,
3771               __kmp_threads_capacity * sizeof(kmp_info_t *));
3772    KMP_MEMCPY(newRoot, __kmp_root,
3773               __kmp_threads_capacity * sizeof(kmp_root_t *));
3774    // Put old __kmp_threads array on a list. Any ongoing references to the old
3775    // list will be valid. This list is cleaned up at library shutdown.
3776    kmp_old_threads_list_t *node =
3777        (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3778    node->threads = __kmp_threads;
3779    node->next = __kmp_old_threads_list;
3780    __kmp_old_threads_list = node;
3781  
3782    *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3783    *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3784    added += newCapacity - __kmp_threads_capacity;
3785    *(volatile int *)&__kmp_threads_capacity = newCapacity;
3786  
3787    if (newCapacity > __kmp_tp_capacity) {
3788      __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3789      if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3790        __kmp_threadprivate_resize_cache(newCapacity);
3791      } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3792        *(volatile int *)&__kmp_tp_capacity = newCapacity;
3793      }
3794      __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3795    }
3796  
3797    return added;
3798  }
3799  
3800  /* Register the current thread as a root thread and obtain our gtid. We must
3801     have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3802     thread that calls from __kmp_do_serial_initialize() */
__kmp_register_root(int initial_thread)3803  int __kmp_register_root(int initial_thread) {
3804    kmp_info_t *root_thread;
3805    kmp_root_t *root;
3806    int gtid;
3807    int capacity;
3808    __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3809    KA_TRACE(20, ("__kmp_register_root: entered\n"));
3810    KMP_MB();
3811  
3812    /* 2007-03-02:
3813       If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3814       initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3815       work as expected -- it may return false (that means there is at least one
3816       empty slot in __kmp_threads array), but it is possible the only free slot
3817       is #0, which is reserved for initial thread and so cannot be used for this
3818       one. Following code workarounds this bug.
3819  
3820       However, right solution seems to be not reserving slot #0 for initial
3821       thread because:
3822       (1) there is no magic in slot #0,
3823       (2) we cannot detect initial thread reliably (the first thread which does
3824          serial initialization may be not a real initial thread).
3825    */
3826    capacity = __kmp_threads_capacity;
3827    if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3828      --capacity;
3829    }
3830  
3831    // If it is not for initializing the hidden helper team, we need to take
3832    // __kmp_hidden_helper_threads_num out of the capacity because it is included
3833    // in __kmp_threads_capacity.
3834    if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3835      capacity -= __kmp_hidden_helper_threads_num;
3836    }
3837  
3838    /* see if there are too many threads */
3839    if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3840      if (__kmp_tp_cached) {
3841        __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3842                    KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3843                    KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3844      } else {
3845        __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3846                    __kmp_msg_null);
3847      }
3848    }
3849  
3850    // When hidden helper task is enabled, __kmp_threads is organized as follows:
3851    // 0: initial thread, also a regular OpenMP thread.
3852    // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3853    // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3854    // regular OpenMP threads.
3855    if (TCR_4(__kmp_init_hidden_helper_threads)) {
3856      // Find an available thread slot for hidden helper thread. Slots for hidden
3857      // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3858      for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3859                     gtid <= __kmp_hidden_helper_threads_num;
3860           gtid++)
3861        ;
3862      KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3863      KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3864                   "hidden helper thread: T#%d\n",
3865                   gtid));
3866    } else {
3867      /* find an available thread slot */
3868      // Don't reassign the zero slot since we need that to only be used by
3869      // initial thread. Slots for hidden helper threads should also be skipped.
3870      if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3871        gtid = 0;
3872      } else {
3873        for (gtid = __kmp_hidden_helper_threads_num + 1;
3874             TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3875          ;
3876      }
3877      KA_TRACE(
3878          1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3879      KMP_ASSERT(gtid < __kmp_threads_capacity);
3880    }
3881  
3882    /* update global accounting */
3883    __kmp_all_nth++;
3884    TCW_4(__kmp_nth, __kmp_nth + 1);
3885  
3886    // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3887    // numbers of procs, and method #2 (keyed API call) for higher numbers.
3888    if (__kmp_adjust_gtid_mode) {
3889      if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3890        if (TCR_4(__kmp_gtid_mode) != 2) {
3891          TCW_4(__kmp_gtid_mode, 2);
3892        }
3893      } else {
3894        if (TCR_4(__kmp_gtid_mode) != 1) {
3895          TCW_4(__kmp_gtid_mode, 1);
3896        }
3897      }
3898    }
3899  
3900  #ifdef KMP_ADJUST_BLOCKTIME
3901    /* Adjust blocktime to zero if necessary            */
3902    /* Middle initialization might not have occurred yet */
3903    if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3904      if (__kmp_nth > __kmp_avail_proc) {
3905        __kmp_zero_bt = TRUE;
3906      }
3907    }
3908  #endif /* KMP_ADJUST_BLOCKTIME */
3909  
3910    /* setup this new hierarchy */
3911    if (!(root = __kmp_root[gtid])) {
3912      root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3913      KMP_DEBUG_ASSERT(!root->r.r_root_team);
3914    }
3915  
3916  #if KMP_STATS_ENABLED
3917    // Initialize stats as soon as possible (right after gtid assignment).
3918    __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3919    __kmp_stats_thread_ptr->startLife();
3920    KMP_SET_THREAD_STATE(SERIAL_REGION);
3921    KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3922  #endif
3923    __kmp_initialize_root(root);
3924  
3925    /* setup new root thread structure */
3926    if (root->r.r_uber_thread) {
3927      root_thread = root->r.r_uber_thread;
3928    } else {
3929      root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3930      if (__kmp_storage_map) {
3931        __kmp_print_thread_storage_map(root_thread, gtid);
3932      }
3933      root_thread->th.th_info.ds.ds_gtid = gtid;
3934  #if OMPT_SUPPORT
3935      root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3936  #endif
3937      root_thread->th.th_root = root;
3938      if (__kmp_env_consistency_check) {
3939        root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3940      }
3941  #if USE_FAST_MEMORY
3942      __kmp_initialize_fast_memory(root_thread);
3943  #endif /* USE_FAST_MEMORY */
3944  
3945  #if KMP_USE_BGET
3946      KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3947      __kmp_initialize_bget(root_thread);
3948  #endif
3949      __kmp_init_random(root_thread); // Initialize random number generator
3950    }
3951  
3952    /* setup the serial team held in reserve by the root thread */
3953    if (!root_thread->th.th_serial_team) {
3954      kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3955      KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3956      root_thread->th.th_serial_team = __kmp_allocate_team(
3957          root, 1, 1,
3958  #if OMPT_SUPPORT
3959          ompt_data_none, // root parallel id
3960  #endif
3961          proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3962    }
3963    KMP_ASSERT(root_thread->th.th_serial_team);
3964    KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3965                  root_thread->th.th_serial_team));
3966  
3967    /* drop root_thread into place */
3968    TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3969  
3970    root->r.r_root_team->t.t_threads[0] = root_thread;
3971    root->r.r_hot_team->t.t_threads[0] = root_thread;
3972    root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3973    // AC: the team created in reserve, not for execution (it is unused for now).
3974    root_thread->th.th_serial_team->t.t_serialized = 0;
3975    root->r.r_uber_thread = root_thread;
3976  
3977    /* initialize the thread, get it ready to go */
3978    __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3979    TCW_4(__kmp_init_gtid, TRUE);
3980  
3981    /* prepare the primary thread for get_gtid() */
3982    __kmp_gtid_set_specific(gtid);
3983  
3984  #if USE_ITT_BUILD
3985    __kmp_itt_thread_name(gtid);
3986  #endif /* USE_ITT_BUILD */
3987  
3988  #ifdef KMP_TDATA_GTID
3989    __kmp_gtid = gtid;
3990  #endif
3991    __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3992    KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3993  
3994    KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3995                  "plain=%u\n",
3996                  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3997                  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3998                  KMP_INIT_BARRIER_STATE));
3999    { // Initialize barrier data.
4000      int b;
4001      for (b = 0; b < bs_last_barrier; ++b) {
4002        root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
4003  #if USE_DEBUGGER
4004        root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
4005  #endif
4006      }
4007    }
4008    KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
4009                     KMP_INIT_BARRIER_STATE);
4010  
4011  #if KMP_AFFINITY_SUPPORTED
4012    root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4013    root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4014    root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4015    root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4016  #endif /* KMP_AFFINITY_SUPPORTED */
4017    root_thread->th.th_def_allocator = __kmp_def_allocator;
4018    root_thread->th.th_prev_level = 0;
4019    root_thread->th.th_prev_num_threads = 1;
4020  
4021    kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4022    tmp->cg_root = root_thread;
4023    tmp->cg_thread_limit = __kmp_cg_max_nth;
4024    tmp->cg_nthreads = 1;
4025    KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4026                   " cg_nthreads init to 1\n",
4027                   root_thread, tmp));
4028    tmp->up = NULL;
4029    root_thread->th.th_cg_roots = tmp;
4030  
4031    __kmp_root_counter++;
4032  
4033  #if OMPT_SUPPORT
4034    if (ompt_enabled.enabled) {
4035  
4036      kmp_info_t *root_thread = ompt_get_thread();
4037  
4038      ompt_set_thread_state(root_thread, ompt_state_overhead);
4039  
4040      if (ompt_enabled.ompt_callback_thread_begin) {
4041        ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4042            ompt_thread_initial, __ompt_get_thread_data_internal());
4043      }
4044      ompt_data_t *task_data;
4045      ompt_data_t *parallel_data;
4046      __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4047                                    NULL);
4048      if (ompt_enabled.ompt_callback_implicit_task) {
4049        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4050            ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4051      }
4052  
4053      ompt_set_thread_state(root_thread, ompt_state_work_serial);
4054    }
4055  #endif
4056  #if OMPD_SUPPORT
4057    if (ompd_state & OMPD_ENABLE_BP)
4058      ompd_bp_thread_begin();
4059  #endif
4060  
4061    KMP_MB();
4062    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4063  
4064    return gtid;
4065  }
4066  
4067  #if KMP_NESTED_HOT_TEAMS
__kmp_free_hot_teams(kmp_root_t * root,kmp_info_t * thr,int level,const int max_level)4068  static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4069                                  const int max_level) {
4070    int i, n, nth;
4071    kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4072    if (!hot_teams || !hot_teams[level].hot_team) {
4073      return 0;
4074    }
4075    KMP_DEBUG_ASSERT(level < max_level);
4076    kmp_team_t *team = hot_teams[level].hot_team;
4077    nth = hot_teams[level].hot_team_nth;
4078    n = nth - 1; // primary thread is not freed
4079    if (level < max_level - 1) {
4080      for (i = 0; i < nth; ++i) {
4081        kmp_info_t *th = team->t.t_threads[i];
4082        n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4083        if (i > 0 && th->th.th_hot_teams) {
4084          __kmp_free(th->th.th_hot_teams);
4085          th->th.th_hot_teams = NULL;
4086        }
4087      }
4088    }
4089    __kmp_free_team(root, team, NULL);
4090    return n;
4091  }
4092  #endif
4093  
4094  // Resets a root thread and clear its root and hot teams.
4095  // Returns the number of __kmp_threads entries directly and indirectly freed.
__kmp_reset_root(int gtid,kmp_root_t * root)4096  static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4097    kmp_team_t *root_team = root->r.r_root_team;
4098    kmp_team_t *hot_team = root->r.r_hot_team;
4099    int n = hot_team->t.t_nproc;
4100    int i;
4101  
4102    KMP_DEBUG_ASSERT(!root->r.r_active);
4103  
4104    root->r.r_root_team = NULL;
4105    root->r.r_hot_team = NULL;
4106    // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4107    // before call to __kmp_free_team().
4108    __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4109  #if KMP_NESTED_HOT_TEAMS
4110    if (__kmp_hot_teams_max_level >
4111        0) { // need to free nested hot teams and their threads if any
4112      for (i = 0; i < hot_team->t.t_nproc; ++i) {
4113        kmp_info_t *th = hot_team->t.t_threads[i];
4114        if (__kmp_hot_teams_max_level > 1) {
4115          n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4116        }
4117        if (th->th.th_hot_teams) {
4118          __kmp_free(th->th.th_hot_teams);
4119          th->th.th_hot_teams = NULL;
4120        }
4121      }
4122    }
4123  #endif
4124    __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4125  
4126    // Before we can reap the thread, we need to make certain that all other
4127    // threads in the teams that had this root as ancestor have stopped trying to
4128    // steal tasks.
4129    if (__kmp_tasking_mode != tskm_immediate_exec) {
4130      __kmp_wait_to_unref_task_teams();
4131    }
4132  
4133  #if KMP_OS_WINDOWS
4134    /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4135    KA_TRACE(
4136        10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4137             "\n",
4138             (LPVOID) & (root->r.r_uber_thread->th),
4139             root->r.r_uber_thread->th.th_info.ds.ds_thread));
4140    __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4141  #endif /* KMP_OS_WINDOWS */
4142  
4143  #if OMPD_SUPPORT
4144    if (ompd_state & OMPD_ENABLE_BP)
4145      ompd_bp_thread_end();
4146  #endif
4147  
4148  #if OMPT_SUPPORT
4149    ompt_data_t *task_data;
4150    ompt_data_t *parallel_data;
4151    __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4152                                  NULL);
4153    if (ompt_enabled.ompt_callback_implicit_task) {
4154      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4155          ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4156    }
4157    if (ompt_enabled.ompt_callback_thread_end) {
4158      ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4159          &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4160    }
4161  #endif
4162  
4163    TCW_4(__kmp_nth,
4164          __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4165    i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4166    KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4167                   " to %d\n",
4168                   root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4169                   root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4170    if (i == 1) {
4171      // need to free contention group structure
4172      KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4173                       root->r.r_uber_thread->th.th_cg_roots->cg_root);
4174      KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4175      __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4176      root->r.r_uber_thread->th.th_cg_roots = NULL;
4177    }
4178    __kmp_reap_thread(root->r.r_uber_thread, 1);
4179  
4180    // We canot put root thread to __kmp_thread_pool, so we have to reap it
4181    // instead of freeing.
4182    root->r.r_uber_thread = NULL;
4183    /* mark root as no longer in use */
4184    root->r.r_begin = FALSE;
4185  
4186    return n;
4187  }
4188  
__kmp_unregister_root_current_thread(int gtid)4189  void __kmp_unregister_root_current_thread(int gtid) {
4190    KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4191    /* this lock should be ok, since unregister_root_current_thread is never
4192       called during an abort, only during a normal close. furthermore, if you
4193       have the forkjoin lock, you should never try to get the initz lock */
4194    __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4195    if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4196      KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4197                    "exiting T#%d\n",
4198                    gtid));
4199      __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4200      return;
4201    }
4202    kmp_root_t *root = __kmp_root[gtid];
4203  
4204    KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4205    KMP_ASSERT(KMP_UBER_GTID(gtid));
4206    KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4207    KMP_ASSERT(root->r.r_active == FALSE);
4208  
4209    KMP_MB();
4210  
4211    kmp_info_t *thread = __kmp_threads[gtid];
4212    kmp_team_t *team = thread->th.th_team;
4213    kmp_task_team_t *task_team = thread->th.th_task_team;
4214  
4215    // we need to wait for the proxy tasks before finishing the thread
4216    if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4217                              task_team->tt.tt_hidden_helper_task_encountered)) {
4218  #if OMPT_SUPPORT
4219      // the runtime is shutting down so we won't report any events
4220      thread->th.ompt_thread_info.state = ompt_state_undefined;
4221  #endif
4222      __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4223    }
4224  
4225    __kmp_reset_root(gtid, root);
4226  
4227    KMP_MB();
4228    KC_TRACE(10,
4229             ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4230  
4231    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4232  }
4233  
4234  #if KMP_OS_WINDOWS
4235  /* __kmp_forkjoin_lock must be already held
4236     Unregisters a root thread that is not the current thread.  Returns the number
4237     of __kmp_threads entries freed as a result. */
__kmp_unregister_root_other_thread(int gtid)4238  static int __kmp_unregister_root_other_thread(int gtid) {
4239    kmp_root_t *root = __kmp_root[gtid];
4240    int r;
4241  
4242    KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4243    KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4244    KMP_ASSERT(KMP_UBER_GTID(gtid));
4245    KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4246    KMP_ASSERT(root->r.r_active == FALSE);
4247  
4248    r = __kmp_reset_root(gtid, root);
4249    KC_TRACE(10,
4250             ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4251    return r;
4252  }
4253  #endif
4254  
4255  #if KMP_DEBUG
__kmp_task_info()4256  void __kmp_task_info() {
4257  
4258    kmp_int32 gtid = __kmp_entry_gtid();
4259    kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4260    kmp_info_t *this_thr = __kmp_threads[gtid];
4261    kmp_team_t *steam = this_thr->th.th_serial_team;
4262    kmp_team_t *team = this_thr->th.th_team;
4263  
4264    __kmp_printf(
4265        "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4266        "ptask=%p\n",
4267        gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4268        team->t.t_implicit_task_taskdata[tid].td_parent);
4269  }
4270  #endif // KMP_DEBUG
4271  
4272  /* TODO optimize with one big memclr, take out what isn't needed, split
4273     responsibility to workers as much as possible, and delay initialization of
4274     features as much as possible  */
__kmp_initialize_info(kmp_info_t * this_thr,kmp_team_t * team,int tid,int gtid)4275  static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4276                                    int tid, int gtid) {
4277    /* this_thr->th.th_info.ds.ds_gtid is setup in
4278       kmp_allocate_thread/create_worker.
4279       this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4280    KMP_DEBUG_ASSERT(this_thr != NULL);
4281    KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4282    KMP_DEBUG_ASSERT(team);
4283    KMP_DEBUG_ASSERT(team->t.t_threads);
4284    KMP_DEBUG_ASSERT(team->t.t_dispatch);
4285    kmp_info_t *master = team->t.t_threads[0];
4286    KMP_DEBUG_ASSERT(master);
4287    KMP_DEBUG_ASSERT(master->th.th_root);
4288  
4289    KMP_MB();
4290  
4291    TCW_SYNC_PTR(this_thr->th.th_team, team);
4292  
4293    this_thr->th.th_info.ds.ds_tid = tid;
4294    this_thr->th.th_set_nproc = 0;
4295    if (__kmp_tasking_mode != tskm_immediate_exec)
4296      // When tasking is possible, threads are not safe to reap until they are
4297      // done tasking; this will be set when tasking code is exited in wait
4298      this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4299    else // no tasking --> always safe to reap
4300      this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4301    this_thr->th.th_set_proc_bind = proc_bind_default;
4302  
4303  #if KMP_AFFINITY_SUPPORTED
4304    this_thr->th.th_new_place = this_thr->th.th_current_place;
4305  #endif
4306    this_thr->th.th_root = master->th.th_root;
4307  
4308    /* setup the thread's cache of the team structure */
4309    this_thr->th.th_team_nproc = team->t.t_nproc;
4310    this_thr->th.th_team_master = master;
4311    this_thr->th.th_team_serialized = team->t.t_serialized;
4312  
4313    KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4314  
4315    KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4316                  tid, gtid, this_thr, this_thr->th.th_current_task));
4317  
4318    __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4319                             team, tid, TRUE);
4320  
4321    KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4322                  tid, gtid, this_thr, this_thr->th.th_current_task));
4323    // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4324    // __kmp_initialize_team()?
4325  
4326    /* TODO no worksharing in speculative threads */
4327    this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4328  
4329    this_thr->th.th_local.this_construct = 0;
4330  
4331    if (!this_thr->th.th_pri_common) {
4332      this_thr->th.th_pri_common =
4333          (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4334      if (__kmp_storage_map) {
4335        __kmp_print_storage_map_gtid(
4336            gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4337            sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4338      }
4339      this_thr->th.th_pri_head = NULL;
4340    }
4341  
4342    if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4343        this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4344      // Make new thread's CG root same as primary thread's
4345      KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4346      kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4347      if (tmp) {
4348        // worker changes CG, need to check if old CG should be freed
4349        int i = tmp->cg_nthreads--;
4350        KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4351                       " on node %p of thread %p to %d\n",
4352                       this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4353        if (i == 1) {
4354          __kmp_free(tmp); // last thread left CG --> free it
4355        }
4356      }
4357      this_thr->th.th_cg_roots = master->th.th_cg_roots;
4358      // Increment new thread's CG root's counter to add the new thread
4359      this_thr->th.th_cg_roots->cg_nthreads++;
4360      KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4361                     " node %p of thread %p to %d\n",
4362                     this_thr, this_thr->th.th_cg_roots,
4363                     this_thr->th.th_cg_roots->cg_root,
4364                     this_thr->th.th_cg_roots->cg_nthreads));
4365      this_thr->th.th_current_task->td_icvs.thread_limit =
4366          this_thr->th.th_cg_roots->cg_thread_limit;
4367    }
4368  
4369    /* Initialize dynamic dispatch */
4370    {
4371      volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4372      // Use team max_nproc since this will never change for the team.
4373      size_t disp_size =
4374          sizeof(dispatch_private_info_t) *
4375          (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4376      KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4377                    team->t.t_max_nproc));
4378      KMP_ASSERT(dispatch);
4379      KMP_DEBUG_ASSERT(team->t.t_dispatch);
4380      KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4381  
4382      dispatch->th_disp_index = 0;
4383      dispatch->th_doacross_buf_idx = 0;
4384      if (!dispatch->th_disp_buffer) {
4385        dispatch->th_disp_buffer =
4386            (dispatch_private_info_t *)__kmp_allocate(disp_size);
4387  
4388        if (__kmp_storage_map) {
4389          __kmp_print_storage_map_gtid(
4390              gtid, &dispatch->th_disp_buffer[0],
4391              &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4392                                            ? 1
4393                                            : __kmp_dispatch_num_buffers],
4394              disp_size,
4395              "th_%d.th_dispatch.th_disp_buffer "
4396              "(team_%d.t_dispatch[%d].th_disp_buffer)",
4397              gtid, team->t.t_id, gtid);
4398        }
4399      } else {
4400        memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4401      }
4402  
4403      dispatch->th_dispatch_pr_current = 0;
4404      dispatch->th_dispatch_sh_current = 0;
4405  
4406      dispatch->th_deo_fcn = 0; /* ORDERED     */
4407      dispatch->th_dxo_fcn = 0; /* END ORDERED */
4408    }
4409  
4410    this_thr->th.th_next_pool = NULL;
4411  
4412    KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4413    KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4414  
4415    KMP_MB();
4416  }
4417  
4418  /* allocate a new thread for the requesting team. this is only called from
4419     within a forkjoin critical section. we will first try to get an available
4420     thread from the thread pool. if none is available, we will fork a new one
4421     assuming we are able to create a new one. this should be assured, as the
4422     caller should check on this first. */
__kmp_allocate_thread(kmp_root_t * root,kmp_team_t * team,int new_tid)4423  kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4424                                    int new_tid) {
4425    kmp_team_t *serial_team;
4426    kmp_info_t *new_thr;
4427    int new_gtid;
4428  
4429    KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4430    KMP_DEBUG_ASSERT(root && team);
4431  #if !KMP_NESTED_HOT_TEAMS
4432    KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4433  #endif
4434    KMP_MB();
4435  
4436    /* first, try to get one from the thread pool unless allocating thread is
4437     * the main hidden helper thread. The hidden helper team should always
4438     * allocate new OS threads. */
4439    if (__kmp_thread_pool && !KMP_HIDDEN_HELPER_TEAM(team)) {
4440      new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4441      __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4442      if (new_thr == __kmp_thread_pool_insert_pt) {
4443        __kmp_thread_pool_insert_pt = NULL;
4444      }
4445      TCW_4(new_thr->th.th_in_pool, FALSE);
4446      __kmp_suspend_initialize_thread(new_thr);
4447      __kmp_lock_suspend_mx(new_thr);
4448      if (new_thr->th.th_active_in_pool == TRUE) {
4449        KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4450        KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4451        new_thr->th.th_active_in_pool = FALSE;
4452      }
4453      __kmp_unlock_suspend_mx(new_thr);
4454  
4455      KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4456                    __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4457      KMP_ASSERT(!new_thr->th.th_team);
4458      KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4459  
4460      /* setup the thread structure */
4461      __kmp_initialize_info(new_thr, team, new_tid,
4462                            new_thr->th.th_info.ds.ds_gtid);
4463      KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4464  
4465      TCW_4(__kmp_nth, __kmp_nth + 1);
4466  
4467      new_thr->th.th_task_state = 0;
4468  
4469      if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4470        // Make sure pool thread has transitioned to waiting on own thread struct
4471        KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4472        // Thread activated in __kmp_allocate_team when increasing team size
4473      }
4474  
4475  #ifdef KMP_ADJUST_BLOCKTIME
4476      /* Adjust blocktime back to zero if necessary */
4477      /* Middle initialization might not have occurred yet */
4478      if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4479        if (__kmp_nth > __kmp_avail_proc) {
4480          __kmp_zero_bt = TRUE;
4481        }
4482      }
4483  #endif /* KMP_ADJUST_BLOCKTIME */
4484  
4485  #if KMP_DEBUG
4486      // If thread entered pool via __kmp_free_thread, wait_flag should !=
4487      // KMP_BARRIER_PARENT_FLAG.
4488      int b;
4489      kmp_balign_t *balign = new_thr->th.th_bar;
4490      for (b = 0; b < bs_last_barrier; ++b)
4491        KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4492  #endif
4493  
4494      KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4495                    __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4496  
4497      KMP_MB();
4498      return new_thr;
4499    }
4500  
4501    /* no, well fork a new one */
4502    KMP_ASSERT(KMP_HIDDEN_HELPER_TEAM(team) || __kmp_nth == __kmp_all_nth);
4503    KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4504  
4505  #if KMP_USE_MONITOR
4506    // If this is the first worker thread the RTL is creating, then also
4507    // launch the monitor thread.  We try to do this as early as possible.
4508    if (!TCR_4(__kmp_init_monitor)) {
4509      __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4510      if (!TCR_4(__kmp_init_monitor)) {
4511        KF_TRACE(10, ("before __kmp_create_monitor\n"));
4512        TCW_4(__kmp_init_monitor, 1);
4513        __kmp_create_monitor(&__kmp_monitor);
4514        KF_TRACE(10, ("after __kmp_create_monitor\n"));
4515  #if KMP_OS_WINDOWS
4516        // AC: wait until monitor has started. This is a fix for CQ232808.
4517        // The reason is that if the library is loaded/unloaded in a loop with
4518        // small (parallel) work in between, then there is high probability that
4519        // monitor thread started after the library shutdown. At shutdown it is
4520        // too late to cope with the problem, because when the primary thread is
4521        // in DllMain (process detach) the monitor has no chances to start (it is
4522        // blocked), and primary thread has no means to inform the monitor that
4523        // the library has gone, because all the memory which the monitor can
4524        // access is going to be released/reset.
4525        while (TCR_4(__kmp_init_monitor) < 2) {
4526          KMP_YIELD(TRUE);
4527        }
4528        KF_TRACE(10, ("after monitor thread has started\n"));
4529  #endif
4530      }
4531      __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4532    }
4533  #endif
4534  
4535    KMP_MB();
4536  
4537    {
4538      int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4539                               ? 1
4540                               : __kmp_hidden_helper_threads_num + 1;
4541  
4542      for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4543           ++new_gtid) {
4544        KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4545      }
4546  
4547      if (TCR_4(__kmp_init_hidden_helper_threads)) {
4548        KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4549      }
4550    }
4551  
4552    /* allocate space for it. */
4553    new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4554  
4555    new_thr->th.th_nt_strict = false;
4556    new_thr->th.th_nt_loc = NULL;
4557    new_thr->th.th_nt_sev = severity_fatal;
4558    new_thr->th.th_nt_msg = NULL;
4559  
4560    TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4561  
4562  #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4563    // suppress race conditions detection on synchronization flags in debug mode
4564    // this helps to analyze library internals eliminating false positives
4565    __itt_suppress_mark_range(
4566        __itt_suppress_range, __itt_suppress_threading_errors,
4567        &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4568    __itt_suppress_mark_range(
4569        __itt_suppress_range, __itt_suppress_threading_errors,
4570        &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4571  #if KMP_OS_WINDOWS
4572    __itt_suppress_mark_range(
4573        __itt_suppress_range, __itt_suppress_threading_errors,
4574        &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4575  #else
4576    __itt_suppress_mark_range(__itt_suppress_range,
4577                              __itt_suppress_threading_errors,
4578                              &new_thr->th.th_suspend_init_count,
4579                              sizeof(new_thr->th.th_suspend_init_count));
4580  #endif
4581    // TODO: check if we need to also suppress b_arrived flags
4582    __itt_suppress_mark_range(__itt_suppress_range,
4583                              __itt_suppress_threading_errors,
4584                              CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4585                              sizeof(new_thr->th.th_bar[0].bb.b_go));
4586    __itt_suppress_mark_range(__itt_suppress_range,
4587                              __itt_suppress_threading_errors,
4588                              CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4589                              sizeof(new_thr->th.th_bar[1].bb.b_go));
4590    __itt_suppress_mark_range(__itt_suppress_range,
4591                              __itt_suppress_threading_errors,
4592                              CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4593                              sizeof(new_thr->th.th_bar[2].bb.b_go));
4594  #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4595    if (__kmp_storage_map) {
4596      __kmp_print_thread_storage_map(new_thr, new_gtid);
4597    }
4598  
4599    // add the reserve serialized team, initialized from the team's primary thread
4600    {
4601      kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4602      KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4603      new_thr->th.th_serial_team = serial_team =
4604          (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4605  #if OMPT_SUPPORT
4606                                            ompt_data_none, // root parallel id
4607  #endif
4608                                            proc_bind_default, &r_icvs,
4609                                            0 USE_NESTED_HOT_ARG(NULL));
4610    }
4611    KMP_ASSERT(serial_team);
4612    serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4613    // execution (it is unused for now).
4614    serial_team->t.t_threads[0] = new_thr;
4615    KF_TRACE(10,
4616             ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4617              new_thr));
4618  
4619    /* setup the thread structures */
4620    __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4621  
4622  #if USE_FAST_MEMORY
4623    __kmp_initialize_fast_memory(new_thr);
4624  #endif /* USE_FAST_MEMORY */
4625  
4626  #if KMP_USE_BGET
4627    KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4628    __kmp_initialize_bget(new_thr);
4629  #endif
4630  
4631    __kmp_init_random(new_thr); // Initialize random number generator
4632  
4633    /* Initialize these only once when thread is grabbed for a team allocation */
4634    KA_TRACE(20,
4635             ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4636              __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4637  
4638    int b;
4639    kmp_balign_t *balign = new_thr->th.th_bar;
4640    for (b = 0; b < bs_last_barrier; ++b) {
4641      balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4642      balign[b].bb.team = NULL;
4643      balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4644      balign[b].bb.use_oncore_barrier = 0;
4645    }
4646  
4647    TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4648    new_thr->th.th_sleep_loc_type = flag_unset;
4649  
4650    new_thr->th.th_spin_here = FALSE;
4651    new_thr->th.th_next_waiting = 0;
4652  #if KMP_OS_UNIX
4653    new_thr->th.th_blocking = false;
4654  #endif
4655  
4656  #if KMP_AFFINITY_SUPPORTED
4657    new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4658    new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4659    new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4660    new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4661  #endif
4662    new_thr->th.th_def_allocator = __kmp_def_allocator;
4663    new_thr->th.th_prev_level = 0;
4664    new_thr->th.th_prev_num_threads = 1;
4665  
4666    TCW_4(new_thr->th.th_in_pool, FALSE);
4667    new_thr->th.th_active_in_pool = FALSE;
4668    TCW_4(new_thr->th.th_active, TRUE);
4669  
4670    new_thr->th.th_set_nested_nth = NULL;
4671    new_thr->th.th_set_nested_nth_sz = 0;
4672  
4673    /* adjust the global counters */
4674    __kmp_all_nth++;
4675    __kmp_nth++;
4676  
4677    // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4678    // numbers of procs, and method #2 (keyed API call) for higher numbers.
4679    if (__kmp_adjust_gtid_mode) {
4680      if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4681        if (TCR_4(__kmp_gtid_mode) != 2) {
4682          TCW_4(__kmp_gtid_mode, 2);
4683        }
4684      } else {
4685        if (TCR_4(__kmp_gtid_mode) != 1) {
4686          TCW_4(__kmp_gtid_mode, 1);
4687        }
4688      }
4689    }
4690  
4691  #ifdef KMP_ADJUST_BLOCKTIME
4692    /* Adjust blocktime back to zero if necessary       */
4693    /* Middle initialization might not have occurred yet */
4694    if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4695      if (__kmp_nth > __kmp_avail_proc) {
4696        __kmp_zero_bt = TRUE;
4697      }
4698    }
4699  #endif /* KMP_ADJUST_BLOCKTIME */
4700  
4701  #if KMP_AFFINITY_SUPPORTED
4702    // Set the affinity and topology information for new thread
4703    __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
4704  #endif
4705  
4706    /* actually fork it and create the new worker thread */
4707    KF_TRACE(
4708        10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4709    __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4710    KF_TRACE(10,
4711             ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4712  
4713    KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4714                  new_gtid));
4715    KMP_MB();
4716    return new_thr;
4717  }
4718  
4719  /* Reinitialize team for reuse.
4720     The hot team code calls this case at every fork barrier, so EPCC barrier
4721     test are extremely sensitive to changes in it, esp. writes to the team
4722     struct, which cause a cache invalidation in all threads.
4723     IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
__kmp_reinitialize_team(kmp_team_t * team,kmp_internal_control_t * new_icvs,ident_t * loc)4724  static void __kmp_reinitialize_team(kmp_team_t *team,
4725                                      kmp_internal_control_t *new_icvs,
4726                                      ident_t *loc) {
4727    KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4728                  team->t.t_threads[0], team));
4729    KMP_DEBUG_ASSERT(team && new_icvs);
4730    KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4731    KMP_CHECK_UPDATE(team->t.t_ident, loc);
4732  
4733    KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4734    // Copy ICVs to the primary thread's implicit taskdata
4735    __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4736    copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4737  
4738    KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4739                  team->t.t_threads[0], team));
4740  }
4741  
4742  /* Initialize the team data structure.
4743     This assumes the t_threads and t_max_nproc are already set.
4744     Also, we don't touch the arguments */
__kmp_initialize_team(kmp_team_t * team,int new_nproc,kmp_internal_control_t * new_icvs,ident_t * loc)4745  static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4746                                    kmp_internal_control_t *new_icvs,
4747                                    ident_t *loc) {
4748    KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4749  
4750    /* verify */
4751    KMP_DEBUG_ASSERT(team);
4752    KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4753    KMP_DEBUG_ASSERT(team->t.t_threads);
4754    KMP_MB();
4755  
4756    team->t.t_master_tid = 0; /* not needed */
4757    /* team->t.t_master_bar;        not needed */
4758    team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4759    team->t.t_nproc = new_nproc;
4760  
4761    /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4762    team->t.t_next_pool = NULL;
4763    /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4764     * up hot team */
4765  
4766    TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4767    team->t.t_invoke = NULL; /* not needed */
4768  
4769    // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4770    team->t.t_sched.sched = new_icvs->sched.sched;
4771  
4772  #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4773    team->t.t_fp_control_saved = FALSE; /* not needed */
4774    team->t.t_x87_fpu_control_word = 0; /* not needed */
4775    team->t.t_mxcsr = 0; /* not needed */
4776  #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4777  
4778    team->t.t_construct = 0;
4779  
4780    team->t.t_ordered.dt.t_value = 0;
4781    team->t.t_master_active = FALSE;
4782  
4783  #ifdef KMP_DEBUG
4784    team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4785  #endif
4786  #if KMP_OS_WINDOWS
4787    team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4788  #endif
4789  
4790    team->t.t_control_stack_top = NULL;
4791  
4792    __kmp_reinitialize_team(team, new_icvs, loc);
4793  
4794    KMP_MB();
4795    KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4796  }
4797  
4798  #if KMP_AFFINITY_SUPPORTED
__kmp_set_thread_place(kmp_team_t * team,kmp_info_t * th,int first,int last,int newp)4799  static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4800                                            int first, int last, int newp) {
4801    th->th.th_first_place = first;
4802    th->th.th_last_place = last;
4803    th->th.th_new_place = newp;
4804    if (newp != th->th.th_current_place) {
4805      if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4806        team->t.t_display_affinity = 1;
4807      // Copy topology information associated with the new place
4808      th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4809      th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4810    }
4811  }
4812  
4813  // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4814  // It calculates the worker + primary thread's partition based upon the parent
4815  // thread's partition, and binds each worker to a thread in their partition.
4816  // The primary thread's partition should already include its current binding.
__kmp_partition_places(kmp_team_t * team,int update_master_only)4817  static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4818    // Do not partition places for the hidden helper team
4819    if (KMP_HIDDEN_HELPER_TEAM(team))
4820      return;
4821    // Copy the primary thread's place partition to the team struct
4822    kmp_info_t *master_th = team->t.t_threads[0];
4823    KMP_DEBUG_ASSERT(master_th != NULL);
4824    kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4825    int first_place = master_th->th.th_first_place;
4826    int last_place = master_th->th.th_last_place;
4827    int masters_place = master_th->th.th_current_place;
4828    int num_masks = __kmp_affinity.num_masks;
4829    team->t.t_first_place = first_place;
4830    team->t.t_last_place = last_place;
4831  
4832    KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4833                  "bound to place %d partition = [%d,%d]\n",
4834                  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4835                  team->t.t_id, masters_place, first_place, last_place));
4836  
4837    switch (proc_bind) {
4838  
4839    case proc_bind_default:
4840      // Serial teams might have the proc_bind policy set to proc_bind_default.
4841      // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4842      KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4843      break;
4844  
4845    case proc_bind_primary: {
4846      int f;
4847      int n_th = team->t.t_nproc;
4848      for (f = 1; f < n_th; f++) {
4849        kmp_info_t *th = team->t.t_threads[f];
4850        KMP_DEBUG_ASSERT(th != NULL);
4851        __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
4852  
4853        KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4854                       "partition = [%d,%d]\n",
4855                       __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4856                       f, masters_place, first_place, last_place));
4857      }
4858    } break;
4859  
4860    case proc_bind_close: {
4861      int f;
4862      int n_th = team->t.t_nproc;
4863      int n_places;
4864      if (first_place <= last_place) {
4865        n_places = last_place - first_place + 1;
4866      } else {
4867        n_places = num_masks - first_place + last_place + 1;
4868      }
4869      if (n_th <= n_places) {
4870        int place = masters_place;
4871        for (f = 1; f < n_th; f++) {
4872          kmp_info_t *th = team->t.t_threads[f];
4873          KMP_DEBUG_ASSERT(th != NULL);
4874  
4875          if (place == last_place) {
4876            place = first_place;
4877          } else if (place == (num_masks - 1)) {
4878            place = 0;
4879          } else {
4880            place++;
4881          }
4882          __kmp_set_thread_place(team, th, first_place, last_place, place);
4883  
4884          KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4885                         "partition = [%d,%d]\n",
4886                         __kmp_gtid_from_thread(team->t.t_threads[f]),
4887                         team->t.t_id, f, place, first_place, last_place));
4888        }
4889      } else {
4890        int S, rem, gap, s_count;
4891        S = n_th / n_places;
4892        s_count = 0;
4893        rem = n_th - (S * n_places);
4894        gap = rem > 0 ? n_places / rem : n_places;
4895        int place = masters_place;
4896        int gap_ct = gap;
4897        for (f = 0; f < n_th; f++) {
4898          kmp_info_t *th = team->t.t_threads[f];
4899          KMP_DEBUG_ASSERT(th != NULL);
4900  
4901          __kmp_set_thread_place(team, th, first_place, last_place, place);
4902          s_count++;
4903  
4904          if ((s_count == S) && rem && (gap_ct == gap)) {
4905            // do nothing, add an extra thread to place on next iteration
4906          } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4907            // we added an extra thread to this place; move to next place
4908            if (place == last_place) {
4909              place = first_place;
4910            } else if (place == (num_masks - 1)) {
4911              place = 0;
4912            } else {
4913              place++;
4914            }
4915            s_count = 0;
4916            gap_ct = 1;
4917            rem--;
4918          } else if (s_count == S) { // place full; don't add extra
4919            if (place == last_place) {
4920              place = first_place;
4921            } else if (place == (num_masks - 1)) {
4922              place = 0;
4923            } else {
4924              place++;
4925            }
4926            gap_ct++;
4927            s_count = 0;
4928          }
4929  
4930          KA_TRACE(100,
4931                   ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4932                    "partition = [%d,%d]\n",
4933                    __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4934                    th->th.th_new_place, first_place, last_place));
4935        }
4936        KMP_DEBUG_ASSERT(place == masters_place);
4937      }
4938    } break;
4939  
4940    case proc_bind_spread: {
4941      int f;
4942      int n_th = team->t.t_nproc;
4943      int n_places;
4944      int thidx;
4945      if (first_place <= last_place) {
4946        n_places = last_place - first_place + 1;
4947      } else {
4948        n_places = num_masks - first_place + last_place + 1;
4949      }
4950      if (n_th <= n_places) {
4951        int place = -1;
4952  
4953        if (n_places != num_masks) {
4954          int S = n_places / n_th;
4955          int s_count, rem, gap, gap_ct;
4956  
4957          place = masters_place;
4958          rem = n_places - n_th * S;
4959          gap = rem ? n_th / rem : 1;
4960          gap_ct = gap;
4961          thidx = n_th;
4962          if (update_master_only == 1)
4963            thidx = 1;
4964          for (f = 0; f < thidx; f++) {
4965            kmp_info_t *th = team->t.t_threads[f];
4966            KMP_DEBUG_ASSERT(th != NULL);
4967  
4968            int fplace = place, nplace = place;
4969            s_count = 1;
4970            while (s_count < S) {
4971              if (place == last_place) {
4972                place = first_place;
4973              } else if (place == (num_masks - 1)) {
4974                place = 0;
4975              } else {
4976                place++;
4977              }
4978              s_count++;
4979            }
4980            if (rem && (gap_ct == gap)) {
4981              if (place == last_place) {
4982                place = first_place;
4983              } else if (place == (num_masks - 1)) {
4984                place = 0;
4985              } else {
4986                place++;
4987              }
4988              rem--;
4989              gap_ct = 0;
4990            }
4991            __kmp_set_thread_place(team, th, fplace, place, nplace);
4992            gap_ct++;
4993  
4994            if (place == last_place) {
4995              place = first_place;
4996            } else if (place == (num_masks - 1)) {
4997              place = 0;
4998            } else {
4999              place++;
5000            }
5001  
5002            KA_TRACE(100,
5003                     ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5004                      "partition = [%d,%d], num_masks: %u\n",
5005                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
5006                      f, th->th.th_new_place, th->th.th_first_place,
5007                      th->th.th_last_place, num_masks));
5008          }
5009        } else {
5010          /* Having uniform space of available computation places I can create
5011             T partitions of round(P/T) size and put threads into the first
5012             place of each partition. */
5013          double current = static_cast<double>(masters_place);
5014          double spacing =
5015              (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5016          int first, last;
5017          kmp_info_t *th;
5018  
5019          thidx = n_th + 1;
5020          if (update_master_only == 1)
5021            thidx = 1;
5022          for (f = 0; f < thidx; f++) {
5023            first = static_cast<int>(current);
5024            last = static_cast<int>(current + spacing) - 1;
5025            KMP_DEBUG_ASSERT(last >= first);
5026            if (first >= n_places) {
5027              if (masters_place) {
5028                first -= n_places;
5029                last -= n_places;
5030                if (first == (masters_place + 1)) {
5031                  KMP_DEBUG_ASSERT(f == n_th);
5032                  first--;
5033                }
5034                if (last == masters_place) {
5035                  KMP_DEBUG_ASSERT(f == (n_th - 1));
5036                  last--;
5037                }
5038              } else {
5039                KMP_DEBUG_ASSERT(f == n_th);
5040                first = 0;
5041                last = 0;
5042              }
5043            }
5044            if (last >= n_places) {
5045              last = (n_places - 1);
5046            }
5047            place = first;
5048            current += spacing;
5049            if (f < n_th) {
5050              KMP_DEBUG_ASSERT(0 <= first);
5051              KMP_DEBUG_ASSERT(n_places > first);
5052              KMP_DEBUG_ASSERT(0 <= last);
5053              KMP_DEBUG_ASSERT(n_places > last);
5054              KMP_DEBUG_ASSERT(last_place >= first_place);
5055              th = team->t.t_threads[f];
5056              KMP_DEBUG_ASSERT(th);
5057              __kmp_set_thread_place(team, th, first, last, place);
5058              KA_TRACE(100,
5059                       ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5060                        "partition = [%d,%d], spacing = %.4f\n",
5061                        __kmp_gtid_from_thread(team->t.t_threads[f]),
5062                        team->t.t_id, f, th->th.th_new_place,
5063                        th->th.th_first_place, th->th.th_last_place, spacing));
5064            }
5065          }
5066        }
5067        KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5068      } else {
5069        int S, rem, gap, s_count;
5070        S = n_th / n_places;
5071        s_count = 0;
5072        rem = n_th - (S * n_places);
5073        gap = rem > 0 ? n_places / rem : n_places;
5074        int place = masters_place;
5075        int gap_ct = gap;
5076        thidx = n_th;
5077        if (update_master_only == 1)
5078          thidx = 1;
5079        for (f = 0; f < thidx; f++) {
5080          kmp_info_t *th = team->t.t_threads[f];
5081          KMP_DEBUG_ASSERT(th != NULL);
5082  
5083          __kmp_set_thread_place(team, th, place, place, place);
5084          s_count++;
5085  
5086          if ((s_count == S) && rem && (gap_ct == gap)) {
5087            // do nothing, add an extra thread to place on next iteration
5088          } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5089            // we added an extra thread to this place; move on to next place
5090            if (place == last_place) {
5091              place = first_place;
5092            } else if (place == (num_masks - 1)) {
5093              place = 0;
5094            } else {
5095              place++;
5096            }
5097            s_count = 0;
5098            gap_ct = 1;
5099            rem--;
5100          } else if (s_count == S) { // place is full; don't add extra thread
5101            if (place == last_place) {
5102              place = first_place;
5103            } else if (place == (num_masks - 1)) {
5104              place = 0;
5105            } else {
5106              place++;
5107            }
5108            gap_ct++;
5109            s_count = 0;
5110          }
5111  
5112          KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5113                         "partition = [%d,%d]\n",
5114                         __kmp_gtid_from_thread(team->t.t_threads[f]),
5115                         team->t.t_id, f, th->th.th_new_place,
5116                         th->th.th_first_place, th->th.th_last_place));
5117        }
5118        KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5119      }
5120    } break;
5121  
5122    default:
5123      break;
5124    }
5125  
5126    KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5127  }
5128  
5129  #endif // KMP_AFFINITY_SUPPORTED
5130  
5131  /* allocate a new team data structure to use.  take one off of the free pool if
5132     available */
5133  kmp_team_t *
__kmp_allocate_team(kmp_root_t * root,int new_nproc,int max_nproc,ompt_data_t ompt_parallel_data,kmp_proc_bind_t new_proc_bind,kmp_internal_control_t * new_icvs,int argc USE_NESTED_HOT_ARG (kmp_info_t * master))5134  __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5135  #if OMPT_SUPPORT
5136                      ompt_data_t ompt_parallel_data,
5137  #endif
5138                      kmp_proc_bind_t new_proc_bind,
5139                      kmp_internal_control_t *new_icvs,
5140                      int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5141    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5142    int f;
5143    kmp_team_t *team;
5144    int use_hot_team = !root->r.r_active;
5145    int level = 0;
5146    int do_place_partition = 1;
5147  
5148    KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5149    KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5150    KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5151    KMP_MB();
5152  
5153  #if KMP_NESTED_HOT_TEAMS
5154    kmp_hot_team_ptr_t *hot_teams;
5155    if (master) {
5156      team = master->th.th_team;
5157      level = team->t.t_active_level;
5158      if (master->th.th_teams_microtask) { // in teams construct?
5159        if (master->th.th_teams_size.nteams > 1 &&
5160            ( // #teams > 1
5161                team->t.t_pkfn ==
5162                    (microtask_t)__kmp_teams_master || // inner fork of the teams
5163                master->th.th_teams_level <
5164                    team->t.t_level)) { // or nested parallel inside the teams
5165          ++level; // not increment if #teams==1, or for outer fork of the teams;
5166          // increment otherwise
5167        }
5168        // Do not perform the place partition if inner fork of the teams
5169        // Wait until nested parallel region encountered inside teams construct
5170        if ((master->th.th_teams_size.nteams == 1 &&
5171             master->th.th_teams_level >= team->t.t_level) ||
5172            (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5173          do_place_partition = 0;
5174      }
5175      hot_teams = master->th.th_hot_teams;
5176      if (level < __kmp_hot_teams_max_level && hot_teams &&
5177          hot_teams[level].hot_team) {
5178        // hot team has already been allocated for given level
5179        use_hot_team = 1;
5180      } else {
5181        use_hot_team = 0;
5182      }
5183    } else {
5184      // check we won't access uninitialized hot_teams, just in case
5185      KMP_DEBUG_ASSERT(new_nproc == 1);
5186    }
5187  #endif
5188    // Optimization to use a "hot" team
5189    if (use_hot_team && new_nproc > 1) {
5190      KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5191  #if KMP_NESTED_HOT_TEAMS
5192      team = hot_teams[level].hot_team;
5193  #else
5194      team = root->r.r_hot_team;
5195  #endif
5196  #if KMP_DEBUG
5197      if (__kmp_tasking_mode != tskm_immediate_exec) {
5198        KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5199                      "task_team[1] = %p before reinit\n",
5200                      team->t.t_task_team[0], team->t.t_task_team[1]));
5201      }
5202  #endif
5203  
5204      if (team->t.t_nproc != new_nproc &&
5205          __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5206        // Distributed barrier may need a resize
5207        int old_nthr = team->t.t_nproc;
5208        __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5209      }
5210  
5211      // If not doing the place partition, then reset the team's proc bind
5212      // to indicate that partitioning of all threads still needs to take place
5213      if (do_place_partition == 0)
5214        team->t.t_proc_bind = proc_bind_default;
5215      // Has the number of threads changed?
5216      /* Let's assume the most common case is that the number of threads is
5217         unchanged, and put that case first. */
5218      if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5219        KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5220        // This case can mean that omp_set_num_threads() was called and the hot
5221        // team size was already reduced, so we check the special flag
5222        if (team->t.t_size_changed == -1) {
5223          team->t.t_size_changed = 1;
5224        } else {
5225          KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5226        }
5227  
5228        // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5229        kmp_r_sched_t new_sched = new_icvs->sched;
5230        // set primary thread's schedule as new run-time schedule
5231        KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5232  
5233        __kmp_reinitialize_team(team, new_icvs,
5234                                root->r.r_uber_thread->th.th_ident);
5235  
5236        KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5237                      team->t.t_threads[0], team));
5238        __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5239  
5240  #if KMP_AFFINITY_SUPPORTED
5241        if ((team->t.t_size_changed == 0) &&
5242            (team->t.t_proc_bind == new_proc_bind)) {
5243          if (new_proc_bind == proc_bind_spread) {
5244            if (do_place_partition) {
5245              // add flag to update only master for spread
5246              __kmp_partition_places(team, 1);
5247            }
5248          }
5249          KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5250                         "proc_bind = %d, partition = [%d,%d]\n",
5251                         team->t.t_id, new_proc_bind, team->t.t_first_place,
5252                         team->t.t_last_place));
5253        } else {
5254          if (do_place_partition) {
5255            KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5256            __kmp_partition_places(team);
5257          }
5258        }
5259  #else
5260        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5261  #endif /* KMP_AFFINITY_SUPPORTED */
5262      } else if (team->t.t_nproc > new_nproc) {
5263        KA_TRACE(20,
5264                 ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5265                  new_nproc));
5266  
5267        team->t.t_size_changed = 1;
5268        if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5269          // Barrier size already reduced earlier in this function
5270          // Activate team threads via th_used_in_team
5271          __kmp_add_threads_to_team(team, new_nproc);
5272        }
5273        // When decreasing team size, threads no longer in the team should
5274        // unref task team.
5275        if (__kmp_tasking_mode != tskm_immediate_exec) {
5276          for (f = new_nproc; f < team->t.t_nproc; f++) {
5277            kmp_info_t *th = team->t.t_threads[f];
5278            KMP_DEBUG_ASSERT(th);
5279            th->th.th_task_team = NULL;
5280          }
5281        }
5282  #if KMP_NESTED_HOT_TEAMS
5283        if (__kmp_hot_teams_mode == 0) {
5284          // AC: saved number of threads should correspond to team's value in this
5285          // mode, can be bigger in mode 1, when hot team has threads in reserve
5286          KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5287          hot_teams[level].hot_team_nth = new_nproc;
5288  #endif // KMP_NESTED_HOT_TEAMS
5289          /* release the extra threads we don't need any more */
5290          for (f = new_nproc; f < team->t.t_nproc; f++) {
5291            KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5292            __kmp_free_thread(team->t.t_threads[f]);
5293            team->t.t_threads[f] = NULL;
5294          }
5295  #if KMP_NESTED_HOT_TEAMS
5296        } // (__kmp_hot_teams_mode == 0)
5297        else {
5298          // When keeping extra threads in team, switch threads to wait on own
5299          // b_go flag
5300          for (f = new_nproc; f < team->t.t_nproc; ++f) {
5301            KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5302            kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5303            for (int b = 0; b < bs_last_barrier; ++b) {
5304              if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5305                balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5306              }
5307              KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5308            }
5309          }
5310        }
5311  #endif // KMP_NESTED_HOT_TEAMS
5312        team->t.t_nproc = new_nproc;
5313        // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5314        KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5315        __kmp_reinitialize_team(team, new_icvs,
5316                                root->r.r_uber_thread->th.th_ident);
5317  
5318        // Update remaining threads
5319        for (f = 0; f < new_nproc; ++f) {
5320          team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5321        }
5322  
5323        // restore the current task state of the primary thread: should be the
5324        // implicit task
5325        KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5326                      team->t.t_threads[0], team));
5327  
5328        __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5329  
5330  #ifdef KMP_DEBUG
5331        for (f = 0; f < team->t.t_nproc; f++) {
5332          KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5333                           team->t.t_threads[f]->th.th_team_nproc ==
5334                               team->t.t_nproc);
5335        }
5336  #endif
5337  
5338        if (do_place_partition) {
5339          KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5340  #if KMP_AFFINITY_SUPPORTED
5341          __kmp_partition_places(team);
5342  #endif
5343        }
5344      } else { // team->t.t_nproc < new_nproc
5345  
5346        KA_TRACE(20,
5347                 ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5348                  new_nproc));
5349        int old_nproc = team->t.t_nproc; // save old value and use to update only
5350        team->t.t_size_changed = 1;
5351  
5352  #if KMP_NESTED_HOT_TEAMS
5353        int avail_threads = hot_teams[level].hot_team_nth;
5354        if (new_nproc < avail_threads)
5355          avail_threads = new_nproc;
5356        kmp_info_t **other_threads = team->t.t_threads;
5357        for (f = team->t.t_nproc; f < avail_threads; ++f) {
5358          // Adjust barrier data of reserved threads (if any) of the team
5359          // Other data will be set in __kmp_initialize_info() below.
5360          int b;
5361          kmp_balign_t *balign = other_threads[f]->th.th_bar;
5362          for (b = 0; b < bs_last_barrier; ++b) {
5363            balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5364            KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5365  #if USE_DEBUGGER
5366            balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5367  #endif
5368          }
5369        }
5370        if (hot_teams[level].hot_team_nth >= new_nproc) {
5371          // we have all needed threads in reserve, no need to allocate any
5372          // this only possible in mode 1, cannot have reserved threads in mode 0
5373          KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5374          team->t.t_nproc = new_nproc; // just get reserved threads involved
5375        } else {
5376          // We may have some threads in reserve, but not enough;
5377          // get reserved threads involved if any.
5378          team->t.t_nproc = hot_teams[level].hot_team_nth;
5379          hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5380  #endif // KMP_NESTED_HOT_TEAMS
5381          if (team->t.t_max_nproc < new_nproc) {
5382            /* reallocate larger arrays */
5383            __kmp_reallocate_team_arrays(team, new_nproc);
5384            __kmp_reinitialize_team(team, new_icvs, NULL);
5385          }
5386  
5387  #if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) &&   \
5388      KMP_AFFINITY_SUPPORTED
5389          /* Temporarily set full mask for primary thread before creation of
5390             workers. The reason is that workers inherit the affinity from the
5391             primary thread, so if a lot of workers are created on the single
5392             core quickly, they don't get a chance to set their own affinity for
5393             a long time. */
5394          kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5395  #endif
5396  
5397          /* allocate new threads for the hot team */
5398          for (f = team->t.t_nproc; f < new_nproc; f++) {
5399            kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5400            KMP_DEBUG_ASSERT(new_worker);
5401            team->t.t_threads[f] = new_worker;
5402  
5403            KA_TRACE(20,
5404                     ("__kmp_allocate_team: team %d init T#%d arrived: "
5405                      "join=%llu, plain=%llu\n",
5406                      team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5407                      team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5408                      team->t.t_bar[bs_plain_barrier].b_arrived));
5409  
5410            { // Initialize barrier data for new threads.
5411              int b;
5412              kmp_balign_t *balign = new_worker->th.th_bar;
5413              for (b = 0; b < bs_last_barrier; ++b) {
5414                balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5415                KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5416                                 KMP_BARRIER_PARENT_FLAG);
5417  #if USE_DEBUGGER
5418                balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5419  #endif
5420              }
5421            }
5422          }
5423  
5424  #if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) &&   \
5425      KMP_AFFINITY_SUPPORTED
5426          /* Restore initial primary thread's affinity mask */
5427          new_temp_affinity.restore();
5428  #endif
5429  #if KMP_NESTED_HOT_TEAMS
5430        } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5431  #endif // KMP_NESTED_HOT_TEAMS
5432        if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5433          // Barrier size already increased earlier in this function
5434          // Activate team threads via th_used_in_team
5435          __kmp_add_threads_to_team(team, new_nproc);
5436        }
5437        /* make sure everyone is syncronized */
5438        // new threads below
5439        __kmp_initialize_team(team, new_nproc, new_icvs,
5440                              root->r.r_uber_thread->th.th_ident);
5441  
5442        /* reinitialize the threads */
5443        KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5444        for (f = 0; f < team->t.t_nproc; ++f)
5445          __kmp_initialize_info(team->t.t_threads[f], team, f,
5446                                __kmp_gtid_from_tid(f, team));
5447  
5448        // set th_task_state for new threads in hot team with older thread's state
5449        kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5450        for (f = old_nproc; f < team->t.t_nproc; ++f)
5451          team->t.t_threads[f]->th.th_task_state = old_state;
5452  
5453  #ifdef KMP_DEBUG
5454        for (f = 0; f < team->t.t_nproc; ++f) {
5455          KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5456                           team->t.t_threads[f]->th.th_team_nproc ==
5457                               team->t.t_nproc);
5458        }
5459  #endif
5460  
5461        if (do_place_partition) {
5462          KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5463  #if KMP_AFFINITY_SUPPORTED
5464          __kmp_partition_places(team);
5465  #endif
5466        }
5467      } // Check changes in number of threads
5468  
5469      if (master->th.th_teams_microtask) {
5470        for (f = 1; f < new_nproc; ++f) {
5471          // propagate teams construct specific info to workers
5472          kmp_info_t *thr = team->t.t_threads[f];
5473          thr->th.th_teams_microtask = master->th.th_teams_microtask;
5474          thr->th.th_teams_level = master->th.th_teams_level;
5475          thr->th.th_teams_size = master->th.th_teams_size;
5476        }
5477      }
5478  #if KMP_NESTED_HOT_TEAMS
5479      if (level) {
5480        // Sync barrier state for nested hot teams, not needed for outermost hot
5481        // team.
5482        for (f = 1; f < new_nproc; ++f) {
5483          kmp_info_t *thr = team->t.t_threads[f];
5484          int b;
5485          kmp_balign_t *balign = thr->th.th_bar;
5486          for (b = 0; b < bs_last_barrier; ++b) {
5487            balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5488            KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5489  #if USE_DEBUGGER
5490            balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5491  #endif
5492          }
5493        }
5494      }
5495  #endif // KMP_NESTED_HOT_TEAMS
5496  
5497      /* reallocate space for arguments if necessary */
5498      __kmp_alloc_argv_entries(argc, team, TRUE);
5499      KMP_CHECK_UPDATE(team->t.t_argc, argc);
5500      // The hot team re-uses the previous task team,
5501      // if untouched during the previous release->gather phase.
5502  
5503      KF_TRACE(10, (" hot_team = %p\n", team));
5504  
5505  #if KMP_DEBUG
5506      if (__kmp_tasking_mode != tskm_immediate_exec) {
5507        KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5508                      "task_team[1] = %p after reinit\n",
5509                      team->t.t_task_team[0], team->t.t_task_team[1]));
5510      }
5511  #endif
5512  
5513  #if OMPT_SUPPORT
5514      __ompt_team_assign_id(team, ompt_parallel_data);
5515  #endif
5516  
5517      KMP_MB();
5518  
5519      return team;
5520    }
5521  
5522    /* next, let's try to take one from the team pool */
5523    KMP_MB();
5524    for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5525      /* TODO: consider resizing undersized teams instead of reaping them, now
5526         that we have a resizing mechanism */
5527      if (team->t.t_max_nproc >= max_nproc) {
5528        /* take this team from the team pool */
5529        __kmp_team_pool = team->t.t_next_pool;
5530  
5531        if (max_nproc > 1 &&
5532            __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5533          if (!team->t.b) { // Allocate barrier structure
5534            team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5535          }
5536        }
5537  
5538        /* setup the team for fresh use */
5539        __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5540  
5541        KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5542                      "task_team[1] %p to NULL\n",
5543                      &team->t.t_task_team[0], &team->t.t_task_team[1]));
5544        team->t.t_task_team[0] = NULL;
5545        team->t.t_task_team[1] = NULL;
5546  
5547        /* reallocate space for arguments if necessary */
5548        __kmp_alloc_argv_entries(argc, team, TRUE);
5549        KMP_CHECK_UPDATE(team->t.t_argc, argc);
5550  
5551        KA_TRACE(
5552            20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5553                 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5554        { // Initialize barrier data.
5555          int b;
5556          for (b = 0; b < bs_last_barrier; ++b) {
5557            team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5558  #if USE_DEBUGGER
5559            team->t.t_bar[b].b_master_arrived = 0;
5560            team->t.t_bar[b].b_team_arrived = 0;
5561  #endif
5562          }
5563        }
5564  
5565        team->t.t_proc_bind = new_proc_bind;
5566  
5567        KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5568                      team->t.t_id));
5569  
5570  #if OMPT_SUPPORT
5571        __ompt_team_assign_id(team, ompt_parallel_data);
5572  #endif
5573  
5574        team->t.t_nested_nth = NULL;
5575  
5576        KMP_MB();
5577  
5578        return team;
5579      }
5580  
5581      /* reap team if it is too small, then loop back and check the next one */
5582      // not sure if this is wise, but, will be redone during the hot-teams
5583      // rewrite.
5584      /* TODO: Use technique to find the right size hot-team, don't reap them */
5585      team = __kmp_reap_team(team);
5586      __kmp_team_pool = team;
5587    }
5588  
5589    /* nothing available in the pool, no matter, make a new team! */
5590    KMP_MB();
5591    team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5592  
5593    /* and set it up */
5594    team->t.t_max_nproc = max_nproc;
5595    if (max_nproc > 1 &&
5596        __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5597      // Allocate barrier structure
5598      team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5599    }
5600  
5601    /* NOTE well, for some reason allocating one big buffer and dividing it up
5602       seems to really hurt performance a lot on the P4, so, let's not use this */
5603    __kmp_allocate_team_arrays(team, max_nproc);
5604  
5605    KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5606    __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5607  
5608    KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5609                  "%p to NULL\n",
5610                  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5611    team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5612    // memory, no need to duplicate
5613    team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5614    // memory, no need to duplicate
5615  
5616    if (__kmp_storage_map) {
5617      __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5618    }
5619  
5620    /* allocate space for arguments */
5621    __kmp_alloc_argv_entries(argc, team, FALSE);
5622    team->t.t_argc = argc;
5623  
5624    KA_TRACE(20,
5625             ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5626              team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5627    { // Initialize barrier data.
5628      int b;
5629      for (b = 0; b < bs_last_barrier; ++b) {
5630        team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5631  #if USE_DEBUGGER
5632        team->t.t_bar[b].b_master_arrived = 0;
5633        team->t.t_bar[b].b_team_arrived = 0;
5634  #endif
5635      }
5636    }
5637  
5638    team->t.t_proc_bind = new_proc_bind;
5639  
5640  #if OMPT_SUPPORT
5641    __ompt_team_assign_id(team, ompt_parallel_data);
5642    team->t.ompt_serialized_team_info = NULL;
5643  #endif
5644  
5645    KMP_MB();
5646  
5647    team->t.t_nested_nth = NULL;
5648  
5649    KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5650                  team->t.t_id));
5651  
5652    return team;
5653  }
5654  
5655  /* TODO implement hot-teams at all levels */
5656  /* TODO implement lazy thread release on demand (disband request) */
5657  
5658  /* free the team.  return it to the team pool.  release all the threads
5659   * associated with it */
__kmp_free_team(kmp_root_t * root,kmp_team_t * team USE_NESTED_HOT_ARG (kmp_info_t * master))5660  void __kmp_free_team(kmp_root_t *root,
5661                       kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5662    int f;
5663    KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5664                  team->t.t_id));
5665  
5666    /* verify state */
5667    KMP_DEBUG_ASSERT(root);
5668    KMP_DEBUG_ASSERT(team);
5669    KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5670    KMP_DEBUG_ASSERT(team->t.t_threads);
5671  
5672    int use_hot_team = team == root->r.r_hot_team;
5673  #if KMP_NESTED_HOT_TEAMS
5674    int level;
5675    if (master) {
5676      level = team->t.t_active_level - 1;
5677      if (master->th.th_teams_microtask) { // in teams construct?
5678        if (master->th.th_teams_size.nteams > 1) {
5679          ++level; // level was not increased in teams construct for
5680          // team_of_masters
5681        }
5682        if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5683            master->th.th_teams_level == team->t.t_level) {
5684          ++level; // level was not increased in teams construct for
5685          // team_of_workers before the parallel
5686        } // team->t.t_level will be increased inside parallel
5687      }
5688  #if KMP_DEBUG
5689      kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5690  #endif
5691      if (level < __kmp_hot_teams_max_level) {
5692        KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5693        use_hot_team = 1;
5694      }
5695    }
5696  #endif // KMP_NESTED_HOT_TEAMS
5697  
5698    /* team is done working */
5699    TCW_SYNC_PTR(team->t.t_pkfn,
5700                 NULL); // Important for Debugging Support Library.
5701  #if KMP_OS_WINDOWS
5702    team->t.t_copyin_counter = 0; // init counter for possible reuse
5703  #endif
5704    // Do not reset pointer to parent team to NULL for hot teams.
5705  
5706    /* if we are non-hot team, release our threads */
5707    if (!use_hot_team) {
5708      if (__kmp_tasking_mode != tskm_immediate_exec) {
5709        // Wait for threads to reach reapable state
5710        for (f = 1; f < team->t.t_nproc; ++f) {
5711          KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5712          kmp_info_t *th = team->t.t_threads[f];
5713          volatile kmp_uint32 *state = &th->th.th_reap_state;
5714          while (*state != KMP_SAFE_TO_REAP) {
5715  #if KMP_OS_WINDOWS
5716            // On Windows a thread can be killed at any time, check this
5717            DWORD ecode;
5718            if (!__kmp_is_thread_alive(th, &ecode)) {
5719              *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5720              break;
5721            }
5722  #endif
5723            // first check if thread is sleeping
5724            if (th->th.th_sleep_loc)
5725              __kmp_null_resume_wrapper(th);
5726            KMP_CPU_PAUSE();
5727          }
5728        }
5729  
5730        // Delete task teams
5731        int tt_idx;
5732        for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5733          kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5734          if (task_team != NULL) {
5735            for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5736              KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5737              team->t.t_threads[f]->th.th_task_team = NULL;
5738            }
5739            KA_TRACE(
5740                20,
5741                ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5742                 __kmp_get_gtid(), task_team, team->t.t_id));
5743  #if KMP_NESTED_HOT_TEAMS
5744            __kmp_free_task_team(master, task_team);
5745  #endif
5746            team->t.t_task_team[tt_idx] = NULL;
5747          }
5748        }
5749      }
5750  
5751      // Before clearing parent pointer, check if nested_nth list should be freed
5752      if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth &&
5753          team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) {
5754        KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
5755        KMP_INTERNAL_FREE(team->t.t_nested_nth);
5756      }
5757      team->t.t_nested_nth = NULL;
5758  
5759      // Reset pointer to parent team only for non-hot teams.
5760      team->t.t_parent = NULL;
5761      team->t.t_level = 0;
5762      team->t.t_active_level = 0;
5763  
5764      /* free the worker threads */
5765      for (f = 1; f < team->t.t_nproc; ++f) {
5766        KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5767        if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5768          KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5769                                      1, 2);
5770        }
5771        __kmp_free_thread(team->t.t_threads[f]);
5772      }
5773  
5774      if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5775        if (team->t.b) {
5776          // wake up thread at old location
5777          team->t.b->go_release();
5778          if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5779            for (f = 1; f < team->t.t_nproc; ++f) {
5780              if (team->t.b->sleep[f].sleep) {
5781                __kmp_atomic_resume_64(
5782                    team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5783                    (kmp_atomic_flag_64<> *)NULL);
5784              }
5785            }
5786          }
5787          // Wait for threads to be removed from team
5788          for (int f = 1; f < team->t.t_nproc; ++f) {
5789            while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5790              KMP_CPU_PAUSE();
5791          }
5792        }
5793      }
5794  
5795      for (f = 1; f < team->t.t_nproc; ++f) {
5796        team->t.t_threads[f] = NULL;
5797      }
5798  
5799      if (team->t.t_max_nproc > 1 &&
5800          __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5801        distributedBarrier::deallocate(team->t.b);
5802        team->t.b = NULL;
5803      }
5804      /* put the team back in the team pool */
5805      /* TODO limit size of team pool, call reap_team if pool too large */
5806      team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5807      __kmp_team_pool = (volatile kmp_team_t *)team;
5808    } else { // Check if team was created for primary threads in teams construct
5809      // See if first worker is a CG root
5810      KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5811                       team->t.t_threads[1]->th.th_cg_roots);
5812      if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5813        // Clean up the CG root nodes on workers so that this team can be re-used
5814        for (f = 1; f < team->t.t_nproc; ++f) {
5815          kmp_info_t *thr = team->t.t_threads[f];
5816          KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5817                           thr->th.th_cg_roots->cg_root == thr);
5818          // Pop current CG root off list
5819          kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5820          thr->th.th_cg_roots = tmp->up;
5821          KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5822                         " up to node %p. cg_nthreads was %d\n",
5823                         thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5824          int i = tmp->cg_nthreads--;
5825          if (i == 1) {
5826            __kmp_free(tmp); // free CG if we are the last thread in it
5827          }
5828          // Restore current task's thread_limit from CG root
5829          if (thr->th.th_cg_roots)
5830            thr->th.th_current_task->td_icvs.thread_limit =
5831                thr->th.th_cg_roots->cg_thread_limit;
5832        }
5833      }
5834    }
5835  
5836    KMP_MB();
5837  }
5838  
5839  /* reap the team.  destroy it, reclaim all its resources and free its memory */
__kmp_reap_team(kmp_team_t * team)5840  kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5841    kmp_team_t *next_pool = team->t.t_next_pool;
5842  
5843    KMP_DEBUG_ASSERT(team);
5844    KMP_DEBUG_ASSERT(team->t.t_dispatch);
5845    KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5846    KMP_DEBUG_ASSERT(team->t.t_threads);
5847    KMP_DEBUG_ASSERT(team->t.t_argv);
5848  
5849    /* TODO clean the threads that are a part of this? */
5850  
5851    /* free stuff */
5852    __kmp_free_team_arrays(team);
5853    if (team->t.t_argv != &team->t.t_inline_argv[0])
5854      __kmp_free((void *)team->t.t_argv);
5855    __kmp_free(team);
5856  
5857    KMP_MB();
5858    return next_pool;
5859  }
5860  
5861  // Free the thread.  Don't reap it, just place it on the pool of available
5862  // threads.
5863  //
5864  // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5865  // binding for the affinity mechanism to be useful.
5866  //
5867  // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5868  // However, we want to avoid a potential performance problem by always
5869  // scanning through the list to find the correct point at which to insert
5870  // the thread (potential N**2 behavior).  To do this we keep track of the
5871  // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5872  // With single-level parallelism, threads will always be added to the tail
5873  // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5874  // parallelism, all bets are off and we may need to scan through the entire
5875  // free list.
5876  //
5877  // This change also has a potentially large performance benefit, for some
5878  // applications.  Previously, as threads were freed from the hot team, they
5879  // would be placed back on the free list in inverse order.  If the hot team
5880  // grew back to it's original size, then the freed thread would be placed
5881  // back on the hot team in reverse order.  This could cause bad cache
5882  // locality problems on programs where the size of the hot team regularly
5883  // grew and shrunk.
5884  //
5885  // Now, for single-level parallelism, the OMP tid is always == gtid.
__kmp_free_thread(kmp_info_t * this_th)5886  void __kmp_free_thread(kmp_info_t *this_th) {
5887    int gtid;
5888    kmp_info_t **scan;
5889  
5890    KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5891                  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5892  
5893    KMP_DEBUG_ASSERT(this_th);
5894  
5895    // When moving thread to pool, switch thread to wait on own b_go flag, and
5896    // uninitialized (NULL team).
5897    int b;
5898    kmp_balign_t *balign = this_th->th.th_bar;
5899    for (b = 0; b < bs_last_barrier; ++b) {
5900      if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5901        balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5902      balign[b].bb.team = NULL;
5903      balign[b].bb.leaf_kids = 0;
5904    }
5905    this_th->th.th_task_state = 0;
5906    this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5907  
5908    /* put thread back on the free pool */
5909    TCW_PTR(this_th->th.th_team, NULL);
5910    TCW_PTR(this_th->th.th_root, NULL);
5911    TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5912  
5913    while (this_th->th.th_cg_roots) {
5914      this_th->th.th_cg_roots->cg_nthreads--;
5915      KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5916                     " %p of thread  %p to %d\n",
5917                     this_th, this_th->th.th_cg_roots,
5918                     this_th->th.th_cg_roots->cg_root,
5919                     this_th->th.th_cg_roots->cg_nthreads));
5920      kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5921      if (tmp->cg_root == this_th) { // Thread is a cg_root
5922        KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5923        KA_TRACE(
5924            5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5925        this_th->th.th_cg_roots = tmp->up;
5926        __kmp_free(tmp);
5927      } else { // Worker thread
5928        if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5929          __kmp_free(tmp);
5930        }
5931        this_th->th.th_cg_roots = NULL;
5932        break;
5933      }
5934    }
5935  
5936    /* If the implicit task assigned to this thread can be used by other threads
5937     * -> multiple threads can share the data and try to free the task at
5938     * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5939     * with higher probability when hot team is disabled but can occurs even when
5940     * the hot team is enabled */
5941    __kmp_free_implicit_task(this_th);
5942    this_th->th.th_current_task = NULL;
5943  
5944    // If the __kmp_thread_pool_insert_pt is already past the new insert
5945    // point, then we need to re-scan the entire list.
5946    gtid = this_th->th.th_info.ds.ds_gtid;
5947    if (__kmp_thread_pool_insert_pt != NULL) {
5948      KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5949      if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5950        __kmp_thread_pool_insert_pt = NULL;
5951      }
5952    }
5953  
5954    // Scan down the list to find the place to insert the thread.
5955    // scan is the address of a link in the list, possibly the address of
5956    // __kmp_thread_pool itself.
5957    //
5958    // In the absence of nested parallelism, the for loop will have 0 iterations.
5959    if (__kmp_thread_pool_insert_pt != NULL) {
5960      scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5961    } else {
5962      scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5963    }
5964    for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5965         scan = &((*scan)->th.th_next_pool))
5966      ;
5967  
5968    // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5969    // to its address.
5970    TCW_PTR(this_th->th.th_next_pool, *scan);
5971    __kmp_thread_pool_insert_pt = *scan = this_th;
5972    KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5973                     (this_th->th.th_info.ds.ds_gtid <
5974                      this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5975    TCW_4(this_th->th.th_in_pool, TRUE);
5976    __kmp_suspend_initialize_thread(this_th);
5977    __kmp_lock_suspend_mx(this_th);
5978    if (this_th->th.th_active == TRUE) {
5979      KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5980      this_th->th.th_active_in_pool = TRUE;
5981    }
5982  #if KMP_DEBUG
5983    else {
5984      KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5985    }
5986  #endif
5987    __kmp_unlock_suspend_mx(this_th);
5988  
5989    TCW_4(__kmp_nth, __kmp_nth - 1);
5990  
5991  #ifdef KMP_ADJUST_BLOCKTIME
5992    /* Adjust blocktime back to user setting or default if necessary */
5993    /* Middle initialization might never have occurred                */
5994    if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5995      KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5996      if (__kmp_nth <= __kmp_avail_proc) {
5997        __kmp_zero_bt = FALSE;
5998      }
5999    }
6000  #endif /* KMP_ADJUST_BLOCKTIME */
6001  
6002    KMP_MB();
6003  }
6004  
6005  /* ------------------------------------------------------------------------ */
6006  
__kmp_launch_thread(kmp_info_t * this_thr)6007  void *__kmp_launch_thread(kmp_info_t *this_thr) {
6008  #if OMP_PROFILING_SUPPORT
6009    ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
6010    // TODO: add a configuration option for time granularity
6011    if (ProfileTraceFile)
6012      llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
6013  #endif
6014  
6015    int gtid = this_thr->th.th_info.ds.ds_gtid;
6016    /*    void                 *stack_data;*/
6017    kmp_team_t **volatile pteam;
6018  
6019    KMP_MB();
6020    KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6021  
6022    if (__kmp_env_consistency_check) {
6023      this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6024    }
6025  
6026  #if OMPD_SUPPORT
6027    if (ompd_state & OMPD_ENABLE_BP)
6028      ompd_bp_thread_begin();
6029  #endif
6030  
6031  #if OMPT_SUPPORT
6032    ompt_data_t *thread_data = nullptr;
6033    if (ompt_enabled.enabled) {
6034      thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6035      *thread_data = ompt_data_none;
6036  
6037      this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6038      this_thr->th.ompt_thread_info.wait_id = 0;
6039      this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6040      this_thr->th.ompt_thread_info.parallel_flags = 0;
6041      if (ompt_enabled.ompt_callback_thread_begin) {
6042        ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6043            ompt_thread_worker, thread_data);
6044      }
6045      this_thr->th.ompt_thread_info.state = ompt_state_idle;
6046    }
6047  #endif
6048  
6049    /* This is the place where threads wait for work */
6050    while (!TCR_4(__kmp_global.g.g_done)) {
6051      KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6052      KMP_MB();
6053  
6054      /* wait for work to do */
6055      KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6056  
6057      /* No tid yet since not part of a team */
6058      __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6059  
6060  #if OMPT_SUPPORT
6061      if (ompt_enabled.enabled) {
6062        this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6063      }
6064  #endif
6065  
6066      pteam = &this_thr->th.th_team;
6067  
6068      /* have we been allocated? */
6069      if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6070        /* we were just woken up, so run our new task */
6071        if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6072          int rc;
6073          KA_TRACE(20,
6074                   ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6075                    gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6076                    (*pteam)->t.t_pkfn));
6077  
6078          updateHWFPControl(*pteam);
6079  
6080  #if OMPT_SUPPORT
6081          if (ompt_enabled.enabled) {
6082            this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6083          }
6084  #endif
6085  
6086          rc = (*pteam)->t.t_invoke(gtid);
6087          KMP_ASSERT(rc);
6088  
6089          KMP_MB();
6090          KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6091                        gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6092                        (*pteam)->t.t_pkfn));
6093        }
6094  #if OMPT_SUPPORT
6095        if (ompt_enabled.enabled) {
6096          /* no frame set while outside task */
6097          __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6098  
6099          this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6100        }
6101  #endif
6102        /* join barrier after parallel region */
6103        __kmp_join_barrier(gtid);
6104      }
6105    }
6106  
6107  #if OMPD_SUPPORT
6108    if (ompd_state & OMPD_ENABLE_BP)
6109      ompd_bp_thread_end();
6110  #endif
6111  
6112  #if OMPT_SUPPORT
6113    if (ompt_enabled.ompt_callback_thread_end) {
6114      ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6115    }
6116  #endif
6117  
6118    this_thr->th.th_task_team = NULL;
6119    /* run the destructors for the threadprivate data for this thread */
6120    __kmp_common_destroy_gtid(gtid);
6121  
6122    KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6123    KMP_MB();
6124  
6125  #if OMP_PROFILING_SUPPORT
6126    llvm::timeTraceProfilerFinishThread();
6127  #endif
6128    return this_thr;
6129  }
6130  
6131  /* ------------------------------------------------------------------------ */
6132  
__kmp_internal_end_dest(void * specific_gtid)6133  void __kmp_internal_end_dest(void *specific_gtid) {
6134    // Make sure no significant bits are lost
6135    int gtid;
6136    __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6137  
6138    KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6139    /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6140     * this is because 0 is reserved for the nothing-stored case */
6141  
6142    __kmp_internal_end_thread(gtid);
6143  }
6144  
6145  #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6146  
__kmp_internal_end_dtor(void)6147  __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6148    __kmp_internal_end_atexit();
6149  }
6150  
6151  #endif
6152  
6153  /* [Windows] josh: when the atexit handler is called, there may still be more
6154     than one thread alive */
__kmp_internal_end_atexit(void)6155  void __kmp_internal_end_atexit(void) {
6156    KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6157    /* [Windows]
6158       josh: ideally, we want to completely shutdown the library in this atexit
6159       handler, but stat code that depends on thread specific data for gtid fails
6160       because that data becomes unavailable at some point during the shutdown, so
6161       we call __kmp_internal_end_thread instead. We should eventually remove the
6162       dependency on __kmp_get_specific_gtid in the stat code and use
6163       __kmp_internal_end_library to cleanly shutdown the library.
6164  
6165       // TODO: Can some of this comment about GVS be removed?
6166       I suspect that the offending stat code is executed when the calling thread
6167       tries to clean up a dead root thread's data structures, resulting in GVS
6168       code trying to close the GVS structures for that thread, but since the stat
6169       code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6170       the calling thread is cleaning up itself instead of another thread, it get
6171       confused. This happens because allowing a thread to unregister and cleanup
6172       another thread is a recent modification for addressing an issue.
6173       Based on the current design (20050722), a thread may end up
6174       trying to unregister another thread only if thread death does not trigger
6175       the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
6176       thread specific data destructor function to detect thread death. For
6177       Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6178       is nothing.  Thus, the workaround is applicable only for Windows static
6179       stat library. */
6180    __kmp_internal_end_library(-1);
6181  #if KMP_OS_WINDOWS
6182    __kmp_close_console();
6183  #endif
6184  }
6185  
__kmp_reap_thread(kmp_info_t * thread,int is_root)6186  static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6187    // It is assumed __kmp_forkjoin_lock is acquired.
6188  
6189    int gtid;
6190  
6191    KMP_DEBUG_ASSERT(thread != NULL);
6192  
6193    gtid = thread->th.th_info.ds.ds_gtid;
6194  
6195    if (!is_root) {
6196      if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6197        /* Assume the threads are at the fork barrier here */
6198        KA_TRACE(
6199            20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6200                 gtid));
6201        if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6202          while (
6203              !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6204            KMP_CPU_PAUSE();
6205          __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6206        } else {
6207          /* Need release fence here to prevent seg faults for tree forkjoin
6208             barrier (GEH) */
6209          kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6210                             thread);
6211          __kmp_release_64(&flag);
6212        }
6213      }
6214  
6215      // Terminate OS thread.
6216      __kmp_reap_worker(thread);
6217  
6218      // The thread was killed asynchronously.  If it was actively
6219      // spinning in the thread pool, decrement the global count.
6220      //
6221      // There is a small timing hole here - if the worker thread was just waking
6222      // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6223      // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6224      // the global counter might not get updated.
6225      //
6226      // Currently, this can only happen as the library is unloaded,
6227      // so there are no harmful side effects.
6228      if (thread->th.th_active_in_pool) {
6229        thread->th.th_active_in_pool = FALSE;
6230        KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6231        KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6232      }
6233    }
6234  
6235    __kmp_free_implicit_task(thread);
6236  
6237  // Free the fast memory for tasking
6238  #if USE_FAST_MEMORY
6239    __kmp_free_fast_memory(thread);
6240  #endif /* USE_FAST_MEMORY */
6241  
6242    __kmp_suspend_uninitialize_thread(thread);
6243  
6244    KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6245    TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6246  
6247    --__kmp_all_nth;
6248    // __kmp_nth was decremented when thread is added to the pool.
6249  
6250  #ifdef KMP_ADJUST_BLOCKTIME
6251    /* Adjust blocktime back to user setting or default if necessary */
6252    /* Middle initialization might never have occurred                */
6253    if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6254      KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6255      if (__kmp_nth <= __kmp_avail_proc) {
6256        __kmp_zero_bt = FALSE;
6257      }
6258    }
6259  #endif /* KMP_ADJUST_BLOCKTIME */
6260  
6261    /* free the memory being used */
6262    if (__kmp_env_consistency_check) {
6263      if (thread->th.th_cons) {
6264        __kmp_free_cons_stack(thread->th.th_cons);
6265        thread->th.th_cons = NULL;
6266      }
6267    }
6268  
6269    if (thread->th.th_pri_common != NULL) {
6270      __kmp_free(thread->th.th_pri_common);
6271      thread->th.th_pri_common = NULL;
6272    }
6273  
6274  #if KMP_USE_BGET
6275    if (thread->th.th_local.bget_data != NULL) {
6276      __kmp_finalize_bget(thread);
6277    }
6278  #endif
6279  
6280  #if KMP_AFFINITY_SUPPORTED
6281    if (thread->th.th_affin_mask != NULL) {
6282      KMP_CPU_FREE(thread->th.th_affin_mask);
6283      thread->th.th_affin_mask = NULL;
6284    }
6285  #endif /* KMP_AFFINITY_SUPPORTED */
6286  
6287  #if KMP_USE_HIER_SCHED
6288    if (thread->th.th_hier_bar_data != NULL) {
6289      __kmp_free(thread->th.th_hier_bar_data);
6290      thread->th.th_hier_bar_data = NULL;
6291    }
6292  #endif
6293  
6294    __kmp_reap_team(thread->th.th_serial_team);
6295    thread->th.th_serial_team = NULL;
6296    __kmp_free(thread);
6297  
6298    KMP_MB();
6299  
6300  } // __kmp_reap_thread
6301  
__kmp_itthash_clean(kmp_info_t * th)6302  static void __kmp_itthash_clean(kmp_info_t *th) {
6303  #if USE_ITT_NOTIFY
6304    if (__kmp_itt_region_domains.count > 0) {
6305      for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6306        kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6307        while (bucket) {
6308          kmp_itthash_entry_t *next = bucket->next_in_bucket;
6309          __kmp_thread_free(th, bucket);
6310          bucket = next;
6311        }
6312      }
6313    }
6314    if (__kmp_itt_barrier_domains.count > 0) {
6315      for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6316        kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6317        while (bucket) {
6318          kmp_itthash_entry_t *next = bucket->next_in_bucket;
6319          __kmp_thread_free(th, bucket);
6320          bucket = next;
6321        }
6322      }
6323    }
6324  #endif
6325  }
6326  
__kmp_internal_end(void)6327  static void __kmp_internal_end(void) {
6328    int i;
6329  
6330    /* First, unregister the library */
6331    __kmp_unregister_library();
6332  
6333  #if KMP_OS_WINDOWS
6334    /* In Win static library, we can't tell when a root actually dies, so we
6335       reclaim the data structures for any root threads that have died but not
6336       unregistered themselves, in order to shut down cleanly.
6337       In Win dynamic library we also can't tell when a thread dies.  */
6338    __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6339  // dead roots
6340  #endif
6341  
6342    for (i = 0; i < __kmp_threads_capacity; i++)
6343      if (__kmp_root[i])
6344        if (__kmp_root[i]->r.r_active)
6345          break;
6346    KMP_MB(); /* Flush all pending memory write invalidates.  */
6347    TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6348  
6349    if (i < __kmp_threads_capacity) {
6350  #if KMP_USE_MONITOR
6351      // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6352      KMP_MB(); /* Flush all pending memory write invalidates.  */
6353  
6354      // Need to check that monitor was initialized before reaping it. If we are
6355      // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6356      // __kmp_monitor will appear to contain valid data, but it is only valid in
6357      // the parent process, not the child.
6358      // New behavior (201008): instead of keying off of the flag
6359      // __kmp_init_parallel, the monitor thread creation is keyed off
6360      // of the new flag __kmp_init_monitor.
6361      __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6362      if (TCR_4(__kmp_init_monitor)) {
6363        __kmp_reap_monitor(&__kmp_monitor);
6364        TCW_4(__kmp_init_monitor, 0);
6365      }
6366      __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6367      KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6368  #endif // KMP_USE_MONITOR
6369    } else {
6370  /* TODO move this to cleanup code */
6371  #ifdef KMP_DEBUG
6372      /* make sure that everything has properly ended */
6373      for (i = 0; i < __kmp_threads_capacity; i++) {
6374        if (__kmp_root[i]) {
6375          //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6376          //                    there can be uber threads alive here
6377          KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6378        }
6379      }
6380  #endif
6381  
6382      KMP_MB();
6383  
6384      // Reap the worker threads.
6385      // This is valid for now, but be careful if threads are reaped sooner.
6386      while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6387        // Get the next thread from the pool.
6388        kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6389        __kmp_thread_pool = thread->th.th_next_pool;
6390        // Reap it.
6391        KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6392        thread->th.th_next_pool = NULL;
6393        thread->th.th_in_pool = FALSE;
6394        __kmp_reap_thread(thread, 0);
6395      }
6396      __kmp_thread_pool_insert_pt = NULL;
6397  
6398      // Reap teams.
6399      while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6400        // Get the next team from the pool.
6401        kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6402        __kmp_team_pool = team->t.t_next_pool;
6403        // Reap it.
6404        team->t.t_next_pool = NULL;
6405        __kmp_reap_team(team);
6406      }
6407  
6408      __kmp_reap_task_teams();
6409  
6410  #if KMP_OS_UNIX
6411      // Threads that are not reaped should not access any resources since they
6412      // are going to be deallocated soon, so the shutdown sequence should wait
6413      // until all threads either exit the final spin-waiting loop or begin
6414      // sleeping after the given blocktime.
6415      for (i = 0; i < __kmp_threads_capacity; i++) {
6416        kmp_info_t *thr = __kmp_threads[i];
6417        while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6418          KMP_CPU_PAUSE();
6419      }
6420  #endif
6421  
6422      for (i = 0; i < __kmp_threads_capacity; ++i) {
6423        // TBD: Add some checking...
6424        // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6425      }
6426  
6427      /* Make sure all threadprivate destructors get run by joining with all
6428         worker threads before resetting this flag */
6429      TCW_SYNC_4(__kmp_init_common, FALSE);
6430  
6431      KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6432      KMP_MB();
6433  
6434  #if KMP_USE_MONITOR
6435      // See note above: One of the possible fixes for CQ138434 / CQ140126
6436      //
6437      // FIXME: push both code fragments down and CSE them?
6438      // push them into __kmp_cleanup() ?
6439      __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6440      if (TCR_4(__kmp_init_monitor)) {
6441        __kmp_reap_monitor(&__kmp_monitor);
6442        TCW_4(__kmp_init_monitor, 0);
6443      }
6444      __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6445      KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6446  #endif
6447    } /* else !__kmp_global.t_active */
6448    TCW_4(__kmp_init_gtid, FALSE);
6449    KMP_MB(); /* Flush all pending memory write invalidates.  */
6450  
6451    __kmp_cleanup();
6452  #if OMPT_SUPPORT
6453    ompt_fini();
6454  #endif
6455  }
6456  
__kmp_internal_end_library(int gtid_req)6457  void __kmp_internal_end_library(int gtid_req) {
6458    /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6459    /* this shouldn't be a race condition because __kmp_internal_end() is the
6460       only place to clear __kmp_serial_init */
6461    /* we'll check this later too, after we get the lock */
6462    // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6463    // redundant, because the next check will work in any case.
6464    if (__kmp_global.g.g_abort) {
6465      KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6466      /* TODO abort? */
6467      return;
6468    }
6469    if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6470      KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6471      return;
6472    }
6473  
6474    // If hidden helper team has been initialized, we need to deinit it
6475    if (TCR_4(__kmp_init_hidden_helper) &&
6476        !TCR_4(__kmp_hidden_helper_team_done)) {
6477      TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6478      // First release the main thread to let it continue its work
6479      __kmp_hidden_helper_main_thread_release();
6480      // Wait until the hidden helper team has been destroyed
6481      __kmp_hidden_helper_threads_deinitz_wait();
6482    }
6483  
6484    KMP_MB(); /* Flush all pending memory write invalidates.  */
6485    /* find out who we are and what we should do */
6486    {
6487      int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6488      KA_TRACE(
6489          10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6490      if (gtid == KMP_GTID_SHUTDOWN) {
6491        KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6492                      "already shutdown\n"));
6493        return;
6494      } else if (gtid == KMP_GTID_MONITOR) {
6495        KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6496                      "registered, or system shutdown\n"));
6497        return;
6498      } else if (gtid == KMP_GTID_DNE) {
6499        KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6500                      "shutdown\n"));
6501        /* we don't know who we are, but we may still shutdown the library */
6502      } else if (KMP_UBER_GTID(gtid)) {
6503        /* unregister ourselves as an uber thread.  gtid is no longer valid */
6504        if (__kmp_root[gtid]->r.r_active) {
6505          __kmp_global.g.g_abort = -1;
6506          TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6507          __kmp_unregister_library();
6508          KA_TRACE(10,
6509                   ("__kmp_internal_end_library: root still active, abort T#%d\n",
6510                    gtid));
6511          return;
6512        } else {
6513          __kmp_itthash_clean(__kmp_threads[gtid]);
6514          KA_TRACE(
6515              10,
6516              ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6517          __kmp_unregister_root_current_thread(gtid);
6518        }
6519      } else {
6520  /* worker threads may call this function through the atexit handler, if they
6521   * call exit() */
6522  /* For now, skip the usual subsequent processing and just dump the debug buffer.
6523     TODO: do a thorough shutdown instead */
6524  #ifdef DUMP_DEBUG_ON_EXIT
6525        if (__kmp_debug_buf)
6526          __kmp_dump_debug_buffer();
6527  #endif
6528        // added unregister library call here when we switch to shm linux
6529        // if we don't, it will leave lots of files in /dev/shm
6530        // cleanup shared memory file before exiting.
6531        __kmp_unregister_library();
6532        return;
6533      }
6534    }
6535    /* synchronize the termination process */
6536    __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6537  
6538    /* have we already finished */
6539    if (__kmp_global.g.g_abort) {
6540      KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6541      /* TODO abort? */
6542      __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6543      return;
6544    }
6545    if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6546      __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6547      return;
6548    }
6549  
6550    /* We need this lock to enforce mutex between this reading of
6551       __kmp_threads_capacity and the writing by __kmp_register_root.
6552       Alternatively, we can use a counter of roots that is atomically updated by
6553       __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6554       __kmp_internal_end_*.  */
6555    __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6556  
6557    /* now we can safely conduct the actual termination */
6558    __kmp_internal_end();
6559  
6560    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6561    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6562  
6563    KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6564  
6565  #ifdef DUMP_DEBUG_ON_EXIT
6566    if (__kmp_debug_buf)
6567      __kmp_dump_debug_buffer();
6568  #endif
6569  
6570  #if KMP_OS_WINDOWS
6571    __kmp_close_console();
6572  #endif
6573  
6574    __kmp_fini_allocator();
6575  
6576  } // __kmp_internal_end_library
6577  
__kmp_internal_end_thread(int gtid_req)6578  void __kmp_internal_end_thread(int gtid_req) {
6579    int i;
6580  
6581    /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6582    /* this shouldn't be a race condition because __kmp_internal_end() is the
6583     * only place to clear __kmp_serial_init */
6584    /* we'll check this later too, after we get the lock */
6585    // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6586    // redundant, because the next check will work in any case.
6587    if (__kmp_global.g.g_abort) {
6588      KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6589      /* TODO abort? */
6590      return;
6591    }
6592    if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6593      KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6594      return;
6595    }
6596  
6597    // If hidden helper team has been initialized, we need to deinit it
6598    if (TCR_4(__kmp_init_hidden_helper) &&
6599        !TCR_4(__kmp_hidden_helper_team_done)) {
6600      TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6601      // First release the main thread to let it continue its work
6602      __kmp_hidden_helper_main_thread_release();
6603      // Wait until the hidden helper team has been destroyed
6604      __kmp_hidden_helper_threads_deinitz_wait();
6605    }
6606  
6607    KMP_MB(); /* Flush all pending memory write invalidates.  */
6608  
6609    /* find out who we are and what we should do */
6610    {
6611      int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6612      KA_TRACE(10,
6613               ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6614      if (gtid == KMP_GTID_SHUTDOWN) {
6615        KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6616                      "already shutdown\n"));
6617        return;
6618      } else if (gtid == KMP_GTID_MONITOR) {
6619        KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6620                      "registered, or system shutdown\n"));
6621        return;
6622      } else if (gtid == KMP_GTID_DNE) {
6623        KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6624                      "shutdown\n"));
6625        return;
6626        /* we don't know who we are */
6627      } else if (KMP_UBER_GTID(gtid)) {
6628        /* unregister ourselves as an uber thread.  gtid is no longer valid */
6629        if (__kmp_root[gtid]->r.r_active) {
6630          __kmp_global.g.g_abort = -1;
6631          TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6632          KA_TRACE(10,
6633                   ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6634                    gtid));
6635          return;
6636        } else {
6637          KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6638                        gtid));
6639          __kmp_unregister_root_current_thread(gtid);
6640        }
6641      } else {
6642        /* just a worker thread, let's leave */
6643        KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6644  
6645        if (gtid >= 0) {
6646          __kmp_threads[gtid]->th.th_task_team = NULL;
6647        }
6648  
6649        KA_TRACE(10,
6650                 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6651                  gtid));
6652        return;
6653      }
6654    }
6655  #if KMP_DYNAMIC_LIB
6656    if (__kmp_pause_status != kmp_hard_paused)
6657    // AC: lets not shutdown the dynamic library at the exit of uber thread,
6658    // because we will better shutdown later in the library destructor.
6659    {
6660      KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6661      return;
6662    }
6663  #endif
6664    /* synchronize the termination process */
6665    __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6666  
6667    /* have we already finished */
6668    if (__kmp_global.g.g_abort) {
6669      KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6670      /* TODO abort? */
6671      __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6672      return;
6673    }
6674    if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6675      __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6676      return;
6677    }
6678  
6679    /* We need this lock to enforce mutex between this reading of
6680       __kmp_threads_capacity and the writing by __kmp_register_root.
6681       Alternatively, we can use a counter of roots that is atomically updated by
6682       __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6683       __kmp_internal_end_*.  */
6684  
6685    /* should we finish the run-time?  are all siblings done? */
6686    __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6687  
6688    for (i = 0; i < __kmp_threads_capacity; ++i) {
6689      if (KMP_UBER_GTID(i)) {
6690        KA_TRACE(
6691            10,
6692            ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6693        __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6694        __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6695        return;
6696      }
6697    }
6698  
6699    /* now we can safely conduct the actual termination */
6700  
6701    __kmp_internal_end();
6702  
6703    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6704    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6705  
6706    KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6707  
6708  #ifdef DUMP_DEBUG_ON_EXIT
6709    if (__kmp_debug_buf)
6710      __kmp_dump_debug_buffer();
6711  #endif
6712  } // __kmp_internal_end_thread
6713  
6714  // -----------------------------------------------------------------------------
6715  // Library registration stuff.
6716  
6717  static long __kmp_registration_flag = 0;
6718  // Random value used to indicate library initialization.
6719  static char *__kmp_registration_str = NULL;
6720  // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6721  
__kmp_reg_status_name()6722  static inline char *__kmp_reg_status_name() {
6723  /* On RHEL 3u5 if linked statically, getpid() returns different values in
6724     each thread. If registration and unregistration go in different threads
6725     (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6726     env var can not be found, because the name will contain different pid. */
6727  // macOS* complains about name being too long with additional getuid()
6728  #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6729    return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6730                            (int)getuid());
6731  #else
6732    return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6733  #endif
6734  } // __kmp_reg_status_get
6735  
6736  #if defined(KMP_USE_SHM)
6737  bool __kmp_shm_available = false;
6738  bool __kmp_tmp_available = false;
6739  // If /dev/shm is not accessible, we will create a temporary file under /tmp.
6740  char *temp_reg_status_file_name = nullptr;
6741  #endif
6742  
__kmp_register_library_startup(void)6743  void __kmp_register_library_startup(void) {
6744  
6745    char *name = __kmp_reg_status_name(); // Name of the environment variable.
6746    int done = 0;
6747    union {
6748      double dtime;
6749      long ltime;
6750    } time;
6751  #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6752    __kmp_initialize_system_tick();
6753  #endif
6754    __kmp_read_system_time(&time.dtime);
6755    __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6756    __kmp_registration_str =
6757        __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6758                         __kmp_registration_flag, KMP_LIBRARY_FILE);
6759  
6760    KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6761                  __kmp_registration_str));
6762  
6763    while (!done) {
6764  
6765      char *value = NULL; // Actual value of the environment variable.
6766  
6767  #if defined(KMP_USE_SHM)
6768      char *shm_name = nullptr;
6769      char *data1 = nullptr;
6770      __kmp_shm_available = __kmp_detect_shm();
6771      if (__kmp_shm_available) {
6772        int fd1 = -1;
6773        shm_name = __kmp_str_format("/%s", name);
6774        int shm_preexist = 0;
6775        fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6776        if ((fd1 == -1) && (errno == EEXIST)) {
6777          // file didn't open because it already exists.
6778          // try opening existing file
6779          fd1 = shm_open(shm_name, O_RDWR, 0600);
6780          if (fd1 == -1) { // file didn't open
6781            KMP_WARNING(FunctionError, "Can't open SHM");
6782            __kmp_shm_available = false;
6783          } else { // able to open existing file
6784            shm_preexist = 1;
6785          }
6786        }
6787        if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6788          if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6789            KMP_WARNING(FunctionError, "Can't set size of SHM");
6790            __kmp_shm_available = false;
6791          }
6792        }
6793        if (__kmp_shm_available) { // SHM exists, now map it
6794          data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6795                               fd1, 0);
6796          if (data1 == MAP_FAILED) { // failed to map shared memory
6797            KMP_WARNING(FunctionError, "Can't map SHM");
6798            __kmp_shm_available = false;
6799          }
6800        }
6801        if (__kmp_shm_available) { // SHM mapped
6802          if (shm_preexist == 0) { // set data to SHM, set value
6803            KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6804          }
6805          // Read value from either what we just wrote or existing file.
6806          value = __kmp_str_format("%s", data1); // read value from SHM
6807          munmap(data1, SHM_SIZE);
6808        }
6809        if (fd1 != -1)
6810          close(fd1);
6811      }
6812      if (!__kmp_shm_available)
6813        __kmp_tmp_available = __kmp_detect_tmp();
6814      if (!__kmp_shm_available && __kmp_tmp_available) {
6815        // SHM failed to work due to an error other than that the file already
6816        // exists. Try to create a temp file under /tmp.
6817        // If /tmp isn't accessible, fall back to using environment variable.
6818        // TODO: /tmp might not always be the temporary directory. For now we will
6819        // not consider TMPDIR.
6820        int fd1 = -1;
6821        temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
6822        int tmp_preexist = 0;
6823        fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6824        if ((fd1 == -1) && (errno == EEXIST)) {
6825          // file didn't open because it already exists.
6826          // try opening existing file
6827          fd1 = open(temp_reg_status_file_name, O_RDWR, 0600);
6828          if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6829            KMP_WARNING(FunctionError, "Can't open TEMP");
6830            __kmp_tmp_available = false;
6831          } else {
6832            tmp_preexist = 1;
6833          }
6834        }
6835        if (__kmp_tmp_available && tmp_preexist == 0) {
6836          // we created /tmp file now set size
6837          if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6838            KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6839            __kmp_tmp_available = false;
6840          }
6841        }
6842        if (__kmp_tmp_available) {
6843          data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6844                               fd1, 0);
6845          if (data1 == MAP_FAILED) { // failed to map /tmp
6846            KMP_WARNING(FunctionError, "Can't map /tmp");
6847            __kmp_tmp_available = false;
6848          }
6849        }
6850        if (__kmp_tmp_available) {
6851          if (tmp_preexist == 0) { // set data to TMP, set value
6852            KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6853          }
6854          // Read value from either what we just wrote or existing file.
6855          value = __kmp_str_format("%s", data1); // read value from SHM
6856          munmap(data1, SHM_SIZE);
6857        }
6858        if (fd1 != -1)
6859          close(fd1);
6860      }
6861      if (!__kmp_shm_available && !__kmp_tmp_available) {
6862        // no /dev/shm and no /tmp -- fall back to environment variable
6863        // Set environment variable, but do not overwrite if it exists.
6864        __kmp_env_set(name, __kmp_registration_str, 0);
6865        // read value to see if it got set
6866        value = __kmp_env_get(name);
6867      }
6868  #else // Windows and unix with static library
6869      // Set environment variable, but do not overwrite if it exists.
6870      __kmp_env_set(name, __kmp_registration_str, 0);
6871      // read value to see if it got set
6872      value = __kmp_env_get(name);
6873  #endif
6874  
6875      if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6876        done = 1; // Ok, environment variable set successfully, exit the loop.
6877      } else {
6878        // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6879        // Check whether it alive or dead.
6880        int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6881        char *tail = value;
6882        char *flag_addr_str = NULL;
6883        char *flag_val_str = NULL;
6884        char const *file_name = NULL;
6885        __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6886        __kmp_str_split(tail, '-', &flag_val_str, &tail);
6887        file_name = tail;
6888        if (tail != NULL) {
6889          unsigned long *flag_addr = 0;
6890          unsigned long flag_val = 0;
6891          KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6892          KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6893          if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6894            // First, check whether environment-encoded address is mapped into
6895            // addr space.
6896            // If so, dereference it to see if it still has the right value.
6897            if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6898              neighbor = 1;
6899            } else {
6900              // If not, then we know the other copy of the library is no longer
6901              // running.
6902              neighbor = 2;
6903            }
6904          }
6905        }
6906        switch (neighbor) {
6907        case 0: // Cannot parse environment variable -- neighbor status unknown.
6908          // Assume it is the incompatible format of future version of the
6909          // library. Assume the other library is alive.
6910          // WARN( ... ); // TODO: Issue a warning.
6911          file_name = "unknown library";
6912          KMP_FALLTHROUGH();
6913        // Attention! Falling to the next case. That's intentional.
6914        case 1: { // Neighbor is alive.
6915          // Check it is allowed.
6916          char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6917          if (!__kmp_str_match_true(duplicate_ok)) {
6918            // That's not allowed. Issue fatal error.
6919            __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6920                        KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6921          }
6922          KMP_INTERNAL_FREE(duplicate_ok);
6923          __kmp_duplicate_library_ok = 1;
6924          done = 1; // Exit the loop.
6925        } break;
6926        case 2: { // Neighbor is dead.
6927  
6928  #if defined(KMP_USE_SHM)
6929          if (__kmp_shm_available) { // close shared memory.
6930            shm_unlink(shm_name); // this removes file in /dev/shm
6931          } else if (__kmp_tmp_available) {
6932            unlink(temp_reg_status_file_name); // this removes the temp file
6933          } else {
6934            // Clear the variable and try to register library again.
6935            __kmp_env_unset(name);
6936          }
6937  #else
6938          // Clear the variable and try to register library again.
6939          __kmp_env_unset(name);
6940  #endif
6941        } break;
6942        default: {
6943          KMP_DEBUG_ASSERT(0);
6944        } break;
6945        }
6946      }
6947      KMP_INTERNAL_FREE((void *)value);
6948  #if defined(KMP_USE_SHM)
6949      if (shm_name)
6950        KMP_INTERNAL_FREE((void *)shm_name);
6951  #endif
6952    } // while
6953    KMP_INTERNAL_FREE((void *)name);
6954  
6955  } // func __kmp_register_library_startup
6956  
__kmp_unregister_library(void)6957  void __kmp_unregister_library(void) {
6958  
6959    char *name = __kmp_reg_status_name();
6960    char *value = NULL;
6961  
6962  #if defined(KMP_USE_SHM)
6963    char *shm_name = nullptr;
6964    int fd1;
6965    if (__kmp_shm_available) {
6966      shm_name = __kmp_str_format("/%s", name);
6967      fd1 = shm_open(shm_name, O_RDONLY, 0600);
6968      if (fd1 != -1) { // File opened successfully
6969        char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6970        if (data1 != MAP_FAILED) {
6971          value = __kmp_str_format("%s", data1); // read value from SHM
6972          munmap(data1, SHM_SIZE);
6973        }
6974        close(fd1);
6975      }
6976    } else if (__kmp_tmp_available) { // try /tmp
6977      fd1 = open(temp_reg_status_file_name, O_RDONLY);
6978      if (fd1 != -1) { // File opened successfully
6979        char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6980        if (data1 != MAP_FAILED) {
6981          value = __kmp_str_format("%s", data1); // read value from /tmp
6982          munmap(data1, SHM_SIZE);
6983        }
6984        close(fd1);
6985      }
6986    } else { // fall back to envirable
6987      value = __kmp_env_get(name);
6988    }
6989  #else
6990    value = __kmp_env_get(name);
6991  #endif
6992  
6993    KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6994    KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6995    if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6996  //  Ok, this is our variable. Delete it.
6997  #if defined(KMP_USE_SHM)
6998      if (__kmp_shm_available) {
6999        shm_unlink(shm_name); // this removes file in /dev/shm
7000      } else if (__kmp_tmp_available) {
7001        unlink(temp_reg_status_file_name); // this removes the temp file
7002      } else {
7003        __kmp_env_unset(name);
7004      }
7005  #else
7006      __kmp_env_unset(name);
7007  #endif
7008    }
7009  
7010  #if defined(KMP_USE_SHM)
7011    if (shm_name)
7012      KMP_INTERNAL_FREE(shm_name);
7013    if (temp_reg_status_file_name)
7014      KMP_INTERNAL_FREE(temp_reg_status_file_name);
7015  #endif
7016  
7017    KMP_INTERNAL_FREE(__kmp_registration_str);
7018    KMP_INTERNAL_FREE(value);
7019    KMP_INTERNAL_FREE(name);
7020  
7021    __kmp_registration_flag = 0;
7022    __kmp_registration_str = NULL;
7023  
7024  } // __kmp_unregister_library
7025  
7026  // End of Library registration stuff.
7027  // -----------------------------------------------------------------------------
7028  
7029  #if KMP_MIC_SUPPORTED
7030  
__kmp_check_mic_type()7031  static void __kmp_check_mic_type() {
7032    kmp_cpuid_t cpuid_state = {0};
7033    kmp_cpuid_t *cs_p = &cpuid_state;
7034    __kmp_x86_cpuid(1, 0, cs_p);
7035    // We don't support mic1 at the moment
7036    if ((cs_p->eax & 0xff0) == 0xB10) {
7037      __kmp_mic_type = mic2;
7038    } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
7039      __kmp_mic_type = mic3;
7040    } else {
7041      __kmp_mic_type = non_mic;
7042    }
7043  }
7044  
7045  #endif /* KMP_MIC_SUPPORTED */
7046  
7047  #if KMP_HAVE_UMWAIT
__kmp_user_level_mwait_init()7048  static void __kmp_user_level_mwait_init() {
7049    struct kmp_cpuid buf;
7050    __kmp_x86_cpuid(7, 0, &buf);
7051    __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7052    __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7053    __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7054    KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7055                  __kmp_umwait_enabled));
7056  }
7057  #elif KMP_HAVE_MWAIT
7058  #ifndef AT_INTELPHIUSERMWAIT
7059  // Spurious, non-existent value that should always fail to return anything.
7060  // Will be replaced with the correct value when we know that.
7061  #define AT_INTELPHIUSERMWAIT 10000
7062  #endif
7063  // getauxval() function is available in RHEL7 and SLES12. If a system with an
7064  // earlier OS is used to build the RTL, we'll use the following internal
7065  // function when the entry is not found.
7066  unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
getauxval(unsigned long)7067  unsigned long getauxval(unsigned long) { return 0; }
7068  
__kmp_user_level_mwait_init()7069  static void __kmp_user_level_mwait_init() {
7070    // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7071    // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7072    // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7073    // KMP_USER_LEVEL_MWAIT was set to TRUE.
7074    if (__kmp_mic_type == mic3) {
7075      unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7076      if ((res & 0x1) || __kmp_user_level_mwait) {
7077        __kmp_mwait_enabled = TRUE;
7078        if (__kmp_user_level_mwait) {
7079          KMP_INFORM(EnvMwaitWarn);
7080        }
7081      } else {
7082        __kmp_mwait_enabled = FALSE;
7083      }
7084    }
7085    KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7086                  "__kmp_mwait_enabled = %d\n",
7087                  __kmp_mic_type, __kmp_mwait_enabled));
7088  }
7089  #endif /* KMP_HAVE_UMWAIT */
7090  
__kmp_do_serial_initialize(void)7091  static void __kmp_do_serial_initialize(void) {
7092    int i, gtid;
7093    size_t size;
7094  
7095    KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7096  
7097    KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7098    KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7099    KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7100    KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7101    KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7102  
7103  #if OMPT_SUPPORT
7104    ompt_pre_init();
7105  #endif
7106  #if OMPD_SUPPORT
7107    __kmp_env_dump();
7108    ompd_init();
7109  #endif
7110  
7111    __kmp_validate_locks();
7112  
7113  #if ENABLE_LIBOMPTARGET
7114    /* Initialize functions from libomptarget */
7115    __kmp_init_omptarget();
7116  #endif
7117  
7118    /* Initialize internal memory allocator */
7119    __kmp_init_allocator();
7120  
7121    /* Register the library startup via an environment variable or via mapped
7122       shared memory file and check to see whether another copy of the library is
7123       already registered. Since forked child process is often terminated, we
7124       postpone the registration till middle initialization in the child */
7125    if (__kmp_need_register_serial)
7126      __kmp_register_library_startup();
7127  
7128    /* TODO reinitialization of library */
7129    if (TCR_4(__kmp_global.g.g_done)) {
7130      KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7131    }
7132  
7133    __kmp_global.g.g_abort = 0;
7134    TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7135  
7136  /* initialize the locks */
7137  #if KMP_USE_ADAPTIVE_LOCKS
7138  #if KMP_DEBUG_ADAPTIVE_LOCKS
7139    __kmp_init_speculative_stats();
7140  #endif
7141  #endif
7142  #if KMP_STATS_ENABLED
7143    __kmp_stats_init();
7144  #endif
7145    __kmp_init_lock(&__kmp_global_lock);
7146    __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7147    __kmp_init_lock(&__kmp_debug_lock);
7148    __kmp_init_atomic_lock(&__kmp_atomic_lock);
7149    __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7150    __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7151    __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7152    __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7153    __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7154    __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7155    __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7156    __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7157    __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7158    __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7159    __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7160    __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7161    __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7162    __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7163  #if KMP_USE_MONITOR
7164    __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7165  #endif
7166    __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7167  
7168    /* conduct initialization and initial setup of configuration */
7169  
7170    __kmp_runtime_initialize();
7171  
7172  #if KMP_MIC_SUPPORTED
7173    __kmp_check_mic_type();
7174  #endif
7175  
7176  // Some global variable initialization moved here from kmp_env_initialize()
7177  #ifdef KMP_DEBUG
7178    kmp_diag = 0;
7179  #endif
7180    __kmp_abort_delay = 0;
7181  
7182    // From __kmp_init_dflt_team_nth()
7183    /* assume the entire machine will be used */
7184    __kmp_dflt_team_nth_ub = __kmp_xproc;
7185    if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7186      __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7187    }
7188    if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7189      __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7190    }
7191    __kmp_max_nth = __kmp_sys_max_nth;
7192    __kmp_cg_max_nth = __kmp_sys_max_nth;
7193    __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7194    if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7195      __kmp_teams_max_nth = __kmp_sys_max_nth;
7196    }
7197  
7198    // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7199    // part
7200    __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7201  #if KMP_USE_MONITOR
7202    __kmp_monitor_wakeups =
7203        KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7204    __kmp_bt_intervals =
7205        KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7206  #endif
7207    // From "KMP_LIBRARY" part of __kmp_env_initialize()
7208    __kmp_library = library_throughput;
7209    // From KMP_SCHEDULE initialization
7210    __kmp_static = kmp_sch_static_balanced;
7211  // AC: do not use analytical here, because it is non-monotonous
7212  //__kmp_guided = kmp_sch_guided_iterative_chunked;
7213  //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7214  // need to repeat assignment
7215  // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7216  // bit control and barrier method control parts
7217  #if KMP_FAST_REDUCTION_BARRIER
7218  #define kmp_reduction_barrier_gather_bb ((int)1)
7219  #define kmp_reduction_barrier_release_bb ((int)1)
7220  #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7221  #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7222  #endif // KMP_FAST_REDUCTION_BARRIER
7223    for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7224      __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7225      __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7226      __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7227      __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7228  #if KMP_FAST_REDUCTION_BARRIER
7229      if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7230        // lin_64 ): hyper,1
7231        __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7232        __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7233        __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7234        __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7235      }
7236  #endif // KMP_FAST_REDUCTION_BARRIER
7237    }
7238  #if KMP_FAST_REDUCTION_BARRIER
7239  #undef kmp_reduction_barrier_release_pat
7240  #undef kmp_reduction_barrier_gather_pat
7241  #undef kmp_reduction_barrier_release_bb
7242  #undef kmp_reduction_barrier_gather_bb
7243  #endif // KMP_FAST_REDUCTION_BARRIER
7244  #if KMP_MIC_SUPPORTED
7245    if (__kmp_mic_type == mic2) { // KNC
7246      // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7247      __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7248      __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7249          1; // forkjoin release
7250      __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7251      __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7252    }
7253  #if KMP_FAST_REDUCTION_BARRIER
7254    if (__kmp_mic_type == mic2) { // KNC
7255      __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7256      __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7257    }
7258  #endif // KMP_FAST_REDUCTION_BARRIER
7259  #endif // KMP_MIC_SUPPORTED
7260  
7261  // From KMP_CHECKS initialization
7262  #ifdef KMP_DEBUG
7263    __kmp_env_checks = TRUE; /* development versions have the extra checks */
7264  #else
7265    __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7266  #endif
7267  
7268    // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7269    __kmp_foreign_tp = TRUE;
7270  
7271    __kmp_global.g.g_dynamic = FALSE;
7272    __kmp_global.g.g_dynamic_mode = dynamic_default;
7273  
7274    __kmp_init_nesting_mode();
7275  
7276    __kmp_env_initialize(NULL);
7277  
7278  #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7279    __kmp_user_level_mwait_init();
7280  #endif
7281  // Print all messages in message catalog for testing purposes.
7282  #ifdef KMP_DEBUG
7283    char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7284    if (__kmp_str_match_true(val)) {
7285      kmp_str_buf_t buffer;
7286      __kmp_str_buf_init(&buffer);
7287      __kmp_i18n_dump_catalog(&buffer);
7288      __kmp_printf("%s", buffer.str);
7289      __kmp_str_buf_free(&buffer);
7290    }
7291    __kmp_env_free(&val);
7292  #endif
7293  
7294    __kmp_threads_capacity =
7295        __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7296    // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7297    __kmp_tp_capacity = __kmp_default_tp_capacity(
7298        __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7299  
7300    // If the library is shut down properly, both pools must be NULL. Just in
7301    // case, set them to NULL -- some memory may leak, but subsequent code will
7302    // work even if pools are not freed.
7303    KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7304    KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7305    KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7306    __kmp_thread_pool = NULL;
7307    __kmp_thread_pool_insert_pt = NULL;
7308    __kmp_team_pool = NULL;
7309  
7310    /* Allocate all of the variable sized records */
7311    /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7312     * expandable */
7313    /* Since allocation is cache-aligned, just add extra padding at the end */
7314    size =
7315        (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7316        CACHE_LINE;
7317    __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7318    __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7319                                 sizeof(kmp_info_t *) * __kmp_threads_capacity);
7320  
7321    /* init thread counts */
7322    KMP_DEBUG_ASSERT(__kmp_all_nth ==
7323                     0); // Asserts fail if the library is reinitializing and
7324    KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7325    __kmp_all_nth = 0;
7326    __kmp_nth = 0;
7327  
7328    /* setup the uber master thread and hierarchy */
7329    gtid = __kmp_register_root(TRUE);
7330    KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
7331    KMP_ASSERT(KMP_UBER_GTID(gtid));
7332    KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7333  
7334    KMP_MB(); /* Flush all pending memory write invalidates.  */
7335  
7336    __kmp_common_initialize();
7337  
7338  #if KMP_OS_UNIX
7339    /* invoke the child fork handler */
7340    __kmp_register_atfork();
7341  #endif
7342  
7343  #if !KMP_DYNAMIC_LIB ||                                                        \
7344      ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7345    {
7346      /* Invoke the exit handler when the program finishes, only for static
7347         library and macOS* dynamic. For other dynamic libraries, we already
7348         have _fini and DllMain. */
7349      int rc = atexit(__kmp_internal_end_atexit);
7350      if (rc != 0) {
7351        __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7352                    __kmp_msg_null);
7353      }
7354    }
7355  #endif
7356  
7357  #if KMP_HANDLE_SIGNALS
7358  #if KMP_OS_UNIX
7359    /* NOTE: make sure that this is called before the user installs their own
7360       signal handlers so that the user handlers are called first. this way they
7361       can return false, not call our handler, avoid terminating the library, and
7362       continue execution where they left off. */
7363    __kmp_install_signals(FALSE);
7364  #endif /* KMP_OS_UNIX */
7365  #if KMP_OS_WINDOWS
7366    __kmp_install_signals(TRUE);
7367  #endif /* KMP_OS_WINDOWS */
7368  #endif
7369  
7370    /* we have finished the serial initialization */
7371    __kmp_init_counter++;
7372  
7373    __kmp_init_serial = TRUE;
7374  
7375    if (__kmp_version) {
7376      __kmp_print_version_1();
7377    }
7378  
7379    if (__kmp_settings) {
7380      __kmp_env_print();
7381    }
7382  
7383    if (__kmp_display_env || __kmp_display_env_verbose) {
7384      __kmp_env_print_2();
7385    }
7386  
7387  #if OMPT_SUPPORT
7388    ompt_post_init();
7389  #endif
7390  
7391    KMP_MB();
7392  
7393    KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7394  }
7395  
__kmp_serial_initialize(void)7396  void __kmp_serial_initialize(void) {
7397    if (__kmp_init_serial) {
7398      return;
7399    }
7400    __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7401    if (__kmp_init_serial) {
7402      __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7403      return;
7404    }
7405    __kmp_do_serial_initialize();
7406    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7407  }
7408  
__kmp_do_middle_initialize(void)7409  static void __kmp_do_middle_initialize(void) {
7410    int i, j;
7411    int prev_dflt_team_nth;
7412  
7413    if (!__kmp_init_serial) {
7414      __kmp_do_serial_initialize();
7415    }
7416  
7417    KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7418  
7419    if (UNLIKELY(!__kmp_need_register_serial)) {
7420      // We are in a forked child process. The registration was skipped during
7421      // serial initialization in __kmp_atfork_child handler. Do it here.
7422      __kmp_register_library_startup();
7423    }
7424  
7425    // Save the previous value for the __kmp_dflt_team_nth so that
7426    // we can avoid some reinitialization if it hasn't changed.
7427    prev_dflt_team_nth = __kmp_dflt_team_nth;
7428  
7429  #if KMP_AFFINITY_SUPPORTED
7430    // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7431    // number of cores on the machine.
7432    __kmp_affinity_initialize(__kmp_affinity);
7433  
7434  #endif /* KMP_AFFINITY_SUPPORTED */
7435  
7436    KMP_ASSERT(__kmp_xproc > 0);
7437    if (__kmp_avail_proc == 0) {
7438      __kmp_avail_proc = __kmp_xproc;
7439    }
7440  
7441    // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7442    // correct them now
7443    j = 0;
7444    while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7445      __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7446          __kmp_avail_proc;
7447      j++;
7448    }
7449  
7450    if (__kmp_dflt_team_nth == 0) {
7451  #ifdef KMP_DFLT_NTH_CORES
7452      // Default #threads = #cores
7453      __kmp_dflt_team_nth = __kmp_ncores;
7454      KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7455                    "__kmp_ncores (%d)\n",
7456                    __kmp_dflt_team_nth));
7457  #else
7458      // Default #threads = #available OS procs
7459      __kmp_dflt_team_nth = __kmp_avail_proc;
7460      KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7461                    "__kmp_avail_proc(%d)\n",
7462                    __kmp_dflt_team_nth));
7463  #endif /* KMP_DFLT_NTH_CORES */
7464    }
7465  
7466    if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7467      __kmp_dflt_team_nth = KMP_MIN_NTH;
7468    }
7469    if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7470      __kmp_dflt_team_nth = __kmp_sys_max_nth;
7471    }
7472  
7473    if (__kmp_nesting_mode > 0)
7474      __kmp_set_nesting_mode_threads();
7475  
7476    // There's no harm in continuing if the following check fails,
7477    // but it indicates an error in the previous logic.
7478    KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7479  
7480    if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7481      // Run through the __kmp_threads array and set the num threads icv for each
7482      // root thread that is currently registered with the RTL (which has not
7483      // already explicitly set its nthreads-var with a call to
7484      // omp_set_num_threads()).
7485      for (i = 0; i < __kmp_threads_capacity; i++) {
7486        kmp_info_t *thread = __kmp_threads[i];
7487        if (thread == NULL)
7488          continue;
7489        if (thread->th.th_current_task->td_icvs.nproc != 0)
7490          continue;
7491  
7492        set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7493      }
7494    }
7495    KA_TRACE(
7496        20,
7497        ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7498         __kmp_dflt_team_nth));
7499  
7500  #ifdef KMP_ADJUST_BLOCKTIME
7501    /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7502    if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7503      KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7504      if (__kmp_nth > __kmp_avail_proc) {
7505        __kmp_zero_bt = TRUE;
7506      }
7507    }
7508  #endif /* KMP_ADJUST_BLOCKTIME */
7509  
7510    /* we have finished middle initialization */
7511    TCW_SYNC_4(__kmp_init_middle, TRUE);
7512  
7513    KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7514  }
7515  
__kmp_middle_initialize(void)7516  void __kmp_middle_initialize(void) {
7517    if (__kmp_init_middle) {
7518      return;
7519    }
7520    __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7521    if (__kmp_init_middle) {
7522      __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7523      return;
7524    }
7525    __kmp_do_middle_initialize();
7526    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7527  }
7528  
__kmp_parallel_initialize(void)7529  void __kmp_parallel_initialize(void) {
7530    int gtid = __kmp_entry_gtid(); // this might be a new root
7531  
7532    /* synchronize parallel initialization (for sibling) */
7533    if (TCR_4(__kmp_init_parallel))
7534      return;
7535    __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7536    if (TCR_4(__kmp_init_parallel)) {
7537      __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7538      return;
7539    }
7540  
7541    /* TODO reinitialization after we have already shut down */
7542    if (TCR_4(__kmp_global.g.g_done)) {
7543      KA_TRACE(
7544          10,
7545          ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7546      __kmp_infinite_loop();
7547    }
7548  
7549    /* jc: The lock __kmp_initz_lock is already held, so calling
7550       __kmp_serial_initialize would cause a deadlock.  So we call
7551       __kmp_do_serial_initialize directly. */
7552    if (!__kmp_init_middle) {
7553      __kmp_do_middle_initialize();
7554    }
7555    __kmp_assign_root_init_mask();
7556    __kmp_resume_if_hard_paused();
7557  
7558    /* begin initialization */
7559    KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7560    KMP_ASSERT(KMP_UBER_GTID(gtid));
7561  
7562  #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7563    // Save the FP control regs.
7564    // Worker threads will set theirs to these values at thread startup.
7565    __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7566    __kmp_store_mxcsr(&__kmp_init_mxcsr);
7567    __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7568  #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7569  
7570  #if KMP_OS_UNIX
7571  #if KMP_HANDLE_SIGNALS
7572    /*  must be after __kmp_serial_initialize  */
7573    __kmp_install_signals(TRUE);
7574  #endif
7575  #endif
7576  
7577    __kmp_suspend_initialize();
7578  
7579  #if defined(USE_LOAD_BALANCE)
7580    if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7581      __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7582    }
7583  #else
7584    if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7585      __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7586    }
7587  #endif
7588  
7589    if (__kmp_version) {
7590      __kmp_print_version_2();
7591    }
7592  
7593    /* we have finished parallel initialization */
7594    TCW_SYNC_4(__kmp_init_parallel, TRUE);
7595  
7596    KMP_MB();
7597    KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7598  
7599    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7600  }
7601  
__kmp_hidden_helper_initialize()7602  void __kmp_hidden_helper_initialize() {
7603    if (TCR_4(__kmp_init_hidden_helper))
7604      return;
7605  
7606    // __kmp_parallel_initialize is required before we initialize hidden helper
7607    if (!TCR_4(__kmp_init_parallel))
7608      __kmp_parallel_initialize();
7609  
7610    // Double check. Note that this double check should not be placed before
7611    // __kmp_parallel_initialize as it will cause dead lock.
7612    __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7613    if (TCR_4(__kmp_init_hidden_helper)) {
7614      __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7615      return;
7616    }
7617  
7618  #if KMP_AFFINITY_SUPPORTED
7619    // Initialize hidden helper affinity settings.
7620    // The above __kmp_parallel_initialize() will initialize
7621    // regular affinity (and topology) if not already done.
7622    if (!__kmp_hh_affinity.flags.initialized)
7623      __kmp_affinity_initialize(__kmp_hh_affinity);
7624  #endif
7625  
7626    // Set the count of hidden helper tasks to be executed to zero
7627    KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7628  
7629    // Set the global variable indicating that we're initializing hidden helper
7630    // team/threads
7631    TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7632  
7633    // Platform independent initialization
7634    __kmp_do_initialize_hidden_helper_threads();
7635  
7636    // Wait here for the finish of initialization of hidden helper teams
7637    __kmp_hidden_helper_threads_initz_wait();
7638  
7639    // We have finished hidden helper initialization
7640    TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7641  
7642    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7643  }
7644  
7645  /* ------------------------------------------------------------------------ */
7646  
__kmp_run_before_invoked_task(int gtid,int tid,kmp_info_t * this_thr,kmp_team_t * team)7647  void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7648                                     kmp_team_t *team) {
7649    kmp_disp_t *dispatch;
7650  
7651    KMP_MB();
7652  
7653    /* none of the threads have encountered any constructs, yet. */
7654    this_thr->th.th_local.this_construct = 0;
7655  #if KMP_CACHE_MANAGE
7656    KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7657  #endif /* KMP_CACHE_MANAGE */
7658    dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7659    KMP_DEBUG_ASSERT(dispatch);
7660    KMP_DEBUG_ASSERT(team->t.t_dispatch);
7661    // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7662    // this_thr->th.th_info.ds.ds_tid ] );
7663  
7664    dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7665    dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7666    if (__kmp_env_consistency_check)
7667      __kmp_push_parallel(gtid, team->t.t_ident);
7668  
7669    KMP_MB(); /* Flush all pending memory write invalidates.  */
7670  }
7671  
__kmp_run_after_invoked_task(int gtid,int tid,kmp_info_t * this_thr,kmp_team_t * team)7672  void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7673                                    kmp_team_t *team) {
7674    if (__kmp_env_consistency_check)
7675      __kmp_pop_parallel(gtid, team->t.t_ident);
7676  
7677    __kmp_finish_implicit_task(this_thr);
7678  }
7679  
__kmp_invoke_task_func(int gtid)7680  int __kmp_invoke_task_func(int gtid) {
7681    int rc;
7682    int tid = __kmp_tid_from_gtid(gtid);
7683    kmp_info_t *this_thr = __kmp_threads[gtid];
7684    kmp_team_t *team = this_thr->th.th_team;
7685  
7686    __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7687  #if USE_ITT_BUILD
7688    if (__itt_stack_caller_create_ptr) {
7689      // inform ittnotify about entering user's code
7690      if (team->t.t_stack_id != NULL) {
7691        __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7692      } else {
7693        KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7694        __kmp_itt_stack_callee_enter(
7695            (__itt_caller)team->t.t_parent->t.t_stack_id);
7696      }
7697    }
7698  #endif /* USE_ITT_BUILD */
7699  #if INCLUDE_SSC_MARKS
7700    SSC_MARK_INVOKING();
7701  #endif
7702  
7703  #if OMPT_SUPPORT
7704    void *dummy;
7705    void **exit_frame_p;
7706    ompt_data_t *my_task_data;
7707    ompt_data_t *my_parallel_data;
7708    int ompt_team_size;
7709  
7710    if (ompt_enabled.enabled) {
7711      exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7712                           .ompt_task_info.frame.exit_frame.ptr);
7713    } else {
7714      exit_frame_p = &dummy;
7715    }
7716  
7717    my_task_data =
7718        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7719    my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7720    if (ompt_enabled.ompt_callback_implicit_task) {
7721      ompt_team_size = team->t.t_nproc;
7722      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7723          ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7724          __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7725      OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7726    }
7727  #endif
7728  
7729  #if KMP_STATS_ENABLED
7730    stats_state_e previous_state = KMP_GET_THREAD_STATE();
7731    if (previous_state == stats_state_e::TEAMS_REGION) {
7732      KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7733    } else {
7734      KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7735    }
7736    KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7737  #endif
7738  
7739    rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7740                                tid, (int)team->t.t_argc, (void **)team->t.t_argv
7741  #if OMPT_SUPPORT
7742                                ,
7743                                exit_frame_p
7744  #endif
7745    );
7746  #if OMPT_SUPPORT
7747    *exit_frame_p = NULL;
7748    this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_team;
7749  #endif
7750  
7751  #if KMP_STATS_ENABLED
7752    if (previous_state == stats_state_e::TEAMS_REGION) {
7753      KMP_SET_THREAD_STATE(previous_state);
7754    }
7755    KMP_POP_PARTITIONED_TIMER();
7756  #endif
7757  
7758  #if USE_ITT_BUILD
7759    if (__itt_stack_caller_create_ptr) {
7760      // inform ittnotify about leaving user's code
7761      if (team->t.t_stack_id != NULL) {
7762        __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7763      } else {
7764        KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7765        __kmp_itt_stack_callee_leave(
7766            (__itt_caller)team->t.t_parent->t.t_stack_id);
7767      }
7768    }
7769  #endif /* USE_ITT_BUILD */
7770    __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7771  
7772    return rc;
7773  }
7774  
__kmp_teams_master(int gtid)7775  void __kmp_teams_master(int gtid) {
7776    // This routine is called by all primary threads in teams construct
7777    kmp_info_t *thr = __kmp_threads[gtid];
7778    kmp_team_t *team = thr->th.th_team;
7779    ident_t *loc = team->t.t_ident;
7780    thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7781    KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7782    KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7783    KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7784                  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7785  
7786    // This thread is a new CG root.  Set up the proper variables.
7787    kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7788    tmp->cg_root = thr; // Make thr the CG root
7789    // Init to thread limit stored when league primary threads were forked
7790    tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7791    tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7792    KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7793                   " cg_nthreads to 1\n",
7794                   thr, tmp));
7795    tmp->up = thr->th.th_cg_roots;
7796    thr->th.th_cg_roots = tmp;
7797  
7798  // Launch league of teams now, but not let workers execute
7799  // (they hang on fork barrier until next parallel)
7800  #if INCLUDE_SSC_MARKS
7801    SSC_MARK_FORKING();
7802  #endif
7803    __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7804                    (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7805                    VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7806  #if INCLUDE_SSC_MARKS
7807    SSC_MARK_JOINING();
7808  #endif
7809    // If the team size was reduced from the limit, set it to the new size
7810    if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7811      thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7812    // AC: last parameter "1" eliminates join barrier which won't work because
7813    // worker threads are in a fork barrier waiting for more parallel regions
7814    __kmp_join_call(loc, gtid
7815  #if OMPT_SUPPORT
7816                    ,
7817                    fork_context_intel
7818  #endif
7819                    ,
7820                    1);
7821  }
7822  
__kmp_invoke_teams_master(int gtid)7823  int __kmp_invoke_teams_master(int gtid) {
7824    kmp_info_t *this_thr = __kmp_threads[gtid];
7825    kmp_team_t *team = this_thr->th.th_team;
7826  #if KMP_DEBUG
7827    if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7828      KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7829                       (void *)__kmp_teams_master);
7830  #endif
7831    __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7832  #if OMPT_SUPPORT
7833    int tid = __kmp_tid_from_gtid(gtid);
7834    ompt_data_t *task_data =
7835        &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7836    ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7837    if (ompt_enabled.ompt_callback_implicit_task) {
7838      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7839          ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7840          ompt_task_initial);
7841      OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7842    }
7843  #endif
7844    __kmp_teams_master(gtid);
7845  #if OMPT_SUPPORT
7846    this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_league;
7847  #endif
7848    __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7849    return 1;
7850  }
7851  
7852  /* this sets the requested number of threads for the next parallel region
7853     encountered by this team. since this should be enclosed in the forkjoin
7854     critical section it should avoid race conditions with asymmetrical nested
7855     parallelism */
__kmp_push_num_threads(ident_t * id,int gtid,int num_threads)7856  void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7857    kmp_info_t *thr = __kmp_threads[gtid];
7858  
7859    if (num_threads > 0)
7860      thr->th.th_set_nproc = num_threads;
7861  }
7862  
__kmp_push_num_threads_list(ident_t * id,int gtid,kmp_uint32 list_length,int * num_threads_list)7863  void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length,
7864                                   int *num_threads_list) {
7865    kmp_info_t *thr = __kmp_threads[gtid];
7866  
7867    KMP_DEBUG_ASSERT(list_length > 1);
7868  
7869    if (num_threads_list[0] > 0)
7870      thr->th.th_set_nproc = num_threads_list[0];
7871    thr->th.th_set_nested_nth =
7872        (int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int));
7873    for (kmp_uint32 i = 0; i < list_length; ++i)
7874      thr->th.th_set_nested_nth[i] = num_threads_list[i];
7875    thr->th.th_set_nested_nth_sz = list_length;
7876  }
7877  
__kmp_set_strict_num_threads(ident_t * loc,int gtid,int sev,const char * msg)7878  void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
7879                                    const char *msg) {
7880    kmp_info_t *thr = __kmp_threads[gtid];
7881    thr->th.th_nt_strict = true;
7882    thr->th.th_nt_loc = loc;
7883    // if sev is unset make fatal
7884    if (sev == severity_warning)
7885      thr->th.th_nt_sev = sev;
7886    else
7887      thr->th.th_nt_sev = severity_fatal;
7888    // if msg is unset, use an appropriate message
7889    if (msg)
7890      thr->th.th_nt_msg = msg;
7891    else
7892      thr->th.th_nt_msg = "Cannot form team with number of threads specified by "
7893                          "strict num_threads clause.";
7894  }
7895  
__kmp_push_thread_limit(kmp_info_t * thr,int num_teams,int num_threads)7896  static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7897                                      int num_threads) {
7898    KMP_DEBUG_ASSERT(thr);
7899    // Remember the number of threads for inner parallel regions
7900    if (!TCR_4(__kmp_init_middle))
7901      __kmp_middle_initialize(); // get internal globals calculated
7902    __kmp_assign_root_init_mask();
7903    KMP_DEBUG_ASSERT(__kmp_avail_proc);
7904    KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7905  
7906    if (num_threads == 0) {
7907      if (__kmp_teams_thread_limit > 0) {
7908        num_threads = __kmp_teams_thread_limit;
7909      } else {
7910        num_threads = __kmp_avail_proc / num_teams;
7911      }
7912      // adjust num_threads w/o warning as it is not user setting
7913      // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7914      // no thread_limit clause specified -  do not change thread-limit-var ICV
7915      if (num_threads > __kmp_dflt_team_nth) {
7916        num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7917      }
7918      if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7919        num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7920      } // prevent team size to exceed thread-limit-var
7921      if (num_teams * num_threads > __kmp_teams_max_nth) {
7922        num_threads = __kmp_teams_max_nth / num_teams;
7923      }
7924      if (num_threads == 0) {
7925        num_threads = 1;
7926      }
7927    } else {
7928      if (num_threads < 0) {
7929        __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7930                  __kmp_msg_null);
7931        num_threads = 1;
7932      }
7933      // This thread will be the primary thread of the league primary threads
7934      // Store new thread limit; old limit is saved in th_cg_roots list
7935      thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7936      // num_threads = min(num_threads, nthreads-var)
7937      if (num_threads > __kmp_dflt_team_nth) {
7938        num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7939      }
7940      if (num_teams * num_threads > __kmp_teams_max_nth) {
7941        int new_threads = __kmp_teams_max_nth / num_teams;
7942        if (new_threads == 0) {
7943          new_threads = 1;
7944        }
7945        if (new_threads != num_threads) {
7946          if (!__kmp_reserve_warn) { // user asked for too many threads
7947            __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7948            __kmp_msg(kmp_ms_warning,
7949                      KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7950                      KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7951          }
7952        }
7953        num_threads = new_threads;
7954      }
7955    }
7956    thr->th.th_teams_size.nth = num_threads;
7957  }
7958  
7959  /* this sets the requested number of teams for the teams region and/or
7960     the number of threads for the next parallel region encountered  */
__kmp_push_num_teams(ident_t * id,int gtid,int num_teams,int num_threads)7961  void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7962                            int num_threads) {
7963    kmp_info_t *thr = __kmp_threads[gtid];
7964    if (num_teams < 0) {
7965      // OpenMP specification requires requested values to be positive,
7966      // but people can send us any value, so we'd better check
7967      __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7968                __kmp_msg_null);
7969      num_teams = 1;
7970    }
7971    if (num_teams == 0) {
7972      if (__kmp_nteams > 0) {
7973        num_teams = __kmp_nteams;
7974      } else {
7975        num_teams = 1; // default number of teams is 1.
7976      }
7977    }
7978    if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7979      if (!__kmp_reserve_warn) {
7980        __kmp_reserve_warn = 1;
7981        __kmp_msg(kmp_ms_warning,
7982                  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7983                  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7984      }
7985      num_teams = __kmp_teams_max_nth;
7986    }
7987    // Set number of teams (number of threads in the outer "parallel" of the
7988    // teams)
7989    thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7990  
7991    __kmp_push_thread_limit(thr, num_teams, num_threads);
7992  }
7993  
7994  /* This sets the requested number of teams for the teams region and/or
7995     the number of threads for the next parallel region encountered  */
__kmp_push_num_teams_51(ident_t * id,int gtid,int num_teams_lb,int num_teams_ub,int num_threads)7996  void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7997                               int num_teams_ub, int num_threads) {
7998    kmp_info_t *thr = __kmp_threads[gtid];
7999    KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
8000    KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
8001    KMP_DEBUG_ASSERT(num_threads >= 0);
8002  
8003    if (num_teams_lb > num_teams_ub) {
8004      __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
8005                  KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
8006    }
8007  
8008    int num_teams = 1; // defalt number of teams is 1.
8009  
8010    if (num_teams_lb == 0 && num_teams_ub > 0)
8011      num_teams_lb = num_teams_ub;
8012  
8013    if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
8014      num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
8015      if (num_teams > __kmp_teams_max_nth) {
8016        if (!__kmp_reserve_warn) {
8017          __kmp_reserve_warn = 1;
8018          __kmp_msg(kmp_ms_warning,
8019                    KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
8020                    KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
8021        }
8022        num_teams = __kmp_teams_max_nth;
8023      }
8024    } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
8025      num_teams = num_teams_ub;
8026    } else { // num_teams_lb <= num_teams <= num_teams_ub
8027      if (num_threads <= 0) {
8028        if (num_teams_ub > __kmp_teams_max_nth) {
8029          num_teams = num_teams_lb;
8030        } else {
8031          num_teams = num_teams_ub;
8032        }
8033      } else {
8034        num_teams = (num_threads > __kmp_teams_max_nth)
8035                        ? num_teams
8036                        : __kmp_teams_max_nth / num_threads;
8037        if (num_teams < num_teams_lb) {
8038          num_teams = num_teams_lb;
8039        } else if (num_teams > num_teams_ub) {
8040          num_teams = num_teams_ub;
8041        }
8042      }
8043    }
8044    // Set number of teams (number of threads in the outer "parallel" of the
8045    // teams)
8046    thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
8047  
8048    __kmp_push_thread_limit(thr, num_teams, num_threads);
8049  }
8050  
8051  // Set the proc_bind var to use in the following parallel region.
__kmp_push_proc_bind(ident_t * id,int gtid,kmp_proc_bind_t proc_bind)8052  void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8053    kmp_info_t *thr = __kmp_threads[gtid];
8054    thr->th.th_set_proc_bind = proc_bind;
8055  }
8056  
8057  /* Launch the worker threads into the microtask. */
8058  
__kmp_internal_fork(ident_t * id,int gtid,kmp_team_t * team)8059  void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8060    kmp_info_t *this_thr = __kmp_threads[gtid];
8061  
8062  #ifdef KMP_DEBUG
8063    int f;
8064  #endif /* KMP_DEBUG */
8065  
8066    KMP_DEBUG_ASSERT(team);
8067    KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8068    KMP_ASSERT(KMP_MASTER_GTID(gtid));
8069    KMP_MB(); /* Flush all pending memory write invalidates.  */
8070  
8071    team->t.t_construct = 0; /* no single directives seen yet */
8072    team->t.t_ordered.dt.t_value =
8073        0; /* thread 0 enters the ordered section first */
8074  
8075    /* Reset the identifiers on the dispatch buffer */
8076    KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8077    if (team->t.t_max_nproc > 1) {
8078      int i;
8079      for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8080        team->t.t_disp_buffer[i].buffer_index = i;
8081        team->t.t_disp_buffer[i].doacross_buf_idx = i;
8082      }
8083    } else {
8084      team->t.t_disp_buffer[0].buffer_index = 0;
8085      team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8086    }
8087  
8088    KMP_MB(); /* Flush all pending memory write invalidates.  */
8089    KMP_ASSERT(this_thr->th.th_team == team);
8090  
8091  #ifdef KMP_DEBUG
8092    for (f = 0; f < team->t.t_nproc; f++) {
8093      KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8094                       team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8095    }
8096  #endif /* KMP_DEBUG */
8097  
8098    /* release the worker threads so they may begin working */
8099    __kmp_fork_barrier(gtid, 0);
8100  }
8101  
__kmp_internal_join(ident_t * id,int gtid,kmp_team_t * team)8102  void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8103    kmp_info_t *this_thr = __kmp_threads[gtid];
8104  
8105    KMP_DEBUG_ASSERT(team);
8106    KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8107    KMP_ASSERT(KMP_MASTER_GTID(gtid));
8108    KMP_MB(); /* Flush all pending memory write invalidates.  */
8109  
8110    /* Join barrier after fork */
8111  
8112  #ifdef KMP_DEBUG
8113    if (__kmp_threads[gtid] &&
8114        __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8115      __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8116                   __kmp_threads[gtid]);
8117      __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8118                   "team->t.t_nproc=%d\n",
8119                   gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8120                   team->t.t_nproc);
8121      __kmp_print_structure();
8122    }
8123    KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8124                     __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8125  #endif /* KMP_DEBUG */
8126  
8127    __kmp_join_barrier(gtid); /* wait for everyone */
8128  #if OMPT_SUPPORT
8129    ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
8130    if (ompt_enabled.enabled &&
8131        (ompt_state == ompt_state_wait_barrier_teams ||
8132         ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
8133      int ds_tid = this_thr->th.th_info.ds.ds_tid;
8134      ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8135      this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8136  #if OMPT_OPTIONAL
8137      void *codeptr = NULL;
8138      if (KMP_MASTER_TID(ds_tid) &&
8139          (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8140           ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8141        codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8142  
8143      ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
8144      if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
8145        sync_kind = ompt_sync_region_barrier_teams;
8146      if (ompt_enabled.ompt_callback_sync_region_wait) {
8147        ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8148            sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8149      }
8150      if (ompt_enabled.ompt_callback_sync_region) {
8151        ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8152            sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8153      }
8154  #endif
8155      if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8156        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8157            ompt_scope_end, NULL, task_data, 0, ds_tid,
8158            ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8159      }
8160    }
8161  #endif
8162  
8163    KMP_MB(); /* Flush all pending memory write invalidates.  */
8164    KMP_ASSERT(this_thr->th.th_team == team);
8165  }
8166  
8167  /* ------------------------------------------------------------------------ */
8168  
8169  #ifdef USE_LOAD_BALANCE
8170  
8171  // Return the worker threads actively spinning in the hot team, if we
8172  // are at the outermost level of parallelism.  Otherwise, return 0.
__kmp_active_hot_team_nproc(kmp_root_t * root)8173  static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8174    int i;
8175    int retval;
8176    kmp_team_t *hot_team;
8177  
8178    if (root->r.r_active) {
8179      return 0;
8180    }
8181    hot_team = root->r.r_hot_team;
8182    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8183      return hot_team->t.t_nproc - 1; // Don't count primary thread
8184    }
8185  
8186    // Skip the primary thread - it is accounted for elsewhere.
8187    retval = 0;
8188    for (i = 1; i < hot_team->t.t_nproc; i++) {
8189      if (hot_team->t.t_threads[i]->th.th_active) {
8190        retval++;
8191      }
8192    }
8193    return retval;
8194  }
8195  
8196  // Perform an automatic adjustment to the number of
8197  // threads used by the next parallel region.
__kmp_load_balance_nproc(kmp_root_t * root,int set_nproc)8198  static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8199    int retval;
8200    int pool_active;
8201    int hot_team_active;
8202    int team_curr_active;
8203    int system_active;
8204  
8205    KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8206                  set_nproc));
8207    KMP_DEBUG_ASSERT(root);
8208    KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8209                         ->th.th_current_task->td_icvs.dynamic == TRUE);
8210    KMP_DEBUG_ASSERT(set_nproc > 1);
8211  
8212    if (set_nproc == 1) {
8213      KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8214      return 1;
8215    }
8216  
8217    // Threads that are active in the thread pool, active in the hot team for this
8218    // particular root (if we are at the outer par level), and the currently
8219    // executing thread (to become the primary thread) are available to add to the
8220    // new team, but are currently contributing to the system load, and must be
8221    // accounted for.
8222    pool_active = __kmp_thread_pool_active_nth;
8223    hot_team_active = __kmp_active_hot_team_nproc(root);
8224    team_curr_active = pool_active + hot_team_active + 1;
8225  
8226    // Check the system load.
8227    system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8228    KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8229                  "hot team active = %d\n",
8230                  system_active, pool_active, hot_team_active));
8231  
8232    if (system_active < 0) {
8233      // There was an error reading the necessary info from /proc, so use the
8234      // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8235      // = dynamic_thread_limit, we shouldn't wind up getting back here.
8236      __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8237      KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8238  
8239      // Make this call behave like the thread limit algorithm.
8240      retval = __kmp_avail_proc - __kmp_nth +
8241               (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8242      if (retval > set_nproc) {
8243        retval = set_nproc;
8244      }
8245      if (retval < KMP_MIN_NTH) {
8246        retval = KMP_MIN_NTH;
8247      }
8248  
8249      KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8250                    retval));
8251      return retval;
8252    }
8253  
8254    // There is a slight delay in the load balance algorithm in detecting new
8255    // running procs. The real system load at this instant should be at least as
8256    // large as the #active omp thread that are available to add to the team.
8257    if (system_active < team_curr_active) {
8258      system_active = team_curr_active;
8259    }
8260    retval = __kmp_avail_proc - system_active + team_curr_active;
8261    if (retval > set_nproc) {
8262      retval = set_nproc;
8263    }
8264    if (retval < KMP_MIN_NTH) {
8265      retval = KMP_MIN_NTH;
8266    }
8267  
8268    KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8269    return retval;
8270  } // __kmp_load_balance_nproc()
8271  
8272  #endif /* USE_LOAD_BALANCE */
8273  
8274  /* ------------------------------------------------------------------------ */
8275  
8276  /* NOTE: this is called with the __kmp_init_lock held */
__kmp_cleanup(void)8277  void __kmp_cleanup(void) {
8278    int f;
8279  
8280    KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8281  
8282    if (TCR_4(__kmp_init_parallel)) {
8283  #if KMP_HANDLE_SIGNALS
8284      __kmp_remove_signals();
8285  #endif
8286      TCW_4(__kmp_init_parallel, FALSE);
8287    }
8288  
8289    if (TCR_4(__kmp_init_middle)) {
8290  #if KMP_AFFINITY_SUPPORTED
8291      __kmp_affinity_uninitialize();
8292  #endif /* KMP_AFFINITY_SUPPORTED */
8293      __kmp_cleanup_hierarchy();
8294      TCW_4(__kmp_init_middle, FALSE);
8295    }
8296  
8297    KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8298  
8299    if (__kmp_init_serial) {
8300      __kmp_runtime_destroy();
8301      __kmp_init_serial = FALSE;
8302    }
8303  
8304    __kmp_cleanup_threadprivate_caches();
8305  
8306    for (f = 0; f < __kmp_threads_capacity; f++) {
8307      if (__kmp_root[f] != NULL) {
8308        __kmp_free(__kmp_root[f]);
8309        __kmp_root[f] = NULL;
8310      }
8311    }
8312    __kmp_free(__kmp_threads);
8313    // __kmp_threads and __kmp_root were allocated at once, as single block, so
8314    // there is no need in freeing __kmp_root.
8315    __kmp_threads = NULL;
8316    __kmp_root = NULL;
8317    __kmp_threads_capacity = 0;
8318  
8319    // Free old __kmp_threads arrays if they exist.
8320    kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8321    while (ptr) {
8322      kmp_old_threads_list_t *next = ptr->next;
8323      __kmp_free(ptr->threads);
8324      __kmp_free(ptr);
8325      ptr = next;
8326    }
8327  
8328  #if KMP_USE_DYNAMIC_LOCK
8329    __kmp_cleanup_indirect_user_locks();
8330  #else
8331    __kmp_cleanup_user_locks();
8332  #endif
8333  #if OMPD_SUPPORT
8334    if (ompd_state) {
8335      __kmp_free(ompd_env_block);
8336      ompd_env_block = NULL;
8337      ompd_env_block_size = 0;
8338    }
8339  #endif
8340  
8341  #if KMP_AFFINITY_SUPPORTED
8342    KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8343    __kmp_cpuinfo_file = NULL;
8344  #endif /* KMP_AFFINITY_SUPPORTED */
8345  
8346  #if KMP_USE_ADAPTIVE_LOCKS
8347  #if KMP_DEBUG_ADAPTIVE_LOCKS
8348    __kmp_print_speculative_stats();
8349  #endif
8350  #endif
8351    KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8352    __kmp_nested_nth.nth = NULL;
8353    __kmp_nested_nth.size = 0;
8354    __kmp_nested_nth.used = 0;
8355  
8356    KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8357    __kmp_nested_proc_bind.bind_types = NULL;
8358    __kmp_nested_proc_bind.size = 0;
8359    __kmp_nested_proc_bind.used = 0;
8360    if (__kmp_affinity_format) {
8361      KMP_INTERNAL_FREE(__kmp_affinity_format);
8362      __kmp_affinity_format = NULL;
8363    }
8364  
8365    __kmp_i18n_catclose();
8366  
8367  #if KMP_USE_HIER_SCHED
8368    __kmp_hier_scheds.deallocate();
8369  #endif
8370  
8371  #if KMP_STATS_ENABLED
8372    __kmp_stats_fini();
8373  #endif
8374  
8375    KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8376  }
8377  
8378  /* ------------------------------------------------------------------------ */
8379  
__kmp_ignore_mppbeg(void)8380  int __kmp_ignore_mppbeg(void) {
8381    char *env;
8382  
8383    if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8384      if (__kmp_str_match_false(env))
8385        return FALSE;
8386    }
8387    // By default __kmpc_begin() is no-op.
8388    return TRUE;
8389  }
8390  
__kmp_ignore_mppend(void)8391  int __kmp_ignore_mppend(void) {
8392    char *env;
8393  
8394    if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8395      if (__kmp_str_match_false(env))
8396        return FALSE;
8397    }
8398    // By default __kmpc_end() is no-op.
8399    return TRUE;
8400  }
8401  
__kmp_internal_begin(void)8402  void __kmp_internal_begin(void) {
8403    int gtid;
8404    kmp_root_t *root;
8405  
8406    /* this is a very important step as it will register new sibling threads
8407       and assign these new uber threads a new gtid */
8408    gtid = __kmp_entry_gtid();
8409    root = __kmp_threads[gtid]->th.th_root;
8410    KMP_ASSERT(KMP_UBER_GTID(gtid));
8411  
8412    if (root->r.r_begin)
8413      return;
8414    __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8415    if (root->r.r_begin) {
8416      __kmp_release_lock(&root->r.r_begin_lock, gtid);
8417      return;
8418    }
8419  
8420    root->r.r_begin = TRUE;
8421  
8422    __kmp_release_lock(&root->r.r_begin_lock, gtid);
8423  }
8424  
8425  /* ------------------------------------------------------------------------ */
8426  
__kmp_user_set_library(enum library_type arg)8427  void __kmp_user_set_library(enum library_type arg) {
8428    int gtid;
8429    kmp_root_t *root;
8430    kmp_info_t *thread;
8431  
8432    /* first, make sure we are initialized so we can get our gtid */
8433  
8434    gtid = __kmp_entry_gtid();
8435    thread = __kmp_threads[gtid];
8436  
8437    root = thread->th.th_root;
8438  
8439    KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8440                  library_serial));
8441    if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8442                                    thread */
8443      KMP_WARNING(SetLibraryIncorrectCall);
8444      return;
8445    }
8446  
8447    switch (arg) {
8448    case library_serial:
8449      thread->th.th_set_nproc = 0;
8450      set__nproc(thread, 1);
8451      break;
8452    case library_turnaround:
8453      thread->th.th_set_nproc = 0;
8454      set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8455                                             : __kmp_dflt_team_nth_ub);
8456      break;
8457    case library_throughput:
8458      thread->th.th_set_nproc = 0;
8459      set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8460                                             : __kmp_dflt_team_nth_ub);
8461      break;
8462    default:
8463      KMP_FATAL(UnknownLibraryType, arg);
8464    }
8465  
8466    __kmp_aux_set_library(arg);
8467  }
8468  
__kmp_aux_set_stacksize(size_t arg)8469  void __kmp_aux_set_stacksize(size_t arg) {
8470    if (!__kmp_init_serial)
8471      __kmp_serial_initialize();
8472  
8473  #if KMP_OS_DARWIN
8474    if (arg & (0x1000 - 1)) {
8475      arg &= ~(0x1000 - 1);
8476      if (arg + 0x1000) /* check for overflow if we round up */
8477        arg += 0x1000;
8478    }
8479  #endif
8480    __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8481  
8482    /* only change the default stacksize before the first parallel region */
8483    if (!TCR_4(__kmp_init_parallel)) {
8484      size_t value = arg; /* argument is in bytes */
8485  
8486      if (value < __kmp_sys_min_stksize)
8487        value = __kmp_sys_min_stksize;
8488      else if (value > KMP_MAX_STKSIZE)
8489        value = KMP_MAX_STKSIZE;
8490  
8491      __kmp_stksize = value;
8492  
8493      __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8494    }
8495  
8496    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8497  }
8498  
8499  /* set the behaviour of the runtime library */
8500  /* TODO this can cause some odd behaviour with sibling parallelism... */
__kmp_aux_set_library(enum library_type arg)8501  void __kmp_aux_set_library(enum library_type arg) {
8502    __kmp_library = arg;
8503  
8504    switch (__kmp_library) {
8505    case library_serial: {
8506      KMP_INFORM(LibraryIsSerial);
8507    } break;
8508    case library_turnaround:
8509      if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8510        __kmp_use_yield = 2; // only yield when oversubscribed
8511      break;
8512    case library_throughput:
8513      if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8514        __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8515      break;
8516    default:
8517      KMP_FATAL(UnknownLibraryType, arg);
8518    }
8519  }
8520  
8521  /* Getting team information common for all team API */
8522  // Returns NULL if not in teams construct
__kmp_aux_get_team_info(int & teams_serialized)8523  static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8524    kmp_info_t *thr = __kmp_entry_thread();
8525    teams_serialized = 0;
8526    if (thr->th.th_teams_microtask) {
8527      kmp_team_t *team = thr->th.th_team;
8528      int tlevel = thr->th.th_teams_level; // the level of the teams construct
8529      int ii = team->t.t_level;
8530      teams_serialized = team->t.t_serialized;
8531      int level = tlevel + 1;
8532      KMP_DEBUG_ASSERT(ii >= tlevel);
8533      while (ii > level) {
8534        for (teams_serialized = team->t.t_serialized;
8535             (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8536        }
8537        if (team->t.t_serialized && (!teams_serialized)) {
8538          team = team->t.t_parent;
8539          continue;
8540        }
8541        if (ii > level) {
8542          team = team->t.t_parent;
8543          ii--;
8544        }
8545      }
8546      return team;
8547    }
8548    return NULL;
8549  }
8550  
__kmp_aux_get_team_num()8551  int __kmp_aux_get_team_num() {
8552    int serialized;
8553    kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8554    if (team) {
8555      if (serialized > 1) {
8556        return 0; // teams region is serialized ( 1 team of 1 thread ).
8557      } else {
8558        return team->t.t_master_tid;
8559      }
8560    }
8561    return 0;
8562  }
8563  
__kmp_aux_get_num_teams()8564  int __kmp_aux_get_num_teams() {
8565    int serialized;
8566    kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8567    if (team) {
8568      if (serialized > 1) {
8569        return 1;
8570      } else {
8571        return team->t.t_parent->t.t_nproc;
8572      }
8573    }
8574    return 1;
8575  }
8576  
8577  /* ------------------------------------------------------------------------ */
8578  
8579  /*
8580   * Affinity Format Parser
8581   *
8582   * Field is in form of: %[[[0].]size]type
8583   * % and type are required (%% means print a literal '%')
8584   * type is either single char or long name surrounded by {},
8585   * e.g., N or {num_threads}
8586   * 0 => leading zeros
8587   * . => right justified when size is specified
8588   * by default output is left justified
8589   * size is the *minimum* field length
8590   * All other characters are printed as is
8591   *
8592   * Available field types:
8593   * L {thread_level}      - omp_get_level()
8594   * n {thread_num}        - omp_get_thread_num()
8595   * h {host}              - name of host machine
8596   * P {process_id}        - process id (integer)
8597   * T {thread_identifier} - native thread identifier (integer)
8598   * N {num_threads}       - omp_get_num_threads()
8599   * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8600   * a {thread_affinity}   - comma separated list of integers or integer ranges
8601   *                         (values of affinity mask)
8602   *
8603   * Implementation-specific field types can be added
8604   * If a type is unknown, print "undefined"
8605   */
8606  
8607  // Structure holding the short name, long name, and corresponding data type
8608  // for snprintf.  A table of these will represent the entire valid keyword
8609  // field types.
8610  typedef struct kmp_affinity_format_field_t {
8611    char short_name; // from spec e.g., L -> thread level
8612    const char *long_name; // from spec thread_level -> thread level
8613    char field_format; // data type for snprintf (typically 'd' or 's'
8614    // for integer or string)
8615  } kmp_affinity_format_field_t;
8616  
8617  static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8618  #if KMP_AFFINITY_SUPPORTED
8619      {'A', "thread_affinity", 's'},
8620  #endif
8621      {'t', "team_num", 'd'},
8622      {'T', "num_teams", 'd'},
8623      {'L', "nesting_level", 'd'},
8624      {'n', "thread_num", 'd'},
8625      {'N', "num_threads", 'd'},
8626      {'a', "ancestor_tnum", 'd'},
8627      {'H', "host", 's'},
8628      {'P', "process_id", 'd'},
8629      {'i', "native_thread_id", 'd'}};
8630  
8631  // Return the number of characters it takes to hold field
__kmp_aux_capture_affinity_field(int gtid,const kmp_info_t * th,const char ** ptr,kmp_str_buf_t * field_buffer)8632  static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8633                                              const char **ptr,
8634                                              kmp_str_buf_t *field_buffer) {
8635    int rc, format_index, field_value;
8636    const char *width_left, *width_right;
8637    bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8638    static const int FORMAT_SIZE = 20;
8639    char format[FORMAT_SIZE] = {0};
8640    char absolute_short_name = 0;
8641  
8642    KMP_DEBUG_ASSERT(gtid >= 0);
8643    KMP_DEBUG_ASSERT(th);
8644    KMP_DEBUG_ASSERT(**ptr == '%');
8645    KMP_DEBUG_ASSERT(field_buffer);
8646  
8647    __kmp_str_buf_clear(field_buffer);
8648  
8649    // Skip the initial %
8650    (*ptr)++;
8651  
8652    // Check for %% first
8653    if (**ptr == '%') {
8654      __kmp_str_buf_cat(field_buffer, "%", 1);
8655      (*ptr)++; // skip over the second %
8656      return 1;
8657    }
8658  
8659    // Parse field modifiers if they are present
8660    pad_zeros = false;
8661    if (**ptr == '0') {
8662      pad_zeros = true;
8663      (*ptr)++; // skip over 0
8664    }
8665    right_justify = false;
8666    if (**ptr == '.') {
8667      right_justify = true;
8668      (*ptr)++; // skip over .
8669    }
8670    // Parse width of field: [width_left, width_right)
8671    width_left = width_right = NULL;
8672    if (**ptr >= '0' && **ptr <= '9') {
8673      width_left = *ptr;
8674      SKIP_DIGITS(*ptr);
8675      width_right = *ptr;
8676    }
8677  
8678    // Create the format for KMP_SNPRINTF based on flags parsed above
8679    format_index = 0;
8680    format[format_index++] = '%';
8681    if (!right_justify)
8682      format[format_index++] = '-';
8683    if (pad_zeros)
8684      format[format_index++] = '0';
8685    if (width_left && width_right) {
8686      int i = 0;
8687      // Only allow 8 digit number widths.
8688      // This also prevents overflowing format variable
8689      while (i < 8 && width_left < width_right) {
8690        format[format_index++] = *width_left;
8691        width_left++;
8692        i++;
8693      }
8694    }
8695  
8696    // Parse a name (long or short)
8697    // Canonicalize the name into absolute_short_name
8698    found_valid_name = false;
8699    parse_long_name = (**ptr == '{');
8700    if (parse_long_name)
8701      (*ptr)++; // skip initial left brace
8702    for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8703                               sizeof(__kmp_affinity_format_table[0]);
8704         ++i) {
8705      char short_name = __kmp_affinity_format_table[i].short_name;
8706      const char *long_name = __kmp_affinity_format_table[i].long_name;
8707      char field_format = __kmp_affinity_format_table[i].field_format;
8708      if (parse_long_name) {
8709        size_t length = KMP_STRLEN(long_name);
8710        if (strncmp(*ptr, long_name, length) == 0) {
8711          found_valid_name = true;
8712          (*ptr) += length; // skip the long name
8713        }
8714      } else if (**ptr == short_name) {
8715        found_valid_name = true;
8716        (*ptr)++; // skip the short name
8717      }
8718      if (found_valid_name) {
8719        format[format_index++] = field_format;
8720        format[format_index++] = '\0';
8721        absolute_short_name = short_name;
8722        break;
8723      }
8724    }
8725    if (parse_long_name) {
8726      if (**ptr != '}') {
8727        absolute_short_name = 0;
8728      } else {
8729        (*ptr)++; // skip over the right brace
8730      }
8731    }
8732  
8733    // Attempt to fill the buffer with the requested
8734    // value using snprintf within __kmp_str_buf_print()
8735    switch (absolute_short_name) {
8736    case 't':
8737      rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8738      break;
8739    case 'T':
8740      rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8741      break;
8742    case 'L':
8743      rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8744      break;
8745    case 'n':
8746      rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8747      break;
8748    case 'H': {
8749      static const int BUFFER_SIZE = 256;
8750      char buf[BUFFER_SIZE];
8751      __kmp_expand_host_name(buf, BUFFER_SIZE);
8752      rc = __kmp_str_buf_print(field_buffer, format, buf);
8753    } break;
8754    case 'P':
8755      rc = __kmp_str_buf_print(field_buffer, format, getpid());
8756      break;
8757    case 'i':
8758      rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8759      break;
8760    case 'N':
8761      rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8762      break;
8763    case 'a':
8764      field_value =
8765          __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8766      rc = __kmp_str_buf_print(field_buffer, format, field_value);
8767      break;
8768  #if KMP_AFFINITY_SUPPORTED
8769    case 'A': {
8770      kmp_str_buf_t buf;
8771      __kmp_str_buf_init(&buf);
8772      __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8773      rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8774      __kmp_str_buf_free(&buf);
8775    } break;
8776  #endif
8777    default:
8778      // According to spec, If an implementation does not have info for field
8779      // type, then "undefined" is printed
8780      rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8781      // Skip the field
8782      if (parse_long_name) {
8783        SKIP_TOKEN(*ptr);
8784        if (**ptr == '}')
8785          (*ptr)++;
8786      } else {
8787        (*ptr)++;
8788      }
8789    }
8790  
8791    KMP_ASSERT(format_index <= FORMAT_SIZE);
8792    return rc;
8793  }
8794  
8795  /*
8796   * Return number of characters needed to hold the affinity string
8797   * (not including null byte character)
8798   * The resultant string is printed to buffer, which the caller can then
8799   * handle afterwards
8800   */
__kmp_aux_capture_affinity(int gtid,const char * format,kmp_str_buf_t * buffer)8801  size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8802                                    kmp_str_buf_t *buffer) {
8803    const char *parse_ptr;
8804    size_t retval;
8805    const kmp_info_t *th;
8806    kmp_str_buf_t field;
8807  
8808    KMP_DEBUG_ASSERT(buffer);
8809    KMP_DEBUG_ASSERT(gtid >= 0);
8810  
8811    __kmp_str_buf_init(&field);
8812    __kmp_str_buf_clear(buffer);
8813  
8814    th = __kmp_threads[gtid];
8815    retval = 0;
8816  
8817    // If format is NULL or zero-length string, then we use
8818    // affinity-format-var ICV
8819    parse_ptr = format;
8820    if (parse_ptr == NULL || *parse_ptr == '\0') {
8821      parse_ptr = __kmp_affinity_format;
8822    }
8823    KMP_DEBUG_ASSERT(parse_ptr);
8824  
8825    while (*parse_ptr != '\0') {
8826      // Parse a field
8827      if (*parse_ptr == '%') {
8828        // Put field in the buffer
8829        int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8830        __kmp_str_buf_catbuf(buffer, &field);
8831        retval += rc;
8832      } else {
8833        // Put literal character in buffer
8834        __kmp_str_buf_cat(buffer, parse_ptr, 1);
8835        retval++;
8836        parse_ptr++;
8837      }
8838    }
8839    __kmp_str_buf_free(&field);
8840    return retval;
8841  }
8842  
8843  // Displays the affinity string to stdout
__kmp_aux_display_affinity(int gtid,const char * format)8844  void __kmp_aux_display_affinity(int gtid, const char *format) {
8845    kmp_str_buf_t buf;
8846    __kmp_str_buf_init(&buf);
8847    __kmp_aux_capture_affinity(gtid, format, &buf);
8848    __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8849    __kmp_str_buf_free(&buf);
8850  }
8851  
8852  /* ------------------------------------------------------------------------ */
__kmp_aux_set_blocktime(int arg,kmp_info_t * thread,int tid)8853  void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8854    int blocktime = arg; /* argument is in microseconds */
8855  #if KMP_USE_MONITOR
8856    int bt_intervals;
8857  #endif
8858    kmp_int8 bt_set;
8859  
8860    __kmp_save_internal_controls(thread);
8861  
8862    /* Normalize and set blocktime for the teams */
8863    if (blocktime < KMP_MIN_BLOCKTIME)
8864      blocktime = KMP_MIN_BLOCKTIME;
8865    else if (blocktime > KMP_MAX_BLOCKTIME)
8866      blocktime = KMP_MAX_BLOCKTIME;
8867  
8868    set__blocktime_team(thread->th.th_team, tid, blocktime);
8869    set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8870  
8871  #if KMP_USE_MONITOR
8872    /* Calculate and set blocktime intervals for the teams */
8873    bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8874  
8875    set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8876    set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8877  #endif
8878  
8879    /* Set whether blocktime has been set to "TRUE" */
8880    bt_set = TRUE;
8881  
8882    set__bt_set_team(thread->th.th_team, tid, bt_set);
8883    set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8884  #if KMP_USE_MONITOR
8885    KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8886                  "bt_intervals=%d, monitor_updates=%d\n",
8887                  __kmp_gtid_from_tid(tid, thread->th.th_team),
8888                  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8889                  __kmp_monitor_wakeups));
8890  #else
8891    KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8892                  __kmp_gtid_from_tid(tid, thread->th.th_team),
8893                  thread->th.th_team->t.t_id, tid, blocktime));
8894  #endif
8895  }
8896  
__kmp_aux_set_defaults(char const * str,size_t len)8897  void __kmp_aux_set_defaults(char const *str, size_t len) {
8898    if (!__kmp_init_serial) {
8899      __kmp_serial_initialize();
8900    }
8901    __kmp_env_initialize(str);
8902  
8903    if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8904      __kmp_env_print();
8905    }
8906  } // __kmp_aux_set_defaults
8907  
8908  /* ------------------------------------------------------------------------ */
8909  /* internal fast reduction routines */
8910  
8911  PACKED_REDUCTION_METHOD_T
__kmp_determine_reduction_method(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_vars,size_t reduce_size,void * reduce_data,void (* reduce_func)(void * lhs_data,void * rhs_data),kmp_critical_name * lck)8912  __kmp_determine_reduction_method(
8913      ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8914      void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8915      kmp_critical_name *lck) {
8916  
8917    // Default reduction method: critical construct ( lck != NULL, like in current
8918    // PAROPT )
8919    // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8920    // can be selected by RTL
8921    // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8922    // can be selected by RTL
8923    // Finally, it's up to OpenMP RTL to make a decision on which method to select
8924    // among generated by PAROPT.
8925  
8926    PACKED_REDUCTION_METHOD_T retval;
8927  
8928    int team_size;
8929  
8930    KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8931  
8932  #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8933    (loc &&                                                                      \
8934     ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8935  #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8936  
8937    retval = critical_reduce_block;
8938  
8939    // another choice of getting a team size (with 1 dynamic deference) is slower
8940    team_size = __kmp_get_team_num_threads(global_tid);
8941    if (team_size == 1) {
8942  
8943      retval = empty_reduce_block;
8944  
8945    } else {
8946  
8947      int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8948  
8949  #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8950      KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||             \
8951      KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
8952  
8953  #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8954      KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||        \
8955      KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8956  
8957      int teamsize_cutoff = 4;
8958  
8959  #if KMP_MIC_SUPPORTED
8960      if (__kmp_mic_type != non_mic) {
8961        teamsize_cutoff = 8;
8962      }
8963  #endif
8964      int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8965      if (tree_available) {
8966        if (team_size <= teamsize_cutoff) {
8967          if (atomic_available) {
8968            retval = atomic_reduce_block;
8969          }
8970        } else {
8971          retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8972        }
8973      } else if (atomic_available) {
8974        retval = atomic_reduce_block;
8975      }
8976  #else
8977  #error "Unknown or unsupported OS"
8978  #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8979         // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||
8980         // KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8981  
8982  #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS ||       \
8983      KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32
8984  
8985  #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8986      KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_SOLARIS ||       \
8987      KMP_OS_WASI || KMP_OS_AIX
8988  
8989      // basic tuning
8990  
8991      if (atomic_available) {
8992        if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8993          retval = atomic_reduce_block;
8994        }
8995      } // otherwise: use critical section
8996  
8997  #elif KMP_OS_DARWIN
8998  
8999      int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9000      if (atomic_available && (num_vars <= 3)) {
9001        retval = atomic_reduce_block;
9002      } else if (tree_available) {
9003        if ((reduce_size > (9 * sizeof(kmp_real64))) &&
9004            (reduce_size < (2000 * sizeof(kmp_real64)))) {
9005          retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
9006        }
9007      } // otherwise: use critical section
9008  
9009  #else
9010  #error "Unknown or unsupported OS"
9011  #endif
9012  
9013  #else
9014  #error "Unknown or unsupported architecture"
9015  #endif
9016    }
9017  
9018    // KMP_FORCE_REDUCTION
9019  
9020    // If the team is serialized (team_size == 1), ignore the forced reduction
9021    // method and stay with the unsynchronized method (empty_reduce_block)
9022    if (__kmp_force_reduction_method != reduction_method_not_defined &&
9023        team_size != 1) {
9024  
9025      PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
9026  
9027      int atomic_available, tree_available;
9028  
9029      switch ((forced_retval = __kmp_force_reduction_method)) {
9030      case critical_reduce_block:
9031        KMP_ASSERT(lck); // lck should be != 0
9032        break;
9033  
9034      case atomic_reduce_block:
9035        atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
9036        if (!atomic_available) {
9037          KMP_WARNING(RedMethodNotSupported, "atomic");
9038          forced_retval = critical_reduce_block;
9039        }
9040        break;
9041  
9042      case tree_reduce_block:
9043        tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9044        if (!tree_available) {
9045          KMP_WARNING(RedMethodNotSupported, "tree");
9046          forced_retval = critical_reduce_block;
9047        } else {
9048  #if KMP_FAST_REDUCTION_BARRIER
9049          forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
9050  #endif
9051        }
9052        break;
9053  
9054      default:
9055        KMP_ASSERT(0); // "unsupported method specified"
9056      }
9057  
9058      retval = forced_retval;
9059    }
9060  
9061    KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9062  
9063  #undef FAST_REDUCTION_TREE_METHOD_GENERATED
9064  #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9065  
9066    return (retval);
9067  }
9068  // this function is for testing set/get/determine reduce method
__kmp_get_reduce_method(void)9069  kmp_int32 __kmp_get_reduce_method(void) {
9070    return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9071  }
9072  
9073  // Soft pause sets up threads to ignore blocktime and just go to sleep.
9074  // Spin-wait code checks __kmp_pause_status and reacts accordingly.
__kmp_soft_pause()9075  void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9076  
9077  // Hard pause shuts down the runtime completely.  Resume happens naturally when
9078  // OpenMP is used subsequently.
__kmp_hard_pause()9079  void __kmp_hard_pause() {
9080    __kmp_pause_status = kmp_hard_paused;
9081    __kmp_internal_end_thread(-1);
9082  }
9083  
9084  // Soft resume sets __kmp_pause_status, and wakes up all threads.
__kmp_resume_if_soft_paused()9085  void __kmp_resume_if_soft_paused() {
9086    if (__kmp_pause_status == kmp_soft_paused) {
9087      __kmp_pause_status = kmp_not_paused;
9088  
9089      for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9090        kmp_info_t *thread = __kmp_threads[gtid];
9091        if (thread) { // Wake it if sleeping
9092          kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9093                           thread);
9094          if (fl.is_sleeping())
9095            fl.resume(gtid);
9096          else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
9097            __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
9098          } else { // thread holds the lock and may sleep soon
9099            do { // until either the thread sleeps, or we can get the lock
9100              if (fl.is_sleeping()) {
9101                fl.resume(gtid);
9102                break;
9103              } else if (__kmp_try_suspend_mx(thread)) {
9104                __kmp_unlock_suspend_mx(thread);
9105                break;
9106              }
9107            } while (1);
9108          }
9109        }
9110      }
9111    }
9112  }
9113  
9114  // This function is called via __kmpc_pause_resource. Returns 0 if successful.
9115  // TODO: add warning messages
__kmp_pause_resource(kmp_pause_status_t level)9116  int __kmp_pause_resource(kmp_pause_status_t level) {
9117    if (level == kmp_not_paused) { // requesting resume
9118      if (__kmp_pause_status == kmp_not_paused) {
9119        // error message about runtime not being paused, so can't resume
9120        return 1;
9121      } else {
9122        KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9123                         __kmp_pause_status == kmp_hard_paused);
9124        __kmp_pause_status = kmp_not_paused;
9125        return 0;
9126      }
9127    } else if (level == kmp_soft_paused) { // requesting soft pause
9128      if (__kmp_pause_status != kmp_not_paused) {
9129        // error message about already being paused
9130        return 1;
9131      } else {
9132        __kmp_soft_pause();
9133        return 0;
9134      }
9135    } else if (level == kmp_hard_paused) { // requesting hard pause
9136      if (__kmp_pause_status != kmp_not_paused) {
9137        // error message about already being paused
9138        return 1;
9139      } else {
9140        __kmp_hard_pause();
9141        return 0;
9142      }
9143    } else {
9144      // error message about invalid level
9145      return 1;
9146    }
9147  }
9148  
__kmp_omp_display_env(int verbose)9149  void __kmp_omp_display_env(int verbose) {
9150    __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9151    if (__kmp_init_serial == 0)
9152      __kmp_do_serial_initialize();
9153    __kmp_display_env_impl(!verbose, verbose);
9154    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9155  }
9156  
9157  // The team size is changing, so distributed barrier must be modified
__kmp_resize_dist_barrier(kmp_team_t * team,int old_nthreads,int new_nthreads)9158  void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9159                                 int new_nthreads) {
9160    KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9161                     bp_dist_bar);
9162    kmp_info_t **other_threads = team->t.t_threads;
9163  
9164    // We want all the workers to stop waiting on the barrier while we adjust the
9165    // size of the team.
9166    for (int f = 1; f < old_nthreads; ++f) {
9167      KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9168      // Ignore threads that are already inactive or not present in the team
9169      if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9170        // teams construct causes thread_limit to get passed in, and some of
9171        // those could be inactive; just ignore them
9172        continue;
9173      }
9174      // If thread is transitioning still to in_use state, wait for it
9175      if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9176        while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9177          KMP_CPU_PAUSE();
9178      }
9179      // The thread should be in_use now
9180      KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9181      // Transition to unused state
9182      team->t.t_threads[f]->th.th_used_in_team.store(2);
9183      KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9184    }
9185    // Release all the workers
9186    team->t.b->go_release();
9187  
9188    KMP_MFENCE();
9189  
9190    // Workers should see transition status 2 and move to 0; but may need to be
9191    // woken up first
9192    int count = old_nthreads - 1;
9193    while (count > 0) {
9194      count = old_nthreads - 1;
9195      for (int f = 1; f < old_nthreads; ++f) {
9196        if (other_threads[f]->th.th_used_in_team.load() != 0) {
9197          if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9198            kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9199                void *, other_threads[f]->th.th_sleep_loc);
9200            __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9201          }
9202        } else {
9203          KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9204          count--;
9205        }
9206      }
9207    }
9208    // Now update the barrier size
9209    team->t.b->update_num_threads(new_nthreads);
9210    team->t.b->go_reset();
9211  }
9212  
__kmp_add_threads_to_team(kmp_team_t * team,int new_nthreads)9213  void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9214    // Add the threads back to the team
9215    KMP_DEBUG_ASSERT(team);
9216    // Threads were paused and pointed at th_used_in_team temporarily during a
9217    // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9218    // the thread that it should transition itself back into the team. Then, if
9219    // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9220    // to wake it up.
9221    for (int f = 1; f < new_nthreads; ++f) {
9222      KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9223      KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9224                                  3);
9225      if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9226        __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9227                        (kmp_flag_32<false, false> *)NULL);
9228      }
9229    }
9230    // The threads should be transitioning to the team; when they are done, they
9231    // should have set th_used_in_team to 1. This loop forces master to wait until
9232    // all threads have moved into the team and are waiting in the barrier.
9233    int count = new_nthreads - 1;
9234    while (count > 0) {
9235      count = new_nthreads - 1;
9236      for (int f = 1; f < new_nthreads; ++f) {
9237        if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9238          count--;
9239        }
9240      }
9241    }
9242  }
9243  
9244  // Globals and functions for hidden helper task
9245  kmp_info_t **__kmp_hidden_helper_threads;
9246  kmp_info_t *__kmp_hidden_helper_main_thread;
9247  std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9248  #if KMP_OS_LINUX
9249  kmp_int32 __kmp_hidden_helper_threads_num = 8;
9250  kmp_int32 __kmp_enable_hidden_helper = TRUE;
9251  #else
9252  kmp_int32 __kmp_hidden_helper_threads_num = 0;
9253  kmp_int32 __kmp_enable_hidden_helper = FALSE;
9254  #endif
9255  
9256  namespace {
9257  std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9258  
__kmp_hidden_helper_wrapper_fn(int * gtid,int *,...)9259  void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9260    // This is an explicit synchronization on all hidden helper threads in case
9261    // that when a regular thread pushes a hidden helper task to one hidden
9262    // helper thread, the thread has not been awaken once since they're released
9263    // by the main thread after creating the team.
9264    KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9265    while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9266           __kmp_hidden_helper_threads_num)
9267      ;
9268  
9269    // If main thread, then wait for signal
9270    if (__kmpc_master(nullptr, *gtid)) {
9271      // First, unset the initial state and release the initial thread
9272      TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9273      __kmp_hidden_helper_initz_release();
9274      __kmp_hidden_helper_main_thread_wait();
9275      // Now wake up all worker threads
9276      for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9277        __kmp_hidden_helper_worker_thread_signal();
9278      }
9279    }
9280  }
9281  } // namespace
9282  
__kmp_hidden_helper_threads_initz_routine()9283  void __kmp_hidden_helper_threads_initz_routine() {
9284    // Create a new root for hidden helper team/threads
9285    const int gtid = __kmp_register_root(TRUE);
9286    __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9287    __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9288    __kmp_hidden_helper_main_thread->th.th_set_nproc =
9289        __kmp_hidden_helper_threads_num;
9290  
9291    KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9292  
9293    __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9294  
9295    // Set the initialization flag to FALSE
9296    TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9297  
9298    __kmp_hidden_helper_threads_deinitz_release();
9299  }
9300  
9301  /* Nesting Mode:
9302     Set via KMP_NESTING_MODE, which takes an integer.
9303     Note: we skip duplicate topology levels, and skip levels with only
9304        one entity.
9305     KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9306     KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9307        in the topology, and initializes the number of threads at each of those
9308        levels to the number of entities at each level, respectively, below the
9309        entity at the parent level.
9310     KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9311        but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9312        the user to turn nesting on explicitly. This is an even more experimental
9313        option to this experimental feature, and may change or go away in the
9314        future.
9315  */
9316  
9317  // Allocate space to store nesting levels
__kmp_init_nesting_mode()9318  void __kmp_init_nesting_mode() {
9319    int levels = KMP_HW_LAST;
9320    __kmp_nesting_mode_nlevels = levels;
9321    __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9322    for (int i = 0; i < levels; ++i)
9323      __kmp_nesting_nth_level[i] = 0;
9324    if (__kmp_nested_nth.size < levels) {
9325      __kmp_nested_nth.nth =
9326          (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9327      __kmp_nested_nth.size = levels;
9328    }
9329  }
9330  
9331  // Set # threads for top levels of nesting; must be called after topology set
__kmp_set_nesting_mode_threads()9332  void __kmp_set_nesting_mode_threads() {
9333    kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9334  
9335    if (__kmp_nesting_mode == 1)
9336      __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9337    else if (__kmp_nesting_mode > 1)
9338      __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9339  
9340    if (__kmp_topology) { // use topology info
9341      int loc, hw_level;
9342      for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9343                                  loc < __kmp_nesting_mode_nlevels;
9344           loc++, hw_level++) {
9345        __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9346        if (__kmp_nesting_nth_level[loc] == 1)
9347          loc--;
9348      }
9349      // Make sure all cores are used
9350      if (__kmp_nesting_mode > 1 && loc > 1) {
9351        int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9352        int num_cores = __kmp_topology->get_count(core_level);
9353        int upper_levels = 1;
9354        for (int level = 0; level < loc - 1; ++level)
9355          upper_levels *= __kmp_nesting_nth_level[level];
9356        if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9357          __kmp_nesting_nth_level[loc - 1] =
9358              num_cores / __kmp_nesting_nth_level[loc - 2];
9359      }
9360      __kmp_nesting_mode_nlevels = loc;
9361      __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9362    } else { // no topology info available; provide a reasonable guesstimation
9363      if (__kmp_avail_proc >= 4) {
9364        __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9365        __kmp_nesting_nth_level[1] = 2;
9366        __kmp_nesting_mode_nlevels = 2;
9367      } else {
9368        __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9369        __kmp_nesting_mode_nlevels = 1;
9370      }
9371      __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9372    }
9373    for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9374      __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9375    }
9376    set__nproc(thread, __kmp_nesting_nth_level[0]);
9377    if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9378      __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9379    if (get__max_active_levels(thread) > 1) {
9380      // if max levels was set, set nesting mode levels to same
9381      __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9382    }
9383    if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9384      set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9385  }
9386  
9387  // Empty symbols to export (see exports_so.txt) when feature is disabled
9388  extern "C" {
9389  #if !KMP_STATS_ENABLED
__kmp_reset_stats()9390  void __kmp_reset_stats() {}
9391  #endif
9392  #if !USE_DEBUGGER
9393  int __kmp_omp_debug_struct_info = FALSE;
9394  int __kmp_debugging = FALSE;
9395  #endif
9396  #if !USE_ITT_BUILD || !USE_ITT_NOTIFY
__kmp_itt_fini_ittlib()9397  void __kmp_itt_fini_ittlib() {}
__kmp_itt_init_ittlib()9398  void __kmp_itt_init_ittlib() {}
9399  #endif
9400  }
9401  
9402  // end of file
9403