1 /*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #include "kmp_utils.h"
28 #if KMP_USE_HIER_SCHED
29 #include "kmp_dispatch_hier.h"
30 #endif
31
32 #if OMPT_SUPPORT
33 #include "ompt-specific.h"
34 #endif
35 #if OMPD_SUPPORT
36 #include "ompd-specific.h"
37 #endif
38
39 #if OMP_PROFILING_SUPPORT
40 #include "llvm/Support/TimeProfiler.h"
41 static char *ProfileTraceFile = nullptr;
42 #endif
43
44 /* these are temporary issues to be dealt with */
45 #define KMP_USE_PRCTL 0
46
47 #if KMP_OS_WINDOWS
48 #include <process.h>
49 #endif
50
51 #ifndef KMP_USE_SHM
52 // Windows and WASI do not need these include files as they don't use shared
53 // memory.
54 #else
55 #include <sys/mman.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #define SHM_SIZE 1024
59 #endif
60
61 #if defined(KMP_GOMP_COMPAT)
62 char const __kmp_version_alt_comp[] =
63 KMP_VERSION_PREFIX "alternative compiler support: yes";
64 #endif /* defined(KMP_GOMP_COMPAT) */
65
66 char const __kmp_version_omp_api[] =
67 KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68
69 #ifdef KMP_DEBUG
70 char const __kmp_version_lock[] =
71 KMP_VERSION_PREFIX "lock type: run time selectable";
72 #endif /* KMP_DEBUG */
73
74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75
76 /* ------------------------------------------------------------------------ */
77
78 #if KMP_USE_MONITOR
79 kmp_info_t __kmp_monitor;
80 #endif
81
82 /* Forward declarations */
83
84 void __kmp_cleanup(void);
85
86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87 int gtid);
88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89 kmp_internal_control_t *new_icvs,
90 ident_t *loc);
91 #if KMP_AFFINITY_SUPPORTED
92 static void __kmp_partition_places(kmp_team_t *team,
93 int update_master_only = 0);
94 #endif
95 static void __kmp_do_serial_initialize(void);
96 void __kmp_fork_barrier(int gtid, int tid);
97 void __kmp_join_barrier(int gtid);
98 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99 kmp_internal_control_t *new_icvs, ident_t *loc);
100
101 #ifdef USE_LOAD_BALANCE
102 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103 #endif
104
105 static int __kmp_expand_threads(int nNeed);
106 #if KMP_OS_WINDOWS
107 static int __kmp_unregister_root_other_thread(int gtid);
108 #endif
109 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111
112 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
113 int new_nthreads);
114 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
115
__kmp_override_nested_nth(kmp_info_t * thr,int level)116 static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
117 int level) {
118 kmp_nested_nthreads_t *new_nested_nth =
119 (kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
120 sizeof(kmp_nested_nthreads_t));
121 int new_size = level + thr->th.th_set_nested_nth_sz;
122 new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
123 for (int i = 0; i < level + 1; ++i)
124 new_nested_nth->nth[i] = 0;
125 for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
126 new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
127 new_nested_nth->size = new_nested_nth->used = new_size;
128 return new_nested_nth;
129 }
130
131 /* Calculate the identifier of the current thread */
132 /* fast (and somewhat portable) way to get unique identifier of executing
133 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
__kmp_get_global_thread_id()134 int __kmp_get_global_thread_id() {
135 int i;
136 kmp_info_t **other_threads;
137 size_t stack_data;
138 char *stack_addr;
139 size_t stack_size;
140 char *stack_base;
141
142 KA_TRACE(
143 1000,
144 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
145 __kmp_nth, __kmp_all_nth));
146
147 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
148 a parallel region, made it return KMP_GTID_DNE to force serial_initialize
149 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
150 __kmp_init_gtid for this to work. */
151
152 if (!TCR_4(__kmp_init_gtid))
153 return KMP_GTID_DNE;
154
155 #ifdef KMP_TDATA_GTID
156 if (TCR_4(__kmp_gtid_mode) >= 3) {
157 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
158 return __kmp_gtid;
159 }
160 #endif
161 if (TCR_4(__kmp_gtid_mode) >= 2) {
162 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
163 return __kmp_gtid_get_specific();
164 }
165 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
166
167 stack_addr = (char *)&stack_data;
168 other_threads = __kmp_threads;
169
170 /* ATT: The code below is a source of potential bugs due to unsynchronized
171 access to __kmp_threads array. For example:
172 1. Current thread loads other_threads[i] to thr and checks it, it is
173 non-NULL.
174 2. Current thread is suspended by OS.
175 3. Another thread unregisters and finishes (debug versions of free()
176 may fill memory with something like 0xEF).
177 4. Current thread is resumed.
178 5. Current thread reads junk from *thr.
179 TODO: Fix it. --ln */
180
181 for (i = 0; i < __kmp_threads_capacity; i++) {
182
183 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
184 if (!thr)
185 continue;
186
187 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
188 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
189
190 /* stack grows down -- search through all of the active threads */
191
192 if (stack_addr <= stack_base) {
193 size_t stack_diff = stack_base - stack_addr;
194
195 if (stack_diff <= stack_size) {
196 /* The only way we can be closer than the allocated */
197 /* stack size is if we are running on this thread. */
198 // __kmp_gtid_get_specific can return negative value because this
199 // function can be called by thread destructor. However, before the
200 // thread destructor is called, the value of the corresponding
201 // thread-specific data will be reset to NULL.
202 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
203 __kmp_gtid_get_specific() == i);
204 return i;
205 }
206 }
207 }
208
209 /* get specific to try and determine our gtid */
210 KA_TRACE(1000,
211 ("*** __kmp_get_global_thread_id: internal alg. failed to find "
212 "thread, using TLS\n"));
213 i = __kmp_gtid_get_specific();
214
215 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
216
217 /* if we havn't been assigned a gtid, then return code */
218 if (i < 0)
219 return i;
220
221 // other_threads[i] can be nullptr at this point because the corresponding
222 // thread could have already been destructed. It can happen when this function
223 // is called in end library routine.
224 if (!TCR_SYNC_PTR(other_threads[i]))
225 return i;
226
227 /* dynamically updated stack window for uber threads to avoid get_specific
228 call */
229 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
230 KMP_FATAL(StackOverflow, i);
231 }
232
233 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
234 if (stack_addr > stack_base) {
235 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
236 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
237 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
238 stack_base);
239 } else {
240 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
241 stack_base - stack_addr);
242 }
243
244 /* Reprint stack bounds for ubermaster since they have been refined */
245 if (__kmp_storage_map) {
246 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
247 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
248 __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
249 other_threads[i]->th.th_info.ds.ds_stacksize,
250 "th_%d stack (refinement)", i);
251 }
252 return i;
253 }
254
__kmp_get_global_thread_id_reg()255 int __kmp_get_global_thread_id_reg() {
256 int gtid;
257
258 if (!__kmp_init_serial) {
259 gtid = KMP_GTID_DNE;
260 } else
261 #ifdef KMP_TDATA_GTID
262 if (TCR_4(__kmp_gtid_mode) >= 3) {
263 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
264 gtid = __kmp_gtid;
265 } else
266 #endif
267 if (TCR_4(__kmp_gtid_mode) >= 2) {
268 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
269 gtid = __kmp_gtid_get_specific();
270 } else {
271 KA_TRACE(1000,
272 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
273 gtid = __kmp_get_global_thread_id();
274 }
275
276 /* we must be a new uber master sibling thread */
277 if (gtid == KMP_GTID_DNE) {
278 KA_TRACE(10,
279 ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
280 "Registering a new gtid.\n"));
281 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
282 if (!__kmp_init_serial) {
283 __kmp_do_serial_initialize();
284 gtid = __kmp_gtid_get_specific();
285 } else {
286 gtid = __kmp_register_root(FALSE);
287 }
288 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
289 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
290 }
291
292 KMP_DEBUG_ASSERT(gtid >= 0);
293
294 return gtid;
295 }
296
297 /* caller must hold forkjoin_lock */
__kmp_check_stack_overlap(kmp_info_t * th)298 void __kmp_check_stack_overlap(kmp_info_t *th) {
299 int f;
300 char *stack_beg = NULL;
301 char *stack_end = NULL;
302 int gtid;
303
304 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
305 if (__kmp_storage_map) {
306 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
307 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
308
309 gtid = __kmp_gtid_from_thread(th);
310
311 if (gtid == KMP_GTID_MONITOR) {
312 __kmp_print_storage_map_gtid(
313 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
314 "th_%s stack (%s)", "mon",
315 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
316 } else {
317 __kmp_print_storage_map_gtid(
318 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
319 "th_%d stack (%s)", gtid,
320 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
321 }
322 }
323
324 /* No point in checking ubermaster threads since they use refinement and
325 * cannot overlap */
326 gtid = __kmp_gtid_from_thread(th);
327 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
328 KA_TRACE(10,
329 ("__kmp_check_stack_overlap: performing extensive checking\n"));
330 if (stack_beg == NULL) {
331 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
332 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
333 }
334
335 for (f = 0; f < __kmp_threads_capacity; f++) {
336 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
337
338 if (f_th && f_th != th) {
339 char *other_stack_end =
340 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
341 char *other_stack_beg =
342 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
343 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
344 (stack_end > other_stack_beg && stack_end < other_stack_end)) {
345
346 /* Print the other stack values before the abort */
347 if (__kmp_storage_map)
348 __kmp_print_storage_map_gtid(
349 -1, other_stack_beg, other_stack_end,
350 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
351 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
352
353 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
354 __kmp_msg_null);
355 }
356 }
357 }
358 }
359 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
360 }
361
362 /* ------------------------------------------------------------------------ */
363
__kmp_infinite_loop(void)364 void __kmp_infinite_loop(void) {
365 static int done = FALSE;
366
367 while (!done) {
368 KMP_YIELD(TRUE);
369 }
370 }
371
372 #define MAX_MESSAGE 512
373
__kmp_print_storage_map_gtid(int gtid,void * p1,void * p2,size_t size,char const * format,...)374 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
375 char const *format, ...) {
376 char buffer[MAX_MESSAGE];
377 va_list ap;
378
379 va_start(ap, format);
380 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
381 p2, (unsigned long)size, format);
382 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
383 __kmp_vprintf(kmp_err, buffer, ap);
384 #if KMP_PRINT_DATA_PLACEMENT
385 int node;
386 if (gtid >= 0) {
387 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
388 if (__kmp_storage_map_verbose) {
389 node = __kmp_get_host_node(p1);
390 if (node < 0) /* doesn't work, so don't try this next time */
391 __kmp_storage_map_verbose = FALSE;
392 else {
393 char *last;
394 int lastNode;
395 int localProc = __kmp_get_cpu_from_gtid(gtid);
396
397 const int page_size = KMP_GET_PAGE_SIZE();
398
399 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
400 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
401 if (localProc >= 0)
402 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
403 localProc >> 1);
404 else
405 __kmp_printf_no_lock(" GTID %d\n", gtid);
406 #if KMP_USE_PRCTL
407 /* The more elaborate format is disabled for now because of the prctl
408 * hanging bug. */
409 do {
410 last = p1;
411 lastNode = node;
412 /* This loop collates adjacent pages with the same host node. */
413 do {
414 (char *)p1 += page_size;
415 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
416 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
417 lastNode);
418 } while (p1 <= p2);
419 #else
420 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
421 (char *)p1 + (page_size - 1),
422 __kmp_get_host_node(p1));
423 if (p1 < p2) {
424 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
425 (char *)p2 + (page_size - 1),
426 __kmp_get_host_node(p2));
427 }
428 #endif
429 }
430 }
431 } else
432 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
433 }
434 #endif /* KMP_PRINT_DATA_PLACEMENT */
435 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
436
437 va_end(ap);
438 }
439
__kmp_warn(char const * format,...)440 void __kmp_warn(char const *format, ...) {
441 char buffer[MAX_MESSAGE];
442 va_list ap;
443
444 if (__kmp_generate_warnings == kmp_warnings_off) {
445 return;
446 }
447
448 va_start(ap, format);
449
450 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
451 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
452 __kmp_vprintf(kmp_err, buffer, ap);
453 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
454
455 va_end(ap);
456 }
457
__kmp_abort_process()458 void __kmp_abort_process() {
459 // Later threads may stall here, but that's ok because abort() will kill them.
460 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
461
462 if (__kmp_debug_buf) {
463 __kmp_dump_debug_buffer();
464 }
465
466 #if KMP_OS_WINDOWS
467 // Let other threads know of abnormal termination and prevent deadlock
468 // if abort happened during library initialization or shutdown
469 __kmp_global.g.g_abort = SIGABRT;
470
471 /* On Windows* OS by default abort() causes pop-up error box, which stalls
472 nightly testing. Unfortunately, we cannot reliably suppress pop-up error
473 boxes. _set_abort_behavior() works well, but this function is not
474 available in VS7 (this is not problem for DLL, but it is a problem for
475 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
476 help, at least in some versions of MS C RTL.
477
478 It seems following sequence is the only way to simulate abort() and
479 avoid pop-up error box. */
480 raise(SIGABRT);
481 _exit(3); // Just in case, if signal ignored, exit anyway.
482 #else
483 __kmp_unregister_library();
484 abort();
485 #endif
486
487 __kmp_infinite_loop();
488 __kmp_release_bootstrap_lock(&__kmp_exit_lock);
489
490 } // __kmp_abort_process
491
__kmp_abort_thread(void)492 void __kmp_abort_thread(void) {
493 // TODO: Eliminate g_abort global variable and this function.
494 // In case of abort just call abort(), it will kill all the threads.
495 __kmp_infinite_loop();
496 } // __kmp_abort_thread
497
498 /* Print out the storage map for the major kmp_info_t thread data structures
499 that are allocated together. */
500
__kmp_print_thread_storage_map(kmp_info_t * thr,int gtid)501 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
502 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
503 gtid);
504
505 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
506 sizeof(kmp_desc_t), "th_%d.th_info", gtid);
507
508 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
509 sizeof(kmp_local_t), "th_%d.th_local", gtid);
510
511 __kmp_print_storage_map_gtid(
512 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
513 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
514
515 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
516 &thr->th.th_bar[bs_plain_barrier + 1],
517 sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
518 gtid);
519
520 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
521 &thr->th.th_bar[bs_forkjoin_barrier + 1],
522 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
523 gtid);
524
525 #if KMP_FAST_REDUCTION_BARRIER
526 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
527 &thr->th.th_bar[bs_reduction_barrier + 1],
528 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
529 gtid);
530 #endif // KMP_FAST_REDUCTION_BARRIER
531 }
532
533 /* Print out the storage map for the major kmp_team_t team data structures
534 that are allocated together. */
535
__kmp_print_team_storage_map(const char * header,kmp_team_t * team,int team_id,int num_thr)536 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
537 int team_id, int num_thr) {
538 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
539 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
540 header, team_id);
541
542 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
543 &team->t.t_bar[bs_last_barrier],
544 sizeof(kmp_balign_team_t) * bs_last_barrier,
545 "%s_%d.t_bar", header, team_id);
546
547 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
548 &team->t.t_bar[bs_plain_barrier + 1],
549 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
550 header, team_id);
551
552 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
553 &team->t.t_bar[bs_forkjoin_barrier + 1],
554 sizeof(kmp_balign_team_t),
555 "%s_%d.t_bar[forkjoin]", header, team_id);
556
557 #if KMP_FAST_REDUCTION_BARRIER
558 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
559 &team->t.t_bar[bs_reduction_barrier + 1],
560 sizeof(kmp_balign_team_t),
561 "%s_%d.t_bar[reduction]", header, team_id);
562 #endif // KMP_FAST_REDUCTION_BARRIER
563
564 __kmp_print_storage_map_gtid(
565 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
566 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
567
568 __kmp_print_storage_map_gtid(
569 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
570 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
571
572 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
573 &team->t.t_disp_buffer[num_disp_buff],
574 sizeof(dispatch_shared_info_t) * num_disp_buff,
575 "%s_%d.t_disp_buffer", header, team_id);
576 }
577
__kmp_init_allocator()578 static void __kmp_init_allocator() {
579 __kmp_init_memkind();
580 __kmp_init_target_mem();
581 }
__kmp_fini_allocator()582 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
583
584 /* ------------------------------------------------------------------------ */
585
586 #if ENABLE_LIBOMPTARGET
__kmp_init_omptarget()587 static void __kmp_init_omptarget() {
588 __kmp_init_target_task();
589 }
590 #endif
591
592 /* ------------------------------------------------------------------------ */
593
594 #if KMP_DYNAMIC_LIB
595 #if KMP_OS_WINDOWS
596
DllMain(HINSTANCE hInstDLL,DWORD fdwReason,LPVOID lpReserved)597 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
598 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
599
600 switch (fdwReason) {
601
602 case DLL_PROCESS_ATTACH:
603 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
604
605 return TRUE;
606
607 case DLL_PROCESS_DETACH:
608 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
609
610 // According to Windows* documentation for DllMain entry point:
611 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
612 // lpReserved == NULL when FreeLibrary() is called,
613 // lpReserved != NULL when the process is terminated.
614 // When FreeLibrary() is called, worker threads remain alive. So the
615 // runtime's state is consistent and executing proper shutdown is OK.
616 // When the process is terminated, worker threads have exited or been
617 // forcefully terminated by the OS and only the shutdown thread remains.
618 // This can leave the runtime in an inconsistent state.
619 // Hence, only attempt proper cleanup when FreeLibrary() is called.
620 // Otherwise, rely on OS to reclaim resources.
621 if (lpReserved == NULL)
622 __kmp_internal_end_library(__kmp_gtid_get_specific());
623
624 return TRUE;
625
626 case DLL_THREAD_ATTACH:
627 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
628
629 /* if we want to register new siblings all the time here call
630 * __kmp_get_gtid(); */
631 return TRUE;
632
633 case DLL_THREAD_DETACH:
634 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
635
636 __kmp_internal_end_thread(__kmp_gtid_get_specific());
637 return TRUE;
638 }
639
640 return TRUE;
641 }
642
643 #endif /* KMP_OS_WINDOWS */
644 #endif /* KMP_DYNAMIC_LIB */
645
646 /* __kmp_parallel_deo -- Wait until it's our turn. */
__kmp_parallel_deo(int * gtid_ref,int * cid_ref,ident_t * loc_ref)647 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
648 int gtid = *gtid_ref;
649 #ifdef BUILD_PARALLEL_ORDERED
650 kmp_team_t *team = __kmp_team_from_gtid(gtid);
651 #endif /* BUILD_PARALLEL_ORDERED */
652
653 if (__kmp_env_consistency_check) {
654 if (__kmp_threads[gtid]->th.th_root->r.r_active)
655 #if KMP_USE_DYNAMIC_LOCK
656 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
657 #else
658 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
659 #endif
660 }
661 #ifdef BUILD_PARALLEL_ORDERED
662 if (!team->t.t_serialized) {
663 KMP_MB();
664 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
665 NULL);
666 KMP_MB();
667 }
668 #endif /* BUILD_PARALLEL_ORDERED */
669 }
670
671 /* __kmp_parallel_dxo -- Signal the next task. */
__kmp_parallel_dxo(int * gtid_ref,int * cid_ref,ident_t * loc_ref)672 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
673 int gtid = *gtid_ref;
674 #ifdef BUILD_PARALLEL_ORDERED
675 int tid = __kmp_tid_from_gtid(gtid);
676 kmp_team_t *team = __kmp_team_from_gtid(gtid);
677 #endif /* BUILD_PARALLEL_ORDERED */
678
679 if (__kmp_env_consistency_check) {
680 if (__kmp_threads[gtid]->th.th_root->r.r_active)
681 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
682 }
683 #ifdef BUILD_PARALLEL_ORDERED
684 if (!team->t.t_serialized) {
685 KMP_MB(); /* Flush all pending memory write invalidates. */
686
687 /* use the tid of the next thread in this team */
688 /* TODO replace with general release procedure */
689 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
690
691 KMP_MB(); /* Flush all pending memory write invalidates. */
692 }
693 #endif /* BUILD_PARALLEL_ORDERED */
694 }
695
696 /* ------------------------------------------------------------------------ */
697 /* The BARRIER for a SINGLE process section is always explicit */
698
__kmp_enter_single(int gtid,ident_t * id_ref,int push_ws)699 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
700 int status;
701 kmp_info_t *th;
702 kmp_team_t *team;
703
704 if (!TCR_4(__kmp_init_parallel))
705 __kmp_parallel_initialize();
706 __kmp_resume_if_soft_paused();
707
708 th = __kmp_threads[gtid];
709 team = th->th.th_team;
710 status = 0;
711
712 th->th.th_ident = id_ref;
713
714 if (team->t.t_serialized) {
715 status = 1;
716 } else {
717 kmp_int32 old_this = th->th.th_local.this_construct;
718
719 ++th->th.th_local.this_construct;
720 /* try to set team count to thread count--success means thread got the
721 single block */
722 /* TODO: Should this be acquire or release? */
723 if (team->t.t_construct == old_this) {
724 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
725 th->th.th_local.this_construct);
726 }
727 #if USE_ITT_BUILD
728 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
729 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
730 team->t.t_active_level == 1) {
731 // Only report metadata by primary thread of active team at level 1
732 __kmp_itt_metadata_single(id_ref);
733 }
734 #endif /* USE_ITT_BUILD */
735 }
736
737 if (__kmp_env_consistency_check) {
738 if (status && push_ws) {
739 __kmp_push_workshare(gtid, ct_psingle, id_ref);
740 } else {
741 __kmp_check_workshare(gtid, ct_psingle, id_ref);
742 }
743 }
744 #if USE_ITT_BUILD
745 if (status) {
746 __kmp_itt_single_start(gtid);
747 }
748 #endif /* USE_ITT_BUILD */
749 return status;
750 }
751
__kmp_exit_single(int gtid)752 void __kmp_exit_single(int gtid) {
753 #if USE_ITT_BUILD
754 __kmp_itt_single_end(gtid);
755 #endif /* USE_ITT_BUILD */
756 if (__kmp_env_consistency_check)
757 __kmp_pop_workshare(gtid, ct_psingle, NULL);
758 }
759
760 /* determine if we can go parallel or must use a serialized parallel region and
761 * how many threads we can use
762 * set_nproc is the number of threads requested for the team
763 * returns 0 if we should serialize or only use one thread,
764 * otherwise the number of threads to use
765 * The forkjoin lock is held by the caller. */
__kmp_reserve_threads(kmp_root_t * root,kmp_team_t * parent_team,int master_tid,int set_nthreads,int enter_teams)766 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
767 int master_tid, int set_nthreads,
768 int enter_teams) {
769 int capacity;
770 int new_nthreads;
771 KMP_DEBUG_ASSERT(__kmp_init_serial);
772 KMP_DEBUG_ASSERT(root && parent_team);
773 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
774
775 // If dyn-var is set, dynamically adjust the number of desired threads,
776 // according to the method specified by dynamic_mode.
777 new_nthreads = set_nthreads;
778 if (!get__dynamic_2(parent_team, master_tid)) {
779 ;
780 }
781 #ifdef USE_LOAD_BALANCE
782 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
783 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
784 if (new_nthreads == 1) {
785 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
786 "reservation to 1 thread\n",
787 master_tid));
788 return 1;
789 }
790 if (new_nthreads < set_nthreads) {
791 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
792 "reservation to %d threads\n",
793 master_tid, new_nthreads));
794 }
795 }
796 #endif /* USE_LOAD_BALANCE */
797 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
798 new_nthreads = __kmp_avail_proc - __kmp_nth +
799 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
800 if (new_nthreads <= 1) {
801 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
802 "reservation to 1 thread\n",
803 master_tid));
804 return 1;
805 }
806 if (new_nthreads < set_nthreads) {
807 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
808 "reservation to %d threads\n",
809 master_tid, new_nthreads));
810 } else {
811 new_nthreads = set_nthreads;
812 }
813 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
814 if (set_nthreads > 2) {
815 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
816 new_nthreads = (new_nthreads % set_nthreads) + 1;
817 if (new_nthreads == 1) {
818 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
819 "reservation to 1 thread\n",
820 master_tid));
821 return 1;
822 }
823 if (new_nthreads < set_nthreads) {
824 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
825 "reservation to %d threads\n",
826 master_tid, new_nthreads));
827 }
828 }
829 } else {
830 KMP_ASSERT(0);
831 }
832
833 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
834 if (__kmp_nth + new_nthreads -
835 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
836 __kmp_max_nth) {
837 int tl_nthreads = __kmp_max_nth - __kmp_nth +
838 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
839 if (tl_nthreads <= 0) {
840 tl_nthreads = 1;
841 }
842
843 // If dyn-var is false, emit a 1-time warning.
844 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
845 __kmp_reserve_warn = 1;
846 __kmp_msg(kmp_ms_warning,
847 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
848 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
849 }
850 if (tl_nthreads == 1) {
851 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
852 "reduced reservation to 1 thread\n",
853 master_tid));
854 return 1;
855 }
856 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
857 "reservation to %d threads\n",
858 master_tid, tl_nthreads));
859 new_nthreads = tl_nthreads;
860 }
861
862 // Respect OMP_THREAD_LIMIT
863 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
864 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
865 if (cg_nthreads + new_nthreads -
866 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
867 max_cg_threads) {
868 int tl_nthreads = max_cg_threads - cg_nthreads +
869 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
870 if (tl_nthreads <= 0) {
871 tl_nthreads = 1;
872 }
873
874 // If dyn-var is false, emit a 1-time warning.
875 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
876 __kmp_reserve_warn = 1;
877 __kmp_msg(kmp_ms_warning,
878 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
879 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
880 }
881 if (tl_nthreads == 1) {
882 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
883 "reduced reservation to 1 thread\n",
884 master_tid));
885 return 1;
886 }
887 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
888 "reservation to %d threads\n",
889 master_tid, tl_nthreads));
890 new_nthreads = tl_nthreads;
891 }
892
893 // Check if the threads array is large enough, or needs expanding.
894 // See comment in __kmp_register_root() about the adjustment if
895 // __kmp_threads[0] == NULL.
896 capacity = __kmp_threads_capacity;
897 if (TCR_PTR(__kmp_threads[0]) == NULL) {
898 --capacity;
899 }
900 // If it is not for initializing the hidden helper team, we need to take
901 // __kmp_hidden_helper_threads_num out of the capacity because it is included
902 // in __kmp_threads_capacity.
903 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
904 capacity -= __kmp_hidden_helper_threads_num;
905 }
906 if (__kmp_nth + new_nthreads -
907 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
908 capacity) {
909 // Expand the threads array.
910 int slotsRequired = __kmp_nth + new_nthreads -
911 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
912 capacity;
913 int slotsAdded = __kmp_expand_threads(slotsRequired);
914 if (slotsAdded < slotsRequired) {
915 // The threads array was not expanded enough.
916 new_nthreads -= (slotsRequired - slotsAdded);
917 KMP_ASSERT(new_nthreads >= 1);
918
919 // If dyn-var is false, emit a 1-time warning.
920 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
921 __kmp_reserve_warn = 1;
922 if (__kmp_tp_cached) {
923 __kmp_msg(kmp_ms_warning,
924 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
925 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
926 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
927 } else {
928 __kmp_msg(kmp_ms_warning,
929 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
930 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
931 }
932 }
933 }
934 }
935
936 #ifdef KMP_DEBUG
937 if (new_nthreads == 1) {
938 KC_TRACE(10,
939 ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
940 "dead roots and rechecking; requested %d threads\n",
941 __kmp_get_gtid(), set_nthreads));
942 } else {
943 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
944 " %d threads\n",
945 __kmp_get_gtid(), new_nthreads, set_nthreads));
946 }
947 #endif // KMP_DEBUG
948
949 if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) {
950 __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
951 this_thr->th.th_nt_msg);
952 }
953 return new_nthreads;
954 }
955
956 /* Allocate threads from the thread pool and assign them to the new team. We are
957 assured that there are enough threads available, because we checked on that
958 earlier within critical section forkjoin */
__kmp_fork_team_threads(kmp_root_t * root,kmp_team_t * team,kmp_info_t * master_th,int master_gtid,int fork_teams_workers)959 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
960 kmp_info_t *master_th, int master_gtid,
961 int fork_teams_workers) {
962 int i;
963 int use_hot_team;
964
965 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
966 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
967 KMP_MB();
968
969 /* first, let's setup the primary thread */
970 master_th->th.th_info.ds.ds_tid = 0;
971 master_th->th.th_team = team;
972 master_th->th.th_team_nproc = team->t.t_nproc;
973 master_th->th.th_team_master = master_th;
974 master_th->th.th_team_serialized = FALSE;
975 master_th->th.th_dispatch = &team->t.t_dispatch[0];
976
977 /* make sure we are not the optimized hot team */
978 #if KMP_NESTED_HOT_TEAMS
979 use_hot_team = 0;
980 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
981 if (hot_teams) { // hot teams array is not allocated if
982 // KMP_HOT_TEAMS_MAX_LEVEL=0
983 int level = team->t.t_active_level - 1; // index in array of hot teams
984 if (master_th->th.th_teams_microtask) { // are we inside the teams?
985 if (master_th->th.th_teams_size.nteams > 1) {
986 ++level; // level was not increased in teams construct for
987 // team_of_masters
988 }
989 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
990 master_th->th.th_teams_level == team->t.t_level) {
991 ++level; // level was not increased in teams construct for
992 // team_of_workers before the parallel
993 } // team->t.t_level will be increased inside parallel
994 }
995 if (level < __kmp_hot_teams_max_level) {
996 if (hot_teams[level].hot_team) {
997 // hot team has already been allocated for given level
998 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
999 use_hot_team = 1; // the team is ready to use
1000 } else {
1001 use_hot_team = 0; // AC: threads are not allocated yet
1002 hot_teams[level].hot_team = team; // remember new hot team
1003 hot_teams[level].hot_team_nth = team->t.t_nproc;
1004 }
1005 } else {
1006 use_hot_team = 0;
1007 }
1008 }
1009 #else
1010 use_hot_team = team == root->r.r_hot_team;
1011 #endif
1012 if (!use_hot_team) {
1013
1014 /* install the primary thread */
1015 team->t.t_threads[0] = master_th;
1016 __kmp_initialize_info(master_th, team, 0, master_gtid);
1017
1018 /* now, install the worker threads */
1019 for (i = 1; i < team->t.t_nproc; i++) {
1020
1021 /* fork or reallocate a new thread and install it in team */
1022 kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1023 team->t.t_threads[i] = thr;
1024 KMP_DEBUG_ASSERT(thr);
1025 KMP_DEBUG_ASSERT(thr->th.th_team == team);
1026 /* align team and thread arrived states */
1027 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1028 "T#%d(%d:%d) join =%llu, plain=%llu\n",
1029 __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1030 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1031 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1032 team->t.t_bar[bs_plain_barrier].b_arrived));
1033 thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1034 thr->th.th_teams_level = master_th->th.th_teams_level;
1035 thr->th.th_teams_size = master_th->th.th_teams_size;
1036 { // Initialize threads' barrier data.
1037 int b;
1038 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1039 for (b = 0; b < bs_last_barrier; ++b) {
1040 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1041 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1042 #if USE_DEBUGGER
1043 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1044 #endif
1045 }
1046 }
1047 }
1048
1049 #if KMP_AFFINITY_SUPPORTED
1050 // Do not partition the places list for teams construct workers who
1051 // haven't actually been forked to do real work yet. This partitioning
1052 // will take place in the parallel region nested within the teams construct.
1053 if (!fork_teams_workers) {
1054 __kmp_partition_places(team);
1055 }
1056 #endif
1057
1058 if (team->t.t_nproc > 1 &&
1059 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1060 team->t.b->update_num_threads(team->t.t_nproc);
1061 __kmp_add_threads_to_team(team, team->t.t_nproc);
1062 }
1063 }
1064
1065 // Take care of primary thread's task state
1066 if (__kmp_tasking_mode != tskm_immediate_exec) {
1067 if (use_hot_team) {
1068 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
1069 KA_TRACE(
1070 20,
1071 ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
1072 "%p, new task_team %p / team %p\n",
1073 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
1074 team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
1075 team));
1076
1077 // Store primary thread's current task state on new team
1078 KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1079 master_th->th.th_task_state);
1080
1081 // Restore primary thread's task state to hot team's state
1082 // by using thread 1's task state
1083 if (team->t.t_nproc > 1) {
1084 KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
1085 team->t.t_threads[1]->th.th_task_state == 1);
1086 KMP_CHECK_UPDATE(master_th->th.th_task_state,
1087 team->t.t_threads[1]->th.th_task_state);
1088 } else {
1089 master_th->th.th_task_state = 0;
1090 }
1091 } else {
1092 // Store primary thread's current task_state on new team
1093 KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1094 master_th->th.th_task_state);
1095 // Are not using hot team, so set task state to 0.
1096 master_th->th.th_task_state = 0;
1097 }
1098 }
1099
1100 if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1101 for (i = 0; i < team->t.t_nproc; i++) {
1102 kmp_info_t *thr = team->t.t_threads[i];
1103 if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1104 thr->th.th_prev_level != team->t.t_level) {
1105 team->t.t_display_affinity = 1;
1106 break;
1107 }
1108 }
1109 }
1110
1111 KMP_MB();
1112 }
1113
1114 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1115 // Propagate any changes to the floating point control registers out to the team
1116 // We try to avoid unnecessary writes to the relevant cache line in the team
1117 // structure, so we don't make changes unless they are needed.
propagateFPControl(kmp_team_t * team)1118 inline static void propagateFPControl(kmp_team_t *team) {
1119 if (__kmp_inherit_fp_control) {
1120 kmp_int16 x87_fpu_control_word;
1121 kmp_uint32 mxcsr;
1122
1123 // Get primary thread's values of FPU control flags (both X87 and vector)
1124 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1125 __kmp_store_mxcsr(&mxcsr);
1126 mxcsr &= KMP_X86_MXCSR_MASK;
1127
1128 // There is no point looking at t_fp_control_saved here.
1129 // If it is TRUE, we still have to update the values if they are different
1130 // from those we now have. If it is FALSE we didn't save anything yet, but
1131 // our objective is the same. We have to ensure that the values in the team
1132 // are the same as those we have.
1133 // So, this code achieves what we need whether or not t_fp_control_saved is
1134 // true. By checking whether the value needs updating we avoid unnecessary
1135 // writes that would put the cache-line into a written state, causing all
1136 // threads in the team to have to read it again.
1137 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1138 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1139 // Although we don't use this value, other code in the runtime wants to know
1140 // whether it should restore them. So we must ensure it is correct.
1141 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1142 } else {
1143 // Similarly here. Don't write to this cache-line in the team structure
1144 // unless we have to.
1145 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1146 }
1147 }
1148
1149 // Do the opposite, setting the hardware registers to the updated values from
1150 // the team.
updateHWFPControl(kmp_team_t * team)1151 inline static void updateHWFPControl(kmp_team_t *team) {
1152 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1153 // Only reset the fp control regs if they have been changed in the team.
1154 // the parallel region that we are exiting.
1155 kmp_int16 x87_fpu_control_word;
1156 kmp_uint32 mxcsr;
1157 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1158 __kmp_store_mxcsr(&mxcsr);
1159 mxcsr &= KMP_X86_MXCSR_MASK;
1160
1161 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1162 __kmp_clear_x87_fpu_status_word();
1163 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1164 }
1165
1166 if (team->t.t_mxcsr != mxcsr) {
1167 __kmp_load_mxcsr(&team->t.t_mxcsr);
1168 }
1169 }
1170 }
1171 #else
1172 #define propagateFPControl(x) ((void)0)
1173 #define updateHWFPControl(x) ((void)0)
1174 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1175
1176 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1177 int realloc); // forward declaration
1178
1179 /* Run a parallel region that has been serialized, so runs only in a team of the
1180 single primary thread. */
__kmp_serialized_parallel(ident_t * loc,kmp_int32 global_tid)1181 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1182 kmp_info_t *this_thr;
1183 kmp_team_t *serial_team;
1184
1185 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1186
1187 /* Skip all this code for autopar serialized loops since it results in
1188 unacceptable overhead */
1189 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1190 return;
1191
1192 if (!TCR_4(__kmp_init_parallel))
1193 __kmp_parallel_initialize();
1194 __kmp_resume_if_soft_paused();
1195
1196 this_thr = __kmp_threads[global_tid];
1197 serial_team = this_thr->th.th_serial_team;
1198
1199 /* utilize the serialized team held by this thread */
1200 KMP_DEBUG_ASSERT(serial_team);
1201 KMP_MB();
1202
1203 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1204 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1205 proc_bind = proc_bind_false;
1206 } else if (proc_bind == proc_bind_default) {
1207 // No proc_bind clause was specified, so use the current value
1208 // of proc-bind-var for this parallel region.
1209 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1210 }
1211 // Reset for next parallel region
1212 this_thr->th.th_set_proc_bind = proc_bind_default;
1213
1214 // Reset num_threads for next parallel region
1215 this_thr->th.th_set_nproc = 0;
1216
1217 #if OMPT_SUPPORT
1218 ompt_data_t ompt_parallel_data = ompt_data_none;
1219 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1220 if (ompt_enabled.enabled &&
1221 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1222
1223 ompt_task_info_t *parent_task_info;
1224 parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1225
1226 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1227 if (ompt_enabled.ompt_callback_parallel_begin) {
1228 int team_size = 1;
1229
1230 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1231 &(parent_task_info->task_data), &(parent_task_info->frame),
1232 &ompt_parallel_data, team_size,
1233 ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1234 }
1235 }
1236 #endif // OMPT_SUPPORT
1237
1238 if (this_thr->th.th_team != serial_team) {
1239 // Nested level will be an index in the nested nthreads array
1240 int level = this_thr->th.th_team->t.t_level;
1241
1242 if (serial_team->t.t_serialized) {
1243 /* this serial team was already used
1244 TODO increase performance by making this locks more specific */
1245 kmp_team_t *new_team;
1246
1247 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1248
1249 new_team =
1250 __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1251 #if OMPT_SUPPORT
1252 ompt_parallel_data,
1253 #endif
1254 proc_bind, &this_thr->th.th_current_task->td_icvs,
1255 0 USE_NESTED_HOT_ARG(NULL));
1256 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1257 KMP_ASSERT(new_team);
1258
1259 /* setup new serialized team and install it */
1260 new_team->t.t_threads[0] = this_thr;
1261 new_team->t.t_parent = this_thr->th.th_team;
1262 serial_team = new_team;
1263 this_thr->th.th_serial_team = serial_team;
1264
1265 KF_TRACE(
1266 10,
1267 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1268 global_tid, serial_team));
1269
1270 /* TODO the above breaks the requirement that if we run out of resources,
1271 then we can still guarantee that serialized teams are ok, since we may
1272 need to allocate a new one */
1273 } else {
1274 KF_TRACE(
1275 10,
1276 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1277 global_tid, serial_team));
1278 }
1279
1280 /* we have to initialize this serial team */
1281 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1282 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1283 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1284 serial_team->t.t_ident = loc;
1285 serial_team->t.t_serialized = 1;
1286 serial_team->t.t_nproc = 1;
1287 serial_team->t.t_parent = this_thr->th.th_team;
1288 if (this_thr->th.th_team->t.t_nested_nth)
1289 serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
1290 else
1291 serial_team->t.t_nested_nth = &__kmp_nested_nth;
1292 // Save previous team's task state on serial team structure
1293 serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
1294 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1295 this_thr->th.th_team = serial_team;
1296 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1297
1298 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1299 this_thr->th.th_current_task));
1300 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1301 this_thr->th.th_current_task->td_flags.executing = 0;
1302
1303 __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1304
1305 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1306 implicit task for each serialized task represented by
1307 team->t.t_serialized? */
1308 copy_icvs(&this_thr->th.th_current_task->td_icvs,
1309 &this_thr->th.th_current_task->td_parent->td_icvs);
1310
1311 // Thread value exists in the nested nthreads array for the next nested
1312 // level
1313 kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1314 if (this_thr->th.th_team->t.t_nested_nth)
1315 nested_nth = this_thr->th.th_team->t.t_nested_nth;
1316 if (nested_nth->used && (level + 1 < nested_nth->used)) {
1317 this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1318 }
1319
1320 if (__kmp_nested_proc_bind.used &&
1321 (level + 1 < __kmp_nested_proc_bind.used)) {
1322 this_thr->th.th_current_task->td_icvs.proc_bind =
1323 __kmp_nested_proc_bind.bind_types[level + 1];
1324 }
1325
1326 #if USE_DEBUGGER
1327 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1328 #endif
1329 this_thr->th.th_info.ds.ds_tid = 0;
1330
1331 /* set thread cache values */
1332 this_thr->th.th_team_nproc = 1;
1333 this_thr->th.th_team_master = this_thr;
1334 this_thr->th.th_team_serialized = 1;
1335 this_thr->th.th_task_team = NULL;
1336 this_thr->th.th_task_state = 0;
1337
1338 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1339 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1340 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1341
1342 propagateFPControl(serial_team);
1343
1344 /* check if we need to allocate dispatch buffers stack */
1345 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1346 if (!serial_team->t.t_dispatch->th_disp_buffer) {
1347 serial_team->t.t_dispatch->th_disp_buffer =
1348 (dispatch_private_info_t *)__kmp_allocate(
1349 sizeof(dispatch_private_info_t));
1350 }
1351 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1352
1353 KMP_MB();
1354
1355 } else {
1356 /* this serialized team is already being used,
1357 * that's fine, just add another nested level */
1358 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1359 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1360 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1361 ++serial_team->t.t_serialized;
1362 this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1363
1364 // Nested level will be an index in the nested nthreads array
1365 int level = this_thr->th.th_team->t.t_level;
1366 // Thread value exists in the nested nthreads array for the next nested
1367 // level
1368
1369 kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1370 if (serial_team->t.t_nested_nth)
1371 nested_nth = serial_team->t.t_nested_nth;
1372 if (nested_nth->used && (level + 1 < nested_nth->used)) {
1373 this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1374 }
1375
1376 serial_team->t.t_level++;
1377 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1378 "of serial team %p to %d\n",
1379 global_tid, serial_team, serial_team->t.t_level));
1380
1381 /* allocate/push dispatch buffers stack */
1382 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1383 {
1384 dispatch_private_info_t *disp_buffer =
1385 (dispatch_private_info_t *)__kmp_allocate(
1386 sizeof(dispatch_private_info_t));
1387 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1388 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1389 }
1390 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1391
1392 /* allocate/push task team stack */
1393 __kmp_push_task_team_node(this_thr, serial_team);
1394
1395 KMP_MB();
1396 }
1397 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1398
1399 // Perform the display affinity functionality for
1400 // serialized parallel regions
1401 if (__kmp_display_affinity) {
1402 if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1403 this_thr->th.th_prev_num_threads != 1) {
1404 // NULL means use the affinity-format-var ICV
1405 __kmp_aux_display_affinity(global_tid, NULL);
1406 this_thr->th.th_prev_level = serial_team->t.t_level;
1407 this_thr->th.th_prev_num_threads = 1;
1408 }
1409 }
1410
1411 if (__kmp_env_consistency_check)
1412 __kmp_push_parallel(global_tid, NULL);
1413 #if OMPT_SUPPORT
1414 serial_team->t.ompt_team_info.master_return_address = codeptr;
1415 if (ompt_enabled.enabled &&
1416 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1417 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1418 OMPT_GET_FRAME_ADDRESS(0);
1419
1420 ompt_lw_taskteam_t lw_taskteam;
1421 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1422 &ompt_parallel_data, codeptr);
1423
1424 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1425 // don't use lw_taskteam after linking. content was swaped
1426
1427 /* OMPT implicit task begin */
1428 if (ompt_enabled.ompt_callback_implicit_task) {
1429 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1430 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1431 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1432 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1433 OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1434 __kmp_tid_from_gtid(global_tid);
1435 }
1436
1437 /* OMPT state */
1438 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1439 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1440 OMPT_GET_FRAME_ADDRESS(0);
1441 }
1442 #endif
1443 }
1444
1445 // Test if this fork is for a team closely nested in a teams construct
__kmp_is_fork_in_teams(kmp_info_t * master_th,microtask_t microtask,int level,int teams_level,kmp_va_list ap)1446 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1447 microtask_t microtask, int level,
1448 int teams_level, kmp_va_list ap) {
1449 return (master_th->th.th_teams_microtask && ap &&
1450 microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1451 }
1452
1453 // Test if this fork is for the teams construct, i.e. to form the outer league
1454 // of teams
__kmp_is_entering_teams(int active_level,int level,int teams_level,kmp_va_list ap)1455 static inline bool __kmp_is_entering_teams(int active_level, int level,
1456 int teams_level, kmp_va_list ap) {
1457 return ((ap == NULL && active_level == 0) ||
1458 (ap && teams_level > 0 && teams_level == level));
1459 }
1460
1461 // AC: This is start of parallel that is nested inside teams construct.
1462 // The team is actual (hot), all workers are ready at the fork barrier.
1463 // No lock needed to initialize the team a bit, then free workers.
1464 static inline int
__kmp_fork_in_teams(ident_t * loc,int gtid,kmp_team_t * parent_team,kmp_int32 argc,kmp_info_t * master_th,kmp_root_t * root,enum fork_context_e call_context,microtask_t microtask,launch_t invoker,int master_set_numthreads,int level,ompt_data_t ompt_parallel_data,void * return_address,kmp_va_list ap)1465 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1466 kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1467 enum fork_context_e call_context, microtask_t microtask,
1468 launch_t invoker, int master_set_numthreads, int level,
1469 #if OMPT_SUPPORT
1470 ompt_data_t ompt_parallel_data, void *return_address,
1471 #endif
1472 kmp_va_list ap) {
1473 void **argv;
1474 int i;
1475
1476 parent_team->t.t_ident = loc;
1477 __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1478 parent_team->t.t_argc = argc;
1479 argv = (void **)parent_team->t.t_argv;
1480 for (i = argc - 1; i >= 0; --i) {
1481 *argv++ = va_arg(kmp_va_deref(ap), void *);
1482 }
1483 // Increment our nested depth levels, but not increase the serialization
1484 if (parent_team == master_th->th.th_serial_team) {
1485 // AC: we are in serialized parallel
1486 __kmpc_serialized_parallel(loc, gtid);
1487 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1488
1489 if (call_context == fork_context_gnu) {
1490 // AC: need to decrement t_serialized for enquiry functions to work
1491 // correctly, will restore at join time
1492 parent_team->t.t_serialized--;
1493 return TRUE;
1494 }
1495
1496 #if OMPD_SUPPORT
1497 parent_team->t.t_pkfn = microtask;
1498 #endif
1499
1500 #if OMPT_SUPPORT
1501 void *dummy;
1502 void **exit_frame_p;
1503 ompt_data_t *implicit_task_data;
1504 ompt_lw_taskteam_t lw_taskteam;
1505
1506 if (ompt_enabled.enabled) {
1507 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1508 &ompt_parallel_data, return_address);
1509 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1510
1511 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1512 // Don't use lw_taskteam after linking. Content was swapped.
1513
1514 /* OMPT implicit task begin */
1515 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1516 if (ompt_enabled.ompt_callback_implicit_task) {
1517 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1518 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1519 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1520 1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1521 }
1522
1523 /* OMPT state */
1524 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1525 } else {
1526 exit_frame_p = &dummy;
1527 }
1528 #endif
1529
1530 // AC: need to decrement t_serialized for enquiry functions to work
1531 // correctly, will restore at join time
1532 parent_team->t.t_serialized--;
1533
1534 {
1535 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1536 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1537 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1538 #if OMPT_SUPPORT
1539 ,
1540 exit_frame_p
1541 #endif
1542 );
1543 }
1544
1545 #if OMPT_SUPPORT
1546 if (ompt_enabled.enabled) {
1547 *exit_frame_p = NULL;
1548 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1549 if (ompt_enabled.ompt_callback_implicit_task) {
1550 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1551 ompt_scope_end, NULL, implicit_task_data, 1,
1552 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1553 }
1554 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1555 __ompt_lw_taskteam_unlink(master_th);
1556 if (ompt_enabled.ompt_callback_parallel_end) {
1557 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1558 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1559 OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1560 }
1561 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1562 }
1563 #endif
1564 return TRUE;
1565 }
1566
1567 parent_team->t.t_pkfn = microtask;
1568 parent_team->t.t_invoke = invoker;
1569 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1570 parent_team->t.t_active_level++;
1571 parent_team->t.t_level++;
1572 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1573
1574 // If the threads allocated to the team are less than the thread limit, update
1575 // the thread limit here. th_teams_size.nth is specific to this team nested
1576 // in a teams construct, the team is fully created, and we're about to do
1577 // the actual fork. Best to do this here so that the subsequent uses below
1578 // and in the join have the correct value.
1579 master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1580
1581 #if OMPT_SUPPORT
1582 if (ompt_enabled.enabled) {
1583 ompt_lw_taskteam_t lw_taskteam;
1584 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1585 return_address);
1586 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1587 }
1588 #endif
1589
1590 /* Change number of threads in the team if requested */
1591 if (master_set_numthreads) { // The parallel has num_threads clause
1592 if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1593 // AC: only can reduce number of threads dynamically, can't increase
1594 kmp_info_t **other_threads = parent_team->t.t_threads;
1595 // NOTE: if using distributed barrier, we need to run this code block
1596 // even when the team size appears not to have changed from the max.
1597 int old_proc = master_th->th.th_teams_size.nth;
1598 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1599 __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1600 __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1601 }
1602 parent_team->t.t_nproc = master_set_numthreads;
1603 for (i = 0; i < master_set_numthreads; ++i) {
1604 other_threads[i]->th.th_team_nproc = master_set_numthreads;
1605 }
1606 }
1607 // Keep extra threads hot in the team for possible next parallels
1608 master_th->th.th_set_nproc = 0;
1609 }
1610
1611 #if USE_DEBUGGER
1612 if (__kmp_debugging) { // Let debugger override number of threads.
1613 int nth = __kmp_omp_num_threads(loc);
1614 if (nth > 0) { // 0 means debugger doesn't want to change num threads
1615 master_set_numthreads = nth;
1616 }
1617 }
1618 #endif
1619
1620 // Figure out the proc_bind policy for the nested parallel within teams
1621 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1622 // proc_bind_default means don't update
1623 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1624 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1625 proc_bind = proc_bind_false;
1626 } else {
1627 // No proc_bind clause specified; use current proc-bind-var
1628 if (proc_bind == proc_bind_default) {
1629 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1630 }
1631 /* else: The proc_bind policy was specified explicitly on parallel clause.
1632 This overrides proc-bind-var for this parallel region, but does not
1633 change proc-bind-var. */
1634 // Figure the value of proc-bind-var for the child threads.
1635 if ((level + 1 < __kmp_nested_proc_bind.used) &&
1636 (__kmp_nested_proc_bind.bind_types[level + 1] !=
1637 master_th->th.th_current_task->td_icvs.proc_bind)) {
1638 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1639 }
1640 }
1641 KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1642 // Need to change the bind-var ICV to correct value for each implicit task
1643 if (proc_bind_icv != proc_bind_default &&
1644 master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1645 kmp_info_t **other_threads = parent_team->t.t_threads;
1646 for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1647 other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1648 }
1649 }
1650 // Reset for next parallel region
1651 master_th->th.th_set_proc_bind = proc_bind_default;
1652
1653 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1654 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1655 KMP_ITT_DEBUG) &&
1656 __kmp_forkjoin_frames_mode == 3 &&
1657 parent_team->t.t_active_level == 1 // only report frames at level 1
1658 && master_th->th.th_teams_size.nteams == 1) {
1659 kmp_uint64 tmp_time = __itt_get_timestamp();
1660 master_th->th.th_frame_time = tmp_time;
1661 parent_team->t.t_region_time = tmp_time;
1662 }
1663 if (__itt_stack_caller_create_ptr) {
1664 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1665 // create new stack stitching id before entering fork barrier
1666 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1667 }
1668 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1669 #if KMP_AFFINITY_SUPPORTED
1670 __kmp_partition_places(parent_team);
1671 #endif
1672
1673 KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1674 "master_th=%p, gtid=%d\n",
1675 root, parent_team, master_th, gtid));
1676 __kmp_internal_fork(loc, gtid, parent_team);
1677 KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1678 "master_th=%p, gtid=%d\n",
1679 root, parent_team, master_th, gtid));
1680
1681 if (call_context == fork_context_gnu)
1682 return TRUE;
1683
1684 /* Invoke microtask for PRIMARY thread */
1685 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1686 parent_team->t.t_id, parent_team->t.t_pkfn));
1687
1688 if (!parent_team->t.t_invoke(gtid)) {
1689 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1690 }
1691 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1692 parent_team->t.t_id, parent_team->t.t_pkfn));
1693 KMP_MB(); /* Flush all pending memory write invalidates. */
1694
1695 KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1696
1697 return TRUE;
1698 }
1699
1700 // Create a serialized parallel region
1701 static inline int
__kmp_serial_fork_call(ident_t * loc,int gtid,enum fork_context_e call_context,kmp_int32 argc,microtask_t microtask,launch_t invoker,kmp_info_t * master_th,kmp_team_t * parent_team,ompt_data_t * ompt_parallel_data,void ** return_address,ompt_data_t ** parent_task_data,kmp_va_list ap)1702 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1703 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1704 kmp_info_t *master_th, kmp_team_t *parent_team,
1705 #if OMPT_SUPPORT
1706 ompt_data_t *ompt_parallel_data, void **return_address,
1707 ompt_data_t **parent_task_data,
1708 #endif
1709 kmp_va_list ap) {
1710 kmp_team_t *team;
1711 int i;
1712 void **argv;
1713
1714 /* josh todo: hypothetical question: what do we do for OS X*? */
1715 #if KMP_OS_LINUX && \
1716 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1717 SimpleVLA<void *> args(argc);
1718 #else
1719 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1720 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1721 KMP_ARCH_AARCH64) */
1722
1723 KA_TRACE(
1724 20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1725
1726 __kmpc_serialized_parallel(loc, gtid);
1727
1728 #if OMPD_SUPPORT
1729 master_th->th.th_serial_team->t.t_pkfn = microtask;
1730 #endif
1731
1732 if (call_context == fork_context_intel) {
1733 /* TODO this sucks, use the compiler itself to pass args! :) */
1734 master_th->th.th_serial_team->t.t_ident = loc;
1735 if (!ap) {
1736 // revert change made in __kmpc_serialized_parallel()
1737 master_th->th.th_serial_team->t.t_level--;
1738 // Get args from parent team for teams construct
1739
1740 #if OMPT_SUPPORT
1741 void *dummy;
1742 void **exit_frame_p;
1743 ompt_task_info_t *task_info;
1744 ompt_lw_taskteam_t lw_taskteam;
1745
1746 if (ompt_enabled.enabled) {
1747 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1748 ompt_parallel_data, *return_address);
1749
1750 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1751 // don't use lw_taskteam after linking. content was swaped
1752 task_info = OMPT_CUR_TASK_INFO(master_th);
1753 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1754 if (ompt_enabled.ompt_callback_implicit_task) {
1755 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1756 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1757 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1758 &(task_info->task_data), 1,
1759 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1760 }
1761
1762 /* OMPT state */
1763 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1764 } else {
1765 exit_frame_p = &dummy;
1766 }
1767 #endif
1768
1769 {
1770 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1771 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1772 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1773 #if OMPT_SUPPORT
1774 ,
1775 exit_frame_p
1776 #endif
1777 );
1778 }
1779
1780 #if OMPT_SUPPORT
1781 if (ompt_enabled.enabled) {
1782 *exit_frame_p = NULL;
1783 if (ompt_enabled.ompt_callback_implicit_task) {
1784 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1785 ompt_scope_end, NULL, &(task_info->task_data), 1,
1786 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1787 }
1788 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1789 __ompt_lw_taskteam_unlink(master_th);
1790 if (ompt_enabled.ompt_callback_parallel_end) {
1791 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1792 ompt_parallel_data, *parent_task_data,
1793 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1794 }
1795 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1796 }
1797 #endif
1798 } else if (microtask == (microtask_t)__kmp_teams_master) {
1799 KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1800 team = master_th->th.th_team;
1801 // team->t.t_pkfn = microtask;
1802 team->t.t_invoke = invoker;
1803 __kmp_alloc_argv_entries(argc, team, TRUE);
1804 team->t.t_argc = argc;
1805 argv = (void **)team->t.t_argv;
1806 for (i = argc - 1; i >= 0; --i)
1807 *argv++ = va_arg(kmp_va_deref(ap), void *);
1808 // AC: revert change made in __kmpc_serialized_parallel()
1809 // because initial code in teams should have level=0
1810 team->t.t_level--;
1811 // AC: call special invoker for outer "parallel" of teams construct
1812 invoker(gtid);
1813 #if OMPT_SUPPORT
1814 if (ompt_enabled.enabled) {
1815 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1816 if (ompt_enabled.ompt_callback_implicit_task) {
1817 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1818 ompt_scope_end, NULL, &(task_info->task_data), 0,
1819 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1820 }
1821 if (ompt_enabled.ompt_callback_parallel_end) {
1822 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1823 ompt_parallel_data, *parent_task_data,
1824 OMPT_INVOKER(call_context) | ompt_parallel_league,
1825 *return_address);
1826 }
1827 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1828 }
1829 #endif
1830 } else {
1831 argv = args;
1832 for (i = argc - 1; i >= 0; --i)
1833 *argv++ = va_arg(kmp_va_deref(ap), void *);
1834 KMP_MB();
1835
1836 #if OMPT_SUPPORT
1837 void *dummy;
1838 void **exit_frame_p;
1839 ompt_task_info_t *task_info;
1840 ompt_lw_taskteam_t lw_taskteam;
1841 ompt_data_t *implicit_task_data;
1842
1843 if (ompt_enabled.enabled) {
1844 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1845 ompt_parallel_data, *return_address);
1846 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1847 // don't use lw_taskteam after linking. content was swaped
1848 task_info = OMPT_CUR_TASK_INFO(master_th);
1849 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1850
1851 /* OMPT implicit task begin */
1852 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1853 if (ompt_enabled.ompt_callback_implicit_task) {
1854 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1855 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1856 implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1857 ompt_task_implicit);
1858 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1859 }
1860
1861 /* OMPT state */
1862 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1863 } else {
1864 exit_frame_p = &dummy;
1865 }
1866 #endif
1867
1868 {
1869 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1870 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1871 __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1872 #if OMPT_SUPPORT
1873 ,
1874 exit_frame_p
1875 #endif
1876 );
1877 }
1878
1879 #if OMPT_SUPPORT
1880 if (ompt_enabled.enabled) {
1881 *exit_frame_p = NULL;
1882 if (ompt_enabled.ompt_callback_implicit_task) {
1883 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1884 ompt_scope_end, NULL, &(task_info->task_data), 1,
1885 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1886 }
1887
1888 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1889 __ompt_lw_taskteam_unlink(master_th);
1890 if (ompt_enabled.ompt_callback_parallel_end) {
1891 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1892 ompt_parallel_data, *parent_task_data,
1893 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1894 }
1895 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1896 }
1897 #endif
1898 }
1899 } else if (call_context == fork_context_gnu) {
1900 #if OMPT_SUPPORT
1901 if (ompt_enabled.enabled) {
1902 ompt_lw_taskteam_t lwt;
1903 __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1904 *return_address);
1905
1906 lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1907 __ompt_lw_taskteam_link(&lwt, master_th, 1);
1908 }
1909 // don't use lw_taskteam after linking. content was swaped
1910 #endif
1911
1912 // we were called from GNU native code
1913 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1914 return FALSE;
1915 } else {
1916 KMP_ASSERT2(call_context < fork_context_last,
1917 "__kmp_serial_fork_call: unknown fork_context parameter");
1918 }
1919
1920 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1921 KMP_MB();
1922 return FALSE;
1923 }
1924
1925 /* most of the work for a fork */
1926 /* return true if we really went parallel, false if serialized */
__kmp_fork_call(ident_t * loc,int gtid,enum fork_context_e call_context,kmp_int32 argc,microtask_t microtask,launch_t invoker,kmp_va_list ap)1927 int __kmp_fork_call(ident_t *loc, int gtid,
1928 enum fork_context_e call_context, // Intel, GNU, ...
1929 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1930 kmp_va_list ap) {
1931 void **argv;
1932 int i;
1933 int master_tid;
1934 int master_this_cons;
1935 kmp_team_t *team;
1936 kmp_team_t *parent_team;
1937 kmp_info_t *master_th;
1938 kmp_root_t *root;
1939 int nthreads;
1940 int master_active;
1941 int master_set_numthreads;
1942 int task_thread_limit = 0;
1943 int level;
1944 int active_level;
1945 int teams_level;
1946 #if KMP_NESTED_HOT_TEAMS
1947 kmp_hot_team_ptr_t **p_hot_teams;
1948 #endif
1949 { // KMP_TIME_BLOCK
1950 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1951 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1952
1953 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1954 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1955 /* Some systems prefer the stack for the root thread(s) to start with */
1956 /* some gap from the parent stack to prevent false sharing. */
1957 void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1958 /* These 2 lines below are so this does not get optimized out */
1959 if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1960 __kmp_stkpadding += (short)((kmp_int64)dummy);
1961 }
1962
1963 /* initialize if needed */
1964 KMP_DEBUG_ASSERT(
1965 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1966 if (!TCR_4(__kmp_init_parallel))
1967 __kmp_parallel_initialize();
1968 __kmp_resume_if_soft_paused();
1969
1970 /* setup current data */
1971 // AC: potentially unsafe, not in sync with library shutdown,
1972 // __kmp_threads can be freed
1973 master_th = __kmp_threads[gtid];
1974
1975 parent_team = master_th->th.th_team;
1976 master_tid = master_th->th.th_info.ds.ds_tid;
1977 master_this_cons = master_th->th.th_local.this_construct;
1978 root = master_th->th.th_root;
1979 master_active = root->r.r_active;
1980 master_set_numthreads = master_th->th.th_set_nproc;
1981 task_thread_limit =
1982 master_th->th.th_current_task->td_icvs.task_thread_limit;
1983
1984 #if OMPT_SUPPORT
1985 ompt_data_t ompt_parallel_data = ompt_data_none;
1986 ompt_data_t *parent_task_data;
1987 ompt_frame_t *ompt_frame;
1988 void *return_address = NULL;
1989
1990 if (ompt_enabled.enabled) {
1991 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1992 NULL, NULL);
1993 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1994 }
1995 #endif
1996
1997 // Assign affinity to root thread if it hasn't happened yet
1998 __kmp_assign_root_init_mask();
1999
2000 // Nested level will be an index in the nested nthreads array
2001 level = parent_team->t.t_level;
2002 // used to launch non-serial teams even if nested is not allowed
2003 active_level = parent_team->t.t_active_level;
2004 // needed to check nesting inside the teams
2005 teams_level = master_th->th.th_teams_level;
2006 #if KMP_NESTED_HOT_TEAMS
2007 p_hot_teams = &master_th->th.th_hot_teams;
2008 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
2009 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
2010 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
2011 (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
2012 // it is either actual or not needed (when active_level > 0)
2013 (*p_hot_teams)[0].hot_team_nth = 1;
2014 }
2015 #endif
2016
2017 #if OMPT_SUPPORT
2018 if (ompt_enabled.enabled) {
2019 if (ompt_enabled.ompt_callback_parallel_begin) {
2020 int team_size = master_set_numthreads
2021 ? master_set_numthreads
2022 : get__nproc_2(parent_team, master_tid);
2023 int flags = OMPT_INVOKER(call_context) |
2024 ((microtask == (microtask_t)__kmp_teams_master)
2025 ? ompt_parallel_league
2026 : ompt_parallel_team);
2027 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
2028 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
2029 return_address);
2030 }
2031 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2032 }
2033 #endif
2034
2035 master_th->th.th_ident = loc;
2036
2037 // Parallel closely nested in teams construct:
2038 if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
2039 return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
2040 call_context, microtask, invoker,
2041 master_set_numthreads, level,
2042 #if OMPT_SUPPORT
2043 ompt_parallel_data, return_address,
2044 #endif
2045 ap);
2046 } // End parallel closely nested in teams construct
2047
2048 // Need this to happen before we determine the number of threads, not while
2049 // we are allocating the team
2050 //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2051
2052 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
2053
2054 // Determine the number of threads
2055 int enter_teams =
2056 __kmp_is_entering_teams(active_level, level, teams_level, ap);
2057 if ((!enter_teams &&
2058 (parent_team->t.t_active_level >=
2059 master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2060 (__kmp_library == library_serial)) {
2061 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2062 nthreads = 1;
2063 } else {
2064 nthreads = master_set_numthreads
2065 ? master_set_numthreads
2066 // TODO: get nproc directly from current task
2067 : get__nproc_2(parent_team, master_tid);
2068 // Use the thread_limit set for the current target task if exists, else go
2069 // with the deduced nthreads
2070 nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2071 ? task_thread_limit
2072 : nthreads;
2073 // Check if we need to take forkjoin lock? (no need for serialized
2074 // parallel out of teams construct).
2075 if (nthreads > 1) {
2076 /* determine how many new threads we can use */
2077 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2078 /* AC: If we execute teams from parallel region (on host), then teams
2079 should be created but each can only have 1 thread if nesting is
2080 disabled. If teams called from serial region, then teams and their
2081 threads should be created regardless of the nesting setting. */
2082 nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2083 nthreads, enter_teams);
2084 if (nthreads == 1) {
2085 // Free lock for single thread execution here; for multi-thread
2086 // execution it will be freed later after team of threads created
2087 // and initialized
2088 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2089 }
2090 }
2091 }
2092 KMP_DEBUG_ASSERT(nthreads > 0);
2093
2094 // If we temporarily changed the set number of threads then restore it now
2095 master_th->th.th_set_nproc = 0;
2096
2097 if (nthreads == 1) {
2098 return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2099 invoker, master_th, parent_team,
2100 #if OMPT_SUPPORT
2101 &ompt_parallel_data, &return_address,
2102 &parent_task_data,
2103 #endif
2104 ap);
2105 } // if (nthreads == 1)
2106
2107 // GEH: only modify the executing flag in the case when not serialized
2108 // serialized case is handled in kmpc_serialized_parallel
2109 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2110 "curtask=%p, curtask_max_aclevel=%d\n",
2111 parent_team->t.t_active_level, master_th,
2112 master_th->th.th_current_task,
2113 master_th->th.th_current_task->td_icvs.max_active_levels));
2114 // TODO: GEH - cannot do this assertion because root thread not set up as
2115 // executing
2116 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2117 master_th->th.th_current_task->td_flags.executing = 0;
2118
2119 if (!master_th->th.th_teams_microtask || level > teams_level) {
2120 /* Increment our nested depth level */
2121 KMP_ATOMIC_INC(&root->r.r_in_parallel);
2122 }
2123
2124 // See if we need to make a copy of the ICVs.
2125 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2126 kmp_nested_nthreads_t *nested_nth = NULL;
2127 if (!master_th->th.th_set_nested_nth &&
2128 (level + 1 < parent_team->t.t_nested_nth->used) &&
2129 (parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
2130 nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
2131 } else if (master_th->th.th_set_nested_nth) {
2132 nested_nth = __kmp_override_nested_nth(master_th, level);
2133 if ((level + 1 < nested_nth->used) &&
2134 (nested_nth->nth[level + 1] != nthreads_icv))
2135 nthreads_icv = nested_nth->nth[level + 1];
2136 else
2137 nthreads_icv = 0; // don't update
2138 } else {
2139 nthreads_icv = 0; // don't update
2140 }
2141
2142 // Figure out the proc_bind_policy for the new team.
2143 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2144 // proc_bind_default means don't update
2145 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2146 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2147 proc_bind = proc_bind_false;
2148 } else {
2149 // No proc_bind clause specified; use current proc-bind-var for this
2150 // parallel region
2151 if (proc_bind == proc_bind_default) {
2152 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2153 }
2154 // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2155 if (master_th->th.th_teams_microtask &&
2156 microtask == (microtask_t)__kmp_teams_master) {
2157 proc_bind = __kmp_teams_proc_bind;
2158 }
2159 /* else: The proc_bind policy was specified explicitly on parallel clause.
2160 This overrides proc-bind-var for this parallel region, but does not
2161 change proc-bind-var. */
2162 // Figure the value of proc-bind-var for the child threads.
2163 if ((level + 1 < __kmp_nested_proc_bind.used) &&
2164 (__kmp_nested_proc_bind.bind_types[level + 1] !=
2165 master_th->th.th_current_task->td_icvs.proc_bind)) {
2166 // Do not modify the proc bind icv for the two teams construct forks
2167 // They just let the proc bind icv pass through
2168 if (!master_th->th.th_teams_microtask ||
2169 !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2170 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2171 }
2172 }
2173
2174 // Reset for next parallel region
2175 master_th->th.th_set_proc_bind = proc_bind_default;
2176
2177 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2178 kmp_internal_control_t new_icvs;
2179 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2180 new_icvs.next = NULL;
2181 if (nthreads_icv > 0) {
2182 new_icvs.nproc = nthreads_icv;
2183 }
2184 if (proc_bind_icv != proc_bind_default) {
2185 new_icvs.proc_bind = proc_bind_icv;
2186 }
2187
2188 /* allocate a new parallel team */
2189 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2190 team = __kmp_allocate_team(root, nthreads, nthreads,
2191 #if OMPT_SUPPORT
2192 ompt_parallel_data,
2193 #endif
2194 proc_bind, &new_icvs,
2195 argc USE_NESTED_HOT_ARG(master_th));
2196 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2197 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2198 } else {
2199 /* allocate a new parallel team */
2200 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2201 team = __kmp_allocate_team(root, nthreads, nthreads,
2202 #if OMPT_SUPPORT
2203 ompt_parallel_data,
2204 #endif
2205 proc_bind,
2206 &master_th->th.th_current_task->td_icvs,
2207 argc USE_NESTED_HOT_ARG(master_th));
2208 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2209 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2210 &master_th->th.th_current_task->td_icvs);
2211 }
2212 KF_TRACE(
2213 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2214
2215 /* setup the new team */
2216 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2217 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2218 KMP_CHECK_UPDATE(team->t.t_ident, loc);
2219 KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2220 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2221 #if OMPT_SUPPORT
2222 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2223 return_address);
2224 #endif
2225 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2226 // TODO: parent_team->t.t_level == INT_MAX ???
2227 if (!master_th->th.th_teams_microtask || level > teams_level) {
2228 int new_level = parent_team->t.t_level + 1;
2229 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2230 new_level = parent_team->t.t_active_level + 1;
2231 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2232 } else {
2233 // AC: Do not increase parallel level at start of the teams construct
2234 int new_level = parent_team->t.t_level;
2235 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2236 new_level = parent_team->t.t_active_level;
2237 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2238 }
2239 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2240 // set primary thread's schedule as new run-time schedule
2241 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2242
2243 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2244 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2245
2246 // Check if hot team has potentially outdated list, and if so, free it
2247 if (team->t.t_nested_nth &&
2248 team->t.t_nested_nth != parent_team->t.t_nested_nth) {
2249 KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
2250 KMP_INTERNAL_FREE(team->t.t_nested_nth);
2251 team->t.t_nested_nth = NULL;
2252 }
2253 team->t.t_nested_nth = parent_team->t.t_nested_nth;
2254 if (master_th->th.th_set_nested_nth) {
2255 if (!nested_nth)
2256 nested_nth = __kmp_override_nested_nth(master_th, level);
2257 team->t.t_nested_nth = nested_nth;
2258 KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
2259 master_th->th.th_set_nested_nth = NULL;
2260 master_th->th.th_set_nested_nth_sz = 0;
2261 master_th->th.th_nt_strict = false;
2262 }
2263
2264 // Update the floating point rounding in the team if required.
2265 propagateFPControl(team);
2266 #if OMPD_SUPPORT
2267 if (ompd_state & OMPD_ENABLE_BP)
2268 ompd_bp_parallel_begin();
2269 #endif
2270
2271 KA_TRACE(
2272 20,
2273 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2274 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2275 team->t.t_nproc));
2276 KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2277 (team->t.t_master_tid == 0 &&
2278 (team->t.t_parent == root->r.r_root_team ||
2279 team->t.t_parent->t.t_serialized)));
2280 KMP_MB();
2281
2282 /* now, setup the arguments */
2283 argv = (void **)team->t.t_argv;
2284 if (ap) {
2285 for (i = argc - 1; i >= 0; --i) {
2286 void *new_argv = va_arg(kmp_va_deref(ap), void *);
2287 KMP_CHECK_UPDATE(*argv, new_argv);
2288 argv++;
2289 }
2290 } else {
2291 for (i = 0; i < argc; ++i) {
2292 // Get args from parent team for teams construct
2293 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2294 }
2295 }
2296
2297 /* now actually fork the threads */
2298 KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2299 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2300 root->r.r_active = TRUE;
2301
2302 __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2303 __kmp_setup_icv_copy(team, nthreads,
2304 &master_th->th.th_current_task->td_icvs, loc);
2305
2306 #if OMPT_SUPPORT
2307 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2308 #endif
2309
2310 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2311
2312 #if USE_ITT_BUILD
2313 if (team->t.t_active_level == 1 // only report frames at level 1
2314 && !master_th->th.th_teams_microtask) { // not in teams construct
2315 #if USE_ITT_NOTIFY
2316 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2317 (__kmp_forkjoin_frames_mode == 3 ||
2318 __kmp_forkjoin_frames_mode == 1)) {
2319 kmp_uint64 tmp_time = 0;
2320 if (__itt_get_timestamp_ptr)
2321 tmp_time = __itt_get_timestamp();
2322 // Internal fork - report frame begin
2323 master_th->th.th_frame_time = tmp_time;
2324 if (__kmp_forkjoin_frames_mode == 3)
2325 team->t.t_region_time = tmp_time;
2326 } else
2327 // only one notification scheme (either "submit" or "forking/joined", not both)
2328 #endif /* USE_ITT_NOTIFY */
2329 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2330 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2331 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2332 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2333 }
2334 }
2335 #endif /* USE_ITT_BUILD */
2336
2337 /* now go on and do the work */
2338 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2339 KMP_MB();
2340 KF_TRACE(10,
2341 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2342 root, team, master_th, gtid));
2343
2344 #if USE_ITT_BUILD
2345 if (__itt_stack_caller_create_ptr) {
2346 // create new stack stitching id before entering fork barrier
2347 if (!enter_teams) {
2348 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2349 team->t.t_stack_id = __kmp_itt_stack_caller_create();
2350 } else if (parent_team->t.t_serialized) {
2351 // keep stack stitching id in the serialized parent_team;
2352 // current team will be used for parallel inside the teams;
2353 // if parent_team is active, then it already keeps stack stitching id
2354 // for the league of teams
2355 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2356 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2357 }
2358 }
2359 #endif /* USE_ITT_BUILD */
2360
2361 // AC: skip __kmp_internal_fork at teams construct, let only primary
2362 // threads execute
2363 if (ap) {
2364 __kmp_internal_fork(loc, gtid, team);
2365 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2366 "master_th=%p, gtid=%d\n",
2367 root, team, master_th, gtid));
2368 }
2369
2370 if (call_context == fork_context_gnu) {
2371 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2372 return TRUE;
2373 }
2374
2375 /* Invoke microtask for PRIMARY thread */
2376 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2377 team->t.t_id, team->t.t_pkfn));
2378 } // END of timer KMP_fork_call block
2379
2380 #if KMP_STATS_ENABLED
2381 // If beginning a teams construct, then change thread state
2382 stats_state_e previous_state = KMP_GET_THREAD_STATE();
2383 if (!ap) {
2384 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2385 }
2386 #endif
2387
2388 if (!team->t.t_invoke(gtid)) {
2389 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2390 }
2391
2392 #if KMP_STATS_ENABLED
2393 // If was beginning of a teams construct, then reset thread state
2394 if (!ap) {
2395 KMP_SET_THREAD_STATE(previous_state);
2396 }
2397 #endif
2398
2399 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2400 team->t.t_id, team->t.t_pkfn));
2401 KMP_MB(); /* Flush all pending memory write invalidates. */
2402
2403 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2404 #if OMPT_SUPPORT
2405 if (ompt_enabled.enabled) {
2406 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2407 }
2408 #endif
2409
2410 return TRUE;
2411 }
2412
2413 #if OMPT_SUPPORT
__kmp_join_restore_state(kmp_info_t * thread,kmp_team_t * team)2414 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2415 kmp_team_t *team) {
2416 // restore state outside the region
2417 thread->th.ompt_thread_info.state =
2418 ((team->t.t_serialized) ? ompt_state_work_serial
2419 : ompt_state_work_parallel);
2420 }
2421
__kmp_join_ompt(int gtid,kmp_info_t * thread,kmp_team_t * team,ompt_data_t * parallel_data,int flags,void * codeptr)2422 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2423 kmp_team_t *team, ompt_data_t *parallel_data,
2424 int flags, void *codeptr) {
2425 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2426 if (ompt_enabled.ompt_callback_parallel_end) {
2427 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2428 parallel_data, &(task_info->task_data), flags, codeptr);
2429 }
2430
2431 task_info->frame.enter_frame = ompt_data_none;
2432 __kmp_join_restore_state(thread, team);
2433 }
2434 #endif
2435
__kmp_join_call(ident_t * loc,int gtid,enum fork_context_e fork_context,int exit_teams)2436 void __kmp_join_call(ident_t *loc, int gtid
2437 #if OMPT_SUPPORT
2438 ,
2439 enum fork_context_e fork_context
2440 #endif
2441 ,
2442 int exit_teams) {
2443 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2444 kmp_team_t *team;
2445 kmp_team_t *parent_team;
2446 kmp_info_t *master_th;
2447 kmp_root_t *root;
2448 int master_active;
2449
2450 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2451
2452 /* setup current data */
2453 master_th = __kmp_threads[gtid];
2454 root = master_th->th.th_root;
2455 team = master_th->th.th_team;
2456 parent_team = team->t.t_parent;
2457
2458 master_th->th.th_ident = loc;
2459
2460 #if OMPT_SUPPORT
2461 void *team_microtask = (void *)team->t.t_pkfn;
2462 // For GOMP interface with serialized parallel, need the
2463 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2464 // and end-parallel events.
2465 if (ompt_enabled.enabled &&
2466 !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2467 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2468 }
2469 #endif
2470
2471 #if KMP_DEBUG
2472 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2473 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2474 "th_task_team = %p\n",
2475 __kmp_gtid_from_thread(master_th), team,
2476 team->t.t_task_team[master_th->th.th_task_state],
2477 master_th->th.th_task_team));
2478 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
2479 }
2480 #endif
2481
2482 if (team->t.t_serialized) {
2483 if (master_th->th.th_teams_microtask) {
2484 // We are in teams construct
2485 int level = team->t.t_level;
2486 int tlevel = master_th->th.th_teams_level;
2487 if (level == tlevel) {
2488 // AC: we haven't incremented it earlier at start of teams construct,
2489 // so do it here - at the end of teams construct
2490 team->t.t_level++;
2491 } else if (level == tlevel + 1) {
2492 // AC: we are exiting parallel inside teams, need to increment
2493 // serialization in order to restore it in the next call to
2494 // __kmpc_end_serialized_parallel
2495 team->t.t_serialized++;
2496 }
2497 }
2498 __kmpc_end_serialized_parallel(loc, gtid);
2499
2500 #if OMPT_SUPPORT
2501 if (ompt_enabled.enabled) {
2502 if (fork_context == fork_context_gnu) {
2503 __ompt_lw_taskteam_unlink(master_th);
2504 }
2505 __kmp_join_restore_state(master_th, parent_team);
2506 }
2507 #endif
2508
2509 return;
2510 }
2511
2512 master_active = team->t.t_master_active;
2513
2514 if (!exit_teams) {
2515 // AC: No barrier for internal teams at exit from teams construct.
2516 // But there is barrier for external team (league).
2517 __kmp_internal_join(loc, gtid, team);
2518 #if USE_ITT_BUILD
2519 if (__itt_stack_caller_create_ptr) {
2520 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2521 // destroy the stack stitching id after join barrier
2522 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2523 team->t.t_stack_id = NULL;
2524 }
2525 #endif
2526 } else {
2527 master_th->th.th_task_state =
2528 0; // AC: no tasking in teams (out of any parallel)
2529 #if USE_ITT_BUILD
2530 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2531 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2532 // destroy the stack stitching id on exit from the teams construct
2533 // if parent_team is active, then the id will be destroyed later on
2534 // by master of the league of teams
2535 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2536 parent_team->t.t_stack_id = NULL;
2537 }
2538 #endif
2539 }
2540
2541 KMP_MB();
2542
2543 #if OMPT_SUPPORT
2544 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2545 void *codeptr = team->t.ompt_team_info.master_return_address;
2546 #endif
2547
2548 #if USE_ITT_BUILD
2549 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2550 if (team->t.t_active_level == 1 &&
2551 (!master_th->th.th_teams_microtask || /* not in teams construct */
2552 master_th->th.th_teams_size.nteams == 1)) {
2553 master_th->th.th_ident = loc;
2554 // only one notification scheme (either "submit" or "forking/joined", not
2555 // both)
2556 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2557 __kmp_forkjoin_frames_mode == 3)
2558 __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2559 master_th->th.th_frame_time, 0, loc,
2560 master_th->th.th_team_nproc, 1);
2561 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2562 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2563 __kmp_itt_region_joined(gtid);
2564 } // active_level == 1
2565 #endif /* USE_ITT_BUILD */
2566
2567 #if KMP_AFFINITY_SUPPORTED
2568 if (!exit_teams) {
2569 // Restore master thread's partition.
2570 master_th->th.th_first_place = team->t.t_first_place;
2571 master_th->th.th_last_place = team->t.t_last_place;
2572 }
2573 #endif // KMP_AFFINITY_SUPPORTED
2574
2575 if (master_th->th.th_teams_microtask && !exit_teams &&
2576 team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2577 team->t.t_level == master_th->th.th_teams_level + 1) {
2578 // AC: We need to leave the team structure intact at the end of parallel
2579 // inside the teams construct, so that at the next parallel same (hot) team
2580 // works, only adjust nesting levels
2581 #if OMPT_SUPPORT
2582 ompt_data_t ompt_parallel_data = ompt_data_none;
2583 if (ompt_enabled.enabled) {
2584 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2585 if (ompt_enabled.ompt_callback_implicit_task) {
2586 int ompt_team_size = team->t.t_nproc;
2587 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2588 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2589 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2590 }
2591 task_info->frame.exit_frame = ompt_data_none;
2592 task_info->task_data = ompt_data_none;
2593 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2594 __ompt_lw_taskteam_unlink(master_th);
2595 }
2596 #endif
2597 /* Decrement our nested depth level */
2598 team->t.t_level--;
2599 team->t.t_active_level--;
2600 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2601
2602 // Restore number of threads in the team if needed. This code relies on
2603 // the proper adjustment of th_teams_size.nth after the fork in
2604 // __kmp_teams_master on each teams primary thread in the case that
2605 // __kmp_reserve_threads reduced it.
2606 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2607 int old_num = master_th->th.th_team_nproc;
2608 int new_num = master_th->th.th_teams_size.nth;
2609 kmp_info_t **other_threads = team->t.t_threads;
2610 team->t.t_nproc = new_num;
2611 for (int i = 0; i < old_num; ++i) {
2612 other_threads[i]->th.th_team_nproc = new_num;
2613 }
2614 // Adjust states of non-used threads of the team
2615 for (int i = old_num; i < new_num; ++i) {
2616 // Re-initialize thread's barrier data.
2617 KMP_DEBUG_ASSERT(other_threads[i]);
2618 kmp_balign_t *balign = other_threads[i]->th.th_bar;
2619 for (int b = 0; b < bs_last_barrier; ++b) {
2620 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2621 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2622 #if USE_DEBUGGER
2623 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2624 #endif
2625 }
2626 if (__kmp_tasking_mode != tskm_immediate_exec) {
2627 // Synchronize thread's task state
2628 other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2629 }
2630 }
2631 }
2632
2633 #if OMPT_SUPPORT
2634 if (ompt_enabled.enabled) {
2635 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2636 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2637 }
2638 #endif
2639
2640 return;
2641 }
2642
2643 /* do cleanup and restore the parent team */
2644 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2645 master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2646
2647 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2648
2649 /* jc: The following lock has instructions with REL and ACQ semantics,
2650 separating the parallel user code called in this parallel region
2651 from the serial user code called after this function returns. */
2652 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2653
2654 if (!master_th->th.th_teams_microtask ||
2655 team->t.t_level > master_th->th.th_teams_level) {
2656 /* Decrement our nested depth level */
2657 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2658 }
2659 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2660
2661 #if OMPT_SUPPORT
2662 if (ompt_enabled.enabled) {
2663 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2664 if (ompt_enabled.ompt_callback_implicit_task) {
2665 int flags = (team_microtask == (void *)__kmp_teams_master)
2666 ? ompt_task_initial
2667 : ompt_task_implicit;
2668 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2669 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2670 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2671 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2672 }
2673 task_info->frame.exit_frame = ompt_data_none;
2674 task_info->task_data = ompt_data_none;
2675 }
2676 #endif
2677
2678 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2679 master_th, team));
2680 __kmp_pop_current_task_from_thread(master_th);
2681
2682 master_th->th.th_def_allocator = team->t.t_def_allocator;
2683
2684 #if OMPD_SUPPORT
2685 if (ompd_state & OMPD_ENABLE_BP)
2686 ompd_bp_parallel_end();
2687 #endif
2688 updateHWFPControl(team);
2689
2690 if (root->r.r_active != master_active)
2691 root->r.r_active = master_active;
2692
2693 __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2694 master_th)); // this will free worker threads
2695
2696 /* this race was fun to find. make sure the following is in the critical
2697 region otherwise assertions may fail occasionally since the old team may be
2698 reallocated and the hierarchy appears inconsistent. it is actually safe to
2699 run and won't cause any bugs, but will cause those assertion failures. it's
2700 only one deref&assign so might as well put this in the critical region */
2701 master_th->th.th_team = parent_team;
2702 master_th->th.th_team_nproc = parent_team->t.t_nproc;
2703 master_th->th.th_team_master = parent_team->t.t_threads[0];
2704 master_th->th.th_team_serialized = parent_team->t.t_serialized;
2705
2706 /* restore serialized team, if need be */
2707 if (parent_team->t.t_serialized &&
2708 parent_team != master_th->th.th_serial_team &&
2709 parent_team != root->r.r_root_team) {
2710 __kmp_free_team(root,
2711 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2712 master_th->th.th_serial_team = parent_team;
2713 }
2714
2715 if (__kmp_tasking_mode != tskm_immediate_exec) {
2716 // Restore primary thread's task state from team structure
2717 KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
2718 team->t.t_primary_task_state == 1);
2719 master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
2720
2721 // Copy the task team from the parent team to the primary thread
2722 master_th->th.th_task_team =
2723 parent_team->t.t_task_team[master_th->th.th_task_state];
2724 KA_TRACE(20,
2725 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2726 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2727 parent_team));
2728 }
2729
2730 // TODO: GEH - cannot do this assertion because root thread not set up as
2731 // executing
2732 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2733 master_th->th.th_current_task->td_flags.executing = 1;
2734
2735 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2736
2737 #if KMP_AFFINITY_SUPPORTED
2738 if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2739 __kmp_reset_root_init_mask(gtid);
2740 }
2741 #endif
2742 #if OMPT_SUPPORT
2743 int flags =
2744 OMPT_INVOKER(fork_context) |
2745 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2746 : ompt_parallel_team);
2747 if (ompt_enabled.enabled) {
2748 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2749 codeptr);
2750 }
2751 #endif
2752
2753 KMP_MB();
2754 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2755 }
2756
2757 /* Check whether we should push an internal control record onto the
2758 serial team stack. If so, do it. */
__kmp_save_internal_controls(kmp_info_t * thread)2759 void __kmp_save_internal_controls(kmp_info_t *thread) {
2760
2761 if (thread->th.th_team != thread->th.th_serial_team) {
2762 return;
2763 }
2764 if (thread->th.th_team->t.t_serialized > 1) {
2765 int push = 0;
2766
2767 if (thread->th.th_team->t.t_control_stack_top == NULL) {
2768 push = 1;
2769 } else {
2770 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2771 thread->th.th_team->t.t_serialized) {
2772 push = 1;
2773 }
2774 }
2775 if (push) { /* push a record on the serial team's stack */
2776 kmp_internal_control_t *control =
2777 (kmp_internal_control_t *)__kmp_allocate(
2778 sizeof(kmp_internal_control_t));
2779
2780 copy_icvs(control, &thread->th.th_current_task->td_icvs);
2781
2782 control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2783
2784 control->next = thread->th.th_team->t.t_control_stack_top;
2785 thread->th.th_team->t.t_control_stack_top = control;
2786 }
2787 }
2788 }
2789
2790 /* Changes set_nproc */
__kmp_set_num_threads(int new_nth,int gtid)2791 void __kmp_set_num_threads(int new_nth, int gtid) {
2792 kmp_info_t *thread;
2793 kmp_root_t *root;
2794
2795 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2796 KMP_DEBUG_ASSERT(__kmp_init_serial);
2797
2798 if (new_nth < 1)
2799 new_nth = 1;
2800 else if (new_nth > __kmp_max_nth)
2801 new_nth = __kmp_max_nth;
2802
2803 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2804 thread = __kmp_threads[gtid];
2805 if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2806 return; // nothing to do
2807
2808 __kmp_save_internal_controls(thread);
2809
2810 set__nproc(thread, new_nth);
2811
2812 // If this omp_set_num_threads() call will cause the hot team size to be
2813 // reduced (in the absence of a num_threads clause), then reduce it now,
2814 // rather than waiting for the next parallel region.
2815 root = thread->th.th_root;
2816 if (__kmp_init_parallel && (!root->r.r_active) &&
2817 (root->r.r_hot_team->t.t_nproc > new_nth)
2818 #if KMP_NESTED_HOT_TEAMS
2819 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2820 #endif
2821 ) {
2822 kmp_team_t *hot_team = root->r.r_hot_team;
2823 int f;
2824
2825 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2826
2827 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2828 __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2829 }
2830 // Release the extra threads we don't need any more.
2831 for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2832 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2833 if (__kmp_tasking_mode != tskm_immediate_exec) {
2834 // When decreasing team size, threads no longer in the team should unref
2835 // task team.
2836 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2837 }
2838 __kmp_free_thread(hot_team->t.t_threads[f]);
2839 hot_team->t.t_threads[f] = NULL;
2840 }
2841 hot_team->t.t_nproc = new_nth;
2842 #if KMP_NESTED_HOT_TEAMS
2843 if (thread->th.th_hot_teams) {
2844 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2845 thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2846 }
2847 #endif
2848
2849 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2850 hot_team->t.b->update_num_threads(new_nth);
2851 __kmp_add_threads_to_team(hot_team, new_nth);
2852 }
2853
2854 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2855
2856 // Update the t_nproc field in the threads that are still active.
2857 for (f = 0; f < new_nth; f++) {
2858 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2859 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2860 }
2861 // Special flag in case omp_set_num_threads() call
2862 hot_team->t.t_size_changed = -1;
2863 }
2864 }
2865
2866 /* Changes max_active_levels */
__kmp_set_max_active_levels(int gtid,int max_active_levels)2867 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2868 kmp_info_t *thread;
2869
2870 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2871 "%d = (%d)\n",
2872 gtid, max_active_levels));
2873 KMP_DEBUG_ASSERT(__kmp_init_serial);
2874
2875 // validate max_active_levels
2876 if (max_active_levels < 0) {
2877 KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2878 // We ignore this call if the user has specified a negative value.
2879 // The current setting won't be changed. The last valid setting will be
2880 // used. A warning will be issued (if warnings are allowed as controlled by
2881 // the KMP_WARNINGS env var).
2882 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2883 "max_active_levels for thread %d = (%d)\n",
2884 gtid, max_active_levels));
2885 return;
2886 }
2887 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2888 // it's OK, the max_active_levels is within the valid range: [ 0;
2889 // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2890 // We allow a zero value. (implementation defined behavior)
2891 } else {
2892 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2893 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2894 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2895 // Current upper limit is MAX_INT. (implementation defined behavior)
2896 // If the input exceeds the upper limit, we correct the input to be the
2897 // upper limit. (implementation defined behavior)
2898 // Actually, the flow should never get here until we use MAX_INT limit.
2899 }
2900 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2901 "max_active_levels for thread %d = (%d)\n",
2902 gtid, max_active_levels));
2903
2904 thread = __kmp_threads[gtid];
2905
2906 __kmp_save_internal_controls(thread);
2907
2908 set__max_active_levels(thread, max_active_levels);
2909 }
2910
2911 /* Gets max_active_levels */
__kmp_get_max_active_levels(int gtid)2912 int __kmp_get_max_active_levels(int gtid) {
2913 kmp_info_t *thread;
2914
2915 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2916 KMP_DEBUG_ASSERT(__kmp_init_serial);
2917
2918 thread = __kmp_threads[gtid];
2919 KMP_DEBUG_ASSERT(thread->th.th_current_task);
2920 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2921 "curtask_maxaclevel=%d\n",
2922 gtid, thread->th.th_current_task,
2923 thread->th.th_current_task->td_icvs.max_active_levels));
2924 return thread->th.th_current_task->td_icvs.max_active_levels;
2925 }
2926
2927 // nteams-var per-device ICV
__kmp_set_num_teams(int num_teams)2928 void __kmp_set_num_teams(int num_teams) {
2929 if (num_teams > 0)
2930 __kmp_nteams = num_teams;
2931 }
__kmp_get_max_teams(void)2932 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2933 // teams-thread-limit-var per-device ICV
__kmp_set_teams_thread_limit(int limit)2934 void __kmp_set_teams_thread_limit(int limit) {
2935 if (limit > 0)
2936 __kmp_teams_thread_limit = limit;
2937 }
__kmp_get_teams_thread_limit(void)2938 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2939
2940 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2941 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2942
2943 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
__kmp_set_schedule(int gtid,kmp_sched_t kind,int chunk)2944 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2945 kmp_info_t *thread;
2946 kmp_sched_t orig_kind;
2947 // kmp_team_t *team;
2948
2949 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2950 gtid, (int)kind, chunk));
2951 KMP_DEBUG_ASSERT(__kmp_init_serial);
2952
2953 // Check if the kind parameter is valid, correct if needed.
2954 // Valid parameters should fit in one of two intervals - standard or extended:
2955 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2956 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2957 orig_kind = kind;
2958 kind = __kmp_sched_without_mods(kind);
2959
2960 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2961 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2962 // TODO: Hint needs attention in case we change the default schedule.
2963 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2964 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2965 __kmp_msg_null);
2966 kind = kmp_sched_default;
2967 chunk = 0; // ignore chunk value in case of bad kind
2968 }
2969
2970 thread = __kmp_threads[gtid];
2971
2972 __kmp_save_internal_controls(thread);
2973
2974 if (kind < kmp_sched_upper_std) {
2975 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2976 // differ static chunked vs. unchunked: chunk should be invalid to
2977 // indicate unchunked schedule (which is the default)
2978 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2979 } else {
2980 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2981 __kmp_sch_map[kind - kmp_sched_lower - 1];
2982 }
2983 } else {
2984 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2985 // kmp_sched_lower - 2 ];
2986 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2987 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2988 kmp_sched_lower - 2];
2989 }
2990 __kmp_sched_apply_mods_intkind(
2991 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2992 if (kind == kmp_sched_auto || chunk < 1) {
2993 // ignore parameter chunk for schedule auto
2994 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2995 } else {
2996 thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2997 }
2998 }
2999
3000 /* Gets def_sched_var ICV values */
__kmp_get_schedule(int gtid,kmp_sched_t * kind,int * chunk)3001 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
3002 kmp_info_t *thread;
3003 enum sched_type th_type;
3004
3005 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
3006 KMP_DEBUG_ASSERT(__kmp_init_serial);
3007
3008 thread = __kmp_threads[gtid];
3009
3010 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3011 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3012 case kmp_sch_static:
3013 case kmp_sch_static_greedy:
3014 case kmp_sch_static_balanced:
3015 *kind = kmp_sched_static;
3016 __kmp_sched_apply_mods_stdkind(kind, th_type);
3017 *chunk = 0; // chunk was not set, try to show this fact via zero value
3018 return;
3019 case kmp_sch_static_chunked:
3020 *kind = kmp_sched_static;
3021 break;
3022 case kmp_sch_dynamic_chunked:
3023 *kind = kmp_sched_dynamic;
3024 break;
3025 case kmp_sch_guided_chunked:
3026 case kmp_sch_guided_iterative_chunked:
3027 case kmp_sch_guided_analytical_chunked:
3028 *kind = kmp_sched_guided;
3029 break;
3030 case kmp_sch_auto:
3031 *kind = kmp_sched_auto;
3032 break;
3033 case kmp_sch_trapezoidal:
3034 *kind = kmp_sched_trapezoidal;
3035 break;
3036 #if KMP_STATIC_STEAL_ENABLED
3037 case kmp_sch_static_steal:
3038 *kind = kmp_sched_static_steal;
3039 break;
3040 #endif
3041 default:
3042 KMP_FATAL(UnknownSchedulingType, th_type);
3043 }
3044
3045 __kmp_sched_apply_mods_stdkind(kind, th_type);
3046 *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3047 }
3048
__kmp_get_ancestor_thread_num(int gtid,int level)3049 int __kmp_get_ancestor_thread_num(int gtid, int level) {
3050
3051 int ii, dd;
3052 kmp_team_t *team;
3053 kmp_info_t *thr;
3054
3055 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3056 KMP_DEBUG_ASSERT(__kmp_init_serial);
3057
3058 // validate level
3059 if (level == 0)
3060 return 0;
3061 if (level < 0)
3062 return -1;
3063 thr = __kmp_threads[gtid];
3064 team = thr->th.th_team;
3065 ii = team->t.t_level;
3066 if (level > ii)
3067 return -1;
3068
3069 if (thr->th.th_teams_microtask) {
3070 // AC: we are in teams region where multiple nested teams have same level
3071 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3072 if (level <=
3073 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3074 KMP_DEBUG_ASSERT(ii >= tlevel);
3075 // AC: As we need to pass by the teams league, we need to artificially
3076 // increase ii
3077 if (ii == tlevel) {
3078 ii += 2; // three teams have same level
3079 } else {
3080 ii++; // two teams have same level
3081 }
3082 }
3083 }
3084
3085 if (ii == level)
3086 return __kmp_tid_from_gtid(gtid);
3087
3088 dd = team->t.t_serialized;
3089 level++;
3090 while (ii > level) {
3091 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3092 }
3093 if ((team->t.t_serialized) && (!dd)) {
3094 team = team->t.t_parent;
3095 continue;
3096 }
3097 if (ii > level) {
3098 team = team->t.t_parent;
3099 dd = team->t.t_serialized;
3100 ii--;
3101 }
3102 }
3103
3104 return (dd > 1) ? (0) : (team->t.t_master_tid);
3105 }
3106
__kmp_get_team_size(int gtid,int level)3107 int __kmp_get_team_size(int gtid, int level) {
3108
3109 int ii, dd;
3110 kmp_team_t *team;
3111 kmp_info_t *thr;
3112
3113 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3114 KMP_DEBUG_ASSERT(__kmp_init_serial);
3115
3116 // validate level
3117 if (level == 0)
3118 return 1;
3119 if (level < 0)
3120 return -1;
3121 thr = __kmp_threads[gtid];
3122 team = thr->th.th_team;
3123 ii = team->t.t_level;
3124 if (level > ii)
3125 return -1;
3126
3127 if (thr->th.th_teams_microtask) {
3128 // AC: we are in teams region where multiple nested teams have same level
3129 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3130 if (level <=
3131 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3132 KMP_DEBUG_ASSERT(ii >= tlevel);
3133 // AC: As we need to pass by the teams league, we need to artificially
3134 // increase ii
3135 if (ii == tlevel) {
3136 ii += 2; // three teams have same level
3137 } else {
3138 ii++; // two teams have same level
3139 }
3140 }
3141 }
3142
3143 while (ii > level) {
3144 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3145 }
3146 if (team->t.t_serialized && (!dd)) {
3147 team = team->t.t_parent;
3148 continue;
3149 }
3150 if (ii > level) {
3151 team = team->t.t_parent;
3152 ii--;
3153 }
3154 }
3155
3156 return team->t.t_nproc;
3157 }
3158
__kmp_get_schedule_global()3159 kmp_r_sched_t __kmp_get_schedule_global() {
3160 // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3161 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3162 // independently. So one can get the updated schedule here.
3163
3164 kmp_r_sched_t r_sched;
3165
3166 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3167 // __kmp_guided. __kmp_sched should keep original value, so that user can set
3168 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3169 // different roots (even in OMP 2.5)
3170 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3171 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3172 if (s == kmp_sch_static) {
3173 // replace STATIC with more detailed schedule (balanced or greedy)
3174 r_sched.r_sched_type = __kmp_static;
3175 } else if (s == kmp_sch_guided_chunked) {
3176 // replace GUIDED with more detailed schedule (iterative or analytical)
3177 r_sched.r_sched_type = __kmp_guided;
3178 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3179 r_sched.r_sched_type = __kmp_sched;
3180 }
3181 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3182
3183 if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3184 // __kmp_chunk may be wrong here (if it was not ever set)
3185 r_sched.chunk = KMP_DEFAULT_CHUNK;
3186 } else {
3187 r_sched.chunk = __kmp_chunk;
3188 }
3189
3190 return r_sched;
3191 }
3192
3193 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3194 at least argc number of *t_argv entries for the requested team. */
__kmp_alloc_argv_entries(int argc,kmp_team_t * team,int realloc)3195 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3196
3197 KMP_DEBUG_ASSERT(team);
3198 if (!realloc || argc > team->t.t_max_argc) {
3199
3200 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3201 "current entries=%d\n",
3202 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3203 /* if previously allocated heap space for args, free them */
3204 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3205 __kmp_free((void *)team->t.t_argv);
3206
3207 if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3208 /* use unused space in the cache line for arguments */
3209 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3210 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3211 "argv entries\n",
3212 team->t.t_id, team->t.t_max_argc));
3213 team->t.t_argv = &team->t.t_inline_argv[0];
3214 if (__kmp_storage_map) {
3215 __kmp_print_storage_map_gtid(
3216 -1, &team->t.t_inline_argv[0],
3217 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3218 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3219 team->t.t_id);
3220 }
3221 } else {
3222 /* allocate space for arguments in the heap */
3223 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3224 ? KMP_MIN_MALLOC_ARGV_ENTRIES
3225 : 2 * argc;
3226 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3227 "argv entries\n",
3228 team->t.t_id, team->t.t_max_argc));
3229 team->t.t_argv =
3230 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3231 if (__kmp_storage_map) {
3232 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3233 &team->t.t_argv[team->t.t_max_argc],
3234 sizeof(void *) * team->t.t_max_argc,
3235 "team_%d.t_argv", team->t.t_id);
3236 }
3237 }
3238 }
3239 }
3240
__kmp_allocate_team_arrays(kmp_team_t * team,int max_nth)3241 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3242 int i;
3243 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3244 team->t.t_threads =
3245 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3246 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3247 sizeof(dispatch_shared_info_t) * num_disp_buff);
3248 team->t.t_dispatch =
3249 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3250 team->t.t_implicit_task_taskdata =
3251 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3252 team->t.t_max_nproc = max_nth;
3253
3254 /* setup dispatch buffers */
3255 for (i = 0; i < num_disp_buff; ++i) {
3256 team->t.t_disp_buffer[i].buffer_index = i;
3257 team->t.t_disp_buffer[i].doacross_buf_idx = i;
3258 }
3259 }
3260
__kmp_free_team_arrays(kmp_team_t * team)3261 static void __kmp_free_team_arrays(kmp_team_t *team) {
3262 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3263 int i;
3264 for (i = 0; i < team->t.t_max_nproc; ++i) {
3265 if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3266 __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3267 team->t.t_dispatch[i].th_disp_buffer = NULL;
3268 }
3269 }
3270 #if KMP_USE_HIER_SCHED
3271 __kmp_dispatch_free_hierarchies(team);
3272 #endif
3273 __kmp_free(team->t.t_threads);
3274 __kmp_free(team->t.t_disp_buffer);
3275 __kmp_free(team->t.t_dispatch);
3276 __kmp_free(team->t.t_implicit_task_taskdata);
3277 team->t.t_threads = NULL;
3278 team->t.t_disp_buffer = NULL;
3279 team->t.t_dispatch = NULL;
3280 team->t.t_implicit_task_taskdata = 0;
3281 }
3282
__kmp_reallocate_team_arrays(kmp_team_t * team,int max_nth)3283 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3284 kmp_info_t **oldThreads = team->t.t_threads;
3285
3286 __kmp_free(team->t.t_disp_buffer);
3287 __kmp_free(team->t.t_dispatch);
3288 __kmp_free(team->t.t_implicit_task_taskdata);
3289 __kmp_allocate_team_arrays(team, max_nth);
3290
3291 KMP_MEMCPY(team->t.t_threads, oldThreads,
3292 team->t.t_nproc * sizeof(kmp_info_t *));
3293
3294 __kmp_free(oldThreads);
3295 }
3296
__kmp_get_global_icvs(void)3297 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3298
3299 kmp_r_sched_t r_sched =
3300 __kmp_get_schedule_global(); // get current state of scheduling globals
3301
3302 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3303
3304 kmp_internal_control_t g_icvs = {
3305 0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3306 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3307 // adjustment of threads (per thread)
3308 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3309 // whether blocktime is explicitly set
3310 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3311 #if KMP_USE_MONITOR
3312 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3313 // intervals
3314 #endif
3315 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3316 // next parallel region (per thread)
3317 // (use a max ub on value if __kmp_parallel_initialize not called yet)
3318 __kmp_cg_max_nth, // int thread_limit;
3319 __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3320 // on task. This is used in the case of target thread_limit
3321 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3322 // for max_active_levels
3323 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3324 // {sched,chunk} pair
3325 __kmp_nested_proc_bind.bind_types[0],
3326 __kmp_default_device,
3327 NULL // struct kmp_internal_control *next;
3328 };
3329
3330 return g_icvs;
3331 }
3332
__kmp_get_x_global_icvs(const kmp_team_t * team)3333 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3334
3335 kmp_internal_control_t gx_icvs;
3336 gx_icvs.serial_nesting_level =
3337 0; // probably =team->t.t_serial like in save_inter_controls
3338 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3339 gx_icvs.next = NULL;
3340
3341 return gx_icvs;
3342 }
3343
__kmp_initialize_root(kmp_root_t * root)3344 static void __kmp_initialize_root(kmp_root_t *root) {
3345 int f;
3346 kmp_team_t *root_team;
3347 kmp_team_t *hot_team;
3348 int hot_team_max_nth;
3349 kmp_r_sched_t r_sched =
3350 __kmp_get_schedule_global(); // get current state of scheduling globals
3351 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3352 KMP_DEBUG_ASSERT(root);
3353 KMP_ASSERT(!root->r.r_begin);
3354
3355 /* setup the root state structure */
3356 __kmp_init_lock(&root->r.r_begin_lock);
3357 root->r.r_begin = FALSE;
3358 root->r.r_active = FALSE;
3359 root->r.r_in_parallel = 0;
3360 root->r.r_blocktime = __kmp_dflt_blocktime;
3361 #if KMP_AFFINITY_SUPPORTED
3362 root->r.r_affinity_assigned = FALSE;
3363 #endif
3364
3365 /* setup the root team for this task */
3366 /* allocate the root team structure */
3367 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3368
3369 root_team =
3370 __kmp_allocate_team(root,
3371 1, // new_nproc
3372 1, // max_nproc
3373 #if OMPT_SUPPORT
3374 ompt_data_none, // root parallel id
3375 #endif
3376 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3377 0 // argc
3378 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3379 );
3380 #if USE_DEBUGGER
3381 // Non-NULL value should be assigned to make the debugger display the root
3382 // team.
3383 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3384 #endif
3385
3386 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3387
3388 root->r.r_root_team = root_team;
3389 root_team->t.t_control_stack_top = NULL;
3390
3391 /* initialize root team */
3392 root_team->t.t_threads[0] = NULL;
3393 root_team->t.t_nproc = 1;
3394 root_team->t.t_serialized = 1;
3395 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3396 root_team->t.t_sched.sched = r_sched.sched;
3397 root_team->t.t_nested_nth = &__kmp_nested_nth;
3398 KA_TRACE(
3399 20,
3400 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3401 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3402
3403 /* setup the hot team for this task */
3404 /* allocate the hot team structure */
3405 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3406
3407 hot_team =
3408 __kmp_allocate_team(root,
3409 1, // new_nproc
3410 __kmp_dflt_team_nth_ub * 2, // max_nproc
3411 #if OMPT_SUPPORT
3412 ompt_data_none, // root parallel id
3413 #endif
3414 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3415 0 // argc
3416 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3417 );
3418 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3419
3420 root->r.r_hot_team = hot_team;
3421 root_team->t.t_control_stack_top = NULL;
3422
3423 /* first-time initialization */
3424 hot_team->t.t_parent = root_team;
3425
3426 /* initialize hot team */
3427 hot_team_max_nth = hot_team->t.t_max_nproc;
3428 for (f = 0; f < hot_team_max_nth; ++f) {
3429 hot_team->t.t_threads[f] = NULL;
3430 }
3431 hot_team->t.t_nproc = 1;
3432 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3433 hot_team->t.t_sched.sched = r_sched.sched;
3434 hot_team->t.t_size_changed = 0;
3435 hot_team->t.t_nested_nth = &__kmp_nested_nth;
3436 }
3437
3438 #ifdef KMP_DEBUG
3439
3440 typedef struct kmp_team_list_item {
3441 kmp_team_p const *entry;
3442 struct kmp_team_list_item *next;
3443 } kmp_team_list_item_t;
3444 typedef kmp_team_list_item_t *kmp_team_list_t;
3445
__kmp_print_structure_team_accum(kmp_team_list_t list,kmp_team_p const * team)3446 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3447 kmp_team_list_t list, // List of teams.
3448 kmp_team_p const *team // Team to add.
3449 ) {
3450
3451 // List must terminate with item where both entry and next are NULL.
3452 // Team is added to the list only once.
3453 // List is sorted in ascending order by team id.
3454 // Team id is *not* a key.
3455
3456 kmp_team_list_t l;
3457
3458 KMP_DEBUG_ASSERT(list != NULL);
3459 if (team == NULL) {
3460 return;
3461 }
3462
3463 __kmp_print_structure_team_accum(list, team->t.t_parent);
3464 __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3465
3466 // Search list for the team.
3467 l = list;
3468 while (l->next != NULL && l->entry != team) {
3469 l = l->next;
3470 }
3471 if (l->next != NULL) {
3472 return; // Team has been added before, exit.
3473 }
3474
3475 // Team is not found. Search list again for insertion point.
3476 l = list;
3477 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3478 l = l->next;
3479 }
3480
3481 // Insert team.
3482 {
3483 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3484 sizeof(kmp_team_list_item_t));
3485 *item = *l;
3486 l->entry = team;
3487 l->next = item;
3488 }
3489 }
3490
__kmp_print_structure_team(char const * title,kmp_team_p const * team)3491 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3492
3493 ) {
3494 __kmp_printf("%s", title);
3495 if (team != NULL) {
3496 __kmp_printf("%2x %p\n", team->t.t_id, team);
3497 } else {
3498 __kmp_printf(" - (nil)\n");
3499 }
3500 }
3501
__kmp_print_structure_thread(char const * title,kmp_info_p const * thread)3502 static void __kmp_print_structure_thread(char const *title,
3503 kmp_info_p const *thread) {
3504 __kmp_printf("%s", title);
3505 if (thread != NULL) {
3506 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3507 } else {
3508 __kmp_printf(" - (nil)\n");
3509 }
3510 }
3511
__kmp_print_structure(void)3512 void __kmp_print_structure(void) {
3513
3514 kmp_team_list_t list;
3515
3516 // Initialize list of teams.
3517 list =
3518 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3519 list->entry = NULL;
3520 list->next = NULL;
3521
3522 __kmp_printf("\n------------------------------\nGlobal Thread "
3523 "Table\n------------------------------\n");
3524 {
3525 int gtid;
3526 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3527 __kmp_printf("%2d", gtid);
3528 if (__kmp_threads != NULL) {
3529 __kmp_printf(" %p", __kmp_threads[gtid]);
3530 }
3531 if (__kmp_root != NULL) {
3532 __kmp_printf(" %p", __kmp_root[gtid]);
3533 }
3534 __kmp_printf("\n");
3535 }
3536 }
3537
3538 // Print out __kmp_threads array.
3539 __kmp_printf("\n------------------------------\nThreads\n--------------------"
3540 "----------\n");
3541 if (__kmp_threads != NULL) {
3542 int gtid;
3543 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3544 kmp_info_t const *thread = __kmp_threads[gtid];
3545 if (thread != NULL) {
3546 __kmp_printf("GTID %2d %p:\n", gtid, thread);
3547 __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3548 __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3549 __kmp_print_structure_team(" Serial Team: ",
3550 thread->th.th_serial_team);
3551 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3552 __kmp_print_structure_thread(" Primary: ",
3553 thread->th.th_team_master);
3554 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3555 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3556 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3557 __kmp_print_structure_thread(" Next in pool: ",
3558 thread->th.th_next_pool);
3559 __kmp_printf("\n");
3560 __kmp_print_structure_team_accum(list, thread->th.th_team);
3561 __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3562 }
3563 }
3564 } else {
3565 __kmp_printf("Threads array is not allocated.\n");
3566 }
3567
3568 // Print out __kmp_root array.
3569 __kmp_printf("\n------------------------------\nUbers\n----------------------"
3570 "--------\n");
3571 if (__kmp_root != NULL) {
3572 int gtid;
3573 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3574 kmp_root_t const *root = __kmp_root[gtid];
3575 if (root != NULL) {
3576 __kmp_printf("GTID %2d %p:\n", gtid, root);
3577 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3578 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3579 __kmp_print_structure_thread(" Uber Thread: ",
3580 root->r.r_uber_thread);
3581 __kmp_printf(" Active?: %2d\n", root->r.r_active);
3582 __kmp_printf(" In Parallel: %2d\n",
3583 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3584 __kmp_printf("\n");
3585 __kmp_print_structure_team_accum(list, root->r.r_root_team);
3586 __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3587 }
3588 }
3589 } else {
3590 __kmp_printf("Ubers array is not allocated.\n");
3591 }
3592
3593 __kmp_printf("\n------------------------------\nTeams\n----------------------"
3594 "--------\n");
3595 while (list->next != NULL) {
3596 kmp_team_p const *team = list->entry;
3597 int i;
3598 __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3599 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3600 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3601 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3602 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3603 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3604 for (i = 0; i < team->t.t_nproc; ++i) {
3605 __kmp_printf(" Thread %2d: ", i);
3606 __kmp_print_structure_thread("", team->t.t_threads[i]);
3607 }
3608 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3609 __kmp_printf("\n");
3610 list = list->next;
3611 }
3612
3613 // Print out __kmp_thread_pool and __kmp_team_pool.
3614 __kmp_printf("\n------------------------------\nPools\n----------------------"
3615 "--------\n");
3616 __kmp_print_structure_thread("Thread pool: ",
3617 CCAST(kmp_info_t *, __kmp_thread_pool));
3618 __kmp_print_structure_team("Team pool: ",
3619 CCAST(kmp_team_t *, __kmp_team_pool));
3620 __kmp_printf("\n");
3621
3622 // Free team list.
3623 while (list != NULL) {
3624 kmp_team_list_item_t *item = list;
3625 list = list->next;
3626 KMP_INTERNAL_FREE(item);
3627 }
3628 }
3629
3630 #endif
3631
3632 //---------------------------------------------------------------------------
3633 // Stuff for per-thread fast random number generator
3634 // Table of primes
3635 static const unsigned __kmp_primes[] = {
3636 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3637 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3638 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3639 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3640 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3641 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3642 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3643 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3644 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3645 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3646 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3647
3648 //---------------------------------------------------------------------------
3649 // __kmp_get_random: Get a random number using a linear congruential method.
__kmp_get_random(kmp_info_t * thread)3650 unsigned short __kmp_get_random(kmp_info_t *thread) {
3651 unsigned x = thread->th.th_x;
3652 unsigned short r = (unsigned short)(x >> 16);
3653
3654 thread->th.th_x = x * thread->th.th_a + 1;
3655
3656 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3657 thread->th.th_info.ds.ds_tid, r));
3658
3659 return r;
3660 }
3661 //--------------------------------------------------------
3662 // __kmp_init_random: Initialize a random number generator
__kmp_init_random(kmp_info_t * thread)3663 void __kmp_init_random(kmp_info_t *thread) {
3664 unsigned seed = thread->th.th_info.ds.ds_tid;
3665
3666 thread->th.th_a =
3667 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3668 thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3669 KA_TRACE(30,
3670 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3671 }
3672
3673 #if KMP_OS_WINDOWS
3674 /* reclaim array entries for root threads that are already dead, returns number
3675 * reclaimed */
__kmp_reclaim_dead_roots(void)3676 static int __kmp_reclaim_dead_roots(void) {
3677 int i, r = 0;
3678
3679 for (i = 0; i < __kmp_threads_capacity; ++i) {
3680 if (KMP_UBER_GTID(i) &&
3681 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3682 !__kmp_root[i]
3683 ->r.r_active) { // AC: reclaim only roots died in non-active state
3684 r += __kmp_unregister_root_other_thread(i);
3685 }
3686 }
3687 return r;
3688 }
3689 #endif
3690
3691 /* This function attempts to create free entries in __kmp_threads and
3692 __kmp_root, and returns the number of free entries generated.
3693
3694 For Windows* OS static library, the first mechanism used is to reclaim array
3695 entries for root threads that are already dead.
3696
3697 On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3698 __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3699 capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3700 threadprivate cache array has been created. Synchronization with
3701 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3702
3703 After any dead root reclamation, if the clipping value allows array expansion
3704 to result in the generation of a total of nNeed free slots, the function does
3705 that expansion. If not, nothing is done beyond the possible initial root
3706 thread reclamation.
3707
3708 If any argument is negative, the behavior is undefined. */
__kmp_expand_threads(int nNeed)3709 static int __kmp_expand_threads(int nNeed) {
3710 int added = 0;
3711 int minimumRequiredCapacity;
3712 int newCapacity;
3713 kmp_info_t **newThreads;
3714 kmp_root_t **newRoot;
3715
3716 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3717 // resizing __kmp_threads does not need additional protection if foreign
3718 // threads are present
3719
3720 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3721 /* only for Windows static library */
3722 /* reclaim array entries for root threads that are already dead */
3723 added = __kmp_reclaim_dead_roots();
3724
3725 if (nNeed) {
3726 nNeed -= added;
3727 if (nNeed < 0)
3728 nNeed = 0;
3729 }
3730 #endif
3731 if (nNeed <= 0)
3732 return added;
3733
3734 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3735 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3736 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3737 // > __kmp_max_nth in one of two ways:
3738 //
3739 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3740 // may not be reused by another thread, so we may need to increase
3741 // __kmp_threads_capacity to __kmp_max_nth + 1.
3742 //
3743 // 2) New foreign root(s) are encountered. We always register new foreign
3744 // roots. This may cause a smaller # of threads to be allocated at
3745 // subsequent parallel regions, but the worker threads hang around (and
3746 // eventually go to sleep) and need slots in the __kmp_threads[] array.
3747 //
3748 // Anyway, that is the reason for moving the check to see if
3749 // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3750 // instead of having it performed here. -BB
3751
3752 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3753
3754 /* compute expansion headroom to check if we can expand */
3755 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3756 /* possible expansion too small -- give up */
3757 return added;
3758 }
3759 minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3760
3761 newCapacity = __kmp_threads_capacity;
3762 do {
3763 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3764 : __kmp_sys_max_nth;
3765 } while (newCapacity < minimumRequiredCapacity);
3766 newThreads = (kmp_info_t **)__kmp_allocate(
3767 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3768 newRoot =
3769 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3770 KMP_MEMCPY(newThreads, __kmp_threads,
3771 __kmp_threads_capacity * sizeof(kmp_info_t *));
3772 KMP_MEMCPY(newRoot, __kmp_root,
3773 __kmp_threads_capacity * sizeof(kmp_root_t *));
3774 // Put old __kmp_threads array on a list. Any ongoing references to the old
3775 // list will be valid. This list is cleaned up at library shutdown.
3776 kmp_old_threads_list_t *node =
3777 (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3778 node->threads = __kmp_threads;
3779 node->next = __kmp_old_threads_list;
3780 __kmp_old_threads_list = node;
3781
3782 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3783 *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3784 added += newCapacity - __kmp_threads_capacity;
3785 *(volatile int *)&__kmp_threads_capacity = newCapacity;
3786
3787 if (newCapacity > __kmp_tp_capacity) {
3788 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3789 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3790 __kmp_threadprivate_resize_cache(newCapacity);
3791 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3792 *(volatile int *)&__kmp_tp_capacity = newCapacity;
3793 }
3794 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3795 }
3796
3797 return added;
3798 }
3799
3800 /* Register the current thread as a root thread and obtain our gtid. We must
3801 have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3802 thread that calls from __kmp_do_serial_initialize() */
__kmp_register_root(int initial_thread)3803 int __kmp_register_root(int initial_thread) {
3804 kmp_info_t *root_thread;
3805 kmp_root_t *root;
3806 int gtid;
3807 int capacity;
3808 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3809 KA_TRACE(20, ("__kmp_register_root: entered\n"));
3810 KMP_MB();
3811
3812 /* 2007-03-02:
3813 If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3814 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3815 work as expected -- it may return false (that means there is at least one
3816 empty slot in __kmp_threads array), but it is possible the only free slot
3817 is #0, which is reserved for initial thread and so cannot be used for this
3818 one. Following code workarounds this bug.
3819
3820 However, right solution seems to be not reserving slot #0 for initial
3821 thread because:
3822 (1) there is no magic in slot #0,
3823 (2) we cannot detect initial thread reliably (the first thread which does
3824 serial initialization may be not a real initial thread).
3825 */
3826 capacity = __kmp_threads_capacity;
3827 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3828 --capacity;
3829 }
3830
3831 // If it is not for initializing the hidden helper team, we need to take
3832 // __kmp_hidden_helper_threads_num out of the capacity because it is included
3833 // in __kmp_threads_capacity.
3834 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3835 capacity -= __kmp_hidden_helper_threads_num;
3836 }
3837
3838 /* see if there are too many threads */
3839 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3840 if (__kmp_tp_cached) {
3841 __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3842 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3843 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3844 } else {
3845 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3846 __kmp_msg_null);
3847 }
3848 }
3849
3850 // When hidden helper task is enabled, __kmp_threads is organized as follows:
3851 // 0: initial thread, also a regular OpenMP thread.
3852 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3853 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3854 // regular OpenMP threads.
3855 if (TCR_4(__kmp_init_hidden_helper_threads)) {
3856 // Find an available thread slot for hidden helper thread. Slots for hidden
3857 // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3858 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3859 gtid <= __kmp_hidden_helper_threads_num;
3860 gtid++)
3861 ;
3862 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3863 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3864 "hidden helper thread: T#%d\n",
3865 gtid));
3866 } else {
3867 /* find an available thread slot */
3868 // Don't reassign the zero slot since we need that to only be used by
3869 // initial thread. Slots for hidden helper threads should also be skipped.
3870 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3871 gtid = 0;
3872 } else {
3873 for (gtid = __kmp_hidden_helper_threads_num + 1;
3874 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3875 ;
3876 }
3877 KA_TRACE(
3878 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3879 KMP_ASSERT(gtid < __kmp_threads_capacity);
3880 }
3881
3882 /* update global accounting */
3883 __kmp_all_nth++;
3884 TCW_4(__kmp_nth, __kmp_nth + 1);
3885
3886 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3887 // numbers of procs, and method #2 (keyed API call) for higher numbers.
3888 if (__kmp_adjust_gtid_mode) {
3889 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3890 if (TCR_4(__kmp_gtid_mode) != 2) {
3891 TCW_4(__kmp_gtid_mode, 2);
3892 }
3893 } else {
3894 if (TCR_4(__kmp_gtid_mode) != 1) {
3895 TCW_4(__kmp_gtid_mode, 1);
3896 }
3897 }
3898 }
3899
3900 #ifdef KMP_ADJUST_BLOCKTIME
3901 /* Adjust blocktime to zero if necessary */
3902 /* Middle initialization might not have occurred yet */
3903 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3904 if (__kmp_nth > __kmp_avail_proc) {
3905 __kmp_zero_bt = TRUE;
3906 }
3907 }
3908 #endif /* KMP_ADJUST_BLOCKTIME */
3909
3910 /* setup this new hierarchy */
3911 if (!(root = __kmp_root[gtid])) {
3912 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3913 KMP_DEBUG_ASSERT(!root->r.r_root_team);
3914 }
3915
3916 #if KMP_STATS_ENABLED
3917 // Initialize stats as soon as possible (right after gtid assignment).
3918 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3919 __kmp_stats_thread_ptr->startLife();
3920 KMP_SET_THREAD_STATE(SERIAL_REGION);
3921 KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3922 #endif
3923 __kmp_initialize_root(root);
3924
3925 /* setup new root thread structure */
3926 if (root->r.r_uber_thread) {
3927 root_thread = root->r.r_uber_thread;
3928 } else {
3929 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3930 if (__kmp_storage_map) {
3931 __kmp_print_thread_storage_map(root_thread, gtid);
3932 }
3933 root_thread->th.th_info.ds.ds_gtid = gtid;
3934 #if OMPT_SUPPORT
3935 root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3936 #endif
3937 root_thread->th.th_root = root;
3938 if (__kmp_env_consistency_check) {
3939 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3940 }
3941 #if USE_FAST_MEMORY
3942 __kmp_initialize_fast_memory(root_thread);
3943 #endif /* USE_FAST_MEMORY */
3944
3945 #if KMP_USE_BGET
3946 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3947 __kmp_initialize_bget(root_thread);
3948 #endif
3949 __kmp_init_random(root_thread); // Initialize random number generator
3950 }
3951
3952 /* setup the serial team held in reserve by the root thread */
3953 if (!root_thread->th.th_serial_team) {
3954 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3955 KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3956 root_thread->th.th_serial_team = __kmp_allocate_team(
3957 root, 1, 1,
3958 #if OMPT_SUPPORT
3959 ompt_data_none, // root parallel id
3960 #endif
3961 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3962 }
3963 KMP_ASSERT(root_thread->th.th_serial_team);
3964 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3965 root_thread->th.th_serial_team));
3966
3967 /* drop root_thread into place */
3968 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3969
3970 root->r.r_root_team->t.t_threads[0] = root_thread;
3971 root->r.r_hot_team->t.t_threads[0] = root_thread;
3972 root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3973 // AC: the team created in reserve, not for execution (it is unused for now).
3974 root_thread->th.th_serial_team->t.t_serialized = 0;
3975 root->r.r_uber_thread = root_thread;
3976
3977 /* initialize the thread, get it ready to go */
3978 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3979 TCW_4(__kmp_init_gtid, TRUE);
3980
3981 /* prepare the primary thread for get_gtid() */
3982 __kmp_gtid_set_specific(gtid);
3983
3984 #if USE_ITT_BUILD
3985 __kmp_itt_thread_name(gtid);
3986 #endif /* USE_ITT_BUILD */
3987
3988 #ifdef KMP_TDATA_GTID
3989 __kmp_gtid = gtid;
3990 #endif
3991 __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3992 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3993
3994 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3995 "plain=%u\n",
3996 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3997 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3998 KMP_INIT_BARRIER_STATE));
3999 { // Initialize barrier data.
4000 int b;
4001 for (b = 0; b < bs_last_barrier; ++b) {
4002 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
4003 #if USE_DEBUGGER
4004 root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
4005 #endif
4006 }
4007 }
4008 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
4009 KMP_INIT_BARRIER_STATE);
4010
4011 #if KMP_AFFINITY_SUPPORTED
4012 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4013 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4014 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4015 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4016 #endif /* KMP_AFFINITY_SUPPORTED */
4017 root_thread->th.th_def_allocator = __kmp_def_allocator;
4018 root_thread->th.th_prev_level = 0;
4019 root_thread->th.th_prev_num_threads = 1;
4020
4021 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4022 tmp->cg_root = root_thread;
4023 tmp->cg_thread_limit = __kmp_cg_max_nth;
4024 tmp->cg_nthreads = 1;
4025 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4026 " cg_nthreads init to 1\n",
4027 root_thread, tmp));
4028 tmp->up = NULL;
4029 root_thread->th.th_cg_roots = tmp;
4030
4031 __kmp_root_counter++;
4032
4033 #if OMPT_SUPPORT
4034 if (ompt_enabled.enabled) {
4035
4036 kmp_info_t *root_thread = ompt_get_thread();
4037
4038 ompt_set_thread_state(root_thread, ompt_state_overhead);
4039
4040 if (ompt_enabled.ompt_callback_thread_begin) {
4041 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4042 ompt_thread_initial, __ompt_get_thread_data_internal());
4043 }
4044 ompt_data_t *task_data;
4045 ompt_data_t *parallel_data;
4046 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data,
4047 NULL);
4048 if (ompt_enabled.ompt_callback_implicit_task) {
4049 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4050 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4051 }
4052
4053 ompt_set_thread_state(root_thread, ompt_state_work_serial);
4054 }
4055 #endif
4056 #if OMPD_SUPPORT
4057 if (ompd_state & OMPD_ENABLE_BP)
4058 ompd_bp_thread_begin();
4059 #endif
4060
4061 KMP_MB();
4062 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4063
4064 return gtid;
4065 }
4066
4067 #if KMP_NESTED_HOT_TEAMS
__kmp_free_hot_teams(kmp_root_t * root,kmp_info_t * thr,int level,const int max_level)4068 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4069 const int max_level) {
4070 int i, n, nth;
4071 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4072 if (!hot_teams || !hot_teams[level].hot_team) {
4073 return 0;
4074 }
4075 KMP_DEBUG_ASSERT(level < max_level);
4076 kmp_team_t *team = hot_teams[level].hot_team;
4077 nth = hot_teams[level].hot_team_nth;
4078 n = nth - 1; // primary thread is not freed
4079 if (level < max_level - 1) {
4080 for (i = 0; i < nth; ++i) {
4081 kmp_info_t *th = team->t.t_threads[i];
4082 n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4083 if (i > 0 && th->th.th_hot_teams) {
4084 __kmp_free(th->th.th_hot_teams);
4085 th->th.th_hot_teams = NULL;
4086 }
4087 }
4088 }
4089 __kmp_free_team(root, team, NULL);
4090 return n;
4091 }
4092 #endif
4093
4094 // Resets a root thread and clear its root and hot teams.
4095 // Returns the number of __kmp_threads entries directly and indirectly freed.
__kmp_reset_root(int gtid,kmp_root_t * root)4096 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4097 kmp_team_t *root_team = root->r.r_root_team;
4098 kmp_team_t *hot_team = root->r.r_hot_team;
4099 int n = hot_team->t.t_nproc;
4100 int i;
4101
4102 KMP_DEBUG_ASSERT(!root->r.r_active);
4103
4104 root->r.r_root_team = NULL;
4105 root->r.r_hot_team = NULL;
4106 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4107 // before call to __kmp_free_team().
4108 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4109 #if KMP_NESTED_HOT_TEAMS
4110 if (__kmp_hot_teams_max_level >
4111 0) { // need to free nested hot teams and their threads if any
4112 for (i = 0; i < hot_team->t.t_nproc; ++i) {
4113 kmp_info_t *th = hot_team->t.t_threads[i];
4114 if (__kmp_hot_teams_max_level > 1) {
4115 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4116 }
4117 if (th->th.th_hot_teams) {
4118 __kmp_free(th->th.th_hot_teams);
4119 th->th.th_hot_teams = NULL;
4120 }
4121 }
4122 }
4123 #endif
4124 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4125
4126 // Before we can reap the thread, we need to make certain that all other
4127 // threads in the teams that had this root as ancestor have stopped trying to
4128 // steal tasks.
4129 if (__kmp_tasking_mode != tskm_immediate_exec) {
4130 __kmp_wait_to_unref_task_teams();
4131 }
4132
4133 #if KMP_OS_WINDOWS
4134 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4135 KA_TRACE(
4136 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4137 "\n",
4138 (LPVOID) & (root->r.r_uber_thread->th),
4139 root->r.r_uber_thread->th.th_info.ds.ds_thread));
4140 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4141 #endif /* KMP_OS_WINDOWS */
4142
4143 #if OMPD_SUPPORT
4144 if (ompd_state & OMPD_ENABLE_BP)
4145 ompd_bp_thread_end();
4146 #endif
4147
4148 #if OMPT_SUPPORT
4149 ompt_data_t *task_data;
4150 ompt_data_t *parallel_data;
4151 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data,
4152 NULL);
4153 if (ompt_enabled.ompt_callback_implicit_task) {
4154 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4155 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4156 }
4157 if (ompt_enabled.ompt_callback_thread_end) {
4158 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4159 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4160 }
4161 #endif
4162
4163 TCW_4(__kmp_nth,
4164 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4165 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4166 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4167 " to %d\n",
4168 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4169 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4170 if (i == 1) {
4171 // need to free contention group structure
4172 KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4173 root->r.r_uber_thread->th.th_cg_roots->cg_root);
4174 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4175 __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4176 root->r.r_uber_thread->th.th_cg_roots = NULL;
4177 }
4178 __kmp_reap_thread(root->r.r_uber_thread, 1);
4179
4180 // We canot put root thread to __kmp_thread_pool, so we have to reap it
4181 // instead of freeing.
4182 root->r.r_uber_thread = NULL;
4183 /* mark root as no longer in use */
4184 root->r.r_begin = FALSE;
4185
4186 return n;
4187 }
4188
__kmp_unregister_root_current_thread(int gtid)4189 void __kmp_unregister_root_current_thread(int gtid) {
4190 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4191 /* this lock should be ok, since unregister_root_current_thread is never
4192 called during an abort, only during a normal close. furthermore, if you
4193 have the forkjoin lock, you should never try to get the initz lock */
4194 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4195 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4196 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4197 "exiting T#%d\n",
4198 gtid));
4199 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4200 return;
4201 }
4202 kmp_root_t *root = __kmp_root[gtid];
4203
4204 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4205 KMP_ASSERT(KMP_UBER_GTID(gtid));
4206 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4207 KMP_ASSERT(root->r.r_active == FALSE);
4208
4209 KMP_MB();
4210
4211 kmp_info_t *thread = __kmp_threads[gtid];
4212 kmp_team_t *team = thread->th.th_team;
4213 kmp_task_team_t *task_team = thread->th.th_task_team;
4214
4215 // we need to wait for the proxy tasks before finishing the thread
4216 if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4217 task_team->tt.tt_hidden_helper_task_encountered)) {
4218 #if OMPT_SUPPORT
4219 // the runtime is shutting down so we won't report any events
4220 thread->th.ompt_thread_info.state = ompt_state_undefined;
4221 #endif
4222 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4223 }
4224
4225 __kmp_reset_root(gtid, root);
4226
4227 KMP_MB();
4228 KC_TRACE(10,
4229 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4230
4231 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4232 }
4233
4234 #if KMP_OS_WINDOWS
4235 /* __kmp_forkjoin_lock must be already held
4236 Unregisters a root thread that is not the current thread. Returns the number
4237 of __kmp_threads entries freed as a result. */
__kmp_unregister_root_other_thread(int gtid)4238 static int __kmp_unregister_root_other_thread(int gtid) {
4239 kmp_root_t *root = __kmp_root[gtid];
4240 int r;
4241
4242 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4243 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4244 KMP_ASSERT(KMP_UBER_GTID(gtid));
4245 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4246 KMP_ASSERT(root->r.r_active == FALSE);
4247
4248 r = __kmp_reset_root(gtid, root);
4249 KC_TRACE(10,
4250 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4251 return r;
4252 }
4253 #endif
4254
4255 #if KMP_DEBUG
__kmp_task_info()4256 void __kmp_task_info() {
4257
4258 kmp_int32 gtid = __kmp_entry_gtid();
4259 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4260 kmp_info_t *this_thr = __kmp_threads[gtid];
4261 kmp_team_t *steam = this_thr->th.th_serial_team;
4262 kmp_team_t *team = this_thr->th.th_team;
4263
4264 __kmp_printf(
4265 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4266 "ptask=%p\n",
4267 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4268 team->t.t_implicit_task_taskdata[tid].td_parent);
4269 }
4270 #endif // KMP_DEBUG
4271
4272 /* TODO optimize with one big memclr, take out what isn't needed, split
4273 responsibility to workers as much as possible, and delay initialization of
4274 features as much as possible */
__kmp_initialize_info(kmp_info_t * this_thr,kmp_team_t * team,int tid,int gtid)4275 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4276 int tid, int gtid) {
4277 /* this_thr->th.th_info.ds.ds_gtid is setup in
4278 kmp_allocate_thread/create_worker.
4279 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4280 KMP_DEBUG_ASSERT(this_thr != NULL);
4281 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4282 KMP_DEBUG_ASSERT(team);
4283 KMP_DEBUG_ASSERT(team->t.t_threads);
4284 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4285 kmp_info_t *master = team->t.t_threads[0];
4286 KMP_DEBUG_ASSERT(master);
4287 KMP_DEBUG_ASSERT(master->th.th_root);
4288
4289 KMP_MB();
4290
4291 TCW_SYNC_PTR(this_thr->th.th_team, team);
4292
4293 this_thr->th.th_info.ds.ds_tid = tid;
4294 this_thr->th.th_set_nproc = 0;
4295 if (__kmp_tasking_mode != tskm_immediate_exec)
4296 // When tasking is possible, threads are not safe to reap until they are
4297 // done tasking; this will be set when tasking code is exited in wait
4298 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4299 else // no tasking --> always safe to reap
4300 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4301 this_thr->th.th_set_proc_bind = proc_bind_default;
4302
4303 #if KMP_AFFINITY_SUPPORTED
4304 this_thr->th.th_new_place = this_thr->th.th_current_place;
4305 #endif
4306 this_thr->th.th_root = master->th.th_root;
4307
4308 /* setup the thread's cache of the team structure */
4309 this_thr->th.th_team_nproc = team->t.t_nproc;
4310 this_thr->th.th_team_master = master;
4311 this_thr->th.th_team_serialized = team->t.t_serialized;
4312
4313 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4314
4315 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4316 tid, gtid, this_thr, this_thr->th.th_current_task));
4317
4318 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4319 team, tid, TRUE);
4320
4321 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4322 tid, gtid, this_thr, this_thr->th.th_current_task));
4323 // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4324 // __kmp_initialize_team()?
4325
4326 /* TODO no worksharing in speculative threads */
4327 this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4328
4329 this_thr->th.th_local.this_construct = 0;
4330
4331 if (!this_thr->th.th_pri_common) {
4332 this_thr->th.th_pri_common =
4333 (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4334 if (__kmp_storage_map) {
4335 __kmp_print_storage_map_gtid(
4336 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4337 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4338 }
4339 this_thr->th.th_pri_head = NULL;
4340 }
4341
4342 if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4343 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4344 // Make new thread's CG root same as primary thread's
4345 KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4346 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4347 if (tmp) {
4348 // worker changes CG, need to check if old CG should be freed
4349 int i = tmp->cg_nthreads--;
4350 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4351 " on node %p of thread %p to %d\n",
4352 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4353 if (i == 1) {
4354 __kmp_free(tmp); // last thread left CG --> free it
4355 }
4356 }
4357 this_thr->th.th_cg_roots = master->th.th_cg_roots;
4358 // Increment new thread's CG root's counter to add the new thread
4359 this_thr->th.th_cg_roots->cg_nthreads++;
4360 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4361 " node %p of thread %p to %d\n",
4362 this_thr, this_thr->th.th_cg_roots,
4363 this_thr->th.th_cg_roots->cg_root,
4364 this_thr->th.th_cg_roots->cg_nthreads));
4365 this_thr->th.th_current_task->td_icvs.thread_limit =
4366 this_thr->th.th_cg_roots->cg_thread_limit;
4367 }
4368
4369 /* Initialize dynamic dispatch */
4370 {
4371 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4372 // Use team max_nproc since this will never change for the team.
4373 size_t disp_size =
4374 sizeof(dispatch_private_info_t) *
4375 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4376 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4377 team->t.t_max_nproc));
4378 KMP_ASSERT(dispatch);
4379 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4380 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4381
4382 dispatch->th_disp_index = 0;
4383 dispatch->th_doacross_buf_idx = 0;
4384 if (!dispatch->th_disp_buffer) {
4385 dispatch->th_disp_buffer =
4386 (dispatch_private_info_t *)__kmp_allocate(disp_size);
4387
4388 if (__kmp_storage_map) {
4389 __kmp_print_storage_map_gtid(
4390 gtid, &dispatch->th_disp_buffer[0],
4391 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4392 ? 1
4393 : __kmp_dispatch_num_buffers],
4394 disp_size,
4395 "th_%d.th_dispatch.th_disp_buffer "
4396 "(team_%d.t_dispatch[%d].th_disp_buffer)",
4397 gtid, team->t.t_id, gtid);
4398 }
4399 } else {
4400 memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4401 }
4402
4403 dispatch->th_dispatch_pr_current = 0;
4404 dispatch->th_dispatch_sh_current = 0;
4405
4406 dispatch->th_deo_fcn = 0; /* ORDERED */
4407 dispatch->th_dxo_fcn = 0; /* END ORDERED */
4408 }
4409
4410 this_thr->th.th_next_pool = NULL;
4411
4412 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4413 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4414
4415 KMP_MB();
4416 }
4417
4418 /* allocate a new thread for the requesting team. this is only called from
4419 within a forkjoin critical section. we will first try to get an available
4420 thread from the thread pool. if none is available, we will fork a new one
4421 assuming we are able to create a new one. this should be assured, as the
4422 caller should check on this first. */
__kmp_allocate_thread(kmp_root_t * root,kmp_team_t * team,int new_tid)4423 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4424 int new_tid) {
4425 kmp_team_t *serial_team;
4426 kmp_info_t *new_thr;
4427 int new_gtid;
4428
4429 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4430 KMP_DEBUG_ASSERT(root && team);
4431 #if !KMP_NESTED_HOT_TEAMS
4432 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4433 #endif
4434 KMP_MB();
4435
4436 /* first, try to get one from the thread pool unless allocating thread is
4437 * the main hidden helper thread. The hidden helper team should always
4438 * allocate new OS threads. */
4439 if (__kmp_thread_pool && !KMP_HIDDEN_HELPER_TEAM(team)) {
4440 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4441 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4442 if (new_thr == __kmp_thread_pool_insert_pt) {
4443 __kmp_thread_pool_insert_pt = NULL;
4444 }
4445 TCW_4(new_thr->th.th_in_pool, FALSE);
4446 __kmp_suspend_initialize_thread(new_thr);
4447 __kmp_lock_suspend_mx(new_thr);
4448 if (new_thr->th.th_active_in_pool == TRUE) {
4449 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4450 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4451 new_thr->th.th_active_in_pool = FALSE;
4452 }
4453 __kmp_unlock_suspend_mx(new_thr);
4454
4455 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4456 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4457 KMP_ASSERT(!new_thr->th.th_team);
4458 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4459
4460 /* setup the thread structure */
4461 __kmp_initialize_info(new_thr, team, new_tid,
4462 new_thr->th.th_info.ds.ds_gtid);
4463 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4464
4465 TCW_4(__kmp_nth, __kmp_nth + 1);
4466
4467 new_thr->th.th_task_state = 0;
4468
4469 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4470 // Make sure pool thread has transitioned to waiting on own thread struct
4471 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4472 // Thread activated in __kmp_allocate_team when increasing team size
4473 }
4474
4475 #ifdef KMP_ADJUST_BLOCKTIME
4476 /* Adjust blocktime back to zero if necessary */
4477 /* Middle initialization might not have occurred yet */
4478 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4479 if (__kmp_nth > __kmp_avail_proc) {
4480 __kmp_zero_bt = TRUE;
4481 }
4482 }
4483 #endif /* KMP_ADJUST_BLOCKTIME */
4484
4485 #if KMP_DEBUG
4486 // If thread entered pool via __kmp_free_thread, wait_flag should !=
4487 // KMP_BARRIER_PARENT_FLAG.
4488 int b;
4489 kmp_balign_t *balign = new_thr->th.th_bar;
4490 for (b = 0; b < bs_last_barrier; ++b)
4491 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4492 #endif
4493
4494 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4495 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4496
4497 KMP_MB();
4498 return new_thr;
4499 }
4500
4501 /* no, well fork a new one */
4502 KMP_ASSERT(KMP_HIDDEN_HELPER_TEAM(team) || __kmp_nth == __kmp_all_nth);
4503 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4504
4505 #if KMP_USE_MONITOR
4506 // If this is the first worker thread the RTL is creating, then also
4507 // launch the monitor thread. We try to do this as early as possible.
4508 if (!TCR_4(__kmp_init_monitor)) {
4509 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4510 if (!TCR_4(__kmp_init_monitor)) {
4511 KF_TRACE(10, ("before __kmp_create_monitor\n"));
4512 TCW_4(__kmp_init_monitor, 1);
4513 __kmp_create_monitor(&__kmp_monitor);
4514 KF_TRACE(10, ("after __kmp_create_monitor\n"));
4515 #if KMP_OS_WINDOWS
4516 // AC: wait until monitor has started. This is a fix for CQ232808.
4517 // The reason is that if the library is loaded/unloaded in a loop with
4518 // small (parallel) work in between, then there is high probability that
4519 // monitor thread started after the library shutdown. At shutdown it is
4520 // too late to cope with the problem, because when the primary thread is
4521 // in DllMain (process detach) the monitor has no chances to start (it is
4522 // blocked), and primary thread has no means to inform the monitor that
4523 // the library has gone, because all the memory which the monitor can
4524 // access is going to be released/reset.
4525 while (TCR_4(__kmp_init_monitor) < 2) {
4526 KMP_YIELD(TRUE);
4527 }
4528 KF_TRACE(10, ("after monitor thread has started\n"));
4529 #endif
4530 }
4531 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4532 }
4533 #endif
4534
4535 KMP_MB();
4536
4537 {
4538 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4539 ? 1
4540 : __kmp_hidden_helper_threads_num + 1;
4541
4542 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4543 ++new_gtid) {
4544 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4545 }
4546
4547 if (TCR_4(__kmp_init_hidden_helper_threads)) {
4548 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4549 }
4550 }
4551
4552 /* allocate space for it. */
4553 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4554
4555 new_thr->th.th_nt_strict = false;
4556 new_thr->th.th_nt_loc = NULL;
4557 new_thr->th.th_nt_sev = severity_fatal;
4558 new_thr->th.th_nt_msg = NULL;
4559
4560 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4561
4562 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4563 // suppress race conditions detection on synchronization flags in debug mode
4564 // this helps to analyze library internals eliminating false positives
4565 __itt_suppress_mark_range(
4566 __itt_suppress_range, __itt_suppress_threading_errors,
4567 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4568 __itt_suppress_mark_range(
4569 __itt_suppress_range, __itt_suppress_threading_errors,
4570 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4571 #if KMP_OS_WINDOWS
4572 __itt_suppress_mark_range(
4573 __itt_suppress_range, __itt_suppress_threading_errors,
4574 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4575 #else
4576 __itt_suppress_mark_range(__itt_suppress_range,
4577 __itt_suppress_threading_errors,
4578 &new_thr->th.th_suspend_init_count,
4579 sizeof(new_thr->th.th_suspend_init_count));
4580 #endif
4581 // TODO: check if we need to also suppress b_arrived flags
4582 __itt_suppress_mark_range(__itt_suppress_range,
4583 __itt_suppress_threading_errors,
4584 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4585 sizeof(new_thr->th.th_bar[0].bb.b_go));
4586 __itt_suppress_mark_range(__itt_suppress_range,
4587 __itt_suppress_threading_errors,
4588 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4589 sizeof(new_thr->th.th_bar[1].bb.b_go));
4590 __itt_suppress_mark_range(__itt_suppress_range,
4591 __itt_suppress_threading_errors,
4592 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4593 sizeof(new_thr->th.th_bar[2].bb.b_go));
4594 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4595 if (__kmp_storage_map) {
4596 __kmp_print_thread_storage_map(new_thr, new_gtid);
4597 }
4598
4599 // add the reserve serialized team, initialized from the team's primary thread
4600 {
4601 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4602 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4603 new_thr->th.th_serial_team = serial_team =
4604 (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4605 #if OMPT_SUPPORT
4606 ompt_data_none, // root parallel id
4607 #endif
4608 proc_bind_default, &r_icvs,
4609 0 USE_NESTED_HOT_ARG(NULL));
4610 }
4611 KMP_ASSERT(serial_team);
4612 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4613 // execution (it is unused for now).
4614 serial_team->t.t_threads[0] = new_thr;
4615 KF_TRACE(10,
4616 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4617 new_thr));
4618
4619 /* setup the thread structures */
4620 __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4621
4622 #if USE_FAST_MEMORY
4623 __kmp_initialize_fast_memory(new_thr);
4624 #endif /* USE_FAST_MEMORY */
4625
4626 #if KMP_USE_BGET
4627 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4628 __kmp_initialize_bget(new_thr);
4629 #endif
4630
4631 __kmp_init_random(new_thr); // Initialize random number generator
4632
4633 /* Initialize these only once when thread is grabbed for a team allocation */
4634 KA_TRACE(20,
4635 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4636 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4637
4638 int b;
4639 kmp_balign_t *balign = new_thr->th.th_bar;
4640 for (b = 0; b < bs_last_barrier; ++b) {
4641 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4642 balign[b].bb.team = NULL;
4643 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4644 balign[b].bb.use_oncore_barrier = 0;
4645 }
4646
4647 TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4648 new_thr->th.th_sleep_loc_type = flag_unset;
4649
4650 new_thr->th.th_spin_here = FALSE;
4651 new_thr->th.th_next_waiting = 0;
4652 #if KMP_OS_UNIX
4653 new_thr->th.th_blocking = false;
4654 #endif
4655
4656 #if KMP_AFFINITY_SUPPORTED
4657 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4658 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4659 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4660 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4661 #endif
4662 new_thr->th.th_def_allocator = __kmp_def_allocator;
4663 new_thr->th.th_prev_level = 0;
4664 new_thr->th.th_prev_num_threads = 1;
4665
4666 TCW_4(new_thr->th.th_in_pool, FALSE);
4667 new_thr->th.th_active_in_pool = FALSE;
4668 TCW_4(new_thr->th.th_active, TRUE);
4669
4670 new_thr->th.th_set_nested_nth = NULL;
4671 new_thr->th.th_set_nested_nth_sz = 0;
4672
4673 /* adjust the global counters */
4674 __kmp_all_nth++;
4675 __kmp_nth++;
4676
4677 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4678 // numbers of procs, and method #2 (keyed API call) for higher numbers.
4679 if (__kmp_adjust_gtid_mode) {
4680 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4681 if (TCR_4(__kmp_gtid_mode) != 2) {
4682 TCW_4(__kmp_gtid_mode, 2);
4683 }
4684 } else {
4685 if (TCR_4(__kmp_gtid_mode) != 1) {
4686 TCW_4(__kmp_gtid_mode, 1);
4687 }
4688 }
4689 }
4690
4691 #ifdef KMP_ADJUST_BLOCKTIME
4692 /* Adjust blocktime back to zero if necessary */
4693 /* Middle initialization might not have occurred yet */
4694 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4695 if (__kmp_nth > __kmp_avail_proc) {
4696 __kmp_zero_bt = TRUE;
4697 }
4698 }
4699 #endif /* KMP_ADJUST_BLOCKTIME */
4700
4701 #if KMP_AFFINITY_SUPPORTED
4702 // Set the affinity and topology information for new thread
4703 __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
4704 #endif
4705
4706 /* actually fork it and create the new worker thread */
4707 KF_TRACE(
4708 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4709 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4710 KF_TRACE(10,
4711 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4712
4713 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4714 new_gtid));
4715 KMP_MB();
4716 return new_thr;
4717 }
4718
4719 /* Reinitialize team for reuse.
4720 The hot team code calls this case at every fork barrier, so EPCC barrier
4721 test are extremely sensitive to changes in it, esp. writes to the team
4722 struct, which cause a cache invalidation in all threads.
4723 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
__kmp_reinitialize_team(kmp_team_t * team,kmp_internal_control_t * new_icvs,ident_t * loc)4724 static void __kmp_reinitialize_team(kmp_team_t *team,
4725 kmp_internal_control_t *new_icvs,
4726 ident_t *loc) {
4727 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4728 team->t.t_threads[0], team));
4729 KMP_DEBUG_ASSERT(team && new_icvs);
4730 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4731 KMP_CHECK_UPDATE(team->t.t_ident, loc);
4732
4733 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4734 // Copy ICVs to the primary thread's implicit taskdata
4735 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4736 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4737
4738 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4739 team->t.t_threads[0], team));
4740 }
4741
4742 /* Initialize the team data structure.
4743 This assumes the t_threads and t_max_nproc are already set.
4744 Also, we don't touch the arguments */
__kmp_initialize_team(kmp_team_t * team,int new_nproc,kmp_internal_control_t * new_icvs,ident_t * loc)4745 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4746 kmp_internal_control_t *new_icvs,
4747 ident_t *loc) {
4748 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4749
4750 /* verify */
4751 KMP_DEBUG_ASSERT(team);
4752 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4753 KMP_DEBUG_ASSERT(team->t.t_threads);
4754 KMP_MB();
4755
4756 team->t.t_master_tid = 0; /* not needed */
4757 /* team->t.t_master_bar; not needed */
4758 team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4759 team->t.t_nproc = new_nproc;
4760
4761 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4762 team->t.t_next_pool = NULL;
4763 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4764 * up hot team */
4765
4766 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4767 team->t.t_invoke = NULL; /* not needed */
4768
4769 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4770 team->t.t_sched.sched = new_icvs->sched.sched;
4771
4772 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4773 team->t.t_fp_control_saved = FALSE; /* not needed */
4774 team->t.t_x87_fpu_control_word = 0; /* not needed */
4775 team->t.t_mxcsr = 0; /* not needed */
4776 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4777
4778 team->t.t_construct = 0;
4779
4780 team->t.t_ordered.dt.t_value = 0;
4781 team->t.t_master_active = FALSE;
4782
4783 #ifdef KMP_DEBUG
4784 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4785 #endif
4786 #if KMP_OS_WINDOWS
4787 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4788 #endif
4789
4790 team->t.t_control_stack_top = NULL;
4791
4792 __kmp_reinitialize_team(team, new_icvs, loc);
4793
4794 KMP_MB();
4795 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4796 }
4797
4798 #if KMP_AFFINITY_SUPPORTED
__kmp_set_thread_place(kmp_team_t * team,kmp_info_t * th,int first,int last,int newp)4799 static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4800 int first, int last, int newp) {
4801 th->th.th_first_place = first;
4802 th->th.th_last_place = last;
4803 th->th.th_new_place = newp;
4804 if (newp != th->th.th_current_place) {
4805 if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4806 team->t.t_display_affinity = 1;
4807 // Copy topology information associated with the new place
4808 th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4809 th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4810 }
4811 }
4812
4813 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4814 // It calculates the worker + primary thread's partition based upon the parent
4815 // thread's partition, and binds each worker to a thread in their partition.
4816 // The primary thread's partition should already include its current binding.
__kmp_partition_places(kmp_team_t * team,int update_master_only)4817 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4818 // Do not partition places for the hidden helper team
4819 if (KMP_HIDDEN_HELPER_TEAM(team))
4820 return;
4821 // Copy the primary thread's place partition to the team struct
4822 kmp_info_t *master_th = team->t.t_threads[0];
4823 KMP_DEBUG_ASSERT(master_th != NULL);
4824 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4825 int first_place = master_th->th.th_first_place;
4826 int last_place = master_th->th.th_last_place;
4827 int masters_place = master_th->th.th_current_place;
4828 int num_masks = __kmp_affinity.num_masks;
4829 team->t.t_first_place = first_place;
4830 team->t.t_last_place = last_place;
4831
4832 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4833 "bound to place %d partition = [%d,%d]\n",
4834 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4835 team->t.t_id, masters_place, first_place, last_place));
4836
4837 switch (proc_bind) {
4838
4839 case proc_bind_default:
4840 // Serial teams might have the proc_bind policy set to proc_bind_default.
4841 // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4842 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4843 break;
4844
4845 case proc_bind_primary: {
4846 int f;
4847 int n_th = team->t.t_nproc;
4848 for (f = 1; f < n_th; f++) {
4849 kmp_info_t *th = team->t.t_threads[f];
4850 KMP_DEBUG_ASSERT(th != NULL);
4851 __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
4852
4853 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4854 "partition = [%d,%d]\n",
4855 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4856 f, masters_place, first_place, last_place));
4857 }
4858 } break;
4859
4860 case proc_bind_close: {
4861 int f;
4862 int n_th = team->t.t_nproc;
4863 int n_places;
4864 if (first_place <= last_place) {
4865 n_places = last_place - first_place + 1;
4866 } else {
4867 n_places = num_masks - first_place + last_place + 1;
4868 }
4869 if (n_th <= n_places) {
4870 int place = masters_place;
4871 for (f = 1; f < n_th; f++) {
4872 kmp_info_t *th = team->t.t_threads[f];
4873 KMP_DEBUG_ASSERT(th != NULL);
4874
4875 if (place == last_place) {
4876 place = first_place;
4877 } else if (place == (num_masks - 1)) {
4878 place = 0;
4879 } else {
4880 place++;
4881 }
4882 __kmp_set_thread_place(team, th, first_place, last_place, place);
4883
4884 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4885 "partition = [%d,%d]\n",
4886 __kmp_gtid_from_thread(team->t.t_threads[f]),
4887 team->t.t_id, f, place, first_place, last_place));
4888 }
4889 } else {
4890 int S, rem, gap, s_count;
4891 S = n_th / n_places;
4892 s_count = 0;
4893 rem = n_th - (S * n_places);
4894 gap = rem > 0 ? n_places / rem : n_places;
4895 int place = masters_place;
4896 int gap_ct = gap;
4897 for (f = 0; f < n_th; f++) {
4898 kmp_info_t *th = team->t.t_threads[f];
4899 KMP_DEBUG_ASSERT(th != NULL);
4900
4901 __kmp_set_thread_place(team, th, first_place, last_place, place);
4902 s_count++;
4903
4904 if ((s_count == S) && rem && (gap_ct == gap)) {
4905 // do nothing, add an extra thread to place on next iteration
4906 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4907 // we added an extra thread to this place; move to next place
4908 if (place == last_place) {
4909 place = first_place;
4910 } else if (place == (num_masks - 1)) {
4911 place = 0;
4912 } else {
4913 place++;
4914 }
4915 s_count = 0;
4916 gap_ct = 1;
4917 rem--;
4918 } else if (s_count == S) { // place full; don't add extra
4919 if (place == last_place) {
4920 place = first_place;
4921 } else if (place == (num_masks - 1)) {
4922 place = 0;
4923 } else {
4924 place++;
4925 }
4926 gap_ct++;
4927 s_count = 0;
4928 }
4929
4930 KA_TRACE(100,
4931 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4932 "partition = [%d,%d]\n",
4933 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4934 th->th.th_new_place, first_place, last_place));
4935 }
4936 KMP_DEBUG_ASSERT(place == masters_place);
4937 }
4938 } break;
4939
4940 case proc_bind_spread: {
4941 int f;
4942 int n_th = team->t.t_nproc;
4943 int n_places;
4944 int thidx;
4945 if (first_place <= last_place) {
4946 n_places = last_place - first_place + 1;
4947 } else {
4948 n_places = num_masks - first_place + last_place + 1;
4949 }
4950 if (n_th <= n_places) {
4951 int place = -1;
4952
4953 if (n_places != num_masks) {
4954 int S = n_places / n_th;
4955 int s_count, rem, gap, gap_ct;
4956
4957 place = masters_place;
4958 rem = n_places - n_th * S;
4959 gap = rem ? n_th / rem : 1;
4960 gap_ct = gap;
4961 thidx = n_th;
4962 if (update_master_only == 1)
4963 thidx = 1;
4964 for (f = 0; f < thidx; f++) {
4965 kmp_info_t *th = team->t.t_threads[f];
4966 KMP_DEBUG_ASSERT(th != NULL);
4967
4968 int fplace = place, nplace = place;
4969 s_count = 1;
4970 while (s_count < S) {
4971 if (place == last_place) {
4972 place = first_place;
4973 } else if (place == (num_masks - 1)) {
4974 place = 0;
4975 } else {
4976 place++;
4977 }
4978 s_count++;
4979 }
4980 if (rem && (gap_ct == gap)) {
4981 if (place == last_place) {
4982 place = first_place;
4983 } else if (place == (num_masks - 1)) {
4984 place = 0;
4985 } else {
4986 place++;
4987 }
4988 rem--;
4989 gap_ct = 0;
4990 }
4991 __kmp_set_thread_place(team, th, fplace, place, nplace);
4992 gap_ct++;
4993
4994 if (place == last_place) {
4995 place = first_place;
4996 } else if (place == (num_masks - 1)) {
4997 place = 0;
4998 } else {
4999 place++;
5000 }
5001
5002 KA_TRACE(100,
5003 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5004 "partition = [%d,%d], num_masks: %u\n",
5005 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
5006 f, th->th.th_new_place, th->th.th_first_place,
5007 th->th.th_last_place, num_masks));
5008 }
5009 } else {
5010 /* Having uniform space of available computation places I can create
5011 T partitions of round(P/T) size and put threads into the first
5012 place of each partition. */
5013 double current = static_cast<double>(masters_place);
5014 double spacing =
5015 (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5016 int first, last;
5017 kmp_info_t *th;
5018
5019 thidx = n_th + 1;
5020 if (update_master_only == 1)
5021 thidx = 1;
5022 for (f = 0; f < thidx; f++) {
5023 first = static_cast<int>(current);
5024 last = static_cast<int>(current + spacing) - 1;
5025 KMP_DEBUG_ASSERT(last >= first);
5026 if (first >= n_places) {
5027 if (masters_place) {
5028 first -= n_places;
5029 last -= n_places;
5030 if (first == (masters_place + 1)) {
5031 KMP_DEBUG_ASSERT(f == n_th);
5032 first--;
5033 }
5034 if (last == masters_place) {
5035 KMP_DEBUG_ASSERT(f == (n_th - 1));
5036 last--;
5037 }
5038 } else {
5039 KMP_DEBUG_ASSERT(f == n_th);
5040 first = 0;
5041 last = 0;
5042 }
5043 }
5044 if (last >= n_places) {
5045 last = (n_places - 1);
5046 }
5047 place = first;
5048 current += spacing;
5049 if (f < n_th) {
5050 KMP_DEBUG_ASSERT(0 <= first);
5051 KMP_DEBUG_ASSERT(n_places > first);
5052 KMP_DEBUG_ASSERT(0 <= last);
5053 KMP_DEBUG_ASSERT(n_places > last);
5054 KMP_DEBUG_ASSERT(last_place >= first_place);
5055 th = team->t.t_threads[f];
5056 KMP_DEBUG_ASSERT(th);
5057 __kmp_set_thread_place(team, th, first, last, place);
5058 KA_TRACE(100,
5059 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5060 "partition = [%d,%d], spacing = %.4f\n",
5061 __kmp_gtid_from_thread(team->t.t_threads[f]),
5062 team->t.t_id, f, th->th.th_new_place,
5063 th->th.th_first_place, th->th.th_last_place, spacing));
5064 }
5065 }
5066 }
5067 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5068 } else {
5069 int S, rem, gap, s_count;
5070 S = n_th / n_places;
5071 s_count = 0;
5072 rem = n_th - (S * n_places);
5073 gap = rem > 0 ? n_places / rem : n_places;
5074 int place = masters_place;
5075 int gap_ct = gap;
5076 thidx = n_th;
5077 if (update_master_only == 1)
5078 thidx = 1;
5079 for (f = 0; f < thidx; f++) {
5080 kmp_info_t *th = team->t.t_threads[f];
5081 KMP_DEBUG_ASSERT(th != NULL);
5082
5083 __kmp_set_thread_place(team, th, place, place, place);
5084 s_count++;
5085
5086 if ((s_count == S) && rem && (gap_ct == gap)) {
5087 // do nothing, add an extra thread to place on next iteration
5088 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5089 // we added an extra thread to this place; move on to next place
5090 if (place == last_place) {
5091 place = first_place;
5092 } else if (place == (num_masks - 1)) {
5093 place = 0;
5094 } else {
5095 place++;
5096 }
5097 s_count = 0;
5098 gap_ct = 1;
5099 rem--;
5100 } else if (s_count == S) { // place is full; don't add extra thread
5101 if (place == last_place) {
5102 place = first_place;
5103 } else if (place == (num_masks - 1)) {
5104 place = 0;
5105 } else {
5106 place++;
5107 }
5108 gap_ct++;
5109 s_count = 0;
5110 }
5111
5112 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5113 "partition = [%d,%d]\n",
5114 __kmp_gtid_from_thread(team->t.t_threads[f]),
5115 team->t.t_id, f, th->th.th_new_place,
5116 th->th.th_first_place, th->th.th_last_place));
5117 }
5118 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5119 }
5120 } break;
5121
5122 default:
5123 break;
5124 }
5125
5126 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5127 }
5128
5129 #endif // KMP_AFFINITY_SUPPORTED
5130
5131 /* allocate a new team data structure to use. take one off of the free pool if
5132 available */
5133 kmp_team_t *
__kmp_allocate_team(kmp_root_t * root,int new_nproc,int max_nproc,ompt_data_t ompt_parallel_data,kmp_proc_bind_t new_proc_bind,kmp_internal_control_t * new_icvs,int argc USE_NESTED_HOT_ARG (kmp_info_t * master))5134 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5135 #if OMPT_SUPPORT
5136 ompt_data_t ompt_parallel_data,
5137 #endif
5138 kmp_proc_bind_t new_proc_bind,
5139 kmp_internal_control_t *new_icvs,
5140 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5141 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5142 int f;
5143 kmp_team_t *team;
5144 int use_hot_team = !root->r.r_active;
5145 int level = 0;
5146 int do_place_partition = 1;
5147
5148 KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5149 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5150 KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5151 KMP_MB();
5152
5153 #if KMP_NESTED_HOT_TEAMS
5154 kmp_hot_team_ptr_t *hot_teams;
5155 if (master) {
5156 team = master->th.th_team;
5157 level = team->t.t_active_level;
5158 if (master->th.th_teams_microtask) { // in teams construct?
5159 if (master->th.th_teams_size.nteams > 1 &&
5160 ( // #teams > 1
5161 team->t.t_pkfn ==
5162 (microtask_t)__kmp_teams_master || // inner fork of the teams
5163 master->th.th_teams_level <
5164 team->t.t_level)) { // or nested parallel inside the teams
5165 ++level; // not increment if #teams==1, or for outer fork of the teams;
5166 // increment otherwise
5167 }
5168 // Do not perform the place partition if inner fork of the teams
5169 // Wait until nested parallel region encountered inside teams construct
5170 if ((master->th.th_teams_size.nteams == 1 &&
5171 master->th.th_teams_level >= team->t.t_level) ||
5172 (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5173 do_place_partition = 0;
5174 }
5175 hot_teams = master->th.th_hot_teams;
5176 if (level < __kmp_hot_teams_max_level && hot_teams &&
5177 hot_teams[level].hot_team) {
5178 // hot team has already been allocated for given level
5179 use_hot_team = 1;
5180 } else {
5181 use_hot_team = 0;
5182 }
5183 } else {
5184 // check we won't access uninitialized hot_teams, just in case
5185 KMP_DEBUG_ASSERT(new_nproc == 1);
5186 }
5187 #endif
5188 // Optimization to use a "hot" team
5189 if (use_hot_team && new_nproc > 1) {
5190 KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5191 #if KMP_NESTED_HOT_TEAMS
5192 team = hot_teams[level].hot_team;
5193 #else
5194 team = root->r.r_hot_team;
5195 #endif
5196 #if KMP_DEBUG
5197 if (__kmp_tasking_mode != tskm_immediate_exec) {
5198 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5199 "task_team[1] = %p before reinit\n",
5200 team->t.t_task_team[0], team->t.t_task_team[1]));
5201 }
5202 #endif
5203
5204 if (team->t.t_nproc != new_nproc &&
5205 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5206 // Distributed barrier may need a resize
5207 int old_nthr = team->t.t_nproc;
5208 __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5209 }
5210
5211 // If not doing the place partition, then reset the team's proc bind
5212 // to indicate that partitioning of all threads still needs to take place
5213 if (do_place_partition == 0)
5214 team->t.t_proc_bind = proc_bind_default;
5215 // Has the number of threads changed?
5216 /* Let's assume the most common case is that the number of threads is
5217 unchanged, and put that case first. */
5218 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5219 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5220 // This case can mean that omp_set_num_threads() was called and the hot
5221 // team size was already reduced, so we check the special flag
5222 if (team->t.t_size_changed == -1) {
5223 team->t.t_size_changed = 1;
5224 } else {
5225 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5226 }
5227
5228 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5229 kmp_r_sched_t new_sched = new_icvs->sched;
5230 // set primary thread's schedule as new run-time schedule
5231 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5232
5233 __kmp_reinitialize_team(team, new_icvs,
5234 root->r.r_uber_thread->th.th_ident);
5235
5236 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5237 team->t.t_threads[0], team));
5238 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5239
5240 #if KMP_AFFINITY_SUPPORTED
5241 if ((team->t.t_size_changed == 0) &&
5242 (team->t.t_proc_bind == new_proc_bind)) {
5243 if (new_proc_bind == proc_bind_spread) {
5244 if (do_place_partition) {
5245 // add flag to update only master for spread
5246 __kmp_partition_places(team, 1);
5247 }
5248 }
5249 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5250 "proc_bind = %d, partition = [%d,%d]\n",
5251 team->t.t_id, new_proc_bind, team->t.t_first_place,
5252 team->t.t_last_place));
5253 } else {
5254 if (do_place_partition) {
5255 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5256 __kmp_partition_places(team);
5257 }
5258 }
5259 #else
5260 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5261 #endif /* KMP_AFFINITY_SUPPORTED */
5262 } else if (team->t.t_nproc > new_nproc) {
5263 KA_TRACE(20,
5264 ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5265 new_nproc));
5266
5267 team->t.t_size_changed = 1;
5268 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5269 // Barrier size already reduced earlier in this function
5270 // Activate team threads via th_used_in_team
5271 __kmp_add_threads_to_team(team, new_nproc);
5272 }
5273 // When decreasing team size, threads no longer in the team should
5274 // unref task team.
5275 if (__kmp_tasking_mode != tskm_immediate_exec) {
5276 for (f = new_nproc; f < team->t.t_nproc; f++) {
5277 kmp_info_t *th = team->t.t_threads[f];
5278 KMP_DEBUG_ASSERT(th);
5279 th->th.th_task_team = NULL;
5280 }
5281 }
5282 #if KMP_NESTED_HOT_TEAMS
5283 if (__kmp_hot_teams_mode == 0) {
5284 // AC: saved number of threads should correspond to team's value in this
5285 // mode, can be bigger in mode 1, when hot team has threads in reserve
5286 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5287 hot_teams[level].hot_team_nth = new_nproc;
5288 #endif // KMP_NESTED_HOT_TEAMS
5289 /* release the extra threads we don't need any more */
5290 for (f = new_nproc; f < team->t.t_nproc; f++) {
5291 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5292 __kmp_free_thread(team->t.t_threads[f]);
5293 team->t.t_threads[f] = NULL;
5294 }
5295 #if KMP_NESTED_HOT_TEAMS
5296 } // (__kmp_hot_teams_mode == 0)
5297 else {
5298 // When keeping extra threads in team, switch threads to wait on own
5299 // b_go flag
5300 for (f = new_nproc; f < team->t.t_nproc; ++f) {
5301 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5302 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5303 for (int b = 0; b < bs_last_barrier; ++b) {
5304 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5305 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5306 }
5307 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5308 }
5309 }
5310 }
5311 #endif // KMP_NESTED_HOT_TEAMS
5312 team->t.t_nproc = new_nproc;
5313 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5314 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5315 __kmp_reinitialize_team(team, new_icvs,
5316 root->r.r_uber_thread->th.th_ident);
5317
5318 // Update remaining threads
5319 for (f = 0; f < new_nproc; ++f) {
5320 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5321 }
5322
5323 // restore the current task state of the primary thread: should be the
5324 // implicit task
5325 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5326 team->t.t_threads[0], team));
5327
5328 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5329
5330 #ifdef KMP_DEBUG
5331 for (f = 0; f < team->t.t_nproc; f++) {
5332 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5333 team->t.t_threads[f]->th.th_team_nproc ==
5334 team->t.t_nproc);
5335 }
5336 #endif
5337
5338 if (do_place_partition) {
5339 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5340 #if KMP_AFFINITY_SUPPORTED
5341 __kmp_partition_places(team);
5342 #endif
5343 }
5344 } else { // team->t.t_nproc < new_nproc
5345
5346 KA_TRACE(20,
5347 ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5348 new_nproc));
5349 int old_nproc = team->t.t_nproc; // save old value and use to update only
5350 team->t.t_size_changed = 1;
5351
5352 #if KMP_NESTED_HOT_TEAMS
5353 int avail_threads = hot_teams[level].hot_team_nth;
5354 if (new_nproc < avail_threads)
5355 avail_threads = new_nproc;
5356 kmp_info_t **other_threads = team->t.t_threads;
5357 for (f = team->t.t_nproc; f < avail_threads; ++f) {
5358 // Adjust barrier data of reserved threads (if any) of the team
5359 // Other data will be set in __kmp_initialize_info() below.
5360 int b;
5361 kmp_balign_t *balign = other_threads[f]->th.th_bar;
5362 for (b = 0; b < bs_last_barrier; ++b) {
5363 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5364 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5365 #if USE_DEBUGGER
5366 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5367 #endif
5368 }
5369 }
5370 if (hot_teams[level].hot_team_nth >= new_nproc) {
5371 // we have all needed threads in reserve, no need to allocate any
5372 // this only possible in mode 1, cannot have reserved threads in mode 0
5373 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5374 team->t.t_nproc = new_nproc; // just get reserved threads involved
5375 } else {
5376 // We may have some threads in reserve, but not enough;
5377 // get reserved threads involved if any.
5378 team->t.t_nproc = hot_teams[level].hot_team_nth;
5379 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5380 #endif // KMP_NESTED_HOT_TEAMS
5381 if (team->t.t_max_nproc < new_nproc) {
5382 /* reallocate larger arrays */
5383 __kmp_reallocate_team_arrays(team, new_nproc);
5384 __kmp_reinitialize_team(team, new_icvs, NULL);
5385 }
5386
5387 #if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
5388 KMP_AFFINITY_SUPPORTED
5389 /* Temporarily set full mask for primary thread before creation of
5390 workers. The reason is that workers inherit the affinity from the
5391 primary thread, so if a lot of workers are created on the single
5392 core quickly, they don't get a chance to set their own affinity for
5393 a long time. */
5394 kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5395 #endif
5396
5397 /* allocate new threads for the hot team */
5398 for (f = team->t.t_nproc; f < new_nproc; f++) {
5399 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5400 KMP_DEBUG_ASSERT(new_worker);
5401 team->t.t_threads[f] = new_worker;
5402
5403 KA_TRACE(20,
5404 ("__kmp_allocate_team: team %d init T#%d arrived: "
5405 "join=%llu, plain=%llu\n",
5406 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5407 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5408 team->t.t_bar[bs_plain_barrier].b_arrived));
5409
5410 { // Initialize barrier data for new threads.
5411 int b;
5412 kmp_balign_t *balign = new_worker->th.th_bar;
5413 for (b = 0; b < bs_last_barrier; ++b) {
5414 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5415 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5416 KMP_BARRIER_PARENT_FLAG);
5417 #if USE_DEBUGGER
5418 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5419 #endif
5420 }
5421 }
5422 }
5423
5424 #if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
5425 KMP_AFFINITY_SUPPORTED
5426 /* Restore initial primary thread's affinity mask */
5427 new_temp_affinity.restore();
5428 #endif
5429 #if KMP_NESTED_HOT_TEAMS
5430 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5431 #endif // KMP_NESTED_HOT_TEAMS
5432 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5433 // Barrier size already increased earlier in this function
5434 // Activate team threads via th_used_in_team
5435 __kmp_add_threads_to_team(team, new_nproc);
5436 }
5437 /* make sure everyone is syncronized */
5438 // new threads below
5439 __kmp_initialize_team(team, new_nproc, new_icvs,
5440 root->r.r_uber_thread->th.th_ident);
5441
5442 /* reinitialize the threads */
5443 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5444 for (f = 0; f < team->t.t_nproc; ++f)
5445 __kmp_initialize_info(team->t.t_threads[f], team, f,
5446 __kmp_gtid_from_tid(f, team));
5447
5448 // set th_task_state for new threads in hot team with older thread's state
5449 kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5450 for (f = old_nproc; f < team->t.t_nproc; ++f)
5451 team->t.t_threads[f]->th.th_task_state = old_state;
5452
5453 #ifdef KMP_DEBUG
5454 for (f = 0; f < team->t.t_nproc; ++f) {
5455 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5456 team->t.t_threads[f]->th.th_team_nproc ==
5457 team->t.t_nproc);
5458 }
5459 #endif
5460
5461 if (do_place_partition) {
5462 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5463 #if KMP_AFFINITY_SUPPORTED
5464 __kmp_partition_places(team);
5465 #endif
5466 }
5467 } // Check changes in number of threads
5468
5469 if (master->th.th_teams_microtask) {
5470 for (f = 1; f < new_nproc; ++f) {
5471 // propagate teams construct specific info to workers
5472 kmp_info_t *thr = team->t.t_threads[f];
5473 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5474 thr->th.th_teams_level = master->th.th_teams_level;
5475 thr->th.th_teams_size = master->th.th_teams_size;
5476 }
5477 }
5478 #if KMP_NESTED_HOT_TEAMS
5479 if (level) {
5480 // Sync barrier state for nested hot teams, not needed for outermost hot
5481 // team.
5482 for (f = 1; f < new_nproc; ++f) {
5483 kmp_info_t *thr = team->t.t_threads[f];
5484 int b;
5485 kmp_balign_t *balign = thr->th.th_bar;
5486 for (b = 0; b < bs_last_barrier; ++b) {
5487 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5488 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5489 #if USE_DEBUGGER
5490 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5491 #endif
5492 }
5493 }
5494 }
5495 #endif // KMP_NESTED_HOT_TEAMS
5496
5497 /* reallocate space for arguments if necessary */
5498 __kmp_alloc_argv_entries(argc, team, TRUE);
5499 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5500 // The hot team re-uses the previous task team,
5501 // if untouched during the previous release->gather phase.
5502
5503 KF_TRACE(10, (" hot_team = %p\n", team));
5504
5505 #if KMP_DEBUG
5506 if (__kmp_tasking_mode != tskm_immediate_exec) {
5507 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5508 "task_team[1] = %p after reinit\n",
5509 team->t.t_task_team[0], team->t.t_task_team[1]));
5510 }
5511 #endif
5512
5513 #if OMPT_SUPPORT
5514 __ompt_team_assign_id(team, ompt_parallel_data);
5515 #endif
5516
5517 KMP_MB();
5518
5519 return team;
5520 }
5521
5522 /* next, let's try to take one from the team pool */
5523 KMP_MB();
5524 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5525 /* TODO: consider resizing undersized teams instead of reaping them, now
5526 that we have a resizing mechanism */
5527 if (team->t.t_max_nproc >= max_nproc) {
5528 /* take this team from the team pool */
5529 __kmp_team_pool = team->t.t_next_pool;
5530
5531 if (max_nproc > 1 &&
5532 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5533 if (!team->t.b) { // Allocate barrier structure
5534 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5535 }
5536 }
5537
5538 /* setup the team for fresh use */
5539 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5540
5541 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5542 "task_team[1] %p to NULL\n",
5543 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5544 team->t.t_task_team[0] = NULL;
5545 team->t.t_task_team[1] = NULL;
5546
5547 /* reallocate space for arguments if necessary */
5548 __kmp_alloc_argv_entries(argc, team, TRUE);
5549 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5550
5551 KA_TRACE(
5552 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5553 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5554 { // Initialize barrier data.
5555 int b;
5556 for (b = 0; b < bs_last_barrier; ++b) {
5557 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5558 #if USE_DEBUGGER
5559 team->t.t_bar[b].b_master_arrived = 0;
5560 team->t.t_bar[b].b_team_arrived = 0;
5561 #endif
5562 }
5563 }
5564
5565 team->t.t_proc_bind = new_proc_bind;
5566
5567 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5568 team->t.t_id));
5569
5570 #if OMPT_SUPPORT
5571 __ompt_team_assign_id(team, ompt_parallel_data);
5572 #endif
5573
5574 team->t.t_nested_nth = NULL;
5575
5576 KMP_MB();
5577
5578 return team;
5579 }
5580
5581 /* reap team if it is too small, then loop back and check the next one */
5582 // not sure if this is wise, but, will be redone during the hot-teams
5583 // rewrite.
5584 /* TODO: Use technique to find the right size hot-team, don't reap them */
5585 team = __kmp_reap_team(team);
5586 __kmp_team_pool = team;
5587 }
5588
5589 /* nothing available in the pool, no matter, make a new team! */
5590 KMP_MB();
5591 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5592
5593 /* and set it up */
5594 team->t.t_max_nproc = max_nproc;
5595 if (max_nproc > 1 &&
5596 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5597 // Allocate barrier structure
5598 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5599 }
5600
5601 /* NOTE well, for some reason allocating one big buffer and dividing it up
5602 seems to really hurt performance a lot on the P4, so, let's not use this */
5603 __kmp_allocate_team_arrays(team, max_nproc);
5604
5605 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5606 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5607
5608 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5609 "%p to NULL\n",
5610 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5611 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5612 // memory, no need to duplicate
5613 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5614 // memory, no need to duplicate
5615
5616 if (__kmp_storage_map) {
5617 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5618 }
5619
5620 /* allocate space for arguments */
5621 __kmp_alloc_argv_entries(argc, team, FALSE);
5622 team->t.t_argc = argc;
5623
5624 KA_TRACE(20,
5625 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5626 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5627 { // Initialize barrier data.
5628 int b;
5629 for (b = 0; b < bs_last_barrier; ++b) {
5630 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5631 #if USE_DEBUGGER
5632 team->t.t_bar[b].b_master_arrived = 0;
5633 team->t.t_bar[b].b_team_arrived = 0;
5634 #endif
5635 }
5636 }
5637
5638 team->t.t_proc_bind = new_proc_bind;
5639
5640 #if OMPT_SUPPORT
5641 __ompt_team_assign_id(team, ompt_parallel_data);
5642 team->t.ompt_serialized_team_info = NULL;
5643 #endif
5644
5645 KMP_MB();
5646
5647 team->t.t_nested_nth = NULL;
5648
5649 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5650 team->t.t_id));
5651
5652 return team;
5653 }
5654
5655 /* TODO implement hot-teams at all levels */
5656 /* TODO implement lazy thread release on demand (disband request) */
5657
5658 /* free the team. return it to the team pool. release all the threads
5659 * associated with it */
__kmp_free_team(kmp_root_t * root,kmp_team_t * team USE_NESTED_HOT_ARG (kmp_info_t * master))5660 void __kmp_free_team(kmp_root_t *root,
5661 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5662 int f;
5663 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5664 team->t.t_id));
5665
5666 /* verify state */
5667 KMP_DEBUG_ASSERT(root);
5668 KMP_DEBUG_ASSERT(team);
5669 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5670 KMP_DEBUG_ASSERT(team->t.t_threads);
5671
5672 int use_hot_team = team == root->r.r_hot_team;
5673 #if KMP_NESTED_HOT_TEAMS
5674 int level;
5675 if (master) {
5676 level = team->t.t_active_level - 1;
5677 if (master->th.th_teams_microtask) { // in teams construct?
5678 if (master->th.th_teams_size.nteams > 1) {
5679 ++level; // level was not increased in teams construct for
5680 // team_of_masters
5681 }
5682 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5683 master->th.th_teams_level == team->t.t_level) {
5684 ++level; // level was not increased in teams construct for
5685 // team_of_workers before the parallel
5686 } // team->t.t_level will be increased inside parallel
5687 }
5688 #if KMP_DEBUG
5689 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5690 #endif
5691 if (level < __kmp_hot_teams_max_level) {
5692 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5693 use_hot_team = 1;
5694 }
5695 }
5696 #endif // KMP_NESTED_HOT_TEAMS
5697
5698 /* team is done working */
5699 TCW_SYNC_PTR(team->t.t_pkfn,
5700 NULL); // Important for Debugging Support Library.
5701 #if KMP_OS_WINDOWS
5702 team->t.t_copyin_counter = 0; // init counter for possible reuse
5703 #endif
5704 // Do not reset pointer to parent team to NULL for hot teams.
5705
5706 /* if we are non-hot team, release our threads */
5707 if (!use_hot_team) {
5708 if (__kmp_tasking_mode != tskm_immediate_exec) {
5709 // Wait for threads to reach reapable state
5710 for (f = 1; f < team->t.t_nproc; ++f) {
5711 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5712 kmp_info_t *th = team->t.t_threads[f];
5713 volatile kmp_uint32 *state = &th->th.th_reap_state;
5714 while (*state != KMP_SAFE_TO_REAP) {
5715 #if KMP_OS_WINDOWS
5716 // On Windows a thread can be killed at any time, check this
5717 DWORD ecode;
5718 if (!__kmp_is_thread_alive(th, &ecode)) {
5719 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5720 break;
5721 }
5722 #endif
5723 // first check if thread is sleeping
5724 if (th->th.th_sleep_loc)
5725 __kmp_null_resume_wrapper(th);
5726 KMP_CPU_PAUSE();
5727 }
5728 }
5729
5730 // Delete task teams
5731 int tt_idx;
5732 for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5733 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5734 if (task_team != NULL) {
5735 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5736 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5737 team->t.t_threads[f]->th.th_task_team = NULL;
5738 }
5739 KA_TRACE(
5740 20,
5741 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5742 __kmp_get_gtid(), task_team, team->t.t_id));
5743 #if KMP_NESTED_HOT_TEAMS
5744 __kmp_free_task_team(master, task_team);
5745 #endif
5746 team->t.t_task_team[tt_idx] = NULL;
5747 }
5748 }
5749 }
5750
5751 // Before clearing parent pointer, check if nested_nth list should be freed
5752 if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth &&
5753 team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) {
5754 KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
5755 KMP_INTERNAL_FREE(team->t.t_nested_nth);
5756 }
5757 team->t.t_nested_nth = NULL;
5758
5759 // Reset pointer to parent team only for non-hot teams.
5760 team->t.t_parent = NULL;
5761 team->t.t_level = 0;
5762 team->t.t_active_level = 0;
5763
5764 /* free the worker threads */
5765 for (f = 1; f < team->t.t_nproc; ++f) {
5766 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5767 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5768 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5769 1, 2);
5770 }
5771 __kmp_free_thread(team->t.t_threads[f]);
5772 }
5773
5774 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5775 if (team->t.b) {
5776 // wake up thread at old location
5777 team->t.b->go_release();
5778 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5779 for (f = 1; f < team->t.t_nproc; ++f) {
5780 if (team->t.b->sleep[f].sleep) {
5781 __kmp_atomic_resume_64(
5782 team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5783 (kmp_atomic_flag_64<> *)NULL);
5784 }
5785 }
5786 }
5787 // Wait for threads to be removed from team
5788 for (int f = 1; f < team->t.t_nproc; ++f) {
5789 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5790 KMP_CPU_PAUSE();
5791 }
5792 }
5793 }
5794
5795 for (f = 1; f < team->t.t_nproc; ++f) {
5796 team->t.t_threads[f] = NULL;
5797 }
5798
5799 if (team->t.t_max_nproc > 1 &&
5800 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5801 distributedBarrier::deallocate(team->t.b);
5802 team->t.b = NULL;
5803 }
5804 /* put the team back in the team pool */
5805 /* TODO limit size of team pool, call reap_team if pool too large */
5806 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5807 __kmp_team_pool = (volatile kmp_team_t *)team;
5808 } else { // Check if team was created for primary threads in teams construct
5809 // See if first worker is a CG root
5810 KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5811 team->t.t_threads[1]->th.th_cg_roots);
5812 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5813 // Clean up the CG root nodes on workers so that this team can be re-used
5814 for (f = 1; f < team->t.t_nproc; ++f) {
5815 kmp_info_t *thr = team->t.t_threads[f];
5816 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5817 thr->th.th_cg_roots->cg_root == thr);
5818 // Pop current CG root off list
5819 kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5820 thr->th.th_cg_roots = tmp->up;
5821 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5822 " up to node %p. cg_nthreads was %d\n",
5823 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5824 int i = tmp->cg_nthreads--;
5825 if (i == 1) {
5826 __kmp_free(tmp); // free CG if we are the last thread in it
5827 }
5828 // Restore current task's thread_limit from CG root
5829 if (thr->th.th_cg_roots)
5830 thr->th.th_current_task->td_icvs.thread_limit =
5831 thr->th.th_cg_roots->cg_thread_limit;
5832 }
5833 }
5834 }
5835
5836 KMP_MB();
5837 }
5838
5839 /* reap the team. destroy it, reclaim all its resources and free its memory */
__kmp_reap_team(kmp_team_t * team)5840 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5841 kmp_team_t *next_pool = team->t.t_next_pool;
5842
5843 KMP_DEBUG_ASSERT(team);
5844 KMP_DEBUG_ASSERT(team->t.t_dispatch);
5845 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5846 KMP_DEBUG_ASSERT(team->t.t_threads);
5847 KMP_DEBUG_ASSERT(team->t.t_argv);
5848
5849 /* TODO clean the threads that are a part of this? */
5850
5851 /* free stuff */
5852 __kmp_free_team_arrays(team);
5853 if (team->t.t_argv != &team->t.t_inline_argv[0])
5854 __kmp_free((void *)team->t.t_argv);
5855 __kmp_free(team);
5856
5857 KMP_MB();
5858 return next_pool;
5859 }
5860
5861 // Free the thread. Don't reap it, just place it on the pool of available
5862 // threads.
5863 //
5864 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5865 // binding for the affinity mechanism to be useful.
5866 //
5867 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5868 // However, we want to avoid a potential performance problem by always
5869 // scanning through the list to find the correct point at which to insert
5870 // the thread (potential N**2 behavior). To do this we keep track of the
5871 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5872 // With single-level parallelism, threads will always be added to the tail
5873 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5874 // parallelism, all bets are off and we may need to scan through the entire
5875 // free list.
5876 //
5877 // This change also has a potentially large performance benefit, for some
5878 // applications. Previously, as threads were freed from the hot team, they
5879 // would be placed back on the free list in inverse order. If the hot team
5880 // grew back to it's original size, then the freed thread would be placed
5881 // back on the hot team in reverse order. This could cause bad cache
5882 // locality problems on programs where the size of the hot team regularly
5883 // grew and shrunk.
5884 //
5885 // Now, for single-level parallelism, the OMP tid is always == gtid.
__kmp_free_thread(kmp_info_t * this_th)5886 void __kmp_free_thread(kmp_info_t *this_th) {
5887 int gtid;
5888 kmp_info_t **scan;
5889
5890 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5891 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5892
5893 KMP_DEBUG_ASSERT(this_th);
5894
5895 // When moving thread to pool, switch thread to wait on own b_go flag, and
5896 // uninitialized (NULL team).
5897 int b;
5898 kmp_balign_t *balign = this_th->th.th_bar;
5899 for (b = 0; b < bs_last_barrier; ++b) {
5900 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5901 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5902 balign[b].bb.team = NULL;
5903 balign[b].bb.leaf_kids = 0;
5904 }
5905 this_th->th.th_task_state = 0;
5906 this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5907
5908 /* put thread back on the free pool */
5909 TCW_PTR(this_th->th.th_team, NULL);
5910 TCW_PTR(this_th->th.th_root, NULL);
5911 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5912
5913 while (this_th->th.th_cg_roots) {
5914 this_th->th.th_cg_roots->cg_nthreads--;
5915 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5916 " %p of thread %p to %d\n",
5917 this_th, this_th->th.th_cg_roots,
5918 this_th->th.th_cg_roots->cg_root,
5919 this_th->th.th_cg_roots->cg_nthreads));
5920 kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5921 if (tmp->cg_root == this_th) { // Thread is a cg_root
5922 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5923 KA_TRACE(
5924 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5925 this_th->th.th_cg_roots = tmp->up;
5926 __kmp_free(tmp);
5927 } else { // Worker thread
5928 if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5929 __kmp_free(tmp);
5930 }
5931 this_th->th.th_cg_roots = NULL;
5932 break;
5933 }
5934 }
5935
5936 /* If the implicit task assigned to this thread can be used by other threads
5937 * -> multiple threads can share the data and try to free the task at
5938 * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5939 * with higher probability when hot team is disabled but can occurs even when
5940 * the hot team is enabled */
5941 __kmp_free_implicit_task(this_th);
5942 this_th->th.th_current_task = NULL;
5943
5944 // If the __kmp_thread_pool_insert_pt is already past the new insert
5945 // point, then we need to re-scan the entire list.
5946 gtid = this_th->th.th_info.ds.ds_gtid;
5947 if (__kmp_thread_pool_insert_pt != NULL) {
5948 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5949 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5950 __kmp_thread_pool_insert_pt = NULL;
5951 }
5952 }
5953
5954 // Scan down the list to find the place to insert the thread.
5955 // scan is the address of a link in the list, possibly the address of
5956 // __kmp_thread_pool itself.
5957 //
5958 // In the absence of nested parallelism, the for loop will have 0 iterations.
5959 if (__kmp_thread_pool_insert_pt != NULL) {
5960 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5961 } else {
5962 scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5963 }
5964 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5965 scan = &((*scan)->th.th_next_pool))
5966 ;
5967
5968 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5969 // to its address.
5970 TCW_PTR(this_th->th.th_next_pool, *scan);
5971 __kmp_thread_pool_insert_pt = *scan = this_th;
5972 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5973 (this_th->th.th_info.ds.ds_gtid <
5974 this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5975 TCW_4(this_th->th.th_in_pool, TRUE);
5976 __kmp_suspend_initialize_thread(this_th);
5977 __kmp_lock_suspend_mx(this_th);
5978 if (this_th->th.th_active == TRUE) {
5979 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5980 this_th->th.th_active_in_pool = TRUE;
5981 }
5982 #if KMP_DEBUG
5983 else {
5984 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5985 }
5986 #endif
5987 __kmp_unlock_suspend_mx(this_th);
5988
5989 TCW_4(__kmp_nth, __kmp_nth - 1);
5990
5991 #ifdef KMP_ADJUST_BLOCKTIME
5992 /* Adjust blocktime back to user setting or default if necessary */
5993 /* Middle initialization might never have occurred */
5994 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5995 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5996 if (__kmp_nth <= __kmp_avail_proc) {
5997 __kmp_zero_bt = FALSE;
5998 }
5999 }
6000 #endif /* KMP_ADJUST_BLOCKTIME */
6001
6002 KMP_MB();
6003 }
6004
6005 /* ------------------------------------------------------------------------ */
6006
__kmp_launch_thread(kmp_info_t * this_thr)6007 void *__kmp_launch_thread(kmp_info_t *this_thr) {
6008 #if OMP_PROFILING_SUPPORT
6009 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
6010 // TODO: add a configuration option for time granularity
6011 if (ProfileTraceFile)
6012 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
6013 #endif
6014
6015 int gtid = this_thr->th.th_info.ds.ds_gtid;
6016 /* void *stack_data;*/
6017 kmp_team_t **volatile pteam;
6018
6019 KMP_MB();
6020 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6021
6022 if (__kmp_env_consistency_check) {
6023 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6024 }
6025
6026 #if OMPD_SUPPORT
6027 if (ompd_state & OMPD_ENABLE_BP)
6028 ompd_bp_thread_begin();
6029 #endif
6030
6031 #if OMPT_SUPPORT
6032 ompt_data_t *thread_data = nullptr;
6033 if (ompt_enabled.enabled) {
6034 thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6035 *thread_data = ompt_data_none;
6036
6037 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6038 this_thr->th.ompt_thread_info.wait_id = 0;
6039 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6040 this_thr->th.ompt_thread_info.parallel_flags = 0;
6041 if (ompt_enabled.ompt_callback_thread_begin) {
6042 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6043 ompt_thread_worker, thread_data);
6044 }
6045 this_thr->th.ompt_thread_info.state = ompt_state_idle;
6046 }
6047 #endif
6048
6049 /* This is the place where threads wait for work */
6050 while (!TCR_4(__kmp_global.g.g_done)) {
6051 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6052 KMP_MB();
6053
6054 /* wait for work to do */
6055 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6056
6057 /* No tid yet since not part of a team */
6058 __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6059
6060 #if OMPT_SUPPORT
6061 if (ompt_enabled.enabled) {
6062 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6063 }
6064 #endif
6065
6066 pteam = &this_thr->th.th_team;
6067
6068 /* have we been allocated? */
6069 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6070 /* we were just woken up, so run our new task */
6071 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6072 int rc;
6073 KA_TRACE(20,
6074 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6075 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6076 (*pteam)->t.t_pkfn));
6077
6078 updateHWFPControl(*pteam);
6079
6080 #if OMPT_SUPPORT
6081 if (ompt_enabled.enabled) {
6082 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6083 }
6084 #endif
6085
6086 rc = (*pteam)->t.t_invoke(gtid);
6087 KMP_ASSERT(rc);
6088
6089 KMP_MB();
6090 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6091 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6092 (*pteam)->t.t_pkfn));
6093 }
6094 #if OMPT_SUPPORT
6095 if (ompt_enabled.enabled) {
6096 /* no frame set while outside task */
6097 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6098
6099 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6100 }
6101 #endif
6102 /* join barrier after parallel region */
6103 __kmp_join_barrier(gtid);
6104 }
6105 }
6106
6107 #if OMPD_SUPPORT
6108 if (ompd_state & OMPD_ENABLE_BP)
6109 ompd_bp_thread_end();
6110 #endif
6111
6112 #if OMPT_SUPPORT
6113 if (ompt_enabled.ompt_callback_thread_end) {
6114 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6115 }
6116 #endif
6117
6118 this_thr->th.th_task_team = NULL;
6119 /* run the destructors for the threadprivate data for this thread */
6120 __kmp_common_destroy_gtid(gtid);
6121
6122 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6123 KMP_MB();
6124
6125 #if OMP_PROFILING_SUPPORT
6126 llvm::timeTraceProfilerFinishThread();
6127 #endif
6128 return this_thr;
6129 }
6130
6131 /* ------------------------------------------------------------------------ */
6132
__kmp_internal_end_dest(void * specific_gtid)6133 void __kmp_internal_end_dest(void *specific_gtid) {
6134 // Make sure no significant bits are lost
6135 int gtid;
6136 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, >id);
6137
6138 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6139 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6140 * this is because 0 is reserved for the nothing-stored case */
6141
6142 __kmp_internal_end_thread(gtid);
6143 }
6144
6145 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6146
__kmp_internal_end_dtor(void)6147 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6148 __kmp_internal_end_atexit();
6149 }
6150
6151 #endif
6152
6153 /* [Windows] josh: when the atexit handler is called, there may still be more
6154 than one thread alive */
__kmp_internal_end_atexit(void)6155 void __kmp_internal_end_atexit(void) {
6156 KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6157 /* [Windows]
6158 josh: ideally, we want to completely shutdown the library in this atexit
6159 handler, but stat code that depends on thread specific data for gtid fails
6160 because that data becomes unavailable at some point during the shutdown, so
6161 we call __kmp_internal_end_thread instead. We should eventually remove the
6162 dependency on __kmp_get_specific_gtid in the stat code and use
6163 __kmp_internal_end_library to cleanly shutdown the library.
6164
6165 // TODO: Can some of this comment about GVS be removed?
6166 I suspect that the offending stat code is executed when the calling thread
6167 tries to clean up a dead root thread's data structures, resulting in GVS
6168 code trying to close the GVS structures for that thread, but since the stat
6169 code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6170 the calling thread is cleaning up itself instead of another thread, it get
6171 confused. This happens because allowing a thread to unregister and cleanup
6172 another thread is a recent modification for addressing an issue.
6173 Based on the current design (20050722), a thread may end up
6174 trying to unregister another thread only if thread death does not trigger
6175 the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6176 thread specific data destructor function to detect thread death. For
6177 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6178 is nothing. Thus, the workaround is applicable only for Windows static
6179 stat library. */
6180 __kmp_internal_end_library(-1);
6181 #if KMP_OS_WINDOWS
6182 __kmp_close_console();
6183 #endif
6184 }
6185
__kmp_reap_thread(kmp_info_t * thread,int is_root)6186 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6187 // It is assumed __kmp_forkjoin_lock is acquired.
6188
6189 int gtid;
6190
6191 KMP_DEBUG_ASSERT(thread != NULL);
6192
6193 gtid = thread->th.th_info.ds.ds_gtid;
6194
6195 if (!is_root) {
6196 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6197 /* Assume the threads are at the fork barrier here */
6198 KA_TRACE(
6199 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6200 gtid));
6201 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6202 while (
6203 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6204 KMP_CPU_PAUSE();
6205 __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6206 } else {
6207 /* Need release fence here to prevent seg faults for tree forkjoin
6208 barrier (GEH) */
6209 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6210 thread);
6211 __kmp_release_64(&flag);
6212 }
6213 }
6214
6215 // Terminate OS thread.
6216 __kmp_reap_worker(thread);
6217
6218 // The thread was killed asynchronously. If it was actively
6219 // spinning in the thread pool, decrement the global count.
6220 //
6221 // There is a small timing hole here - if the worker thread was just waking
6222 // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6223 // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6224 // the global counter might not get updated.
6225 //
6226 // Currently, this can only happen as the library is unloaded,
6227 // so there are no harmful side effects.
6228 if (thread->th.th_active_in_pool) {
6229 thread->th.th_active_in_pool = FALSE;
6230 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6231 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6232 }
6233 }
6234
6235 __kmp_free_implicit_task(thread);
6236
6237 // Free the fast memory for tasking
6238 #if USE_FAST_MEMORY
6239 __kmp_free_fast_memory(thread);
6240 #endif /* USE_FAST_MEMORY */
6241
6242 __kmp_suspend_uninitialize_thread(thread);
6243
6244 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6245 TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6246
6247 --__kmp_all_nth;
6248 // __kmp_nth was decremented when thread is added to the pool.
6249
6250 #ifdef KMP_ADJUST_BLOCKTIME
6251 /* Adjust blocktime back to user setting or default if necessary */
6252 /* Middle initialization might never have occurred */
6253 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6254 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6255 if (__kmp_nth <= __kmp_avail_proc) {
6256 __kmp_zero_bt = FALSE;
6257 }
6258 }
6259 #endif /* KMP_ADJUST_BLOCKTIME */
6260
6261 /* free the memory being used */
6262 if (__kmp_env_consistency_check) {
6263 if (thread->th.th_cons) {
6264 __kmp_free_cons_stack(thread->th.th_cons);
6265 thread->th.th_cons = NULL;
6266 }
6267 }
6268
6269 if (thread->th.th_pri_common != NULL) {
6270 __kmp_free(thread->th.th_pri_common);
6271 thread->th.th_pri_common = NULL;
6272 }
6273
6274 #if KMP_USE_BGET
6275 if (thread->th.th_local.bget_data != NULL) {
6276 __kmp_finalize_bget(thread);
6277 }
6278 #endif
6279
6280 #if KMP_AFFINITY_SUPPORTED
6281 if (thread->th.th_affin_mask != NULL) {
6282 KMP_CPU_FREE(thread->th.th_affin_mask);
6283 thread->th.th_affin_mask = NULL;
6284 }
6285 #endif /* KMP_AFFINITY_SUPPORTED */
6286
6287 #if KMP_USE_HIER_SCHED
6288 if (thread->th.th_hier_bar_data != NULL) {
6289 __kmp_free(thread->th.th_hier_bar_data);
6290 thread->th.th_hier_bar_data = NULL;
6291 }
6292 #endif
6293
6294 __kmp_reap_team(thread->th.th_serial_team);
6295 thread->th.th_serial_team = NULL;
6296 __kmp_free(thread);
6297
6298 KMP_MB();
6299
6300 } // __kmp_reap_thread
6301
__kmp_itthash_clean(kmp_info_t * th)6302 static void __kmp_itthash_clean(kmp_info_t *th) {
6303 #if USE_ITT_NOTIFY
6304 if (__kmp_itt_region_domains.count > 0) {
6305 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6306 kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6307 while (bucket) {
6308 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6309 __kmp_thread_free(th, bucket);
6310 bucket = next;
6311 }
6312 }
6313 }
6314 if (__kmp_itt_barrier_domains.count > 0) {
6315 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6316 kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6317 while (bucket) {
6318 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6319 __kmp_thread_free(th, bucket);
6320 bucket = next;
6321 }
6322 }
6323 }
6324 #endif
6325 }
6326
__kmp_internal_end(void)6327 static void __kmp_internal_end(void) {
6328 int i;
6329
6330 /* First, unregister the library */
6331 __kmp_unregister_library();
6332
6333 #if KMP_OS_WINDOWS
6334 /* In Win static library, we can't tell when a root actually dies, so we
6335 reclaim the data structures for any root threads that have died but not
6336 unregistered themselves, in order to shut down cleanly.
6337 In Win dynamic library we also can't tell when a thread dies. */
6338 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6339 // dead roots
6340 #endif
6341
6342 for (i = 0; i < __kmp_threads_capacity; i++)
6343 if (__kmp_root[i])
6344 if (__kmp_root[i]->r.r_active)
6345 break;
6346 KMP_MB(); /* Flush all pending memory write invalidates. */
6347 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6348
6349 if (i < __kmp_threads_capacity) {
6350 #if KMP_USE_MONITOR
6351 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6352 KMP_MB(); /* Flush all pending memory write invalidates. */
6353
6354 // Need to check that monitor was initialized before reaping it. If we are
6355 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6356 // __kmp_monitor will appear to contain valid data, but it is only valid in
6357 // the parent process, not the child.
6358 // New behavior (201008): instead of keying off of the flag
6359 // __kmp_init_parallel, the monitor thread creation is keyed off
6360 // of the new flag __kmp_init_monitor.
6361 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6362 if (TCR_4(__kmp_init_monitor)) {
6363 __kmp_reap_monitor(&__kmp_monitor);
6364 TCW_4(__kmp_init_monitor, 0);
6365 }
6366 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6367 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6368 #endif // KMP_USE_MONITOR
6369 } else {
6370 /* TODO move this to cleanup code */
6371 #ifdef KMP_DEBUG
6372 /* make sure that everything has properly ended */
6373 for (i = 0; i < __kmp_threads_capacity; i++) {
6374 if (__kmp_root[i]) {
6375 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6376 // there can be uber threads alive here
6377 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6378 }
6379 }
6380 #endif
6381
6382 KMP_MB();
6383
6384 // Reap the worker threads.
6385 // This is valid for now, but be careful if threads are reaped sooner.
6386 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6387 // Get the next thread from the pool.
6388 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6389 __kmp_thread_pool = thread->th.th_next_pool;
6390 // Reap it.
6391 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6392 thread->th.th_next_pool = NULL;
6393 thread->th.th_in_pool = FALSE;
6394 __kmp_reap_thread(thread, 0);
6395 }
6396 __kmp_thread_pool_insert_pt = NULL;
6397
6398 // Reap teams.
6399 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6400 // Get the next team from the pool.
6401 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6402 __kmp_team_pool = team->t.t_next_pool;
6403 // Reap it.
6404 team->t.t_next_pool = NULL;
6405 __kmp_reap_team(team);
6406 }
6407
6408 __kmp_reap_task_teams();
6409
6410 #if KMP_OS_UNIX
6411 // Threads that are not reaped should not access any resources since they
6412 // are going to be deallocated soon, so the shutdown sequence should wait
6413 // until all threads either exit the final spin-waiting loop or begin
6414 // sleeping after the given blocktime.
6415 for (i = 0; i < __kmp_threads_capacity; i++) {
6416 kmp_info_t *thr = __kmp_threads[i];
6417 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6418 KMP_CPU_PAUSE();
6419 }
6420 #endif
6421
6422 for (i = 0; i < __kmp_threads_capacity; ++i) {
6423 // TBD: Add some checking...
6424 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6425 }
6426
6427 /* Make sure all threadprivate destructors get run by joining with all
6428 worker threads before resetting this flag */
6429 TCW_SYNC_4(__kmp_init_common, FALSE);
6430
6431 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6432 KMP_MB();
6433
6434 #if KMP_USE_MONITOR
6435 // See note above: One of the possible fixes for CQ138434 / CQ140126
6436 //
6437 // FIXME: push both code fragments down and CSE them?
6438 // push them into __kmp_cleanup() ?
6439 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6440 if (TCR_4(__kmp_init_monitor)) {
6441 __kmp_reap_monitor(&__kmp_monitor);
6442 TCW_4(__kmp_init_monitor, 0);
6443 }
6444 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6445 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6446 #endif
6447 } /* else !__kmp_global.t_active */
6448 TCW_4(__kmp_init_gtid, FALSE);
6449 KMP_MB(); /* Flush all pending memory write invalidates. */
6450
6451 __kmp_cleanup();
6452 #if OMPT_SUPPORT
6453 ompt_fini();
6454 #endif
6455 }
6456
__kmp_internal_end_library(int gtid_req)6457 void __kmp_internal_end_library(int gtid_req) {
6458 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6459 /* this shouldn't be a race condition because __kmp_internal_end() is the
6460 only place to clear __kmp_serial_init */
6461 /* we'll check this later too, after we get the lock */
6462 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6463 // redundant, because the next check will work in any case.
6464 if (__kmp_global.g.g_abort) {
6465 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6466 /* TODO abort? */
6467 return;
6468 }
6469 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6470 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6471 return;
6472 }
6473
6474 // If hidden helper team has been initialized, we need to deinit it
6475 if (TCR_4(__kmp_init_hidden_helper) &&
6476 !TCR_4(__kmp_hidden_helper_team_done)) {
6477 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6478 // First release the main thread to let it continue its work
6479 __kmp_hidden_helper_main_thread_release();
6480 // Wait until the hidden helper team has been destroyed
6481 __kmp_hidden_helper_threads_deinitz_wait();
6482 }
6483
6484 KMP_MB(); /* Flush all pending memory write invalidates. */
6485 /* find out who we are and what we should do */
6486 {
6487 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6488 KA_TRACE(
6489 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6490 if (gtid == KMP_GTID_SHUTDOWN) {
6491 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6492 "already shutdown\n"));
6493 return;
6494 } else if (gtid == KMP_GTID_MONITOR) {
6495 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6496 "registered, or system shutdown\n"));
6497 return;
6498 } else if (gtid == KMP_GTID_DNE) {
6499 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6500 "shutdown\n"));
6501 /* we don't know who we are, but we may still shutdown the library */
6502 } else if (KMP_UBER_GTID(gtid)) {
6503 /* unregister ourselves as an uber thread. gtid is no longer valid */
6504 if (__kmp_root[gtid]->r.r_active) {
6505 __kmp_global.g.g_abort = -1;
6506 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6507 __kmp_unregister_library();
6508 KA_TRACE(10,
6509 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6510 gtid));
6511 return;
6512 } else {
6513 __kmp_itthash_clean(__kmp_threads[gtid]);
6514 KA_TRACE(
6515 10,
6516 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6517 __kmp_unregister_root_current_thread(gtid);
6518 }
6519 } else {
6520 /* worker threads may call this function through the atexit handler, if they
6521 * call exit() */
6522 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6523 TODO: do a thorough shutdown instead */
6524 #ifdef DUMP_DEBUG_ON_EXIT
6525 if (__kmp_debug_buf)
6526 __kmp_dump_debug_buffer();
6527 #endif
6528 // added unregister library call here when we switch to shm linux
6529 // if we don't, it will leave lots of files in /dev/shm
6530 // cleanup shared memory file before exiting.
6531 __kmp_unregister_library();
6532 return;
6533 }
6534 }
6535 /* synchronize the termination process */
6536 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6537
6538 /* have we already finished */
6539 if (__kmp_global.g.g_abort) {
6540 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6541 /* TODO abort? */
6542 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6543 return;
6544 }
6545 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6546 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6547 return;
6548 }
6549
6550 /* We need this lock to enforce mutex between this reading of
6551 __kmp_threads_capacity and the writing by __kmp_register_root.
6552 Alternatively, we can use a counter of roots that is atomically updated by
6553 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6554 __kmp_internal_end_*. */
6555 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6556
6557 /* now we can safely conduct the actual termination */
6558 __kmp_internal_end();
6559
6560 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6561 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6562
6563 KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6564
6565 #ifdef DUMP_DEBUG_ON_EXIT
6566 if (__kmp_debug_buf)
6567 __kmp_dump_debug_buffer();
6568 #endif
6569
6570 #if KMP_OS_WINDOWS
6571 __kmp_close_console();
6572 #endif
6573
6574 __kmp_fini_allocator();
6575
6576 } // __kmp_internal_end_library
6577
__kmp_internal_end_thread(int gtid_req)6578 void __kmp_internal_end_thread(int gtid_req) {
6579 int i;
6580
6581 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6582 /* this shouldn't be a race condition because __kmp_internal_end() is the
6583 * only place to clear __kmp_serial_init */
6584 /* we'll check this later too, after we get the lock */
6585 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6586 // redundant, because the next check will work in any case.
6587 if (__kmp_global.g.g_abort) {
6588 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6589 /* TODO abort? */
6590 return;
6591 }
6592 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6593 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6594 return;
6595 }
6596
6597 // If hidden helper team has been initialized, we need to deinit it
6598 if (TCR_4(__kmp_init_hidden_helper) &&
6599 !TCR_4(__kmp_hidden_helper_team_done)) {
6600 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6601 // First release the main thread to let it continue its work
6602 __kmp_hidden_helper_main_thread_release();
6603 // Wait until the hidden helper team has been destroyed
6604 __kmp_hidden_helper_threads_deinitz_wait();
6605 }
6606
6607 KMP_MB(); /* Flush all pending memory write invalidates. */
6608
6609 /* find out who we are and what we should do */
6610 {
6611 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6612 KA_TRACE(10,
6613 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6614 if (gtid == KMP_GTID_SHUTDOWN) {
6615 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6616 "already shutdown\n"));
6617 return;
6618 } else if (gtid == KMP_GTID_MONITOR) {
6619 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6620 "registered, or system shutdown\n"));
6621 return;
6622 } else if (gtid == KMP_GTID_DNE) {
6623 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6624 "shutdown\n"));
6625 return;
6626 /* we don't know who we are */
6627 } else if (KMP_UBER_GTID(gtid)) {
6628 /* unregister ourselves as an uber thread. gtid is no longer valid */
6629 if (__kmp_root[gtid]->r.r_active) {
6630 __kmp_global.g.g_abort = -1;
6631 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6632 KA_TRACE(10,
6633 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6634 gtid));
6635 return;
6636 } else {
6637 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6638 gtid));
6639 __kmp_unregister_root_current_thread(gtid);
6640 }
6641 } else {
6642 /* just a worker thread, let's leave */
6643 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6644
6645 if (gtid >= 0) {
6646 __kmp_threads[gtid]->th.th_task_team = NULL;
6647 }
6648
6649 KA_TRACE(10,
6650 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6651 gtid));
6652 return;
6653 }
6654 }
6655 #if KMP_DYNAMIC_LIB
6656 if (__kmp_pause_status != kmp_hard_paused)
6657 // AC: lets not shutdown the dynamic library at the exit of uber thread,
6658 // because we will better shutdown later in the library destructor.
6659 {
6660 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6661 return;
6662 }
6663 #endif
6664 /* synchronize the termination process */
6665 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6666
6667 /* have we already finished */
6668 if (__kmp_global.g.g_abort) {
6669 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6670 /* TODO abort? */
6671 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6672 return;
6673 }
6674 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6675 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6676 return;
6677 }
6678
6679 /* We need this lock to enforce mutex between this reading of
6680 __kmp_threads_capacity and the writing by __kmp_register_root.
6681 Alternatively, we can use a counter of roots that is atomically updated by
6682 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6683 __kmp_internal_end_*. */
6684
6685 /* should we finish the run-time? are all siblings done? */
6686 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6687
6688 for (i = 0; i < __kmp_threads_capacity; ++i) {
6689 if (KMP_UBER_GTID(i)) {
6690 KA_TRACE(
6691 10,
6692 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6693 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6694 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6695 return;
6696 }
6697 }
6698
6699 /* now we can safely conduct the actual termination */
6700
6701 __kmp_internal_end();
6702
6703 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6704 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6705
6706 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6707
6708 #ifdef DUMP_DEBUG_ON_EXIT
6709 if (__kmp_debug_buf)
6710 __kmp_dump_debug_buffer();
6711 #endif
6712 } // __kmp_internal_end_thread
6713
6714 // -----------------------------------------------------------------------------
6715 // Library registration stuff.
6716
6717 static long __kmp_registration_flag = 0;
6718 // Random value used to indicate library initialization.
6719 static char *__kmp_registration_str = NULL;
6720 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6721
__kmp_reg_status_name()6722 static inline char *__kmp_reg_status_name() {
6723 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6724 each thread. If registration and unregistration go in different threads
6725 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6726 env var can not be found, because the name will contain different pid. */
6727 // macOS* complains about name being too long with additional getuid()
6728 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6729 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6730 (int)getuid());
6731 #else
6732 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6733 #endif
6734 } // __kmp_reg_status_get
6735
6736 #if defined(KMP_USE_SHM)
6737 bool __kmp_shm_available = false;
6738 bool __kmp_tmp_available = false;
6739 // If /dev/shm is not accessible, we will create a temporary file under /tmp.
6740 char *temp_reg_status_file_name = nullptr;
6741 #endif
6742
__kmp_register_library_startup(void)6743 void __kmp_register_library_startup(void) {
6744
6745 char *name = __kmp_reg_status_name(); // Name of the environment variable.
6746 int done = 0;
6747 union {
6748 double dtime;
6749 long ltime;
6750 } time;
6751 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6752 __kmp_initialize_system_tick();
6753 #endif
6754 __kmp_read_system_time(&time.dtime);
6755 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6756 __kmp_registration_str =
6757 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6758 __kmp_registration_flag, KMP_LIBRARY_FILE);
6759
6760 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6761 __kmp_registration_str));
6762
6763 while (!done) {
6764
6765 char *value = NULL; // Actual value of the environment variable.
6766
6767 #if defined(KMP_USE_SHM)
6768 char *shm_name = nullptr;
6769 char *data1 = nullptr;
6770 __kmp_shm_available = __kmp_detect_shm();
6771 if (__kmp_shm_available) {
6772 int fd1 = -1;
6773 shm_name = __kmp_str_format("/%s", name);
6774 int shm_preexist = 0;
6775 fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6776 if ((fd1 == -1) && (errno == EEXIST)) {
6777 // file didn't open because it already exists.
6778 // try opening existing file
6779 fd1 = shm_open(shm_name, O_RDWR, 0600);
6780 if (fd1 == -1) { // file didn't open
6781 KMP_WARNING(FunctionError, "Can't open SHM");
6782 __kmp_shm_available = false;
6783 } else { // able to open existing file
6784 shm_preexist = 1;
6785 }
6786 }
6787 if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6788 if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6789 KMP_WARNING(FunctionError, "Can't set size of SHM");
6790 __kmp_shm_available = false;
6791 }
6792 }
6793 if (__kmp_shm_available) { // SHM exists, now map it
6794 data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6795 fd1, 0);
6796 if (data1 == MAP_FAILED) { // failed to map shared memory
6797 KMP_WARNING(FunctionError, "Can't map SHM");
6798 __kmp_shm_available = false;
6799 }
6800 }
6801 if (__kmp_shm_available) { // SHM mapped
6802 if (shm_preexist == 0) { // set data to SHM, set value
6803 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6804 }
6805 // Read value from either what we just wrote or existing file.
6806 value = __kmp_str_format("%s", data1); // read value from SHM
6807 munmap(data1, SHM_SIZE);
6808 }
6809 if (fd1 != -1)
6810 close(fd1);
6811 }
6812 if (!__kmp_shm_available)
6813 __kmp_tmp_available = __kmp_detect_tmp();
6814 if (!__kmp_shm_available && __kmp_tmp_available) {
6815 // SHM failed to work due to an error other than that the file already
6816 // exists. Try to create a temp file under /tmp.
6817 // If /tmp isn't accessible, fall back to using environment variable.
6818 // TODO: /tmp might not always be the temporary directory. For now we will
6819 // not consider TMPDIR.
6820 int fd1 = -1;
6821 temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
6822 int tmp_preexist = 0;
6823 fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6824 if ((fd1 == -1) && (errno == EEXIST)) {
6825 // file didn't open because it already exists.
6826 // try opening existing file
6827 fd1 = open(temp_reg_status_file_name, O_RDWR, 0600);
6828 if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6829 KMP_WARNING(FunctionError, "Can't open TEMP");
6830 __kmp_tmp_available = false;
6831 } else {
6832 tmp_preexist = 1;
6833 }
6834 }
6835 if (__kmp_tmp_available && tmp_preexist == 0) {
6836 // we created /tmp file now set size
6837 if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6838 KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6839 __kmp_tmp_available = false;
6840 }
6841 }
6842 if (__kmp_tmp_available) {
6843 data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6844 fd1, 0);
6845 if (data1 == MAP_FAILED) { // failed to map /tmp
6846 KMP_WARNING(FunctionError, "Can't map /tmp");
6847 __kmp_tmp_available = false;
6848 }
6849 }
6850 if (__kmp_tmp_available) {
6851 if (tmp_preexist == 0) { // set data to TMP, set value
6852 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6853 }
6854 // Read value from either what we just wrote or existing file.
6855 value = __kmp_str_format("%s", data1); // read value from SHM
6856 munmap(data1, SHM_SIZE);
6857 }
6858 if (fd1 != -1)
6859 close(fd1);
6860 }
6861 if (!__kmp_shm_available && !__kmp_tmp_available) {
6862 // no /dev/shm and no /tmp -- fall back to environment variable
6863 // Set environment variable, but do not overwrite if it exists.
6864 __kmp_env_set(name, __kmp_registration_str, 0);
6865 // read value to see if it got set
6866 value = __kmp_env_get(name);
6867 }
6868 #else // Windows and unix with static library
6869 // Set environment variable, but do not overwrite if it exists.
6870 __kmp_env_set(name, __kmp_registration_str, 0);
6871 // read value to see if it got set
6872 value = __kmp_env_get(name);
6873 #endif
6874
6875 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6876 done = 1; // Ok, environment variable set successfully, exit the loop.
6877 } else {
6878 // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6879 // Check whether it alive or dead.
6880 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6881 char *tail = value;
6882 char *flag_addr_str = NULL;
6883 char *flag_val_str = NULL;
6884 char const *file_name = NULL;
6885 __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6886 __kmp_str_split(tail, '-', &flag_val_str, &tail);
6887 file_name = tail;
6888 if (tail != NULL) {
6889 unsigned long *flag_addr = 0;
6890 unsigned long flag_val = 0;
6891 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6892 KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6893 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6894 // First, check whether environment-encoded address is mapped into
6895 // addr space.
6896 // If so, dereference it to see if it still has the right value.
6897 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6898 neighbor = 1;
6899 } else {
6900 // If not, then we know the other copy of the library is no longer
6901 // running.
6902 neighbor = 2;
6903 }
6904 }
6905 }
6906 switch (neighbor) {
6907 case 0: // Cannot parse environment variable -- neighbor status unknown.
6908 // Assume it is the incompatible format of future version of the
6909 // library. Assume the other library is alive.
6910 // WARN( ... ); // TODO: Issue a warning.
6911 file_name = "unknown library";
6912 KMP_FALLTHROUGH();
6913 // Attention! Falling to the next case. That's intentional.
6914 case 1: { // Neighbor is alive.
6915 // Check it is allowed.
6916 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6917 if (!__kmp_str_match_true(duplicate_ok)) {
6918 // That's not allowed. Issue fatal error.
6919 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6920 KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6921 }
6922 KMP_INTERNAL_FREE(duplicate_ok);
6923 __kmp_duplicate_library_ok = 1;
6924 done = 1; // Exit the loop.
6925 } break;
6926 case 2: { // Neighbor is dead.
6927
6928 #if defined(KMP_USE_SHM)
6929 if (__kmp_shm_available) { // close shared memory.
6930 shm_unlink(shm_name); // this removes file in /dev/shm
6931 } else if (__kmp_tmp_available) {
6932 unlink(temp_reg_status_file_name); // this removes the temp file
6933 } else {
6934 // Clear the variable and try to register library again.
6935 __kmp_env_unset(name);
6936 }
6937 #else
6938 // Clear the variable and try to register library again.
6939 __kmp_env_unset(name);
6940 #endif
6941 } break;
6942 default: {
6943 KMP_DEBUG_ASSERT(0);
6944 } break;
6945 }
6946 }
6947 KMP_INTERNAL_FREE((void *)value);
6948 #if defined(KMP_USE_SHM)
6949 if (shm_name)
6950 KMP_INTERNAL_FREE((void *)shm_name);
6951 #endif
6952 } // while
6953 KMP_INTERNAL_FREE((void *)name);
6954
6955 } // func __kmp_register_library_startup
6956
__kmp_unregister_library(void)6957 void __kmp_unregister_library(void) {
6958
6959 char *name = __kmp_reg_status_name();
6960 char *value = NULL;
6961
6962 #if defined(KMP_USE_SHM)
6963 char *shm_name = nullptr;
6964 int fd1;
6965 if (__kmp_shm_available) {
6966 shm_name = __kmp_str_format("/%s", name);
6967 fd1 = shm_open(shm_name, O_RDONLY, 0600);
6968 if (fd1 != -1) { // File opened successfully
6969 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6970 if (data1 != MAP_FAILED) {
6971 value = __kmp_str_format("%s", data1); // read value from SHM
6972 munmap(data1, SHM_SIZE);
6973 }
6974 close(fd1);
6975 }
6976 } else if (__kmp_tmp_available) { // try /tmp
6977 fd1 = open(temp_reg_status_file_name, O_RDONLY);
6978 if (fd1 != -1) { // File opened successfully
6979 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6980 if (data1 != MAP_FAILED) {
6981 value = __kmp_str_format("%s", data1); // read value from /tmp
6982 munmap(data1, SHM_SIZE);
6983 }
6984 close(fd1);
6985 }
6986 } else { // fall back to envirable
6987 value = __kmp_env_get(name);
6988 }
6989 #else
6990 value = __kmp_env_get(name);
6991 #endif
6992
6993 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6994 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6995 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6996 // Ok, this is our variable. Delete it.
6997 #if defined(KMP_USE_SHM)
6998 if (__kmp_shm_available) {
6999 shm_unlink(shm_name); // this removes file in /dev/shm
7000 } else if (__kmp_tmp_available) {
7001 unlink(temp_reg_status_file_name); // this removes the temp file
7002 } else {
7003 __kmp_env_unset(name);
7004 }
7005 #else
7006 __kmp_env_unset(name);
7007 #endif
7008 }
7009
7010 #if defined(KMP_USE_SHM)
7011 if (shm_name)
7012 KMP_INTERNAL_FREE(shm_name);
7013 if (temp_reg_status_file_name)
7014 KMP_INTERNAL_FREE(temp_reg_status_file_name);
7015 #endif
7016
7017 KMP_INTERNAL_FREE(__kmp_registration_str);
7018 KMP_INTERNAL_FREE(value);
7019 KMP_INTERNAL_FREE(name);
7020
7021 __kmp_registration_flag = 0;
7022 __kmp_registration_str = NULL;
7023
7024 } // __kmp_unregister_library
7025
7026 // End of Library registration stuff.
7027 // -----------------------------------------------------------------------------
7028
7029 #if KMP_MIC_SUPPORTED
7030
__kmp_check_mic_type()7031 static void __kmp_check_mic_type() {
7032 kmp_cpuid_t cpuid_state = {0};
7033 kmp_cpuid_t *cs_p = &cpuid_state;
7034 __kmp_x86_cpuid(1, 0, cs_p);
7035 // We don't support mic1 at the moment
7036 if ((cs_p->eax & 0xff0) == 0xB10) {
7037 __kmp_mic_type = mic2;
7038 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
7039 __kmp_mic_type = mic3;
7040 } else {
7041 __kmp_mic_type = non_mic;
7042 }
7043 }
7044
7045 #endif /* KMP_MIC_SUPPORTED */
7046
7047 #if KMP_HAVE_UMWAIT
__kmp_user_level_mwait_init()7048 static void __kmp_user_level_mwait_init() {
7049 struct kmp_cpuid buf;
7050 __kmp_x86_cpuid(7, 0, &buf);
7051 __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7052 __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7053 __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7054 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7055 __kmp_umwait_enabled));
7056 }
7057 #elif KMP_HAVE_MWAIT
7058 #ifndef AT_INTELPHIUSERMWAIT
7059 // Spurious, non-existent value that should always fail to return anything.
7060 // Will be replaced with the correct value when we know that.
7061 #define AT_INTELPHIUSERMWAIT 10000
7062 #endif
7063 // getauxval() function is available in RHEL7 and SLES12. If a system with an
7064 // earlier OS is used to build the RTL, we'll use the following internal
7065 // function when the entry is not found.
7066 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
getauxval(unsigned long)7067 unsigned long getauxval(unsigned long) { return 0; }
7068
__kmp_user_level_mwait_init()7069 static void __kmp_user_level_mwait_init() {
7070 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7071 // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7072 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7073 // KMP_USER_LEVEL_MWAIT was set to TRUE.
7074 if (__kmp_mic_type == mic3) {
7075 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7076 if ((res & 0x1) || __kmp_user_level_mwait) {
7077 __kmp_mwait_enabled = TRUE;
7078 if (__kmp_user_level_mwait) {
7079 KMP_INFORM(EnvMwaitWarn);
7080 }
7081 } else {
7082 __kmp_mwait_enabled = FALSE;
7083 }
7084 }
7085 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7086 "__kmp_mwait_enabled = %d\n",
7087 __kmp_mic_type, __kmp_mwait_enabled));
7088 }
7089 #endif /* KMP_HAVE_UMWAIT */
7090
__kmp_do_serial_initialize(void)7091 static void __kmp_do_serial_initialize(void) {
7092 int i, gtid;
7093 size_t size;
7094
7095 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7096
7097 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7098 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7099 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7100 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7101 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7102
7103 #if OMPT_SUPPORT
7104 ompt_pre_init();
7105 #endif
7106 #if OMPD_SUPPORT
7107 __kmp_env_dump();
7108 ompd_init();
7109 #endif
7110
7111 __kmp_validate_locks();
7112
7113 #if ENABLE_LIBOMPTARGET
7114 /* Initialize functions from libomptarget */
7115 __kmp_init_omptarget();
7116 #endif
7117
7118 /* Initialize internal memory allocator */
7119 __kmp_init_allocator();
7120
7121 /* Register the library startup via an environment variable or via mapped
7122 shared memory file and check to see whether another copy of the library is
7123 already registered. Since forked child process is often terminated, we
7124 postpone the registration till middle initialization in the child */
7125 if (__kmp_need_register_serial)
7126 __kmp_register_library_startup();
7127
7128 /* TODO reinitialization of library */
7129 if (TCR_4(__kmp_global.g.g_done)) {
7130 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7131 }
7132
7133 __kmp_global.g.g_abort = 0;
7134 TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7135
7136 /* initialize the locks */
7137 #if KMP_USE_ADAPTIVE_LOCKS
7138 #if KMP_DEBUG_ADAPTIVE_LOCKS
7139 __kmp_init_speculative_stats();
7140 #endif
7141 #endif
7142 #if KMP_STATS_ENABLED
7143 __kmp_stats_init();
7144 #endif
7145 __kmp_init_lock(&__kmp_global_lock);
7146 __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7147 __kmp_init_lock(&__kmp_debug_lock);
7148 __kmp_init_atomic_lock(&__kmp_atomic_lock);
7149 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7150 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7151 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7152 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7153 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7154 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7155 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7156 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7157 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7158 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7159 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7160 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7161 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7162 __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7163 #if KMP_USE_MONITOR
7164 __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7165 #endif
7166 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7167
7168 /* conduct initialization and initial setup of configuration */
7169
7170 __kmp_runtime_initialize();
7171
7172 #if KMP_MIC_SUPPORTED
7173 __kmp_check_mic_type();
7174 #endif
7175
7176 // Some global variable initialization moved here from kmp_env_initialize()
7177 #ifdef KMP_DEBUG
7178 kmp_diag = 0;
7179 #endif
7180 __kmp_abort_delay = 0;
7181
7182 // From __kmp_init_dflt_team_nth()
7183 /* assume the entire machine will be used */
7184 __kmp_dflt_team_nth_ub = __kmp_xproc;
7185 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7186 __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7187 }
7188 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7189 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7190 }
7191 __kmp_max_nth = __kmp_sys_max_nth;
7192 __kmp_cg_max_nth = __kmp_sys_max_nth;
7193 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7194 if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7195 __kmp_teams_max_nth = __kmp_sys_max_nth;
7196 }
7197
7198 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7199 // part
7200 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7201 #if KMP_USE_MONITOR
7202 __kmp_monitor_wakeups =
7203 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7204 __kmp_bt_intervals =
7205 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7206 #endif
7207 // From "KMP_LIBRARY" part of __kmp_env_initialize()
7208 __kmp_library = library_throughput;
7209 // From KMP_SCHEDULE initialization
7210 __kmp_static = kmp_sch_static_balanced;
7211 // AC: do not use analytical here, because it is non-monotonous
7212 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7213 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7214 // need to repeat assignment
7215 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7216 // bit control and barrier method control parts
7217 #if KMP_FAST_REDUCTION_BARRIER
7218 #define kmp_reduction_barrier_gather_bb ((int)1)
7219 #define kmp_reduction_barrier_release_bb ((int)1)
7220 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7221 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7222 #endif // KMP_FAST_REDUCTION_BARRIER
7223 for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7224 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7225 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7226 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7227 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7228 #if KMP_FAST_REDUCTION_BARRIER
7229 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7230 // lin_64 ): hyper,1
7231 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7232 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7233 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7234 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7235 }
7236 #endif // KMP_FAST_REDUCTION_BARRIER
7237 }
7238 #if KMP_FAST_REDUCTION_BARRIER
7239 #undef kmp_reduction_barrier_release_pat
7240 #undef kmp_reduction_barrier_gather_pat
7241 #undef kmp_reduction_barrier_release_bb
7242 #undef kmp_reduction_barrier_gather_bb
7243 #endif // KMP_FAST_REDUCTION_BARRIER
7244 #if KMP_MIC_SUPPORTED
7245 if (__kmp_mic_type == mic2) { // KNC
7246 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7247 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7248 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7249 1; // forkjoin release
7250 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7251 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7252 }
7253 #if KMP_FAST_REDUCTION_BARRIER
7254 if (__kmp_mic_type == mic2) { // KNC
7255 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7256 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7257 }
7258 #endif // KMP_FAST_REDUCTION_BARRIER
7259 #endif // KMP_MIC_SUPPORTED
7260
7261 // From KMP_CHECKS initialization
7262 #ifdef KMP_DEBUG
7263 __kmp_env_checks = TRUE; /* development versions have the extra checks */
7264 #else
7265 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7266 #endif
7267
7268 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7269 __kmp_foreign_tp = TRUE;
7270
7271 __kmp_global.g.g_dynamic = FALSE;
7272 __kmp_global.g.g_dynamic_mode = dynamic_default;
7273
7274 __kmp_init_nesting_mode();
7275
7276 __kmp_env_initialize(NULL);
7277
7278 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7279 __kmp_user_level_mwait_init();
7280 #endif
7281 // Print all messages in message catalog for testing purposes.
7282 #ifdef KMP_DEBUG
7283 char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7284 if (__kmp_str_match_true(val)) {
7285 kmp_str_buf_t buffer;
7286 __kmp_str_buf_init(&buffer);
7287 __kmp_i18n_dump_catalog(&buffer);
7288 __kmp_printf("%s", buffer.str);
7289 __kmp_str_buf_free(&buffer);
7290 }
7291 __kmp_env_free(&val);
7292 #endif
7293
7294 __kmp_threads_capacity =
7295 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7296 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7297 __kmp_tp_capacity = __kmp_default_tp_capacity(
7298 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7299
7300 // If the library is shut down properly, both pools must be NULL. Just in
7301 // case, set them to NULL -- some memory may leak, but subsequent code will
7302 // work even if pools are not freed.
7303 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7304 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7305 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7306 __kmp_thread_pool = NULL;
7307 __kmp_thread_pool_insert_pt = NULL;
7308 __kmp_team_pool = NULL;
7309
7310 /* Allocate all of the variable sized records */
7311 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7312 * expandable */
7313 /* Since allocation is cache-aligned, just add extra padding at the end */
7314 size =
7315 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7316 CACHE_LINE;
7317 __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7318 __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7319 sizeof(kmp_info_t *) * __kmp_threads_capacity);
7320
7321 /* init thread counts */
7322 KMP_DEBUG_ASSERT(__kmp_all_nth ==
7323 0); // Asserts fail if the library is reinitializing and
7324 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7325 __kmp_all_nth = 0;
7326 __kmp_nth = 0;
7327
7328 /* setup the uber master thread and hierarchy */
7329 gtid = __kmp_register_root(TRUE);
7330 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7331 KMP_ASSERT(KMP_UBER_GTID(gtid));
7332 KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7333
7334 KMP_MB(); /* Flush all pending memory write invalidates. */
7335
7336 __kmp_common_initialize();
7337
7338 #if KMP_OS_UNIX
7339 /* invoke the child fork handler */
7340 __kmp_register_atfork();
7341 #endif
7342
7343 #if !KMP_DYNAMIC_LIB || \
7344 ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7345 {
7346 /* Invoke the exit handler when the program finishes, only for static
7347 library and macOS* dynamic. For other dynamic libraries, we already
7348 have _fini and DllMain. */
7349 int rc = atexit(__kmp_internal_end_atexit);
7350 if (rc != 0) {
7351 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7352 __kmp_msg_null);
7353 }
7354 }
7355 #endif
7356
7357 #if KMP_HANDLE_SIGNALS
7358 #if KMP_OS_UNIX
7359 /* NOTE: make sure that this is called before the user installs their own
7360 signal handlers so that the user handlers are called first. this way they
7361 can return false, not call our handler, avoid terminating the library, and
7362 continue execution where they left off. */
7363 __kmp_install_signals(FALSE);
7364 #endif /* KMP_OS_UNIX */
7365 #if KMP_OS_WINDOWS
7366 __kmp_install_signals(TRUE);
7367 #endif /* KMP_OS_WINDOWS */
7368 #endif
7369
7370 /* we have finished the serial initialization */
7371 __kmp_init_counter++;
7372
7373 __kmp_init_serial = TRUE;
7374
7375 if (__kmp_version) {
7376 __kmp_print_version_1();
7377 }
7378
7379 if (__kmp_settings) {
7380 __kmp_env_print();
7381 }
7382
7383 if (__kmp_display_env || __kmp_display_env_verbose) {
7384 __kmp_env_print_2();
7385 }
7386
7387 #if OMPT_SUPPORT
7388 ompt_post_init();
7389 #endif
7390
7391 KMP_MB();
7392
7393 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7394 }
7395
__kmp_serial_initialize(void)7396 void __kmp_serial_initialize(void) {
7397 if (__kmp_init_serial) {
7398 return;
7399 }
7400 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7401 if (__kmp_init_serial) {
7402 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7403 return;
7404 }
7405 __kmp_do_serial_initialize();
7406 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7407 }
7408
__kmp_do_middle_initialize(void)7409 static void __kmp_do_middle_initialize(void) {
7410 int i, j;
7411 int prev_dflt_team_nth;
7412
7413 if (!__kmp_init_serial) {
7414 __kmp_do_serial_initialize();
7415 }
7416
7417 KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7418
7419 if (UNLIKELY(!__kmp_need_register_serial)) {
7420 // We are in a forked child process. The registration was skipped during
7421 // serial initialization in __kmp_atfork_child handler. Do it here.
7422 __kmp_register_library_startup();
7423 }
7424
7425 // Save the previous value for the __kmp_dflt_team_nth so that
7426 // we can avoid some reinitialization if it hasn't changed.
7427 prev_dflt_team_nth = __kmp_dflt_team_nth;
7428
7429 #if KMP_AFFINITY_SUPPORTED
7430 // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7431 // number of cores on the machine.
7432 __kmp_affinity_initialize(__kmp_affinity);
7433
7434 #endif /* KMP_AFFINITY_SUPPORTED */
7435
7436 KMP_ASSERT(__kmp_xproc > 0);
7437 if (__kmp_avail_proc == 0) {
7438 __kmp_avail_proc = __kmp_xproc;
7439 }
7440
7441 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7442 // correct them now
7443 j = 0;
7444 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7445 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7446 __kmp_avail_proc;
7447 j++;
7448 }
7449
7450 if (__kmp_dflt_team_nth == 0) {
7451 #ifdef KMP_DFLT_NTH_CORES
7452 // Default #threads = #cores
7453 __kmp_dflt_team_nth = __kmp_ncores;
7454 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7455 "__kmp_ncores (%d)\n",
7456 __kmp_dflt_team_nth));
7457 #else
7458 // Default #threads = #available OS procs
7459 __kmp_dflt_team_nth = __kmp_avail_proc;
7460 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7461 "__kmp_avail_proc(%d)\n",
7462 __kmp_dflt_team_nth));
7463 #endif /* KMP_DFLT_NTH_CORES */
7464 }
7465
7466 if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7467 __kmp_dflt_team_nth = KMP_MIN_NTH;
7468 }
7469 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7470 __kmp_dflt_team_nth = __kmp_sys_max_nth;
7471 }
7472
7473 if (__kmp_nesting_mode > 0)
7474 __kmp_set_nesting_mode_threads();
7475
7476 // There's no harm in continuing if the following check fails,
7477 // but it indicates an error in the previous logic.
7478 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7479
7480 if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7481 // Run through the __kmp_threads array and set the num threads icv for each
7482 // root thread that is currently registered with the RTL (which has not
7483 // already explicitly set its nthreads-var with a call to
7484 // omp_set_num_threads()).
7485 for (i = 0; i < __kmp_threads_capacity; i++) {
7486 kmp_info_t *thread = __kmp_threads[i];
7487 if (thread == NULL)
7488 continue;
7489 if (thread->th.th_current_task->td_icvs.nproc != 0)
7490 continue;
7491
7492 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7493 }
7494 }
7495 KA_TRACE(
7496 20,
7497 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7498 __kmp_dflt_team_nth));
7499
7500 #ifdef KMP_ADJUST_BLOCKTIME
7501 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7502 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7503 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7504 if (__kmp_nth > __kmp_avail_proc) {
7505 __kmp_zero_bt = TRUE;
7506 }
7507 }
7508 #endif /* KMP_ADJUST_BLOCKTIME */
7509
7510 /* we have finished middle initialization */
7511 TCW_SYNC_4(__kmp_init_middle, TRUE);
7512
7513 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7514 }
7515
__kmp_middle_initialize(void)7516 void __kmp_middle_initialize(void) {
7517 if (__kmp_init_middle) {
7518 return;
7519 }
7520 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7521 if (__kmp_init_middle) {
7522 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7523 return;
7524 }
7525 __kmp_do_middle_initialize();
7526 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7527 }
7528
__kmp_parallel_initialize(void)7529 void __kmp_parallel_initialize(void) {
7530 int gtid = __kmp_entry_gtid(); // this might be a new root
7531
7532 /* synchronize parallel initialization (for sibling) */
7533 if (TCR_4(__kmp_init_parallel))
7534 return;
7535 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7536 if (TCR_4(__kmp_init_parallel)) {
7537 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7538 return;
7539 }
7540
7541 /* TODO reinitialization after we have already shut down */
7542 if (TCR_4(__kmp_global.g.g_done)) {
7543 KA_TRACE(
7544 10,
7545 ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7546 __kmp_infinite_loop();
7547 }
7548
7549 /* jc: The lock __kmp_initz_lock is already held, so calling
7550 __kmp_serial_initialize would cause a deadlock. So we call
7551 __kmp_do_serial_initialize directly. */
7552 if (!__kmp_init_middle) {
7553 __kmp_do_middle_initialize();
7554 }
7555 __kmp_assign_root_init_mask();
7556 __kmp_resume_if_hard_paused();
7557
7558 /* begin initialization */
7559 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7560 KMP_ASSERT(KMP_UBER_GTID(gtid));
7561
7562 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7563 // Save the FP control regs.
7564 // Worker threads will set theirs to these values at thread startup.
7565 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7566 __kmp_store_mxcsr(&__kmp_init_mxcsr);
7567 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7568 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7569
7570 #if KMP_OS_UNIX
7571 #if KMP_HANDLE_SIGNALS
7572 /* must be after __kmp_serial_initialize */
7573 __kmp_install_signals(TRUE);
7574 #endif
7575 #endif
7576
7577 __kmp_suspend_initialize();
7578
7579 #if defined(USE_LOAD_BALANCE)
7580 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7581 __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7582 }
7583 #else
7584 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7585 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7586 }
7587 #endif
7588
7589 if (__kmp_version) {
7590 __kmp_print_version_2();
7591 }
7592
7593 /* we have finished parallel initialization */
7594 TCW_SYNC_4(__kmp_init_parallel, TRUE);
7595
7596 KMP_MB();
7597 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7598
7599 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7600 }
7601
__kmp_hidden_helper_initialize()7602 void __kmp_hidden_helper_initialize() {
7603 if (TCR_4(__kmp_init_hidden_helper))
7604 return;
7605
7606 // __kmp_parallel_initialize is required before we initialize hidden helper
7607 if (!TCR_4(__kmp_init_parallel))
7608 __kmp_parallel_initialize();
7609
7610 // Double check. Note that this double check should not be placed before
7611 // __kmp_parallel_initialize as it will cause dead lock.
7612 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7613 if (TCR_4(__kmp_init_hidden_helper)) {
7614 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7615 return;
7616 }
7617
7618 #if KMP_AFFINITY_SUPPORTED
7619 // Initialize hidden helper affinity settings.
7620 // The above __kmp_parallel_initialize() will initialize
7621 // regular affinity (and topology) if not already done.
7622 if (!__kmp_hh_affinity.flags.initialized)
7623 __kmp_affinity_initialize(__kmp_hh_affinity);
7624 #endif
7625
7626 // Set the count of hidden helper tasks to be executed to zero
7627 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7628
7629 // Set the global variable indicating that we're initializing hidden helper
7630 // team/threads
7631 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7632
7633 // Platform independent initialization
7634 __kmp_do_initialize_hidden_helper_threads();
7635
7636 // Wait here for the finish of initialization of hidden helper teams
7637 __kmp_hidden_helper_threads_initz_wait();
7638
7639 // We have finished hidden helper initialization
7640 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7641
7642 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7643 }
7644
7645 /* ------------------------------------------------------------------------ */
7646
__kmp_run_before_invoked_task(int gtid,int tid,kmp_info_t * this_thr,kmp_team_t * team)7647 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7648 kmp_team_t *team) {
7649 kmp_disp_t *dispatch;
7650
7651 KMP_MB();
7652
7653 /* none of the threads have encountered any constructs, yet. */
7654 this_thr->th.th_local.this_construct = 0;
7655 #if KMP_CACHE_MANAGE
7656 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7657 #endif /* KMP_CACHE_MANAGE */
7658 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7659 KMP_DEBUG_ASSERT(dispatch);
7660 KMP_DEBUG_ASSERT(team->t.t_dispatch);
7661 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7662 // this_thr->th.th_info.ds.ds_tid ] );
7663
7664 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7665 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7666 if (__kmp_env_consistency_check)
7667 __kmp_push_parallel(gtid, team->t.t_ident);
7668
7669 KMP_MB(); /* Flush all pending memory write invalidates. */
7670 }
7671
__kmp_run_after_invoked_task(int gtid,int tid,kmp_info_t * this_thr,kmp_team_t * team)7672 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7673 kmp_team_t *team) {
7674 if (__kmp_env_consistency_check)
7675 __kmp_pop_parallel(gtid, team->t.t_ident);
7676
7677 __kmp_finish_implicit_task(this_thr);
7678 }
7679
__kmp_invoke_task_func(int gtid)7680 int __kmp_invoke_task_func(int gtid) {
7681 int rc;
7682 int tid = __kmp_tid_from_gtid(gtid);
7683 kmp_info_t *this_thr = __kmp_threads[gtid];
7684 kmp_team_t *team = this_thr->th.th_team;
7685
7686 __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7687 #if USE_ITT_BUILD
7688 if (__itt_stack_caller_create_ptr) {
7689 // inform ittnotify about entering user's code
7690 if (team->t.t_stack_id != NULL) {
7691 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7692 } else {
7693 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7694 __kmp_itt_stack_callee_enter(
7695 (__itt_caller)team->t.t_parent->t.t_stack_id);
7696 }
7697 }
7698 #endif /* USE_ITT_BUILD */
7699 #if INCLUDE_SSC_MARKS
7700 SSC_MARK_INVOKING();
7701 #endif
7702
7703 #if OMPT_SUPPORT
7704 void *dummy;
7705 void **exit_frame_p;
7706 ompt_data_t *my_task_data;
7707 ompt_data_t *my_parallel_data;
7708 int ompt_team_size;
7709
7710 if (ompt_enabled.enabled) {
7711 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7712 .ompt_task_info.frame.exit_frame.ptr);
7713 } else {
7714 exit_frame_p = &dummy;
7715 }
7716
7717 my_task_data =
7718 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7719 my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7720 if (ompt_enabled.ompt_callback_implicit_task) {
7721 ompt_team_size = team->t.t_nproc;
7722 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7723 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7724 __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7725 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7726 }
7727 #endif
7728
7729 #if KMP_STATS_ENABLED
7730 stats_state_e previous_state = KMP_GET_THREAD_STATE();
7731 if (previous_state == stats_state_e::TEAMS_REGION) {
7732 KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7733 } else {
7734 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7735 }
7736 KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7737 #endif
7738
7739 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7740 tid, (int)team->t.t_argc, (void **)team->t.t_argv
7741 #if OMPT_SUPPORT
7742 ,
7743 exit_frame_p
7744 #endif
7745 );
7746 #if OMPT_SUPPORT
7747 *exit_frame_p = NULL;
7748 this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_team;
7749 #endif
7750
7751 #if KMP_STATS_ENABLED
7752 if (previous_state == stats_state_e::TEAMS_REGION) {
7753 KMP_SET_THREAD_STATE(previous_state);
7754 }
7755 KMP_POP_PARTITIONED_TIMER();
7756 #endif
7757
7758 #if USE_ITT_BUILD
7759 if (__itt_stack_caller_create_ptr) {
7760 // inform ittnotify about leaving user's code
7761 if (team->t.t_stack_id != NULL) {
7762 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7763 } else {
7764 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7765 __kmp_itt_stack_callee_leave(
7766 (__itt_caller)team->t.t_parent->t.t_stack_id);
7767 }
7768 }
7769 #endif /* USE_ITT_BUILD */
7770 __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7771
7772 return rc;
7773 }
7774
__kmp_teams_master(int gtid)7775 void __kmp_teams_master(int gtid) {
7776 // This routine is called by all primary threads in teams construct
7777 kmp_info_t *thr = __kmp_threads[gtid];
7778 kmp_team_t *team = thr->th.th_team;
7779 ident_t *loc = team->t.t_ident;
7780 thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7781 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7782 KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7783 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7784 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7785
7786 // This thread is a new CG root. Set up the proper variables.
7787 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7788 tmp->cg_root = thr; // Make thr the CG root
7789 // Init to thread limit stored when league primary threads were forked
7790 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7791 tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7792 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7793 " cg_nthreads to 1\n",
7794 thr, tmp));
7795 tmp->up = thr->th.th_cg_roots;
7796 thr->th.th_cg_roots = tmp;
7797
7798 // Launch league of teams now, but not let workers execute
7799 // (they hang on fork barrier until next parallel)
7800 #if INCLUDE_SSC_MARKS
7801 SSC_MARK_FORKING();
7802 #endif
7803 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7804 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7805 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7806 #if INCLUDE_SSC_MARKS
7807 SSC_MARK_JOINING();
7808 #endif
7809 // If the team size was reduced from the limit, set it to the new size
7810 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7811 thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7812 // AC: last parameter "1" eliminates join barrier which won't work because
7813 // worker threads are in a fork barrier waiting for more parallel regions
7814 __kmp_join_call(loc, gtid
7815 #if OMPT_SUPPORT
7816 ,
7817 fork_context_intel
7818 #endif
7819 ,
7820 1);
7821 }
7822
__kmp_invoke_teams_master(int gtid)7823 int __kmp_invoke_teams_master(int gtid) {
7824 kmp_info_t *this_thr = __kmp_threads[gtid];
7825 kmp_team_t *team = this_thr->th.th_team;
7826 #if KMP_DEBUG
7827 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7828 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7829 (void *)__kmp_teams_master);
7830 #endif
7831 __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7832 #if OMPT_SUPPORT
7833 int tid = __kmp_tid_from_gtid(gtid);
7834 ompt_data_t *task_data =
7835 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7836 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7837 if (ompt_enabled.ompt_callback_implicit_task) {
7838 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7839 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7840 ompt_task_initial);
7841 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7842 }
7843 #endif
7844 __kmp_teams_master(gtid);
7845 #if OMPT_SUPPORT
7846 this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_league;
7847 #endif
7848 __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7849 return 1;
7850 }
7851
7852 /* this sets the requested number of threads for the next parallel region
7853 encountered by this team. since this should be enclosed in the forkjoin
7854 critical section it should avoid race conditions with asymmetrical nested
7855 parallelism */
__kmp_push_num_threads(ident_t * id,int gtid,int num_threads)7856 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7857 kmp_info_t *thr = __kmp_threads[gtid];
7858
7859 if (num_threads > 0)
7860 thr->th.th_set_nproc = num_threads;
7861 }
7862
__kmp_push_num_threads_list(ident_t * id,int gtid,kmp_uint32 list_length,int * num_threads_list)7863 void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length,
7864 int *num_threads_list) {
7865 kmp_info_t *thr = __kmp_threads[gtid];
7866
7867 KMP_DEBUG_ASSERT(list_length > 1);
7868
7869 if (num_threads_list[0] > 0)
7870 thr->th.th_set_nproc = num_threads_list[0];
7871 thr->th.th_set_nested_nth =
7872 (int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int));
7873 for (kmp_uint32 i = 0; i < list_length; ++i)
7874 thr->th.th_set_nested_nth[i] = num_threads_list[i];
7875 thr->th.th_set_nested_nth_sz = list_length;
7876 }
7877
__kmp_set_strict_num_threads(ident_t * loc,int gtid,int sev,const char * msg)7878 void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
7879 const char *msg) {
7880 kmp_info_t *thr = __kmp_threads[gtid];
7881 thr->th.th_nt_strict = true;
7882 thr->th.th_nt_loc = loc;
7883 // if sev is unset make fatal
7884 if (sev == severity_warning)
7885 thr->th.th_nt_sev = sev;
7886 else
7887 thr->th.th_nt_sev = severity_fatal;
7888 // if msg is unset, use an appropriate message
7889 if (msg)
7890 thr->th.th_nt_msg = msg;
7891 else
7892 thr->th.th_nt_msg = "Cannot form team with number of threads specified by "
7893 "strict num_threads clause.";
7894 }
7895
__kmp_push_thread_limit(kmp_info_t * thr,int num_teams,int num_threads)7896 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7897 int num_threads) {
7898 KMP_DEBUG_ASSERT(thr);
7899 // Remember the number of threads for inner parallel regions
7900 if (!TCR_4(__kmp_init_middle))
7901 __kmp_middle_initialize(); // get internal globals calculated
7902 __kmp_assign_root_init_mask();
7903 KMP_DEBUG_ASSERT(__kmp_avail_proc);
7904 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7905
7906 if (num_threads == 0) {
7907 if (__kmp_teams_thread_limit > 0) {
7908 num_threads = __kmp_teams_thread_limit;
7909 } else {
7910 num_threads = __kmp_avail_proc / num_teams;
7911 }
7912 // adjust num_threads w/o warning as it is not user setting
7913 // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7914 // no thread_limit clause specified - do not change thread-limit-var ICV
7915 if (num_threads > __kmp_dflt_team_nth) {
7916 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7917 }
7918 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7919 num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7920 } // prevent team size to exceed thread-limit-var
7921 if (num_teams * num_threads > __kmp_teams_max_nth) {
7922 num_threads = __kmp_teams_max_nth / num_teams;
7923 }
7924 if (num_threads == 0) {
7925 num_threads = 1;
7926 }
7927 } else {
7928 if (num_threads < 0) {
7929 __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7930 __kmp_msg_null);
7931 num_threads = 1;
7932 }
7933 // This thread will be the primary thread of the league primary threads
7934 // Store new thread limit; old limit is saved in th_cg_roots list
7935 thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7936 // num_threads = min(num_threads, nthreads-var)
7937 if (num_threads > __kmp_dflt_team_nth) {
7938 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7939 }
7940 if (num_teams * num_threads > __kmp_teams_max_nth) {
7941 int new_threads = __kmp_teams_max_nth / num_teams;
7942 if (new_threads == 0) {
7943 new_threads = 1;
7944 }
7945 if (new_threads != num_threads) {
7946 if (!__kmp_reserve_warn) { // user asked for too many threads
7947 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7948 __kmp_msg(kmp_ms_warning,
7949 KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7950 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7951 }
7952 }
7953 num_threads = new_threads;
7954 }
7955 }
7956 thr->th.th_teams_size.nth = num_threads;
7957 }
7958
7959 /* this sets the requested number of teams for the teams region and/or
7960 the number of threads for the next parallel region encountered */
__kmp_push_num_teams(ident_t * id,int gtid,int num_teams,int num_threads)7961 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7962 int num_threads) {
7963 kmp_info_t *thr = __kmp_threads[gtid];
7964 if (num_teams < 0) {
7965 // OpenMP specification requires requested values to be positive,
7966 // but people can send us any value, so we'd better check
7967 __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7968 __kmp_msg_null);
7969 num_teams = 1;
7970 }
7971 if (num_teams == 0) {
7972 if (__kmp_nteams > 0) {
7973 num_teams = __kmp_nteams;
7974 } else {
7975 num_teams = 1; // default number of teams is 1.
7976 }
7977 }
7978 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7979 if (!__kmp_reserve_warn) {
7980 __kmp_reserve_warn = 1;
7981 __kmp_msg(kmp_ms_warning,
7982 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7983 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7984 }
7985 num_teams = __kmp_teams_max_nth;
7986 }
7987 // Set number of teams (number of threads in the outer "parallel" of the
7988 // teams)
7989 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7990
7991 __kmp_push_thread_limit(thr, num_teams, num_threads);
7992 }
7993
7994 /* This sets the requested number of teams for the teams region and/or
7995 the number of threads for the next parallel region encountered */
__kmp_push_num_teams_51(ident_t * id,int gtid,int num_teams_lb,int num_teams_ub,int num_threads)7996 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7997 int num_teams_ub, int num_threads) {
7998 kmp_info_t *thr = __kmp_threads[gtid];
7999 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
8000 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
8001 KMP_DEBUG_ASSERT(num_threads >= 0);
8002
8003 if (num_teams_lb > num_teams_ub) {
8004 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
8005 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
8006 }
8007
8008 int num_teams = 1; // defalt number of teams is 1.
8009
8010 if (num_teams_lb == 0 && num_teams_ub > 0)
8011 num_teams_lb = num_teams_ub;
8012
8013 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
8014 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
8015 if (num_teams > __kmp_teams_max_nth) {
8016 if (!__kmp_reserve_warn) {
8017 __kmp_reserve_warn = 1;
8018 __kmp_msg(kmp_ms_warning,
8019 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
8020 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
8021 }
8022 num_teams = __kmp_teams_max_nth;
8023 }
8024 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
8025 num_teams = num_teams_ub;
8026 } else { // num_teams_lb <= num_teams <= num_teams_ub
8027 if (num_threads <= 0) {
8028 if (num_teams_ub > __kmp_teams_max_nth) {
8029 num_teams = num_teams_lb;
8030 } else {
8031 num_teams = num_teams_ub;
8032 }
8033 } else {
8034 num_teams = (num_threads > __kmp_teams_max_nth)
8035 ? num_teams
8036 : __kmp_teams_max_nth / num_threads;
8037 if (num_teams < num_teams_lb) {
8038 num_teams = num_teams_lb;
8039 } else if (num_teams > num_teams_ub) {
8040 num_teams = num_teams_ub;
8041 }
8042 }
8043 }
8044 // Set number of teams (number of threads in the outer "parallel" of the
8045 // teams)
8046 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
8047
8048 __kmp_push_thread_limit(thr, num_teams, num_threads);
8049 }
8050
8051 // Set the proc_bind var to use in the following parallel region.
__kmp_push_proc_bind(ident_t * id,int gtid,kmp_proc_bind_t proc_bind)8052 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8053 kmp_info_t *thr = __kmp_threads[gtid];
8054 thr->th.th_set_proc_bind = proc_bind;
8055 }
8056
8057 /* Launch the worker threads into the microtask. */
8058
__kmp_internal_fork(ident_t * id,int gtid,kmp_team_t * team)8059 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8060 kmp_info_t *this_thr = __kmp_threads[gtid];
8061
8062 #ifdef KMP_DEBUG
8063 int f;
8064 #endif /* KMP_DEBUG */
8065
8066 KMP_DEBUG_ASSERT(team);
8067 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8068 KMP_ASSERT(KMP_MASTER_GTID(gtid));
8069 KMP_MB(); /* Flush all pending memory write invalidates. */
8070
8071 team->t.t_construct = 0; /* no single directives seen yet */
8072 team->t.t_ordered.dt.t_value =
8073 0; /* thread 0 enters the ordered section first */
8074
8075 /* Reset the identifiers on the dispatch buffer */
8076 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8077 if (team->t.t_max_nproc > 1) {
8078 int i;
8079 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8080 team->t.t_disp_buffer[i].buffer_index = i;
8081 team->t.t_disp_buffer[i].doacross_buf_idx = i;
8082 }
8083 } else {
8084 team->t.t_disp_buffer[0].buffer_index = 0;
8085 team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8086 }
8087
8088 KMP_MB(); /* Flush all pending memory write invalidates. */
8089 KMP_ASSERT(this_thr->th.th_team == team);
8090
8091 #ifdef KMP_DEBUG
8092 for (f = 0; f < team->t.t_nproc; f++) {
8093 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8094 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8095 }
8096 #endif /* KMP_DEBUG */
8097
8098 /* release the worker threads so they may begin working */
8099 __kmp_fork_barrier(gtid, 0);
8100 }
8101
__kmp_internal_join(ident_t * id,int gtid,kmp_team_t * team)8102 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8103 kmp_info_t *this_thr = __kmp_threads[gtid];
8104
8105 KMP_DEBUG_ASSERT(team);
8106 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8107 KMP_ASSERT(KMP_MASTER_GTID(gtid));
8108 KMP_MB(); /* Flush all pending memory write invalidates. */
8109
8110 /* Join barrier after fork */
8111
8112 #ifdef KMP_DEBUG
8113 if (__kmp_threads[gtid] &&
8114 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8115 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8116 __kmp_threads[gtid]);
8117 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8118 "team->t.t_nproc=%d\n",
8119 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8120 team->t.t_nproc);
8121 __kmp_print_structure();
8122 }
8123 KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8124 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8125 #endif /* KMP_DEBUG */
8126
8127 __kmp_join_barrier(gtid); /* wait for everyone */
8128 #if OMPT_SUPPORT
8129 ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
8130 if (ompt_enabled.enabled &&
8131 (ompt_state == ompt_state_wait_barrier_teams ||
8132 ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
8133 int ds_tid = this_thr->th.th_info.ds.ds_tid;
8134 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8135 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8136 #if OMPT_OPTIONAL
8137 void *codeptr = NULL;
8138 if (KMP_MASTER_TID(ds_tid) &&
8139 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8140 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8141 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8142
8143 ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
8144 if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
8145 sync_kind = ompt_sync_region_barrier_teams;
8146 if (ompt_enabled.ompt_callback_sync_region_wait) {
8147 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8148 sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8149 }
8150 if (ompt_enabled.ompt_callback_sync_region) {
8151 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8152 sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8153 }
8154 #endif
8155 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8156 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8157 ompt_scope_end, NULL, task_data, 0, ds_tid,
8158 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8159 }
8160 }
8161 #endif
8162
8163 KMP_MB(); /* Flush all pending memory write invalidates. */
8164 KMP_ASSERT(this_thr->th.th_team == team);
8165 }
8166
8167 /* ------------------------------------------------------------------------ */
8168
8169 #ifdef USE_LOAD_BALANCE
8170
8171 // Return the worker threads actively spinning in the hot team, if we
8172 // are at the outermost level of parallelism. Otherwise, return 0.
__kmp_active_hot_team_nproc(kmp_root_t * root)8173 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8174 int i;
8175 int retval;
8176 kmp_team_t *hot_team;
8177
8178 if (root->r.r_active) {
8179 return 0;
8180 }
8181 hot_team = root->r.r_hot_team;
8182 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8183 return hot_team->t.t_nproc - 1; // Don't count primary thread
8184 }
8185
8186 // Skip the primary thread - it is accounted for elsewhere.
8187 retval = 0;
8188 for (i = 1; i < hot_team->t.t_nproc; i++) {
8189 if (hot_team->t.t_threads[i]->th.th_active) {
8190 retval++;
8191 }
8192 }
8193 return retval;
8194 }
8195
8196 // Perform an automatic adjustment to the number of
8197 // threads used by the next parallel region.
__kmp_load_balance_nproc(kmp_root_t * root,int set_nproc)8198 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8199 int retval;
8200 int pool_active;
8201 int hot_team_active;
8202 int team_curr_active;
8203 int system_active;
8204
8205 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8206 set_nproc));
8207 KMP_DEBUG_ASSERT(root);
8208 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8209 ->th.th_current_task->td_icvs.dynamic == TRUE);
8210 KMP_DEBUG_ASSERT(set_nproc > 1);
8211
8212 if (set_nproc == 1) {
8213 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8214 return 1;
8215 }
8216
8217 // Threads that are active in the thread pool, active in the hot team for this
8218 // particular root (if we are at the outer par level), and the currently
8219 // executing thread (to become the primary thread) are available to add to the
8220 // new team, but are currently contributing to the system load, and must be
8221 // accounted for.
8222 pool_active = __kmp_thread_pool_active_nth;
8223 hot_team_active = __kmp_active_hot_team_nproc(root);
8224 team_curr_active = pool_active + hot_team_active + 1;
8225
8226 // Check the system load.
8227 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8228 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8229 "hot team active = %d\n",
8230 system_active, pool_active, hot_team_active));
8231
8232 if (system_active < 0) {
8233 // There was an error reading the necessary info from /proc, so use the
8234 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8235 // = dynamic_thread_limit, we shouldn't wind up getting back here.
8236 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8237 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8238
8239 // Make this call behave like the thread limit algorithm.
8240 retval = __kmp_avail_proc - __kmp_nth +
8241 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8242 if (retval > set_nproc) {
8243 retval = set_nproc;
8244 }
8245 if (retval < KMP_MIN_NTH) {
8246 retval = KMP_MIN_NTH;
8247 }
8248
8249 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8250 retval));
8251 return retval;
8252 }
8253
8254 // There is a slight delay in the load balance algorithm in detecting new
8255 // running procs. The real system load at this instant should be at least as
8256 // large as the #active omp thread that are available to add to the team.
8257 if (system_active < team_curr_active) {
8258 system_active = team_curr_active;
8259 }
8260 retval = __kmp_avail_proc - system_active + team_curr_active;
8261 if (retval > set_nproc) {
8262 retval = set_nproc;
8263 }
8264 if (retval < KMP_MIN_NTH) {
8265 retval = KMP_MIN_NTH;
8266 }
8267
8268 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8269 return retval;
8270 } // __kmp_load_balance_nproc()
8271
8272 #endif /* USE_LOAD_BALANCE */
8273
8274 /* ------------------------------------------------------------------------ */
8275
8276 /* NOTE: this is called with the __kmp_init_lock held */
__kmp_cleanup(void)8277 void __kmp_cleanup(void) {
8278 int f;
8279
8280 KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8281
8282 if (TCR_4(__kmp_init_parallel)) {
8283 #if KMP_HANDLE_SIGNALS
8284 __kmp_remove_signals();
8285 #endif
8286 TCW_4(__kmp_init_parallel, FALSE);
8287 }
8288
8289 if (TCR_4(__kmp_init_middle)) {
8290 #if KMP_AFFINITY_SUPPORTED
8291 __kmp_affinity_uninitialize();
8292 #endif /* KMP_AFFINITY_SUPPORTED */
8293 __kmp_cleanup_hierarchy();
8294 TCW_4(__kmp_init_middle, FALSE);
8295 }
8296
8297 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8298
8299 if (__kmp_init_serial) {
8300 __kmp_runtime_destroy();
8301 __kmp_init_serial = FALSE;
8302 }
8303
8304 __kmp_cleanup_threadprivate_caches();
8305
8306 for (f = 0; f < __kmp_threads_capacity; f++) {
8307 if (__kmp_root[f] != NULL) {
8308 __kmp_free(__kmp_root[f]);
8309 __kmp_root[f] = NULL;
8310 }
8311 }
8312 __kmp_free(__kmp_threads);
8313 // __kmp_threads and __kmp_root were allocated at once, as single block, so
8314 // there is no need in freeing __kmp_root.
8315 __kmp_threads = NULL;
8316 __kmp_root = NULL;
8317 __kmp_threads_capacity = 0;
8318
8319 // Free old __kmp_threads arrays if they exist.
8320 kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8321 while (ptr) {
8322 kmp_old_threads_list_t *next = ptr->next;
8323 __kmp_free(ptr->threads);
8324 __kmp_free(ptr);
8325 ptr = next;
8326 }
8327
8328 #if KMP_USE_DYNAMIC_LOCK
8329 __kmp_cleanup_indirect_user_locks();
8330 #else
8331 __kmp_cleanup_user_locks();
8332 #endif
8333 #if OMPD_SUPPORT
8334 if (ompd_state) {
8335 __kmp_free(ompd_env_block);
8336 ompd_env_block = NULL;
8337 ompd_env_block_size = 0;
8338 }
8339 #endif
8340
8341 #if KMP_AFFINITY_SUPPORTED
8342 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8343 __kmp_cpuinfo_file = NULL;
8344 #endif /* KMP_AFFINITY_SUPPORTED */
8345
8346 #if KMP_USE_ADAPTIVE_LOCKS
8347 #if KMP_DEBUG_ADAPTIVE_LOCKS
8348 __kmp_print_speculative_stats();
8349 #endif
8350 #endif
8351 KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8352 __kmp_nested_nth.nth = NULL;
8353 __kmp_nested_nth.size = 0;
8354 __kmp_nested_nth.used = 0;
8355
8356 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8357 __kmp_nested_proc_bind.bind_types = NULL;
8358 __kmp_nested_proc_bind.size = 0;
8359 __kmp_nested_proc_bind.used = 0;
8360 if (__kmp_affinity_format) {
8361 KMP_INTERNAL_FREE(__kmp_affinity_format);
8362 __kmp_affinity_format = NULL;
8363 }
8364
8365 __kmp_i18n_catclose();
8366
8367 #if KMP_USE_HIER_SCHED
8368 __kmp_hier_scheds.deallocate();
8369 #endif
8370
8371 #if KMP_STATS_ENABLED
8372 __kmp_stats_fini();
8373 #endif
8374
8375 KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8376 }
8377
8378 /* ------------------------------------------------------------------------ */
8379
__kmp_ignore_mppbeg(void)8380 int __kmp_ignore_mppbeg(void) {
8381 char *env;
8382
8383 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8384 if (__kmp_str_match_false(env))
8385 return FALSE;
8386 }
8387 // By default __kmpc_begin() is no-op.
8388 return TRUE;
8389 }
8390
__kmp_ignore_mppend(void)8391 int __kmp_ignore_mppend(void) {
8392 char *env;
8393
8394 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8395 if (__kmp_str_match_false(env))
8396 return FALSE;
8397 }
8398 // By default __kmpc_end() is no-op.
8399 return TRUE;
8400 }
8401
__kmp_internal_begin(void)8402 void __kmp_internal_begin(void) {
8403 int gtid;
8404 kmp_root_t *root;
8405
8406 /* this is a very important step as it will register new sibling threads
8407 and assign these new uber threads a new gtid */
8408 gtid = __kmp_entry_gtid();
8409 root = __kmp_threads[gtid]->th.th_root;
8410 KMP_ASSERT(KMP_UBER_GTID(gtid));
8411
8412 if (root->r.r_begin)
8413 return;
8414 __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8415 if (root->r.r_begin) {
8416 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8417 return;
8418 }
8419
8420 root->r.r_begin = TRUE;
8421
8422 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8423 }
8424
8425 /* ------------------------------------------------------------------------ */
8426
__kmp_user_set_library(enum library_type arg)8427 void __kmp_user_set_library(enum library_type arg) {
8428 int gtid;
8429 kmp_root_t *root;
8430 kmp_info_t *thread;
8431
8432 /* first, make sure we are initialized so we can get our gtid */
8433
8434 gtid = __kmp_entry_gtid();
8435 thread = __kmp_threads[gtid];
8436
8437 root = thread->th.th_root;
8438
8439 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8440 library_serial));
8441 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8442 thread */
8443 KMP_WARNING(SetLibraryIncorrectCall);
8444 return;
8445 }
8446
8447 switch (arg) {
8448 case library_serial:
8449 thread->th.th_set_nproc = 0;
8450 set__nproc(thread, 1);
8451 break;
8452 case library_turnaround:
8453 thread->th.th_set_nproc = 0;
8454 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8455 : __kmp_dflt_team_nth_ub);
8456 break;
8457 case library_throughput:
8458 thread->th.th_set_nproc = 0;
8459 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8460 : __kmp_dflt_team_nth_ub);
8461 break;
8462 default:
8463 KMP_FATAL(UnknownLibraryType, arg);
8464 }
8465
8466 __kmp_aux_set_library(arg);
8467 }
8468
__kmp_aux_set_stacksize(size_t arg)8469 void __kmp_aux_set_stacksize(size_t arg) {
8470 if (!__kmp_init_serial)
8471 __kmp_serial_initialize();
8472
8473 #if KMP_OS_DARWIN
8474 if (arg & (0x1000 - 1)) {
8475 arg &= ~(0x1000 - 1);
8476 if (arg + 0x1000) /* check for overflow if we round up */
8477 arg += 0x1000;
8478 }
8479 #endif
8480 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8481
8482 /* only change the default stacksize before the first parallel region */
8483 if (!TCR_4(__kmp_init_parallel)) {
8484 size_t value = arg; /* argument is in bytes */
8485
8486 if (value < __kmp_sys_min_stksize)
8487 value = __kmp_sys_min_stksize;
8488 else if (value > KMP_MAX_STKSIZE)
8489 value = KMP_MAX_STKSIZE;
8490
8491 __kmp_stksize = value;
8492
8493 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8494 }
8495
8496 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8497 }
8498
8499 /* set the behaviour of the runtime library */
8500 /* TODO this can cause some odd behaviour with sibling parallelism... */
__kmp_aux_set_library(enum library_type arg)8501 void __kmp_aux_set_library(enum library_type arg) {
8502 __kmp_library = arg;
8503
8504 switch (__kmp_library) {
8505 case library_serial: {
8506 KMP_INFORM(LibraryIsSerial);
8507 } break;
8508 case library_turnaround:
8509 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8510 __kmp_use_yield = 2; // only yield when oversubscribed
8511 break;
8512 case library_throughput:
8513 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8514 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8515 break;
8516 default:
8517 KMP_FATAL(UnknownLibraryType, arg);
8518 }
8519 }
8520
8521 /* Getting team information common for all team API */
8522 // Returns NULL if not in teams construct
__kmp_aux_get_team_info(int & teams_serialized)8523 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8524 kmp_info_t *thr = __kmp_entry_thread();
8525 teams_serialized = 0;
8526 if (thr->th.th_teams_microtask) {
8527 kmp_team_t *team = thr->th.th_team;
8528 int tlevel = thr->th.th_teams_level; // the level of the teams construct
8529 int ii = team->t.t_level;
8530 teams_serialized = team->t.t_serialized;
8531 int level = tlevel + 1;
8532 KMP_DEBUG_ASSERT(ii >= tlevel);
8533 while (ii > level) {
8534 for (teams_serialized = team->t.t_serialized;
8535 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8536 }
8537 if (team->t.t_serialized && (!teams_serialized)) {
8538 team = team->t.t_parent;
8539 continue;
8540 }
8541 if (ii > level) {
8542 team = team->t.t_parent;
8543 ii--;
8544 }
8545 }
8546 return team;
8547 }
8548 return NULL;
8549 }
8550
__kmp_aux_get_team_num()8551 int __kmp_aux_get_team_num() {
8552 int serialized;
8553 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8554 if (team) {
8555 if (serialized > 1) {
8556 return 0; // teams region is serialized ( 1 team of 1 thread ).
8557 } else {
8558 return team->t.t_master_tid;
8559 }
8560 }
8561 return 0;
8562 }
8563
__kmp_aux_get_num_teams()8564 int __kmp_aux_get_num_teams() {
8565 int serialized;
8566 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8567 if (team) {
8568 if (serialized > 1) {
8569 return 1;
8570 } else {
8571 return team->t.t_parent->t.t_nproc;
8572 }
8573 }
8574 return 1;
8575 }
8576
8577 /* ------------------------------------------------------------------------ */
8578
8579 /*
8580 * Affinity Format Parser
8581 *
8582 * Field is in form of: %[[[0].]size]type
8583 * % and type are required (%% means print a literal '%')
8584 * type is either single char or long name surrounded by {},
8585 * e.g., N or {num_threads}
8586 * 0 => leading zeros
8587 * . => right justified when size is specified
8588 * by default output is left justified
8589 * size is the *minimum* field length
8590 * All other characters are printed as is
8591 *
8592 * Available field types:
8593 * L {thread_level} - omp_get_level()
8594 * n {thread_num} - omp_get_thread_num()
8595 * h {host} - name of host machine
8596 * P {process_id} - process id (integer)
8597 * T {thread_identifier} - native thread identifier (integer)
8598 * N {num_threads} - omp_get_num_threads()
8599 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8600 * a {thread_affinity} - comma separated list of integers or integer ranges
8601 * (values of affinity mask)
8602 *
8603 * Implementation-specific field types can be added
8604 * If a type is unknown, print "undefined"
8605 */
8606
8607 // Structure holding the short name, long name, and corresponding data type
8608 // for snprintf. A table of these will represent the entire valid keyword
8609 // field types.
8610 typedef struct kmp_affinity_format_field_t {
8611 char short_name; // from spec e.g., L -> thread level
8612 const char *long_name; // from spec thread_level -> thread level
8613 char field_format; // data type for snprintf (typically 'd' or 's'
8614 // for integer or string)
8615 } kmp_affinity_format_field_t;
8616
8617 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8618 #if KMP_AFFINITY_SUPPORTED
8619 {'A', "thread_affinity", 's'},
8620 #endif
8621 {'t', "team_num", 'd'},
8622 {'T', "num_teams", 'd'},
8623 {'L', "nesting_level", 'd'},
8624 {'n', "thread_num", 'd'},
8625 {'N', "num_threads", 'd'},
8626 {'a', "ancestor_tnum", 'd'},
8627 {'H', "host", 's'},
8628 {'P', "process_id", 'd'},
8629 {'i', "native_thread_id", 'd'}};
8630
8631 // Return the number of characters it takes to hold field
__kmp_aux_capture_affinity_field(int gtid,const kmp_info_t * th,const char ** ptr,kmp_str_buf_t * field_buffer)8632 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8633 const char **ptr,
8634 kmp_str_buf_t *field_buffer) {
8635 int rc, format_index, field_value;
8636 const char *width_left, *width_right;
8637 bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8638 static const int FORMAT_SIZE = 20;
8639 char format[FORMAT_SIZE] = {0};
8640 char absolute_short_name = 0;
8641
8642 KMP_DEBUG_ASSERT(gtid >= 0);
8643 KMP_DEBUG_ASSERT(th);
8644 KMP_DEBUG_ASSERT(**ptr == '%');
8645 KMP_DEBUG_ASSERT(field_buffer);
8646
8647 __kmp_str_buf_clear(field_buffer);
8648
8649 // Skip the initial %
8650 (*ptr)++;
8651
8652 // Check for %% first
8653 if (**ptr == '%') {
8654 __kmp_str_buf_cat(field_buffer, "%", 1);
8655 (*ptr)++; // skip over the second %
8656 return 1;
8657 }
8658
8659 // Parse field modifiers if they are present
8660 pad_zeros = false;
8661 if (**ptr == '0') {
8662 pad_zeros = true;
8663 (*ptr)++; // skip over 0
8664 }
8665 right_justify = false;
8666 if (**ptr == '.') {
8667 right_justify = true;
8668 (*ptr)++; // skip over .
8669 }
8670 // Parse width of field: [width_left, width_right)
8671 width_left = width_right = NULL;
8672 if (**ptr >= '0' && **ptr <= '9') {
8673 width_left = *ptr;
8674 SKIP_DIGITS(*ptr);
8675 width_right = *ptr;
8676 }
8677
8678 // Create the format for KMP_SNPRINTF based on flags parsed above
8679 format_index = 0;
8680 format[format_index++] = '%';
8681 if (!right_justify)
8682 format[format_index++] = '-';
8683 if (pad_zeros)
8684 format[format_index++] = '0';
8685 if (width_left && width_right) {
8686 int i = 0;
8687 // Only allow 8 digit number widths.
8688 // This also prevents overflowing format variable
8689 while (i < 8 && width_left < width_right) {
8690 format[format_index++] = *width_left;
8691 width_left++;
8692 i++;
8693 }
8694 }
8695
8696 // Parse a name (long or short)
8697 // Canonicalize the name into absolute_short_name
8698 found_valid_name = false;
8699 parse_long_name = (**ptr == '{');
8700 if (parse_long_name)
8701 (*ptr)++; // skip initial left brace
8702 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8703 sizeof(__kmp_affinity_format_table[0]);
8704 ++i) {
8705 char short_name = __kmp_affinity_format_table[i].short_name;
8706 const char *long_name = __kmp_affinity_format_table[i].long_name;
8707 char field_format = __kmp_affinity_format_table[i].field_format;
8708 if (parse_long_name) {
8709 size_t length = KMP_STRLEN(long_name);
8710 if (strncmp(*ptr, long_name, length) == 0) {
8711 found_valid_name = true;
8712 (*ptr) += length; // skip the long name
8713 }
8714 } else if (**ptr == short_name) {
8715 found_valid_name = true;
8716 (*ptr)++; // skip the short name
8717 }
8718 if (found_valid_name) {
8719 format[format_index++] = field_format;
8720 format[format_index++] = '\0';
8721 absolute_short_name = short_name;
8722 break;
8723 }
8724 }
8725 if (parse_long_name) {
8726 if (**ptr != '}') {
8727 absolute_short_name = 0;
8728 } else {
8729 (*ptr)++; // skip over the right brace
8730 }
8731 }
8732
8733 // Attempt to fill the buffer with the requested
8734 // value using snprintf within __kmp_str_buf_print()
8735 switch (absolute_short_name) {
8736 case 't':
8737 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8738 break;
8739 case 'T':
8740 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8741 break;
8742 case 'L':
8743 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8744 break;
8745 case 'n':
8746 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8747 break;
8748 case 'H': {
8749 static const int BUFFER_SIZE = 256;
8750 char buf[BUFFER_SIZE];
8751 __kmp_expand_host_name(buf, BUFFER_SIZE);
8752 rc = __kmp_str_buf_print(field_buffer, format, buf);
8753 } break;
8754 case 'P':
8755 rc = __kmp_str_buf_print(field_buffer, format, getpid());
8756 break;
8757 case 'i':
8758 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8759 break;
8760 case 'N':
8761 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8762 break;
8763 case 'a':
8764 field_value =
8765 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8766 rc = __kmp_str_buf_print(field_buffer, format, field_value);
8767 break;
8768 #if KMP_AFFINITY_SUPPORTED
8769 case 'A': {
8770 kmp_str_buf_t buf;
8771 __kmp_str_buf_init(&buf);
8772 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8773 rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8774 __kmp_str_buf_free(&buf);
8775 } break;
8776 #endif
8777 default:
8778 // According to spec, If an implementation does not have info for field
8779 // type, then "undefined" is printed
8780 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8781 // Skip the field
8782 if (parse_long_name) {
8783 SKIP_TOKEN(*ptr);
8784 if (**ptr == '}')
8785 (*ptr)++;
8786 } else {
8787 (*ptr)++;
8788 }
8789 }
8790
8791 KMP_ASSERT(format_index <= FORMAT_SIZE);
8792 return rc;
8793 }
8794
8795 /*
8796 * Return number of characters needed to hold the affinity string
8797 * (not including null byte character)
8798 * The resultant string is printed to buffer, which the caller can then
8799 * handle afterwards
8800 */
__kmp_aux_capture_affinity(int gtid,const char * format,kmp_str_buf_t * buffer)8801 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8802 kmp_str_buf_t *buffer) {
8803 const char *parse_ptr;
8804 size_t retval;
8805 const kmp_info_t *th;
8806 kmp_str_buf_t field;
8807
8808 KMP_DEBUG_ASSERT(buffer);
8809 KMP_DEBUG_ASSERT(gtid >= 0);
8810
8811 __kmp_str_buf_init(&field);
8812 __kmp_str_buf_clear(buffer);
8813
8814 th = __kmp_threads[gtid];
8815 retval = 0;
8816
8817 // If format is NULL or zero-length string, then we use
8818 // affinity-format-var ICV
8819 parse_ptr = format;
8820 if (parse_ptr == NULL || *parse_ptr == '\0') {
8821 parse_ptr = __kmp_affinity_format;
8822 }
8823 KMP_DEBUG_ASSERT(parse_ptr);
8824
8825 while (*parse_ptr != '\0') {
8826 // Parse a field
8827 if (*parse_ptr == '%') {
8828 // Put field in the buffer
8829 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8830 __kmp_str_buf_catbuf(buffer, &field);
8831 retval += rc;
8832 } else {
8833 // Put literal character in buffer
8834 __kmp_str_buf_cat(buffer, parse_ptr, 1);
8835 retval++;
8836 parse_ptr++;
8837 }
8838 }
8839 __kmp_str_buf_free(&field);
8840 return retval;
8841 }
8842
8843 // Displays the affinity string to stdout
__kmp_aux_display_affinity(int gtid,const char * format)8844 void __kmp_aux_display_affinity(int gtid, const char *format) {
8845 kmp_str_buf_t buf;
8846 __kmp_str_buf_init(&buf);
8847 __kmp_aux_capture_affinity(gtid, format, &buf);
8848 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8849 __kmp_str_buf_free(&buf);
8850 }
8851
8852 /* ------------------------------------------------------------------------ */
__kmp_aux_set_blocktime(int arg,kmp_info_t * thread,int tid)8853 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8854 int blocktime = arg; /* argument is in microseconds */
8855 #if KMP_USE_MONITOR
8856 int bt_intervals;
8857 #endif
8858 kmp_int8 bt_set;
8859
8860 __kmp_save_internal_controls(thread);
8861
8862 /* Normalize and set blocktime for the teams */
8863 if (blocktime < KMP_MIN_BLOCKTIME)
8864 blocktime = KMP_MIN_BLOCKTIME;
8865 else if (blocktime > KMP_MAX_BLOCKTIME)
8866 blocktime = KMP_MAX_BLOCKTIME;
8867
8868 set__blocktime_team(thread->th.th_team, tid, blocktime);
8869 set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8870
8871 #if KMP_USE_MONITOR
8872 /* Calculate and set blocktime intervals for the teams */
8873 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8874
8875 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8876 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8877 #endif
8878
8879 /* Set whether blocktime has been set to "TRUE" */
8880 bt_set = TRUE;
8881
8882 set__bt_set_team(thread->th.th_team, tid, bt_set);
8883 set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8884 #if KMP_USE_MONITOR
8885 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8886 "bt_intervals=%d, monitor_updates=%d\n",
8887 __kmp_gtid_from_tid(tid, thread->th.th_team),
8888 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8889 __kmp_monitor_wakeups));
8890 #else
8891 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8892 __kmp_gtid_from_tid(tid, thread->th.th_team),
8893 thread->th.th_team->t.t_id, tid, blocktime));
8894 #endif
8895 }
8896
__kmp_aux_set_defaults(char const * str,size_t len)8897 void __kmp_aux_set_defaults(char const *str, size_t len) {
8898 if (!__kmp_init_serial) {
8899 __kmp_serial_initialize();
8900 }
8901 __kmp_env_initialize(str);
8902
8903 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8904 __kmp_env_print();
8905 }
8906 } // __kmp_aux_set_defaults
8907
8908 /* ------------------------------------------------------------------------ */
8909 /* internal fast reduction routines */
8910
8911 PACKED_REDUCTION_METHOD_T
__kmp_determine_reduction_method(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_vars,size_t reduce_size,void * reduce_data,void (* reduce_func)(void * lhs_data,void * rhs_data),kmp_critical_name * lck)8912 __kmp_determine_reduction_method(
8913 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8914 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8915 kmp_critical_name *lck) {
8916
8917 // Default reduction method: critical construct ( lck != NULL, like in current
8918 // PAROPT )
8919 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8920 // can be selected by RTL
8921 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8922 // can be selected by RTL
8923 // Finally, it's up to OpenMP RTL to make a decision on which method to select
8924 // among generated by PAROPT.
8925
8926 PACKED_REDUCTION_METHOD_T retval;
8927
8928 int team_size;
8929
8930 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8931
8932 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8933 (loc && \
8934 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8935 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8936
8937 retval = critical_reduce_block;
8938
8939 // another choice of getting a team size (with 1 dynamic deference) is slower
8940 team_size = __kmp_get_team_num_threads(global_tid);
8941 if (team_size == 1) {
8942
8943 retval = empty_reduce_block;
8944
8945 } else {
8946
8947 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8948
8949 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8950 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
8951 KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
8952
8953 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8954 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD || \
8955 KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8956
8957 int teamsize_cutoff = 4;
8958
8959 #if KMP_MIC_SUPPORTED
8960 if (__kmp_mic_type != non_mic) {
8961 teamsize_cutoff = 8;
8962 }
8963 #endif
8964 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8965 if (tree_available) {
8966 if (team_size <= teamsize_cutoff) {
8967 if (atomic_available) {
8968 retval = atomic_reduce_block;
8969 }
8970 } else {
8971 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8972 }
8973 } else if (atomic_available) {
8974 retval = atomic_reduce_block;
8975 }
8976 #else
8977 #error "Unknown or unsupported OS"
8978 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8979 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||
8980 // KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8981
8982 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS || \
8983 KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32
8984
8985 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8986 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_SOLARIS || \
8987 KMP_OS_WASI || KMP_OS_AIX
8988
8989 // basic tuning
8990
8991 if (atomic_available) {
8992 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8993 retval = atomic_reduce_block;
8994 }
8995 } // otherwise: use critical section
8996
8997 #elif KMP_OS_DARWIN
8998
8999 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9000 if (atomic_available && (num_vars <= 3)) {
9001 retval = atomic_reduce_block;
9002 } else if (tree_available) {
9003 if ((reduce_size > (9 * sizeof(kmp_real64))) &&
9004 (reduce_size < (2000 * sizeof(kmp_real64)))) {
9005 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
9006 }
9007 } // otherwise: use critical section
9008
9009 #else
9010 #error "Unknown or unsupported OS"
9011 #endif
9012
9013 #else
9014 #error "Unknown or unsupported architecture"
9015 #endif
9016 }
9017
9018 // KMP_FORCE_REDUCTION
9019
9020 // If the team is serialized (team_size == 1), ignore the forced reduction
9021 // method and stay with the unsynchronized method (empty_reduce_block)
9022 if (__kmp_force_reduction_method != reduction_method_not_defined &&
9023 team_size != 1) {
9024
9025 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
9026
9027 int atomic_available, tree_available;
9028
9029 switch ((forced_retval = __kmp_force_reduction_method)) {
9030 case critical_reduce_block:
9031 KMP_ASSERT(lck); // lck should be != 0
9032 break;
9033
9034 case atomic_reduce_block:
9035 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
9036 if (!atomic_available) {
9037 KMP_WARNING(RedMethodNotSupported, "atomic");
9038 forced_retval = critical_reduce_block;
9039 }
9040 break;
9041
9042 case tree_reduce_block:
9043 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9044 if (!tree_available) {
9045 KMP_WARNING(RedMethodNotSupported, "tree");
9046 forced_retval = critical_reduce_block;
9047 } else {
9048 #if KMP_FAST_REDUCTION_BARRIER
9049 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
9050 #endif
9051 }
9052 break;
9053
9054 default:
9055 KMP_ASSERT(0); // "unsupported method specified"
9056 }
9057
9058 retval = forced_retval;
9059 }
9060
9061 KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9062
9063 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
9064 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9065
9066 return (retval);
9067 }
9068 // this function is for testing set/get/determine reduce method
__kmp_get_reduce_method(void)9069 kmp_int32 __kmp_get_reduce_method(void) {
9070 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9071 }
9072
9073 // Soft pause sets up threads to ignore blocktime and just go to sleep.
9074 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
__kmp_soft_pause()9075 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9076
9077 // Hard pause shuts down the runtime completely. Resume happens naturally when
9078 // OpenMP is used subsequently.
__kmp_hard_pause()9079 void __kmp_hard_pause() {
9080 __kmp_pause_status = kmp_hard_paused;
9081 __kmp_internal_end_thread(-1);
9082 }
9083
9084 // Soft resume sets __kmp_pause_status, and wakes up all threads.
__kmp_resume_if_soft_paused()9085 void __kmp_resume_if_soft_paused() {
9086 if (__kmp_pause_status == kmp_soft_paused) {
9087 __kmp_pause_status = kmp_not_paused;
9088
9089 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9090 kmp_info_t *thread = __kmp_threads[gtid];
9091 if (thread) { // Wake it if sleeping
9092 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9093 thread);
9094 if (fl.is_sleeping())
9095 fl.resume(gtid);
9096 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
9097 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
9098 } else { // thread holds the lock and may sleep soon
9099 do { // until either the thread sleeps, or we can get the lock
9100 if (fl.is_sleeping()) {
9101 fl.resume(gtid);
9102 break;
9103 } else if (__kmp_try_suspend_mx(thread)) {
9104 __kmp_unlock_suspend_mx(thread);
9105 break;
9106 }
9107 } while (1);
9108 }
9109 }
9110 }
9111 }
9112 }
9113
9114 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
9115 // TODO: add warning messages
__kmp_pause_resource(kmp_pause_status_t level)9116 int __kmp_pause_resource(kmp_pause_status_t level) {
9117 if (level == kmp_not_paused) { // requesting resume
9118 if (__kmp_pause_status == kmp_not_paused) {
9119 // error message about runtime not being paused, so can't resume
9120 return 1;
9121 } else {
9122 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9123 __kmp_pause_status == kmp_hard_paused);
9124 __kmp_pause_status = kmp_not_paused;
9125 return 0;
9126 }
9127 } else if (level == kmp_soft_paused) { // requesting soft pause
9128 if (__kmp_pause_status != kmp_not_paused) {
9129 // error message about already being paused
9130 return 1;
9131 } else {
9132 __kmp_soft_pause();
9133 return 0;
9134 }
9135 } else if (level == kmp_hard_paused) { // requesting hard pause
9136 if (__kmp_pause_status != kmp_not_paused) {
9137 // error message about already being paused
9138 return 1;
9139 } else {
9140 __kmp_hard_pause();
9141 return 0;
9142 }
9143 } else {
9144 // error message about invalid level
9145 return 1;
9146 }
9147 }
9148
__kmp_omp_display_env(int verbose)9149 void __kmp_omp_display_env(int verbose) {
9150 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9151 if (__kmp_init_serial == 0)
9152 __kmp_do_serial_initialize();
9153 __kmp_display_env_impl(!verbose, verbose);
9154 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9155 }
9156
9157 // The team size is changing, so distributed barrier must be modified
__kmp_resize_dist_barrier(kmp_team_t * team,int old_nthreads,int new_nthreads)9158 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9159 int new_nthreads) {
9160 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9161 bp_dist_bar);
9162 kmp_info_t **other_threads = team->t.t_threads;
9163
9164 // We want all the workers to stop waiting on the barrier while we adjust the
9165 // size of the team.
9166 for (int f = 1; f < old_nthreads; ++f) {
9167 KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9168 // Ignore threads that are already inactive or not present in the team
9169 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9170 // teams construct causes thread_limit to get passed in, and some of
9171 // those could be inactive; just ignore them
9172 continue;
9173 }
9174 // If thread is transitioning still to in_use state, wait for it
9175 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9176 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9177 KMP_CPU_PAUSE();
9178 }
9179 // The thread should be in_use now
9180 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9181 // Transition to unused state
9182 team->t.t_threads[f]->th.th_used_in_team.store(2);
9183 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9184 }
9185 // Release all the workers
9186 team->t.b->go_release();
9187
9188 KMP_MFENCE();
9189
9190 // Workers should see transition status 2 and move to 0; but may need to be
9191 // woken up first
9192 int count = old_nthreads - 1;
9193 while (count > 0) {
9194 count = old_nthreads - 1;
9195 for (int f = 1; f < old_nthreads; ++f) {
9196 if (other_threads[f]->th.th_used_in_team.load() != 0) {
9197 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9198 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9199 void *, other_threads[f]->th.th_sleep_loc);
9200 __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9201 }
9202 } else {
9203 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9204 count--;
9205 }
9206 }
9207 }
9208 // Now update the barrier size
9209 team->t.b->update_num_threads(new_nthreads);
9210 team->t.b->go_reset();
9211 }
9212
__kmp_add_threads_to_team(kmp_team_t * team,int new_nthreads)9213 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9214 // Add the threads back to the team
9215 KMP_DEBUG_ASSERT(team);
9216 // Threads were paused and pointed at th_used_in_team temporarily during a
9217 // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9218 // the thread that it should transition itself back into the team. Then, if
9219 // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9220 // to wake it up.
9221 for (int f = 1; f < new_nthreads; ++f) {
9222 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9223 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9224 3);
9225 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9226 __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9227 (kmp_flag_32<false, false> *)NULL);
9228 }
9229 }
9230 // The threads should be transitioning to the team; when they are done, they
9231 // should have set th_used_in_team to 1. This loop forces master to wait until
9232 // all threads have moved into the team and are waiting in the barrier.
9233 int count = new_nthreads - 1;
9234 while (count > 0) {
9235 count = new_nthreads - 1;
9236 for (int f = 1; f < new_nthreads; ++f) {
9237 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9238 count--;
9239 }
9240 }
9241 }
9242 }
9243
9244 // Globals and functions for hidden helper task
9245 kmp_info_t **__kmp_hidden_helper_threads;
9246 kmp_info_t *__kmp_hidden_helper_main_thread;
9247 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9248 #if KMP_OS_LINUX
9249 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9250 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9251 #else
9252 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9253 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9254 #endif
9255
9256 namespace {
9257 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9258
__kmp_hidden_helper_wrapper_fn(int * gtid,int *,...)9259 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9260 // This is an explicit synchronization on all hidden helper threads in case
9261 // that when a regular thread pushes a hidden helper task to one hidden
9262 // helper thread, the thread has not been awaken once since they're released
9263 // by the main thread after creating the team.
9264 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9265 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9266 __kmp_hidden_helper_threads_num)
9267 ;
9268
9269 // If main thread, then wait for signal
9270 if (__kmpc_master(nullptr, *gtid)) {
9271 // First, unset the initial state and release the initial thread
9272 TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9273 __kmp_hidden_helper_initz_release();
9274 __kmp_hidden_helper_main_thread_wait();
9275 // Now wake up all worker threads
9276 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9277 __kmp_hidden_helper_worker_thread_signal();
9278 }
9279 }
9280 }
9281 } // namespace
9282
__kmp_hidden_helper_threads_initz_routine()9283 void __kmp_hidden_helper_threads_initz_routine() {
9284 // Create a new root for hidden helper team/threads
9285 const int gtid = __kmp_register_root(TRUE);
9286 __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9287 __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9288 __kmp_hidden_helper_main_thread->th.th_set_nproc =
9289 __kmp_hidden_helper_threads_num;
9290
9291 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9292
9293 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9294
9295 // Set the initialization flag to FALSE
9296 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9297
9298 __kmp_hidden_helper_threads_deinitz_release();
9299 }
9300
9301 /* Nesting Mode:
9302 Set via KMP_NESTING_MODE, which takes an integer.
9303 Note: we skip duplicate topology levels, and skip levels with only
9304 one entity.
9305 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9306 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9307 in the topology, and initializes the number of threads at each of those
9308 levels to the number of entities at each level, respectively, below the
9309 entity at the parent level.
9310 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9311 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9312 the user to turn nesting on explicitly. This is an even more experimental
9313 option to this experimental feature, and may change or go away in the
9314 future.
9315 */
9316
9317 // Allocate space to store nesting levels
__kmp_init_nesting_mode()9318 void __kmp_init_nesting_mode() {
9319 int levels = KMP_HW_LAST;
9320 __kmp_nesting_mode_nlevels = levels;
9321 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9322 for (int i = 0; i < levels; ++i)
9323 __kmp_nesting_nth_level[i] = 0;
9324 if (__kmp_nested_nth.size < levels) {
9325 __kmp_nested_nth.nth =
9326 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9327 __kmp_nested_nth.size = levels;
9328 }
9329 }
9330
9331 // Set # threads for top levels of nesting; must be called after topology set
__kmp_set_nesting_mode_threads()9332 void __kmp_set_nesting_mode_threads() {
9333 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9334
9335 if (__kmp_nesting_mode == 1)
9336 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9337 else if (__kmp_nesting_mode > 1)
9338 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9339
9340 if (__kmp_topology) { // use topology info
9341 int loc, hw_level;
9342 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9343 loc < __kmp_nesting_mode_nlevels;
9344 loc++, hw_level++) {
9345 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9346 if (__kmp_nesting_nth_level[loc] == 1)
9347 loc--;
9348 }
9349 // Make sure all cores are used
9350 if (__kmp_nesting_mode > 1 && loc > 1) {
9351 int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9352 int num_cores = __kmp_topology->get_count(core_level);
9353 int upper_levels = 1;
9354 for (int level = 0; level < loc - 1; ++level)
9355 upper_levels *= __kmp_nesting_nth_level[level];
9356 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9357 __kmp_nesting_nth_level[loc - 1] =
9358 num_cores / __kmp_nesting_nth_level[loc - 2];
9359 }
9360 __kmp_nesting_mode_nlevels = loc;
9361 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9362 } else { // no topology info available; provide a reasonable guesstimation
9363 if (__kmp_avail_proc >= 4) {
9364 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9365 __kmp_nesting_nth_level[1] = 2;
9366 __kmp_nesting_mode_nlevels = 2;
9367 } else {
9368 __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9369 __kmp_nesting_mode_nlevels = 1;
9370 }
9371 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9372 }
9373 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9374 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9375 }
9376 set__nproc(thread, __kmp_nesting_nth_level[0]);
9377 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9378 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9379 if (get__max_active_levels(thread) > 1) {
9380 // if max levels was set, set nesting mode levels to same
9381 __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9382 }
9383 if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9384 set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9385 }
9386
9387 // Empty symbols to export (see exports_so.txt) when feature is disabled
9388 extern "C" {
9389 #if !KMP_STATS_ENABLED
__kmp_reset_stats()9390 void __kmp_reset_stats() {}
9391 #endif
9392 #if !USE_DEBUGGER
9393 int __kmp_omp_debug_struct_info = FALSE;
9394 int __kmp_debugging = FALSE;
9395 #endif
9396 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY
__kmp_itt_fini_ittlib()9397 void __kmp_itt_fini_ittlib() {}
__kmp_itt_init_ittlib()9398 void __kmp_itt_init_ittlib() {}
9399 #endif
9400 }
9401
9402 // end of file
9403