1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #if KMP_USE_HIER_SCHED 28 #include "kmp_dispatch_hier.h" 29 #endif 30 31 #if OMPT_SUPPORT 32 #include "ompt-specific.h" 33 #endif 34 #if OMPD_SUPPORT 35 #include "ompd-specific.h" 36 #endif 37 38 #if OMP_PROFILING_SUPPORT 39 #include "llvm/Support/TimeProfiler.h" 40 static char *ProfileTraceFile = nullptr; 41 #endif 42 43 /* these are temporary issues to be dealt with */ 44 #define KMP_USE_PRCTL 0 45 46 #if KMP_OS_WINDOWS 47 #include <process.h> 48 #endif 49 50 #if KMP_OS_WINDOWS 51 // windows does not need include files as it doesn't use shared memory 52 #else 53 #include <sys/mman.h> 54 #include <sys/stat.h> 55 #include <fcntl.h> 56 #define SHM_SIZE 1024 57 #endif 58 59 #if defined(KMP_GOMP_COMPAT) 60 char const __kmp_version_alt_comp[] = 61 KMP_VERSION_PREFIX "alternative compiler support: yes"; 62 #endif /* defined(KMP_GOMP_COMPAT) */ 63 64 char const __kmp_version_omp_api[] = 65 KMP_VERSION_PREFIX "API version: 5.0 (201611)"; 66 67 #ifdef KMP_DEBUG 68 char const __kmp_version_lock[] = 69 KMP_VERSION_PREFIX "lock type: run time selectable"; 70 #endif /* KMP_DEBUG */ 71 72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 73 74 /* ------------------------------------------------------------------------ */ 75 76 #if KMP_USE_MONITOR 77 kmp_info_t __kmp_monitor; 78 #endif 79 80 /* Forward declarations */ 81 82 void __kmp_cleanup(void); 83 84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 85 int gtid); 86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 87 kmp_internal_control_t *new_icvs, 88 ident_t *loc); 89 #if KMP_AFFINITY_SUPPORTED 90 static void __kmp_partition_places(kmp_team_t *team, 91 int update_master_only = 0); 92 #endif 93 static void __kmp_do_serial_initialize(void); 94 void __kmp_fork_barrier(int gtid, int tid); 95 void __kmp_join_barrier(int gtid); 96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 97 kmp_internal_control_t *new_icvs, ident_t *loc); 98 99 #ifdef USE_LOAD_BALANCE 100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 101 #endif 102 103 static int __kmp_expand_threads(int nNeed); 104 #if KMP_OS_WINDOWS 105 static int __kmp_unregister_root_other_thread(int gtid); 106 #endif 107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 109 110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads, 111 int new_nthreads); 112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads); 113 114 /* Calculate the identifier of the current thread */ 115 /* fast (and somewhat portable) way to get unique identifier of executing 116 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 117 int __kmp_get_global_thread_id() { 118 int i; 119 kmp_info_t **other_threads; 120 size_t stack_data; 121 char *stack_addr; 122 size_t stack_size; 123 char *stack_base; 124 125 KA_TRACE( 126 1000, 127 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 128 __kmp_nth, __kmp_all_nth)); 129 130 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 131 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 132 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 133 __kmp_init_gtid for this to work. */ 134 135 if (!TCR_4(__kmp_init_gtid)) 136 return KMP_GTID_DNE; 137 138 #ifdef KMP_TDATA_GTID 139 if (TCR_4(__kmp_gtid_mode) >= 3) { 140 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 141 return __kmp_gtid; 142 } 143 #endif 144 if (TCR_4(__kmp_gtid_mode) >= 2) { 145 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 146 return __kmp_gtid_get_specific(); 147 } 148 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 149 150 stack_addr = (char *)&stack_data; 151 other_threads = __kmp_threads; 152 153 /* ATT: The code below is a source of potential bugs due to unsynchronized 154 access to __kmp_threads array. For example: 155 1. Current thread loads other_threads[i] to thr and checks it, it is 156 non-NULL. 157 2. Current thread is suspended by OS. 158 3. Another thread unregisters and finishes (debug versions of free() 159 may fill memory with something like 0xEF). 160 4. Current thread is resumed. 161 5. Current thread reads junk from *thr. 162 TODO: Fix it. --ln */ 163 164 for (i = 0; i < __kmp_threads_capacity; i++) { 165 166 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 167 if (!thr) 168 continue; 169 170 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 171 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 172 173 /* stack grows down -- search through all of the active threads */ 174 175 if (stack_addr <= stack_base) { 176 size_t stack_diff = stack_base - stack_addr; 177 178 if (stack_diff <= stack_size) { 179 /* The only way we can be closer than the allocated */ 180 /* stack size is if we are running on this thread. */ 181 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 182 return i; 183 } 184 } 185 } 186 187 /* get specific to try and determine our gtid */ 188 KA_TRACE(1000, 189 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 190 "thread, using TLS\n")); 191 i = __kmp_gtid_get_specific(); 192 193 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 194 195 /* if we havn't been assigned a gtid, then return code */ 196 if (i < 0) 197 return i; 198 199 /* dynamically updated stack window for uber threads to avoid get_specific 200 call */ 201 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 202 KMP_FATAL(StackOverflow, i); 203 } 204 205 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 206 if (stack_addr > stack_base) { 207 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 208 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 209 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 210 stack_base); 211 } else { 212 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 213 stack_base - stack_addr); 214 } 215 216 /* Reprint stack bounds for ubermaster since they have been refined */ 217 if (__kmp_storage_map) { 218 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 219 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 220 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 221 other_threads[i]->th.th_info.ds.ds_stacksize, 222 "th_%d stack (refinement)", i); 223 } 224 return i; 225 } 226 227 int __kmp_get_global_thread_id_reg() { 228 int gtid; 229 230 if (!__kmp_init_serial) { 231 gtid = KMP_GTID_DNE; 232 } else 233 #ifdef KMP_TDATA_GTID 234 if (TCR_4(__kmp_gtid_mode) >= 3) { 235 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 236 gtid = __kmp_gtid; 237 } else 238 #endif 239 if (TCR_4(__kmp_gtid_mode) >= 2) { 240 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 241 gtid = __kmp_gtid_get_specific(); 242 } else { 243 KA_TRACE(1000, 244 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 245 gtid = __kmp_get_global_thread_id(); 246 } 247 248 /* we must be a new uber master sibling thread */ 249 if (gtid == KMP_GTID_DNE) { 250 KA_TRACE(10, 251 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 252 "Registering a new gtid.\n")); 253 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 254 if (!__kmp_init_serial) { 255 __kmp_do_serial_initialize(); 256 gtid = __kmp_gtid_get_specific(); 257 } else { 258 gtid = __kmp_register_root(FALSE); 259 } 260 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 261 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 262 } 263 264 KMP_DEBUG_ASSERT(gtid >= 0); 265 266 return gtid; 267 } 268 269 /* caller must hold forkjoin_lock */ 270 void __kmp_check_stack_overlap(kmp_info_t *th) { 271 int f; 272 char *stack_beg = NULL; 273 char *stack_end = NULL; 274 int gtid; 275 276 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 277 if (__kmp_storage_map) { 278 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 279 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 280 281 gtid = __kmp_gtid_from_thread(th); 282 283 if (gtid == KMP_GTID_MONITOR) { 284 __kmp_print_storage_map_gtid( 285 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 286 "th_%s stack (%s)", "mon", 287 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 288 } else { 289 __kmp_print_storage_map_gtid( 290 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 291 "th_%d stack (%s)", gtid, 292 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 293 } 294 } 295 296 /* No point in checking ubermaster threads since they use refinement and 297 * cannot overlap */ 298 gtid = __kmp_gtid_from_thread(th); 299 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 300 KA_TRACE(10, 301 ("__kmp_check_stack_overlap: performing extensive checking\n")); 302 if (stack_beg == NULL) { 303 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 304 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 305 } 306 307 for (f = 0; f < __kmp_threads_capacity; f++) { 308 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 309 310 if (f_th && f_th != th) { 311 char *other_stack_end = 312 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 313 char *other_stack_beg = 314 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 315 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 316 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 317 318 /* Print the other stack values before the abort */ 319 if (__kmp_storage_map) 320 __kmp_print_storage_map_gtid( 321 -1, other_stack_beg, other_stack_end, 322 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 323 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 324 325 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 326 __kmp_msg_null); 327 } 328 } 329 } 330 } 331 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 332 } 333 334 /* ------------------------------------------------------------------------ */ 335 336 void __kmp_infinite_loop(void) { 337 static int done = FALSE; 338 339 while (!done) { 340 KMP_YIELD(TRUE); 341 } 342 } 343 344 #define MAX_MESSAGE 512 345 346 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 347 char const *format, ...) { 348 char buffer[MAX_MESSAGE]; 349 va_list ap; 350 351 va_start(ap, format); 352 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 353 p2, (unsigned long)size, format); 354 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 355 __kmp_vprintf(kmp_err, buffer, ap); 356 #if KMP_PRINT_DATA_PLACEMENT 357 int node; 358 if (gtid >= 0) { 359 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 360 if (__kmp_storage_map_verbose) { 361 node = __kmp_get_host_node(p1); 362 if (node < 0) /* doesn't work, so don't try this next time */ 363 __kmp_storage_map_verbose = FALSE; 364 else { 365 char *last; 366 int lastNode; 367 int localProc = __kmp_get_cpu_from_gtid(gtid); 368 369 const int page_size = KMP_GET_PAGE_SIZE(); 370 371 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 372 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 373 if (localProc >= 0) 374 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 375 localProc >> 1); 376 else 377 __kmp_printf_no_lock(" GTID %d\n", gtid); 378 #if KMP_USE_PRCTL 379 /* The more elaborate format is disabled for now because of the prctl 380 * hanging bug. */ 381 do { 382 last = p1; 383 lastNode = node; 384 /* This loop collates adjacent pages with the same host node. */ 385 do { 386 (char *)p1 += page_size; 387 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 388 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 389 lastNode); 390 } while (p1 <= p2); 391 #else 392 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 393 (char *)p1 + (page_size - 1), 394 __kmp_get_host_node(p1)); 395 if (p1 < p2) { 396 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 397 (char *)p2 + (page_size - 1), 398 __kmp_get_host_node(p2)); 399 } 400 #endif 401 } 402 } 403 } else 404 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 405 } 406 #endif /* KMP_PRINT_DATA_PLACEMENT */ 407 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 408 } 409 410 void __kmp_warn(char const *format, ...) { 411 char buffer[MAX_MESSAGE]; 412 va_list ap; 413 414 if (__kmp_generate_warnings == kmp_warnings_off) { 415 return; 416 } 417 418 va_start(ap, format); 419 420 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 421 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 422 __kmp_vprintf(kmp_err, buffer, ap); 423 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 424 425 va_end(ap); 426 } 427 428 void __kmp_abort_process() { 429 // Later threads may stall here, but that's ok because abort() will kill them. 430 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 431 432 if (__kmp_debug_buf) { 433 __kmp_dump_debug_buffer(); 434 } 435 436 if (KMP_OS_WINDOWS) { 437 // Let other threads know of abnormal termination and prevent deadlock 438 // if abort happened during library initialization or shutdown 439 __kmp_global.g.g_abort = SIGABRT; 440 441 /* On Windows* OS by default abort() causes pop-up error box, which stalls 442 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 443 boxes. _set_abort_behavior() works well, but this function is not 444 available in VS7 (this is not problem for DLL, but it is a problem for 445 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 446 help, at least in some versions of MS C RTL. 447 448 It seems following sequence is the only way to simulate abort() and 449 avoid pop-up error box. */ 450 raise(SIGABRT); 451 _exit(3); // Just in case, if signal ignored, exit anyway. 452 } else { 453 __kmp_unregister_library(); 454 abort(); 455 } 456 457 __kmp_infinite_loop(); 458 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 459 460 } // __kmp_abort_process 461 462 void __kmp_abort_thread(void) { 463 // TODO: Eliminate g_abort global variable and this function. 464 // In case of abort just call abort(), it will kill all the threads. 465 __kmp_infinite_loop(); 466 } // __kmp_abort_thread 467 468 /* Print out the storage map for the major kmp_info_t thread data structures 469 that are allocated together. */ 470 471 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 472 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 473 gtid); 474 475 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 476 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 477 478 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 479 sizeof(kmp_local_t), "th_%d.th_local", gtid); 480 481 __kmp_print_storage_map_gtid( 482 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 483 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 484 485 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 486 &thr->th.th_bar[bs_plain_barrier + 1], 487 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 488 gtid); 489 490 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 491 &thr->th.th_bar[bs_forkjoin_barrier + 1], 492 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 493 gtid); 494 495 #if KMP_FAST_REDUCTION_BARRIER 496 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 497 &thr->th.th_bar[bs_reduction_barrier + 1], 498 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 499 gtid); 500 #endif // KMP_FAST_REDUCTION_BARRIER 501 } 502 503 /* Print out the storage map for the major kmp_team_t team data structures 504 that are allocated together. */ 505 506 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 507 int team_id, int num_thr) { 508 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 509 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 510 header, team_id); 511 512 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 513 &team->t.t_bar[bs_last_barrier], 514 sizeof(kmp_balign_team_t) * bs_last_barrier, 515 "%s_%d.t_bar", header, team_id); 516 517 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 518 &team->t.t_bar[bs_plain_barrier + 1], 519 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 520 header, team_id); 521 522 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 523 &team->t.t_bar[bs_forkjoin_barrier + 1], 524 sizeof(kmp_balign_team_t), 525 "%s_%d.t_bar[forkjoin]", header, team_id); 526 527 #if KMP_FAST_REDUCTION_BARRIER 528 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 529 &team->t.t_bar[bs_reduction_barrier + 1], 530 sizeof(kmp_balign_team_t), 531 "%s_%d.t_bar[reduction]", header, team_id); 532 #endif // KMP_FAST_REDUCTION_BARRIER 533 534 __kmp_print_storage_map_gtid( 535 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 536 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 537 538 __kmp_print_storage_map_gtid( 539 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 540 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 541 542 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 543 &team->t.t_disp_buffer[num_disp_buff], 544 sizeof(dispatch_shared_info_t) * num_disp_buff, 545 "%s_%d.t_disp_buffer", header, team_id); 546 } 547 548 static void __kmp_init_allocator() { 549 __kmp_init_memkind(); 550 __kmp_init_target_mem(); 551 } 552 static void __kmp_fini_allocator() { __kmp_fini_memkind(); } 553 554 /* ------------------------------------------------------------------------ */ 555 556 #if KMP_DYNAMIC_LIB 557 #if KMP_OS_WINDOWS 558 559 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 560 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 561 562 switch (fdwReason) { 563 564 case DLL_PROCESS_ATTACH: 565 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 566 567 return TRUE; 568 569 case DLL_PROCESS_DETACH: 570 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 571 572 // According to Windows* documentation for DllMain entry point: 573 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference: 574 // lpReserved == NULL when FreeLibrary() is called, 575 // lpReserved != NULL when the process is terminated. 576 // When FreeLibrary() is called, worker threads remain alive. So the 577 // runtime's state is consistent and executing proper shutdown is OK. 578 // When the process is terminated, worker threads have exited or been 579 // forcefully terminated by the OS and only the shutdown thread remains. 580 // This can leave the runtime in an inconsistent state. 581 // Hence, only attempt proper cleanup when FreeLibrary() is called. 582 // Otherwise, rely on OS to reclaim resources. 583 if (lpReserved == NULL) 584 __kmp_internal_end_library(__kmp_gtid_get_specific()); 585 586 return TRUE; 587 588 case DLL_THREAD_ATTACH: 589 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 590 591 /* if we want to register new siblings all the time here call 592 * __kmp_get_gtid(); */ 593 return TRUE; 594 595 case DLL_THREAD_DETACH: 596 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 597 598 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 599 return TRUE; 600 } 601 602 return TRUE; 603 } 604 605 #endif /* KMP_OS_WINDOWS */ 606 #endif /* KMP_DYNAMIC_LIB */ 607 608 /* __kmp_parallel_deo -- Wait until it's our turn. */ 609 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 610 int gtid = *gtid_ref; 611 #ifdef BUILD_PARALLEL_ORDERED 612 kmp_team_t *team = __kmp_team_from_gtid(gtid); 613 #endif /* BUILD_PARALLEL_ORDERED */ 614 615 if (__kmp_env_consistency_check) { 616 if (__kmp_threads[gtid]->th.th_root->r.r_active) 617 #if KMP_USE_DYNAMIC_LOCK 618 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 619 #else 620 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 621 #endif 622 } 623 #ifdef BUILD_PARALLEL_ORDERED 624 if (!team->t.t_serialized) { 625 KMP_MB(); 626 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, 627 NULL); 628 KMP_MB(); 629 } 630 #endif /* BUILD_PARALLEL_ORDERED */ 631 } 632 633 /* __kmp_parallel_dxo -- Signal the next task. */ 634 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 635 int gtid = *gtid_ref; 636 #ifdef BUILD_PARALLEL_ORDERED 637 int tid = __kmp_tid_from_gtid(gtid); 638 kmp_team_t *team = __kmp_team_from_gtid(gtid); 639 #endif /* BUILD_PARALLEL_ORDERED */ 640 641 if (__kmp_env_consistency_check) { 642 if (__kmp_threads[gtid]->th.th_root->r.r_active) 643 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 644 } 645 #ifdef BUILD_PARALLEL_ORDERED 646 if (!team->t.t_serialized) { 647 KMP_MB(); /* Flush all pending memory write invalidates. */ 648 649 /* use the tid of the next thread in this team */ 650 /* TODO replace with general release procedure */ 651 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 652 653 KMP_MB(); /* Flush all pending memory write invalidates. */ 654 } 655 #endif /* BUILD_PARALLEL_ORDERED */ 656 } 657 658 /* ------------------------------------------------------------------------ */ 659 /* The BARRIER for a SINGLE process section is always explicit */ 660 661 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 662 int status; 663 kmp_info_t *th; 664 kmp_team_t *team; 665 666 if (!TCR_4(__kmp_init_parallel)) 667 __kmp_parallel_initialize(); 668 __kmp_resume_if_soft_paused(); 669 670 th = __kmp_threads[gtid]; 671 team = th->th.th_team; 672 status = 0; 673 674 th->th.th_ident = id_ref; 675 676 if (team->t.t_serialized) { 677 status = 1; 678 } else { 679 kmp_int32 old_this = th->th.th_local.this_construct; 680 681 ++th->th.th_local.this_construct; 682 /* try to set team count to thread count--success means thread got the 683 single block */ 684 /* TODO: Should this be acquire or release? */ 685 if (team->t.t_construct == old_this) { 686 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 687 th->th.th_local.this_construct); 688 } 689 #if USE_ITT_BUILD 690 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 691 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 692 team->t.t_active_level == 1) { 693 // Only report metadata by primary thread of active team at level 1 694 __kmp_itt_metadata_single(id_ref); 695 } 696 #endif /* USE_ITT_BUILD */ 697 } 698 699 if (__kmp_env_consistency_check) { 700 if (status && push_ws) { 701 __kmp_push_workshare(gtid, ct_psingle, id_ref); 702 } else { 703 __kmp_check_workshare(gtid, ct_psingle, id_ref); 704 } 705 } 706 #if USE_ITT_BUILD 707 if (status) { 708 __kmp_itt_single_start(gtid); 709 } 710 #endif /* USE_ITT_BUILD */ 711 return status; 712 } 713 714 void __kmp_exit_single(int gtid) { 715 #if USE_ITT_BUILD 716 __kmp_itt_single_end(gtid); 717 #endif /* USE_ITT_BUILD */ 718 if (__kmp_env_consistency_check) 719 __kmp_pop_workshare(gtid, ct_psingle, NULL); 720 } 721 722 /* determine if we can go parallel or must use a serialized parallel region and 723 * how many threads we can use 724 * set_nproc is the number of threads requested for the team 725 * returns 0 if we should serialize or only use one thread, 726 * otherwise the number of threads to use 727 * The forkjoin lock is held by the caller. */ 728 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 729 int master_tid, int set_nthreads, 730 int enter_teams) { 731 int capacity; 732 int new_nthreads; 733 KMP_DEBUG_ASSERT(__kmp_init_serial); 734 KMP_DEBUG_ASSERT(root && parent_team); 735 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; 736 737 // If dyn-var is set, dynamically adjust the number of desired threads, 738 // according to the method specified by dynamic_mode. 739 new_nthreads = set_nthreads; 740 if (!get__dynamic_2(parent_team, master_tid)) { 741 ; 742 } 743 #ifdef USE_LOAD_BALANCE 744 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 745 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 746 if (new_nthreads == 1) { 747 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 748 "reservation to 1 thread\n", 749 master_tid)); 750 return 1; 751 } 752 if (new_nthreads < set_nthreads) { 753 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 754 "reservation to %d threads\n", 755 master_tid, new_nthreads)); 756 } 757 } 758 #endif /* USE_LOAD_BALANCE */ 759 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 760 new_nthreads = __kmp_avail_proc - __kmp_nth + 761 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 762 if (new_nthreads <= 1) { 763 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 764 "reservation to 1 thread\n", 765 master_tid)); 766 return 1; 767 } 768 if (new_nthreads < set_nthreads) { 769 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 770 "reservation to %d threads\n", 771 master_tid, new_nthreads)); 772 } else { 773 new_nthreads = set_nthreads; 774 } 775 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 776 if (set_nthreads > 2) { 777 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 778 new_nthreads = (new_nthreads % set_nthreads) + 1; 779 if (new_nthreads == 1) { 780 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 781 "reservation to 1 thread\n", 782 master_tid)); 783 return 1; 784 } 785 if (new_nthreads < set_nthreads) { 786 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 787 "reservation to %d threads\n", 788 master_tid, new_nthreads)); 789 } 790 } 791 } else { 792 KMP_ASSERT(0); 793 } 794 795 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 796 if (__kmp_nth + new_nthreads - 797 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 798 __kmp_max_nth) { 799 int tl_nthreads = __kmp_max_nth - __kmp_nth + 800 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 801 if (tl_nthreads <= 0) { 802 tl_nthreads = 1; 803 } 804 805 // If dyn-var is false, emit a 1-time warning. 806 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 807 __kmp_reserve_warn = 1; 808 __kmp_msg(kmp_ms_warning, 809 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 810 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 811 } 812 if (tl_nthreads == 1) { 813 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 814 "reduced reservation to 1 thread\n", 815 master_tid)); 816 return 1; 817 } 818 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 819 "reservation to %d threads\n", 820 master_tid, tl_nthreads)); 821 new_nthreads = tl_nthreads; 822 } 823 824 // Respect OMP_THREAD_LIMIT 825 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; 826 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; 827 if (cg_nthreads + new_nthreads - 828 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 829 max_cg_threads) { 830 int tl_nthreads = max_cg_threads - cg_nthreads + 831 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 832 if (tl_nthreads <= 0) { 833 tl_nthreads = 1; 834 } 835 836 // If dyn-var is false, emit a 1-time warning. 837 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 838 __kmp_reserve_warn = 1; 839 __kmp_msg(kmp_ms_warning, 840 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 841 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 842 } 843 if (tl_nthreads == 1) { 844 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 845 "reduced reservation to 1 thread\n", 846 master_tid)); 847 return 1; 848 } 849 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 850 "reservation to %d threads\n", 851 master_tid, tl_nthreads)); 852 new_nthreads = tl_nthreads; 853 } 854 855 // Check if the threads array is large enough, or needs expanding. 856 // See comment in __kmp_register_root() about the adjustment if 857 // __kmp_threads[0] == NULL. 858 capacity = __kmp_threads_capacity; 859 if (TCR_PTR(__kmp_threads[0]) == NULL) { 860 --capacity; 861 } 862 // If it is not for initializing the hidden helper team, we need to take 863 // __kmp_hidden_helper_threads_num out of the capacity because it is included 864 // in __kmp_threads_capacity. 865 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 866 capacity -= __kmp_hidden_helper_threads_num; 867 } 868 if (__kmp_nth + new_nthreads - 869 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 870 capacity) { 871 // Expand the threads array. 872 int slotsRequired = __kmp_nth + new_nthreads - 873 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 874 capacity; 875 int slotsAdded = __kmp_expand_threads(slotsRequired); 876 if (slotsAdded < slotsRequired) { 877 // The threads array was not expanded enough. 878 new_nthreads -= (slotsRequired - slotsAdded); 879 KMP_ASSERT(new_nthreads >= 1); 880 881 // If dyn-var is false, emit a 1-time warning. 882 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 883 __kmp_reserve_warn = 1; 884 if (__kmp_tp_cached) { 885 __kmp_msg(kmp_ms_warning, 886 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 887 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 888 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 889 } else { 890 __kmp_msg(kmp_ms_warning, 891 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 892 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 893 } 894 } 895 } 896 } 897 898 #ifdef KMP_DEBUG 899 if (new_nthreads == 1) { 900 KC_TRACE(10, 901 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 902 "dead roots and rechecking; requested %d threads\n", 903 __kmp_get_gtid(), set_nthreads)); 904 } else { 905 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 906 " %d threads\n", 907 __kmp_get_gtid(), new_nthreads, set_nthreads)); 908 } 909 #endif // KMP_DEBUG 910 return new_nthreads; 911 } 912 913 /* Allocate threads from the thread pool and assign them to the new team. We are 914 assured that there are enough threads available, because we checked on that 915 earlier within critical section forkjoin */ 916 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 917 kmp_info_t *master_th, int master_gtid, 918 int fork_teams_workers) { 919 int i; 920 int use_hot_team; 921 922 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 923 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 924 KMP_MB(); 925 926 /* first, let's setup the primary thread */ 927 master_th->th.th_info.ds.ds_tid = 0; 928 master_th->th.th_team = team; 929 master_th->th.th_team_nproc = team->t.t_nproc; 930 master_th->th.th_team_master = master_th; 931 master_th->th.th_team_serialized = FALSE; 932 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 933 934 /* make sure we are not the optimized hot team */ 935 #if KMP_NESTED_HOT_TEAMS 936 use_hot_team = 0; 937 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 938 if (hot_teams) { // hot teams array is not allocated if 939 // KMP_HOT_TEAMS_MAX_LEVEL=0 940 int level = team->t.t_active_level - 1; // index in array of hot teams 941 if (master_th->th.th_teams_microtask) { // are we inside the teams? 942 if (master_th->th.th_teams_size.nteams > 1) { 943 ++level; // level was not increased in teams construct for 944 // team_of_masters 945 } 946 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 947 master_th->th.th_teams_level == team->t.t_level) { 948 ++level; // level was not increased in teams construct for 949 // team_of_workers before the parallel 950 } // team->t.t_level will be increased inside parallel 951 } 952 if (level < __kmp_hot_teams_max_level) { 953 if (hot_teams[level].hot_team) { 954 // hot team has already been allocated for given level 955 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 956 use_hot_team = 1; // the team is ready to use 957 } else { 958 use_hot_team = 0; // AC: threads are not allocated yet 959 hot_teams[level].hot_team = team; // remember new hot team 960 hot_teams[level].hot_team_nth = team->t.t_nproc; 961 } 962 } else { 963 use_hot_team = 0; 964 } 965 } 966 #else 967 use_hot_team = team == root->r.r_hot_team; 968 #endif 969 if (!use_hot_team) { 970 971 /* install the primary thread */ 972 team->t.t_threads[0] = master_th; 973 __kmp_initialize_info(master_th, team, 0, master_gtid); 974 975 /* now, install the worker threads */ 976 for (i = 1; i < team->t.t_nproc; i++) { 977 978 /* fork or reallocate a new thread and install it in team */ 979 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 980 team->t.t_threads[i] = thr; 981 KMP_DEBUG_ASSERT(thr); 982 KMP_DEBUG_ASSERT(thr->th.th_team == team); 983 /* align team and thread arrived states */ 984 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 985 "T#%d(%d:%d) join =%llu, plain=%llu\n", 986 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 987 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 988 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 989 team->t.t_bar[bs_plain_barrier].b_arrived)); 990 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 991 thr->th.th_teams_level = master_th->th.th_teams_level; 992 thr->th.th_teams_size = master_th->th.th_teams_size; 993 { // Initialize threads' barrier data. 994 int b; 995 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 996 for (b = 0; b < bs_last_barrier; ++b) { 997 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 998 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 999 #if USE_DEBUGGER 1000 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 1001 #endif 1002 } 1003 } 1004 } 1005 1006 #if KMP_AFFINITY_SUPPORTED 1007 // Do not partition the places list for teams construct workers who 1008 // haven't actually been forked to do real work yet. This partitioning 1009 // will take place in the parallel region nested within the teams construct. 1010 if (!fork_teams_workers) { 1011 __kmp_partition_places(team); 1012 } 1013 #endif 1014 1015 if (team->t.t_nproc > 1 && 1016 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 1017 team->t.b->update_num_threads(team->t.t_nproc); 1018 __kmp_add_threads_to_team(team, team->t.t_nproc); 1019 } 1020 } 1021 1022 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 1023 for (i = 0; i < team->t.t_nproc; i++) { 1024 kmp_info_t *thr = team->t.t_threads[i]; 1025 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1026 thr->th.th_prev_level != team->t.t_level) { 1027 team->t.t_display_affinity = 1; 1028 break; 1029 } 1030 } 1031 } 1032 1033 KMP_MB(); 1034 } 1035 1036 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1037 // Propagate any changes to the floating point control registers out to the team 1038 // We try to avoid unnecessary writes to the relevant cache line in the team 1039 // structure, so we don't make changes unless they are needed. 1040 inline static void propagateFPControl(kmp_team_t *team) { 1041 if (__kmp_inherit_fp_control) { 1042 kmp_int16 x87_fpu_control_word; 1043 kmp_uint32 mxcsr; 1044 1045 // Get primary thread's values of FPU control flags (both X87 and vector) 1046 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1047 __kmp_store_mxcsr(&mxcsr); 1048 mxcsr &= KMP_X86_MXCSR_MASK; 1049 1050 // There is no point looking at t_fp_control_saved here. 1051 // If it is TRUE, we still have to update the values if they are different 1052 // from those we now have. If it is FALSE we didn't save anything yet, but 1053 // our objective is the same. We have to ensure that the values in the team 1054 // are the same as those we have. 1055 // So, this code achieves what we need whether or not t_fp_control_saved is 1056 // true. By checking whether the value needs updating we avoid unnecessary 1057 // writes that would put the cache-line into a written state, causing all 1058 // threads in the team to have to read it again. 1059 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1060 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1061 // Although we don't use this value, other code in the runtime wants to know 1062 // whether it should restore them. So we must ensure it is correct. 1063 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1064 } else { 1065 // Similarly here. Don't write to this cache-line in the team structure 1066 // unless we have to. 1067 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1068 } 1069 } 1070 1071 // Do the opposite, setting the hardware registers to the updated values from 1072 // the team. 1073 inline static void updateHWFPControl(kmp_team_t *team) { 1074 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1075 // Only reset the fp control regs if they have been changed in the team. 1076 // the parallel region that we are exiting. 1077 kmp_int16 x87_fpu_control_word; 1078 kmp_uint32 mxcsr; 1079 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1080 __kmp_store_mxcsr(&mxcsr); 1081 mxcsr &= KMP_X86_MXCSR_MASK; 1082 1083 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1084 __kmp_clear_x87_fpu_status_word(); 1085 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1086 } 1087 1088 if (team->t.t_mxcsr != mxcsr) { 1089 __kmp_load_mxcsr(&team->t.t_mxcsr); 1090 } 1091 } 1092 } 1093 #else 1094 #define propagateFPControl(x) ((void)0) 1095 #define updateHWFPControl(x) ((void)0) 1096 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1097 1098 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1099 int realloc); // forward declaration 1100 1101 /* Run a parallel region that has been serialized, so runs only in a team of the 1102 single primary thread. */ 1103 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1104 kmp_info_t *this_thr; 1105 kmp_team_t *serial_team; 1106 1107 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1108 1109 /* Skip all this code for autopar serialized loops since it results in 1110 unacceptable overhead */ 1111 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1112 return; 1113 1114 if (!TCR_4(__kmp_init_parallel)) 1115 __kmp_parallel_initialize(); 1116 __kmp_resume_if_soft_paused(); 1117 1118 this_thr = __kmp_threads[global_tid]; 1119 serial_team = this_thr->th.th_serial_team; 1120 1121 /* utilize the serialized team held by this thread */ 1122 KMP_DEBUG_ASSERT(serial_team); 1123 KMP_MB(); 1124 1125 if (__kmp_tasking_mode != tskm_immediate_exec) { 1126 KMP_DEBUG_ASSERT( 1127 this_thr->th.th_task_team == 1128 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1129 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1130 NULL); 1131 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1132 "team %p, new task_team = NULL\n", 1133 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1134 this_thr->th.th_task_team = NULL; 1135 } 1136 1137 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1138 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1139 proc_bind = proc_bind_false; 1140 } else if (proc_bind == proc_bind_default) { 1141 // No proc_bind clause was specified, so use the current value 1142 // of proc-bind-var for this parallel region. 1143 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1144 } 1145 // Reset for next parallel region 1146 this_thr->th.th_set_proc_bind = proc_bind_default; 1147 1148 #if OMPT_SUPPORT 1149 ompt_data_t ompt_parallel_data = ompt_data_none; 1150 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1151 if (ompt_enabled.enabled && 1152 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1153 1154 ompt_task_info_t *parent_task_info; 1155 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1156 1157 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1158 if (ompt_enabled.ompt_callback_parallel_begin) { 1159 int team_size = 1; 1160 1161 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1162 &(parent_task_info->task_data), &(parent_task_info->frame), 1163 &ompt_parallel_data, team_size, 1164 ompt_parallel_invoker_program | ompt_parallel_team, codeptr); 1165 } 1166 } 1167 #endif // OMPT_SUPPORT 1168 1169 if (this_thr->th.th_team != serial_team) { 1170 // Nested level will be an index in the nested nthreads array 1171 int level = this_thr->th.th_team->t.t_level; 1172 1173 if (serial_team->t.t_serialized) { 1174 /* this serial team was already used 1175 TODO increase performance by making this locks more specific */ 1176 kmp_team_t *new_team; 1177 1178 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1179 1180 new_team = 1181 __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1182 #if OMPT_SUPPORT 1183 ompt_parallel_data, 1184 #endif 1185 proc_bind, &this_thr->th.th_current_task->td_icvs, 1186 0 USE_NESTED_HOT_ARG(NULL)); 1187 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1188 KMP_ASSERT(new_team); 1189 1190 /* setup new serialized team and install it */ 1191 new_team->t.t_threads[0] = this_thr; 1192 new_team->t.t_parent = this_thr->th.th_team; 1193 serial_team = new_team; 1194 this_thr->th.th_serial_team = serial_team; 1195 1196 KF_TRACE( 1197 10, 1198 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1199 global_tid, serial_team)); 1200 1201 /* TODO the above breaks the requirement that if we run out of resources, 1202 then we can still guarantee that serialized teams are ok, since we may 1203 need to allocate a new one */ 1204 } else { 1205 KF_TRACE( 1206 10, 1207 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1208 global_tid, serial_team)); 1209 } 1210 1211 /* we have to initialize this serial team */ 1212 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1213 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1214 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1215 serial_team->t.t_ident = loc; 1216 serial_team->t.t_serialized = 1; 1217 serial_team->t.t_nproc = 1; 1218 serial_team->t.t_parent = this_thr->th.th_team; 1219 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1220 this_thr->th.th_team = serial_team; 1221 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1222 1223 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid, 1224 this_thr->th.th_current_task)); 1225 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1226 this_thr->th.th_current_task->td_flags.executing = 0; 1227 1228 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1229 1230 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1231 implicit task for each serialized task represented by 1232 team->t.t_serialized? */ 1233 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1234 &this_thr->th.th_current_task->td_parent->td_icvs); 1235 1236 // Thread value exists in the nested nthreads array for the next nested 1237 // level 1238 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1239 this_thr->th.th_current_task->td_icvs.nproc = 1240 __kmp_nested_nth.nth[level + 1]; 1241 } 1242 1243 if (__kmp_nested_proc_bind.used && 1244 (level + 1 < __kmp_nested_proc_bind.used)) { 1245 this_thr->th.th_current_task->td_icvs.proc_bind = 1246 __kmp_nested_proc_bind.bind_types[level + 1]; 1247 } 1248 1249 #if USE_DEBUGGER 1250 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1251 #endif 1252 this_thr->th.th_info.ds.ds_tid = 0; 1253 1254 /* set thread cache values */ 1255 this_thr->th.th_team_nproc = 1; 1256 this_thr->th.th_team_master = this_thr; 1257 this_thr->th.th_team_serialized = 1; 1258 1259 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1260 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1261 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1262 1263 propagateFPControl(serial_team); 1264 1265 /* check if we need to allocate dispatch buffers stack */ 1266 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1267 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1268 serial_team->t.t_dispatch->th_disp_buffer = 1269 (dispatch_private_info_t *)__kmp_allocate( 1270 sizeof(dispatch_private_info_t)); 1271 } 1272 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1273 1274 KMP_MB(); 1275 1276 } else { 1277 /* this serialized team is already being used, 1278 * that's fine, just add another nested level */ 1279 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1280 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1281 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1282 ++serial_team->t.t_serialized; 1283 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1284 1285 // Nested level will be an index in the nested nthreads array 1286 int level = this_thr->th.th_team->t.t_level; 1287 // Thread value exists in the nested nthreads array for the next nested 1288 // level 1289 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1290 this_thr->th.th_current_task->td_icvs.nproc = 1291 __kmp_nested_nth.nth[level + 1]; 1292 } 1293 serial_team->t.t_level++; 1294 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1295 "of serial team %p to %d\n", 1296 global_tid, serial_team, serial_team->t.t_level)); 1297 1298 /* allocate/push dispatch buffers stack */ 1299 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1300 { 1301 dispatch_private_info_t *disp_buffer = 1302 (dispatch_private_info_t *)__kmp_allocate( 1303 sizeof(dispatch_private_info_t)); 1304 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1305 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1306 } 1307 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1308 1309 KMP_MB(); 1310 } 1311 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1312 1313 // Perform the display affinity functionality for 1314 // serialized parallel regions 1315 if (__kmp_display_affinity) { 1316 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1317 this_thr->th.th_prev_num_threads != 1) { 1318 // NULL means use the affinity-format-var ICV 1319 __kmp_aux_display_affinity(global_tid, NULL); 1320 this_thr->th.th_prev_level = serial_team->t.t_level; 1321 this_thr->th.th_prev_num_threads = 1; 1322 } 1323 } 1324 1325 if (__kmp_env_consistency_check) 1326 __kmp_push_parallel(global_tid, NULL); 1327 #if OMPT_SUPPORT 1328 serial_team->t.ompt_team_info.master_return_address = codeptr; 1329 if (ompt_enabled.enabled && 1330 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1331 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1332 OMPT_GET_FRAME_ADDRESS(0); 1333 1334 ompt_lw_taskteam_t lw_taskteam; 1335 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1336 &ompt_parallel_data, codeptr); 1337 1338 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1339 // don't use lw_taskteam after linking. content was swaped 1340 1341 /* OMPT implicit task begin */ 1342 if (ompt_enabled.ompt_callback_implicit_task) { 1343 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1344 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1345 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), 1346 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1347 OMPT_CUR_TASK_INFO(this_thr)->thread_num = 1348 __kmp_tid_from_gtid(global_tid); 1349 } 1350 1351 /* OMPT state */ 1352 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1353 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1354 OMPT_GET_FRAME_ADDRESS(0); 1355 } 1356 #endif 1357 } 1358 1359 // Test if this fork is for a team closely nested in a teams construct 1360 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th, 1361 microtask_t microtask, int level, 1362 int teams_level, kmp_va_list ap) { 1363 return (master_th->th.th_teams_microtask && ap && 1364 microtask != (microtask_t)__kmp_teams_master && level == teams_level); 1365 } 1366 1367 // Test if this fork is for the teams construct, i.e. to form the outer league 1368 // of teams 1369 static inline bool __kmp_is_entering_teams(int active_level, int level, 1370 int teams_level, kmp_va_list ap) { 1371 return ((ap == NULL && active_level == 0) || 1372 (ap && teams_level > 0 && teams_level == level)); 1373 } 1374 1375 // AC: This is start of parallel that is nested inside teams construct. 1376 // The team is actual (hot), all workers are ready at the fork barrier. 1377 // No lock needed to initialize the team a bit, then free workers. 1378 static inline int 1379 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team, 1380 kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root, 1381 enum fork_context_e call_context, microtask_t microtask, 1382 launch_t invoker, int master_set_numthreads, int level, 1383 #if OMPT_SUPPORT 1384 ompt_data_t ompt_parallel_data, void *return_address, 1385 #endif 1386 kmp_va_list ap) { 1387 void **argv; 1388 int i; 1389 1390 parent_team->t.t_ident = loc; 1391 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1392 parent_team->t.t_argc = argc; 1393 argv = (void **)parent_team->t.t_argv; 1394 for (i = argc - 1; i >= 0; --i) { 1395 *argv++ = va_arg(kmp_va_deref(ap), void *); 1396 } 1397 // Increment our nested depth levels, but not increase the serialization 1398 if (parent_team == master_th->th.th_serial_team) { 1399 // AC: we are in serialized parallel 1400 __kmpc_serialized_parallel(loc, gtid); 1401 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1402 1403 if (call_context == fork_context_gnu) { 1404 // AC: need to decrement t_serialized for enquiry functions to work 1405 // correctly, will restore at join time 1406 parent_team->t.t_serialized--; 1407 return TRUE; 1408 } 1409 1410 #if OMPD_SUPPORT 1411 parent_team->t.t_pkfn = microtask; 1412 #endif 1413 1414 #if OMPT_SUPPORT 1415 void *dummy; 1416 void **exit_frame_p; 1417 ompt_data_t *implicit_task_data; 1418 ompt_lw_taskteam_t lw_taskteam; 1419 1420 if (ompt_enabled.enabled) { 1421 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1422 &ompt_parallel_data, return_address); 1423 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1424 1425 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1426 // Don't use lw_taskteam after linking. Content was swapped. 1427 1428 /* OMPT implicit task begin */ 1429 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1430 if (ompt_enabled.ompt_callback_implicit_task) { 1431 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid); 1432 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1433 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data, 1434 1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1435 } 1436 1437 /* OMPT state */ 1438 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1439 } else { 1440 exit_frame_p = &dummy; 1441 } 1442 #endif 1443 1444 // AC: need to decrement t_serialized for enquiry functions to work 1445 // correctly, will restore at join time 1446 parent_team->t.t_serialized--; 1447 1448 { 1449 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1450 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1451 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1452 #if OMPT_SUPPORT 1453 , 1454 exit_frame_p 1455 #endif 1456 ); 1457 } 1458 1459 #if OMPT_SUPPORT 1460 if (ompt_enabled.enabled) { 1461 *exit_frame_p = NULL; 1462 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1463 if (ompt_enabled.ompt_callback_implicit_task) { 1464 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1465 ompt_scope_end, NULL, implicit_task_data, 1, 1466 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1467 } 1468 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1469 __ompt_lw_taskteam_unlink(master_th); 1470 if (ompt_enabled.ompt_callback_parallel_end) { 1471 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1472 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th), 1473 OMPT_INVOKER(call_context) | ompt_parallel_team, return_address); 1474 } 1475 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1476 } 1477 #endif 1478 return TRUE; 1479 } 1480 1481 parent_team->t.t_pkfn = microtask; 1482 parent_team->t.t_invoke = invoker; 1483 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1484 parent_team->t.t_active_level++; 1485 parent_team->t.t_level++; 1486 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1487 1488 // If the threads allocated to the team are less than the thread limit, update 1489 // the thread limit here. th_teams_size.nth is specific to this team nested 1490 // in a teams construct, the team is fully created, and we're about to do 1491 // the actual fork. Best to do this here so that the subsequent uses below 1492 // and in the join have the correct value. 1493 master_th->th.th_teams_size.nth = parent_team->t.t_nproc; 1494 1495 #if OMPT_SUPPORT 1496 if (ompt_enabled.enabled) { 1497 ompt_lw_taskteam_t lw_taskteam; 1498 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data, 1499 return_address); 1500 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true); 1501 } 1502 #endif 1503 1504 /* Change number of threads in the team if requested */ 1505 if (master_set_numthreads) { // The parallel has num_threads clause 1506 if (master_set_numthreads <= master_th->th.th_teams_size.nth) { 1507 // AC: only can reduce number of threads dynamically, can't increase 1508 kmp_info_t **other_threads = parent_team->t.t_threads; 1509 // NOTE: if using distributed barrier, we need to run this code block 1510 // even when the team size appears not to have changed from the max. 1511 int old_proc = master_th->th.th_teams_size.nth; 1512 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 1513 __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads); 1514 __kmp_add_threads_to_team(parent_team, master_set_numthreads); 1515 } 1516 parent_team->t.t_nproc = master_set_numthreads; 1517 for (i = 0; i < master_set_numthreads; ++i) { 1518 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1519 } 1520 } 1521 // Keep extra threads hot in the team for possible next parallels 1522 master_th->th.th_set_nproc = 0; 1523 } 1524 1525 #if USE_DEBUGGER 1526 if (__kmp_debugging) { // Let debugger override number of threads. 1527 int nth = __kmp_omp_num_threads(loc); 1528 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1529 master_set_numthreads = nth; 1530 } 1531 } 1532 #endif 1533 1534 // Figure out the proc_bind policy for the nested parallel within teams 1535 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1536 // proc_bind_default means don't update 1537 kmp_proc_bind_t proc_bind_icv = proc_bind_default; 1538 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1539 proc_bind = proc_bind_false; 1540 } else { 1541 // No proc_bind clause specified; use current proc-bind-var 1542 if (proc_bind == proc_bind_default) { 1543 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1544 } 1545 /* else: The proc_bind policy was specified explicitly on parallel clause. 1546 This overrides proc-bind-var for this parallel region, but does not 1547 change proc-bind-var. */ 1548 // Figure the value of proc-bind-var for the child threads. 1549 if ((level + 1 < __kmp_nested_proc_bind.used) && 1550 (__kmp_nested_proc_bind.bind_types[level + 1] != 1551 master_th->th.th_current_task->td_icvs.proc_bind)) { 1552 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1553 } 1554 } 1555 KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind); 1556 // Need to change the bind-var ICV to correct value for each implicit task 1557 if (proc_bind_icv != proc_bind_default && 1558 master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) { 1559 kmp_info_t **other_threads = parent_team->t.t_threads; 1560 for (i = 0; i < master_th->th.th_team_nproc; ++i) { 1561 other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv; 1562 } 1563 } 1564 // Reset for next parallel region 1565 master_th->th.th_set_proc_bind = proc_bind_default; 1566 1567 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1568 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || 1569 KMP_ITT_DEBUG) && 1570 __kmp_forkjoin_frames_mode == 3 && 1571 parent_team->t.t_active_level == 1 // only report frames at level 1 1572 && master_th->th.th_teams_size.nteams == 1) { 1573 kmp_uint64 tmp_time = __itt_get_timestamp(); 1574 master_th->th.th_frame_time = tmp_time; 1575 parent_team->t.t_region_time = tmp_time; 1576 } 1577 if (__itt_stack_caller_create_ptr) { 1578 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 1579 // create new stack stitching id before entering fork barrier 1580 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 1581 } 1582 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1583 #if KMP_AFFINITY_SUPPORTED 1584 __kmp_partition_places(parent_team); 1585 #endif 1586 1587 KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, " 1588 "master_th=%p, gtid=%d\n", 1589 root, parent_team, master_th, gtid)); 1590 __kmp_internal_fork(loc, gtid, parent_team); 1591 KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, " 1592 "master_th=%p, gtid=%d\n", 1593 root, parent_team, master_th, gtid)); 1594 1595 if (call_context == fork_context_gnu) 1596 return TRUE; 1597 1598 /* Invoke microtask for PRIMARY thread */ 1599 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid, 1600 parent_team->t.t_id, parent_team->t.t_pkfn)); 1601 1602 if (!parent_team->t.t_invoke(gtid)) { 1603 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 1604 } 1605 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid, 1606 parent_team->t.t_id, parent_team->t.t_pkfn)); 1607 KMP_MB(); /* Flush all pending memory write invalidates. */ 1608 1609 KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid)); 1610 1611 return TRUE; 1612 } 1613 1614 // Create a serialized parallel region 1615 static inline int 1616 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context, 1617 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1618 kmp_info_t *master_th, kmp_team_t *parent_team, 1619 #if OMPT_SUPPORT 1620 ompt_data_t *ompt_parallel_data, void **return_address, 1621 ompt_data_t **parent_task_data, 1622 #endif 1623 kmp_va_list ap) { 1624 kmp_team_t *team; 1625 int i; 1626 void **argv; 1627 1628 /* josh todo: hypothetical question: what do we do for OS X*? */ 1629 #if KMP_OS_LINUX && \ 1630 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1631 void *args[argc]; 1632 #else 1633 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1634 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1635 KMP_ARCH_AARCH64) */ 1636 1637 KA_TRACE( 1638 20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid)); 1639 1640 __kmpc_serialized_parallel(loc, gtid); 1641 1642 #if OMPD_SUPPORT 1643 master_th->th.th_serial_team->t.t_pkfn = microtask; 1644 #endif 1645 1646 if (call_context == fork_context_intel) { 1647 /* TODO this sucks, use the compiler itself to pass args! :) */ 1648 master_th->th.th_serial_team->t.t_ident = loc; 1649 if (!ap) { 1650 // revert change made in __kmpc_serialized_parallel() 1651 master_th->th.th_serial_team->t.t_level--; 1652 // Get args from parent team for teams construct 1653 1654 #if OMPT_SUPPORT 1655 void *dummy; 1656 void **exit_frame_p; 1657 ompt_task_info_t *task_info; 1658 ompt_lw_taskteam_t lw_taskteam; 1659 1660 if (ompt_enabled.enabled) { 1661 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1662 ompt_parallel_data, *return_address); 1663 1664 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1665 // don't use lw_taskteam after linking. content was swaped 1666 task_info = OMPT_CUR_TASK_INFO(master_th); 1667 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1668 if (ompt_enabled.ompt_callback_implicit_task) { 1669 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid); 1670 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1671 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1672 &(task_info->task_data), 1, 1673 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1674 } 1675 1676 /* OMPT state */ 1677 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1678 } else { 1679 exit_frame_p = &dummy; 1680 } 1681 #endif 1682 1683 { 1684 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1685 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1686 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1687 #if OMPT_SUPPORT 1688 , 1689 exit_frame_p 1690 #endif 1691 ); 1692 } 1693 1694 #if OMPT_SUPPORT 1695 if (ompt_enabled.enabled) { 1696 *exit_frame_p = NULL; 1697 if (ompt_enabled.ompt_callback_implicit_task) { 1698 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1699 ompt_scope_end, NULL, &(task_info->task_data), 1, 1700 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1701 } 1702 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1703 __ompt_lw_taskteam_unlink(master_th); 1704 if (ompt_enabled.ompt_callback_parallel_end) { 1705 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1706 ompt_parallel_data, *parent_task_data, 1707 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address); 1708 } 1709 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1710 } 1711 #endif 1712 } else if (microtask == (microtask_t)__kmp_teams_master) { 1713 KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team); 1714 team = master_th->th.th_team; 1715 // team->t.t_pkfn = microtask; 1716 team->t.t_invoke = invoker; 1717 __kmp_alloc_argv_entries(argc, team, TRUE); 1718 team->t.t_argc = argc; 1719 argv = (void **)team->t.t_argv; 1720 if (ap) { 1721 for (i = argc - 1; i >= 0; --i) 1722 *argv++ = va_arg(kmp_va_deref(ap), void *); 1723 } else { 1724 for (i = 0; i < argc; ++i) 1725 // Get args from parent team for teams construct 1726 argv[i] = parent_team->t.t_argv[i]; 1727 } 1728 // AC: revert change made in __kmpc_serialized_parallel() 1729 // because initial code in teams should have level=0 1730 team->t.t_level--; 1731 // AC: call special invoker for outer "parallel" of teams construct 1732 invoker(gtid); 1733 #if OMPT_SUPPORT 1734 if (ompt_enabled.enabled) { 1735 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th); 1736 if (ompt_enabled.ompt_callback_implicit_task) { 1737 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1738 ompt_scope_end, NULL, &(task_info->task_data), 0, 1739 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial); 1740 } 1741 if (ompt_enabled.ompt_callback_parallel_end) { 1742 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1743 ompt_parallel_data, *parent_task_data, 1744 OMPT_INVOKER(call_context) | ompt_parallel_league, 1745 *return_address); 1746 } 1747 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1748 } 1749 #endif 1750 } else { 1751 argv = args; 1752 for (i = argc - 1; i >= 0; --i) 1753 *argv++ = va_arg(kmp_va_deref(ap), void *); 1754 KMP_MB(); 1755 1756 #if OMPT_SUPPORT 1757 void *dummy; 1758 void **exit_frame_p; 1759 ompt_task_info_t *task_info; 1760 ompt_lw_taskteam_t lw_taskteam; 1761 ompt_data_t *implicit_task_data; 1762 1763 if (ompt_enabled.enabled) { 1764 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1765 ompt_parallel_data, *return_address); 1766 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1767 // don't use lw_taskteam after linking. content was swaped 1768 task_info = OMPT_CUR_TASK_INFO(master_th); 1769 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1770 1771 /* OMPT implicit task begin */ 1772 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1773 if (ompt_enabled.ompt_callback_implicit_task) { 1774 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1775 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1776 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), 1777 ompt_task_implicit); 1778 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid); 1779 } 1780 1781 /* OMPT state */ 1782 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1783 } else { 1784 exit_frame_p = &dummy; 1785 } 1786 #endif 1787 1788 { 1789 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1790 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1791 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1792 #if OMPT_SUPPORT 1793 , 1794 exit_frame_p 1795 #endif 1796 ); 1797 } 1798 1799 #if OMPT_SUPPORT 1800 if (ompt_enabled.enabled) { 1801 *exit_frame_p = NULL; 1802 if (ompt_enabled.ompt_callback_implicit_task) { 1803 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1804 ompt_scope_end, NULL, &(task_info->task_data), 1, 1805 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1806 } 1807 1808 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1809 __ompt_lw_taskteam_unlink(master_th); 1810 if (ompt_enabled.ompt_callback_parallel_end) { 1811 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1812 ompt_parallel_data, *parent_task_data, 1813 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address); 1814 } 1815 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1816 } 1817 #endif 1818 } 1819 } else if (call_context == fork_context_gnu) { 1820 #if OMPT_SUPPORT 1821 if (ompt_enabled.enabled) { 1822 ompt_lw_taskteam_t lwt; 1823 __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data, 1824 *return_address); 1825 1826 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1827 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1828 } 1829 // don't use lw_taskteam after linking. content was swaped 1830 #endif 1831 1832 // we were called from GNU native code 1833 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid)); 1834 return FALSE; 1835 } else { 1836 KMP_ASSERT2(call_context < fork_context_last, 1837 "__kmp_serial_fork_call: unknown fork_context parameter"); 1838 } 1839 1840 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid)); 1841 KMP_MB(); 1842 return FALSE; 1843 } 1844 1845 /* most of the work for a fork */ 1846 /* return true if we really went parallel, false if serialized */ 1847 int __kmp_fork_call(ident_t *loc, int gtid, 1848 enum fork_context_e call_context, // Intel, GNU, ... 1849 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1850 kmp_va_list ap) { 1851 void **argv; 1852 int i; 1853 int master_tid; 1854 int master_this_cons; 1855 kmp_team_t *team; 1856 kmp_team_t *parent_team; 1857 kmp_info_t *master_th; 1858 kmp_root_t *root; 1859 int nthreads; 1860 int master_active; 1861 int master_set_numthreads; 1862 int level; 1863 int active_level; 1864 int teams_level; 1865 #if KMP_NESTED_HOT_TEAMS 1866 kmp_hot_team_ptr_t **p_hot_teams; 1867 #endif 1868 { // KMP_TIME_BLOCK 1869 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1870 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1871 1872 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1873 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1874 /* Some systems prefer the stack for the root thread(s) to start with */ 1875 /* some gap from the parent stack to prevent false sharing. */ 1876 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1877 /* These 2 lines below are so this does not get optimized out */ 1878 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1879 __kmp_stkpadding += (short)((kmp_int64)dummy); 1880 } 1881 1882 /* initialize if needed */ 1883 KMP_DEBUG_ASSERT( 1884 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1885 if (!TCR_4(__kmp_init_parallel)) 1886 __kmp_parallel_initialize(); 1887 __kmp_resume_if_soft_paused(); 1888 1889 /* setup current data */ 1890 // AC: potentially unsafe, not in sync with library shutdown, 1891 // __kmp_threads can be freed 1892 master_th = __kmp_threads[gtid]; 1893 1894 parent_team = master_th->th.th_team; 1895 master_tid = master_th->th.th_info.ds.ds_tid; 1896 master_this_cons = master_th->th.th_local.this_construct; 1897 root = master_th->th.th_root; 1898 master_active = root->r.r_active; 1899 master_set_numthreads = master_th->th.th_set_nproc; 1900 1901 #if OMPT_SUPPORT 1902 ompt_data_t ompt_parallel_data = ompt_data_none; 1903 ompt_data_t *parent_task_data; 1904 ompt_frame_t *ompt_frame; 1905 void *return_address = NULL; 1906 1907 if (ompt_enabled.enabled) { 1908 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1909 NULL, NULL); 1910 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1911 } 1912 #endif 1913 1914 // Assign affinity to root thread if it hasn't happened yet 1915 __kmp_assign_root_init_mask(); 1916 1917 // Nested level will be an index in the nested nthreads array 1918 level = parent_team->t.t_level; 1919 // used to launch non-serial teams even if nested is not allowed 1920 active_level = parent_team->t.t_active_level; 1921 // needed to check nesting inside the teams 1922 teams_level = master_th->th.th_teams_level; 1923 #if KMP_NESTED_HOT_TEAMS 1924 p_hot_teams = &master_th->th.th_hot_teams; 1925 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1926 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1927 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1928 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1929 // it is either actual or not needed (when active_level > 0) 1930 (*p_hot_teams)[0].hot_team_nth = 1; 1931 } 1932 #endif 1933 1934 #if OMPT_SUPPORT 1935 if (ompt_enabled.enabled) { 1936 if (ompt_enabled.ompt_callback_parallel_begin) { 1937 int team_size = master_set_numthreads 1938 ? master_set_numthreads 1939 : get__nproc_2(parent_team, master_tid); 1940 int flags = OMPT_INVOKER(call_context) | 1941 ((microtask == (microtask_t)__kmp_teams_master) 1942 ? ompt_parallel_league 1943 : ompt_parallel_team); 1944 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1945 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags, 1946 return_address); 1947 } 1948 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1949 } 1950 #endif 1951 1952 master_th->th.th_ident = loc; 1953 1954 // Parallel closely nested in teams construct: 1955 if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) { 1956 return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root, 1957 call_context, microtask, invoker, 1958 master_set_numthreads, level, 1959 #if OMPT_SUPPORT 1960 ompt_parallel_data, return_address, 1961 #endif 1962 ap); 1963 } // End parallel closely nested in teams construct 1964 1965 #if KMP_DEBUG 1966 if (__kmp_tasking_mode != tskm_immediate_exec) { 1967 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1968 parent_team->t.t_task_team[master_th->th.th_task_state]); 1969 } 1970 #endif 1971 1972 // Need this to happen before we determine the number of threads, not while 1973 // we are allocating the team 1974 //__kmp_push_current_task_to_thread(master_th, parent_team, 0); 1975 1976 // Determine the number of threads 1977 int enter_teams = 1978 __kmp_is_entering_teams(active_level, level, teams_level, ap); 1979 if ((!enter_teams && 1980 (parent_team->t.t_active_level >= 1981 master_th->th.th_current_task->td_icvs.max_active_levels)) || 1982 (__kmp_library == library_serial)) { 1983 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid)); 1984 nthreads = 1; 1985 } else { 1986 nthreads = master_set_numthreads 1987 ? master_set_numthreads 1988 // TODO: get nproc directly from current task 1989 : get__nproc_2(parent_team, master_tid); 1990 // Check if we need to take forkjoin lock? (no need for serialized 1991 // parallel out of teams construct). 1992 if (nthreads > 1) { 1993 /* determine how many new threads we can use */ 1994 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1995 /* AC: If we execute teams from parallel region (on host), then teams 1996 should be created but each can only have 1 thread if nesting is 1997 disabled. If teams called from serial region, then teams and their 1998 threads should be created regardless of the nesting setting. */ 1999 nthreads = __kmp_reserve_threads(root, parent_team, master_tid, 2000 nthreads, enter_teams); 2001 if (nthreads == 1) { 2002 // Free lock for single thread execution here; for multi-thread 2003 // execution it will be freed later after team of threads created 2004 // and initialized 2005 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2006 } 2007 } 2008 } 2009 KMP_DEBUG_ASSERT(nthreads > 0); 2010 2011 // If we temporarily changed the set number of threads then restore it now 2012 master_th->th.th_set_nproc = 0; 2013 2014 if (nthreads == 1) { 2015 return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask, 2016 invoker, master_th, parent_team, 2017 #if OMPT_SUPPORT 2018 &ompt_parallel_data, &return_address, 2019 &parent_task_data, 2020 #endif 2021 ap); 2022 } // if (nthreads == 1) 2023 2024 // GEH: only modify the executing flag in the case when not serialized 2025 // serialized case is handled in kmpc_serialized_parallel 2026 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 2027 "curtask=%p, curtask_max_aclevel=%d\n", 2028 parent_team->t.t_active_level, master_th, 2029 master_th->th.th_current_task, 2030 master_th->th.th_current_task->td_icvs.max_active_levels)); 2031 // TODO: GEH - cannot do this assertion because root thread not set up as 2032 // executing 2033 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 2034 master_th->th.th_current_task->td_flags.executing = 0; 2035 2036 if (!master_th->th.th_teams_microtask || level > teams_level) { 2037 /* Increment our nested depth level */ 2038 KMP_ATOMIC_INC(&root->r.r_in_parallel); 2039 } 2040 2041 // See if we need to make a copy of the ICVs. 2042 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 2043 if ((level + 1 < __kmp_nested_nth.used) && 2044 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 2045 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 2046 } else { 2047 nthreads_icv = 0; // don't update 2048 } 2049 2050 // Figure out the proc_bind_policy for the new team. 2051 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 2052 // proc_bind_default means don't update 2053 kmp_proc_bind_t proc_bind_icv = proc_bind_default; 2054 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 2055 proc_bind = proc_bind_false; 2056 } else { 2057 // No proc_bind clause specified; use current proc-bind-var for this 2058 // parallel region 2059 if (proc_bind == proc_bind_default) { 2060 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 2061 } 2062 // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND 2063 if (master_th->th.th_teams_microtask && 2064 microtask == (microtask_t)__kmp_teams_master) { 2065 proc_bind = __kmp_teams_proc_bind; 2066 } 2067 /* else: The proc_bind policy was specified explicitly on parallel clause. 2068 This overrides proc-bind-var for this parallel region, but does not 2069 change proc-bind-var. */ 2070 // Figure the value of proc-bind-var for the child threads. 2071 if ((level + 1 < __kmp_nested_proc_bind.used) && 2072 (__kmp_nested_proc_bind.bind_types[level + 1] != 2073 master_th->th.th_current_task->td_icvs.proc_bind)) { 2074 // Do not modify the proc bind icv for the two teams construct forks 2075 // They just let the proc bind icv pass through 2076 if (!master_th->th.th_teams_microtask || 2077 !(microtask == (microtask_t)__kmp_teams_master || ap == NULL)) 2078 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 2079 } 2080 } 2081 2082 // Reset for next parallel region 2083 master_th->th.th_set_proc_bind = proc_bind_default; 2084 2085 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { 2086 kmp_internal_control_t new_icvs; 2087 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 2088 new_icvs.next = NULL; 2089 if (nthreads_icv > 0) { 2090 new_icvs.nproc = nthreads_icv; 2091 } 2092 if (proc_bind_icv != proc_bind_default) { 2093 new_icvs.proc_bind = proc_bind_icv; 2094 } 2095 2096 /* allocate a new parallel team */ 2097 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2098 team = __kmp_allocate_team(root, nthreads, nthreads, 2099 #if OMPT_SUPPORT 2100 ompt_parallel_data, 2101 #endif 2102 proc_bind, &new_icvs, 2103 argc USE_NESTED_HOT_ARG(master_th)); 2104 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) 2105 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs); 2106 } else { 2107 /* allocate a new parallel team */ 2108 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2109 team = __kmp_allocate_team(root, nthreads, nthreads, 2110 #if OMPT_SUPPORT 2111 ompt_parallel_data, 2112 #endif 2113 proc_bind, 2114 &master_th->th.th_current_task->td_icvs, 2115 argc USE_NESTED_HOT_ARG(master_th)); 2116 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) 2117 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, 2118 &master_th->th.th_current_task->td_icvs); 2119 } 2120 KF_TRACE( 2121 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 2122 2123 /* setup the new team */ 2124 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 2125 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 2126 KMP_CHECK_UPDATE(team->t.t_ident, loc); 2127 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 2128 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 2129 #if OMPT_SUPPORT 2130 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 2131 return_address); 2132 #endif 2133 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 2134 // TODO: parent_team->t.t_level == INT_MAX ??? 2135 if (!master_th->th.th_teams_microtask || level > teams_level) { 2136 int new_level = parent_team->t.t_level + 1; 2137 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2138 new_level = parent_team->t.t_active_level + 1; 2139 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2140 } else { 2141 // AC: Do not increase parallel level at start of the teams construct 2142 int new_level = parent_team->t.t_level; 2143 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2144 new_level = parent_team->t.t_active_level; 2145 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2146 } 2147 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2148 // set primary thread's schedule as new run-time schedule 2149 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2150 2151 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2152 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2153 2154 // Update the floating point rounding in the team if required. 2155 propagateFPControl(team); 2156 #if OMPD_SUPPORT 2157 if (ompd_state & OMPD_ENABLE_BP) 2158 ompd_bp_parallel_begin(); 2159 #endif 2160 2161 if (__kmp_tasking_mode != tskm_immediate_exec) { 2162 // Set primary thread's task team to team's task team. Unless this is hot 2163 // team, it should be NULL. 2164 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2165 parent_team->t.t_task_team[master_th->th.th_task_state]); 2166 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team " 2167 "%p, new task_team %p / team %p\n", 2168 __kmp_gtid_from_thread(master_th), 2169 master_th->th.th_task_team, parent_team, 2170 team->t.t_task_team[master_th->th.th_task_state], team)); 2171 2172 if (active_level || master_th->th.th_task_team) { 2173 // Take a memo of primary thread's task_state 2174 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2175 if (master_th->th.th_task_state_top >= 2176 master_th->th.th_task_state_stack_sz) { // increase size 2177 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2178 kmp_uint8 *old_stack, *new_stack; 2179 kmp_uint32 i; 2180 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2181 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2182 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2183 } 2184 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2185 ++i) { // zero-init rest of stack 2186 new_stack[i] = 0; 2187 } 2188 old_stack = master_th->th.th_task_state_memo_stack; 2189 master_th->th.th_task_state_memo_stack = new_stack; 2190 master_th->th.th_task_state_stack_sz = new_size; 2191 __kmp_free(old_stack); 2192 } 2193 // Store primary thread's task_state on stack 2194 master_th->th 2195 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2196 master_th->th.th_task_state; 2197 master_th->th.th_task_state_top++; 2198 #if KMP_NESTED_HOT_TEAMS 2199 if (master_th->th.th_hot_teams && 2200 active_level < __kmp_hot_teams_max_level && 2201 team == master_th->th.th_hot_teams[active_level].hot_team) { 2202 // Restore primary thread's nested state if nested hot team 2203 master_th->th.th_task_state = 2204 master_th->th 2205 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2206 } else { 2207 #endif 2208 master_th->th.th_task_state = 0; 2209 #if KMP_NESTED_HOT_TEAMS 2210 } 2211 #endif 2212 } 2213 #if !KMP_NESTED_HOT_TEAMS 2214 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2215 (team == root->r.r_hot_team)); 2216 #endif 2217 } 2218 2219 KA_TRACE( 2220 20, 2221 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2222 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2223 team->t.t_nproc)); 2224 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2225 (team->t.t_master_tid == 0 && 2226 (team->t.t_parent == root->r.r_root_team || 2227 team->t.t_parent->t.t_serialized))); 2228 KMP_MB(); 2229 2230 /* now, setup the arguments */ 2231 argv = (void **)team->t.t_argv; 2232 if (ap) { 2233 for (i = argc - 1; i >= 0; --i) { 2234 void *new_argv = va_arg(kmp_va_deref(ap), void *); 2235 KMP_CHECK_UPDATE(*argv, new_argv); 2236 argv++; 2237 } 2238 } else { 2239 for (i = 0; i < argc; ++i) { 2240 // Get args from parent team for teams construct 2241 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2242 } 2243 } 2244 2245 /* now actually fork the threads */ 2246 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2247 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2248 root->r.r_active = TRUE; 2249 2250 __kmp_fork_team_threads(root, team, master_th, gtid, !ap); 2251 __kmp_setup_icv_copy(team, nthreads, 2252 &master_th->th.th_current_task->td_icvs, loc); 2253 2254 #if OMPT_SUPPORT 2255 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2256 #endif 2257 2258 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2259 2260 #if USE_ITT_BUILD 2261 if (team->t.t_active_level == 1 // only report frames at level 1 2262 && !master_th->th.th_teams_microtask) { // not in teams construct 2263 #if USE_ITT_NOTIFY 2264 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2265 (__kmp_forkjoin_frames_mode == 3 || 2266 __kmp_forkjoin_frames_mode == 1)) { 2267 kmp_uint64 tmp_time = 0; 2268 if (__itt_get_timestamp_ptr) 2269 tmp_time = __itt_get_timestamp(); 2270 // Internal fork - report frame begin 2271 master_th->th.th_frame_time = tmp_time; 2272 if (__kmp_forkjoin_frames_mode == 3) 2273 team->t.t_region_time = tmp_time; 2274 } else 2275 // only one notification scheme (either "submit" or "forking/joined", not both) 2276 #endif /* USE_ITT_NOTIFY */ 2277 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2278 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2279 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2280 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2281 } 2282 } 2283 #endif /* USE_ITT_BUILD */ 2284 2285 /* now go on and do the work */ 2286 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2287 KMP_MB(); 2288 KF_TRACE(10, 2289 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2290 root, team, master_th, gtid)); 2291 2292 #if USE_ITT_BUILD 2293 if (__itt_stack_caller_create_ptr) { 2294 // create new stack stitching id before entering fork barrier 2295 if (!enter_teams) { 2296 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL); 2297 team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2298 } else if (parent_team->t.t_serialized) { 2299 // keep stack stitching id in the serialized parent_team; 2300 // current team will be used for parallel inside the teams; 2301 // if parent_team is active, then it already keeps stack stitching id 2302 // for the league of teams 2303 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 2304 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2305 } 2306 } 2307 #endif /* USE_ITT_BUILD */ 2308 2309 // AC: skip __kmp_internal_fork at teams construct, let only primary 2310 // threads execute 2311 if (ap) { 2312 __kmp_internal_fork(loc, gtid, team); 2313 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2314 "master_th=%p, gtid=%d\n", 2315 root, team, master_th, gtid)); 2316 } 2317 2318 if (call_context == fork_context_gnu) { 2319 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2320 return TRUE; 2321 } 2322 2323 /* Invoke microtask for PRIMARY thread */ 2324 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2325 team->t.t_id, team->t.t_pkfn)); 2326 } // END of timer KMP_fork_call block 2327 2328 #if KMP_STATS_ENABLED 2329 // If beginning a teams construct, then change thread state 2330 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 2331 if (!ap) { 2332 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); 2333 } 2334 #endif 2335 2336 if (!team->t.t_invoke(gtid)) { 2337 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 2338 } 2339 2340 #if KMP_STATS_ENABLED 2341 // If was beginning of a teams construct, then reset thread state 2342 if (!ap) { 2343 KMP_SET_THREAD_STATE(previous_state); 2344 } 2345 #endif 2346 2347 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2348 team->t.t_id, team->t.t_pkfn)); 2349 KMP_MB(); /* Flush all pending memory write invalidates. */ 2350 2351 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2352 #if OMPT_SUPPORT 2353 if (ompt_enabled.enabled) { 2354 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2355 } 2356 #endif 2357 2358 return TRUE; 2359 } 2360 2361 #if OMPT_SUPPORT 2362 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2363 kmp_team_t *team) { 2364 // restore state outside the region 2365 thread->th.ompt_thread_info.state = 2366 ((team->t.t_serialized) ? ompt_state_work_serial 2367 : ompt_state_work_parallel); 2368 } 2369 2370 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2371 kmp_team_t *team, ompt_data_t *parallel_data, 2372 int flags, void *codeptr) { 2373 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2374 if (ompt_enabled.ompt_callback_parallel_end) { 2375 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2376 parallel_data, &(task_info->task_data), flags, codeptr); 2377 } 2378 2379 task_info->frame.enter_frame = ompt_data_none; 2380 __kmp_join_restore_state(thread, team); 2381 } 2382 #endif 2383 2384 void __kmp_join_call(ident_t *loc, int gtid 2385 #if OMPT_SUPPORT 2386 , 2387 enum fork_context_e fork_context 2388 #endif 2389 , 2390 int exit_teams) { 2391 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2392 kmp_team_t *team; 2393 kmp_team_t *parent_team; 2394 kmp_info_t *master_th; 2395 kmp_root_t *root; 2396 int master_active; 2397 2398 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2399 2400 /* setup current data */ 2401 master_th = __kmp_threads[gtid]; 2402 root = master_th->th.th_root; 2403 team = master_th->th.th_team; 2404 parent_team = team->t.t_parent; 2405 2406 master_th->th.th_ident = loc; 2407 2408 #if OMPT_SUPPORT 2409 void *team_microtask = (void *)team->t.t_pkfn; 2410 // For GOMP interface with serialized parallel, need the 2411 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task 2412 // and end-parallel events. 2413 if (ompt_enabled.enabled && 2414 !(team->t.t_serialized && fork_context == fork_context_gnu)) { 2415 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2416 } 2417 #endif 2418 2419 #if KMP_DEBUG 2420 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2421 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2422 "th_task_team = %p\n", 2423 __kmp_gtid_from_thread(master_th), team, 2424 team->t.t_task_team[master_th->th.th_task_state], 2425 master_th->th.th_task_team)); 2426 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2427 team->t.t_task_team[master_th->th.th_task_state]); 2428 } 2429 #endif 2430 2431 if (team->t.t_serialized) { 2432 if (master_th->th.th_teams_microtask) { 2433 // We are in teams construct 2434 int level = team->t.t_level; 2435 int tlevel = master_th->th.th_teams_level; 2436 if (level == tlevel) { 2437 // AC: we haven't incremented it earlier at start of teams construct, 2438 // so do it here - at the end of teams construct 2439 team->t.t_level++; 2440 } else if (level == tlevel + 1) { 2441 // AC: we are exiting parallel inside teams, need to increment 2442 // serialization in order to restore it in the next call to 2443 // __kmpc_end_serialized_parallel 2444 team->t.t_serialized++; 2445 } 2446 } 2447 __kmpc_end_serialized_parallel(loc, gtid); 2448 2449 #if OMPT_SUPPORT 2450 if (ompt_enabled.enabled) { 2451 if (fork_context == fork_context_gnu) { 2452 __ompt_lw_taskteam_unlink(master_th); 2453 } 2454 __kmp_join_restore_state(master_th, parent_team); 2455 } 2456 #endif 2457 2458 return; 2459 } 2460 2461 master_active = team->t.t_master_active; 2462 2463 if (!exit_teams) { 2464 // AC: No barrier for internal teams at exit from teams construct. 2465 // But there is barrier for external team (league). 2466 __kmp_internal_join(loc, gtid, team); 2467 #if USE_ITT_BUILD 2468 if (__itt_stack_caller_create_ptr) { 2469 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL); 2470 // destroy the stack stitching id after join barrier 2471 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id); 2472 team->t.t_stack_id = NULL; 2473 } 2474 #endif 2475 } else { 2476 master_th->th.th_task_state = 2477 0; // AC: no tasking in teams (out of any parallel) 2478 #if USE_ITT_BUILD 2479 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) { 2480 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL); 2481 // destroy the stack stitching id on exit from the teams construct 2482 // if parent_team is active, then the id will be destroyed later on 2483 // by master of the league of teams 2484 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id); 2485 parent_team->t.t_stack_id = NULL; 2486 } 2487 #endif 2488 } 2489 2490 KMP_MB(); 2491 2492 #if OMPT_SUPPORT 2493 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2494 void *codeptr = team->t.ompt_team_info.master_return_address; 2495 #endif 2496 2497 #if USE_ITT_BUILD 2498 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2499 if (team->t.t_active_level == 1 && 2500 (!master_th->th.th_teams_microtask || /* not in teams construct */ 2501 master_th->th.th_teams_size.nteams == 1)) { 2502 master_th->th.th_ident = loc; 2503 // only one notification scheme (either "submit" or "forking/joined", not 2504 // both) 2505 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2506 __kmp_forkjoin_frames_mode == 3) 2507 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2508 master_th->th.th_frame_time, 0, loc, 2509 master_th->th.th_team_nproc, 1); 2510 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2511 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2512 __kmp_itt_region_joined(gtid); 2513 } // active_level == 1 2514 #endif /* USE_ITT_BUILD */ 2515 2516 #if KMP_AFFINITY_SUPPORTED 2517 if (!exit_teams) { 2518 // Restore master thread's partition. 2519 master_th->th.th_first_place = team->t.t_first_place; 2520 master_th->th.th_last_place = team->t.t_last_place; 2521 } 2522 #endif // KMP_AFFINITY_SUPPORTED 2523 2524 if (master_th->th.th_teams_microtask && !exit_teams && 2525 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2526 team->t.t_level == master_th->th.th_teams_level + 1) { 2527 // AC: We need to leave the team structure intact at the end of parallel 2528 // inside the teams construct, so that at the next parallel same (hot) team 2529 // works, only adjust nesting levels 2530 #if OMPT_SUPPORT 2531 ompt_data_t ompt_parallel_data = ompt_data_none; 2532 if (ompt_enabled.enabled) { 2533 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2534 if (ompt_enabled.ompt_callback_implicit_task) { 2535 int ompt_team_size = team->t.t_nproc; 2536 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2537 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2538 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 2539 } 2540 task_info->frame.exit_frame = ompt_data_none; 2541 task_info->task_data = ompt_data_none; 2542 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 2543 __ompt_lw_taskteam_unlink(master_th); 2544 } 2545 #endif 2546 /* Decrement our nested depth level */ 2547 team->t.t_level--; 2548 team->t.t_active_level--; 2549 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2550 2551 // Restore number of threads in the team if needed. This code relies on 2552 // the proper adjustment of th_teams_size.nth after the fork in 2553 // __kmp_teams_master on each teams primary thread in the case that 2554 // __kmp_reserve_threads reduced it. 2555 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2556 int old_num = master_th->th.th_team_nproc; 2557 int new_num = master_th->th.th_teams_size.nth; 2558 kmp_info_t **other_threads = team->t.t_threads; 2559 team->t.t_nproc = new_num; 2560 for (int i = 0; i < old_num; ++i) { 2561 other_threads[i]->th.th_team_nproc = new_num; 2562 } 2563 // Adjust states of non-used threads of the team 2564 for (int i = old_num; i < new_num; ++i) { 2565 // Re-initialize thread's barrier data. 2566 KMP_DEBUG_ASSERT(other_threads[i]); 2567 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2568 for (int b = 0; b < bs_last_barrier; ++b) { 2569 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2570 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2571 #if USE_DEBUGGER 2572 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2573 #endif 2574 } 2575 if (__kmp_tasking_mode != tskm_immediate_exec) { 2576 // Synchronize thread's task state 2577 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2578 } 2579 } 2580 } 2581 2582 #if OMPT_SUPPORT 2583 if (ompt_enabled.enabled) { 2584 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data, 2585 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr); 2586 } 2587 #endif 2588 2589 return; 2590 } 2591 2592 /* do cleanup and restore the parent team */ 2593 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2594 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2595 2596 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2597 2598 /* jc: The following lock has instructions with REL and ACQ semantics, 2599 separating the parallel user code called in this parallel region 2600 from the serial user code called after this function returns. */ 2601 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2602 2603 if (!master_th->th.th_teams_microtask || 2604 team->t.t_level > master_th->th.th_teams_level) { 2605 /* Decrement our nested depth level */ 2606 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2607 } 2608 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2609 2610 #if OMPT_SUPPORT 2611 if (ompt_enabled.enabled) { 2612 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2613 if (ompt_enabled.ompt_callback_implicit_task) { 2614 int flags = (team_microtask == (void *)__kmp_teams_master) 2615 ? ompt_task_initial 2616 : ompt_task_implicit; 2617 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc; 2618 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2619 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2620 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags); 2621 } 2622 task_info->frame.exit_frame = ompt_data_none; 2623 task_info->task_data = ompt_data_none; 2624 } 2625 #endif 2626 2627 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2628 master_th, team)); 2629 __kmp_pop_current_task_from_thread(master_th); 2630 2631 master_th->th.th_def_allocator = team->t.t_def_allocator; 2632 2633 #if OMPD_SUPPORT 2634 if (ompd_state & OMPD_ENABLE_BP) 2635 ompd_bp_parallel_end(); 2636 #endif 2637 updateHWFPControl(team); 2638 2639 if (root->r.r_active != master_active) 2640 root->r.r_active = master_active; 2641 2642 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2643 master_th)); // this will free worker threads 2644 2645 /* this race was fun to find. make sure the following is in the critical 2646 region otherwise assertions may fail occasionally since the old team may be 2647 reallocated and the hierarchy appears inconsistent. it is actually safe to 2648 run and won't cause any bugs, but will cause those assertion failures. it's 2649 only one deref&assign so might as well put this in the critical region */ 2650 master_th->th.th_team = parent_team; 2651 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2652 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2653 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2654 2655 /* restore serialized team, if need be */ 2656 if (parent_team->t.t_serialized && 2657 parent_team != master_th->th.th_serial_team && 2658 parent_team != root->r.r_root_team) { 2659 __kmp_free_team(root, 2660 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2661 master_th->th.th_serial_team = parent_team; 2662 } 2663 2664 if (__kmp_tasking_mode != tskm_immediate_exec) { 2665 if (master_th->th.th_task_state_top > 2666 0) { // Restore task state from memo stack 2667 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2668 // Remember primary thread's state if we re-use this nested hot team 2669 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2670 master_th->th.th_task_state; 2671 --master_th->th.th_task_state_top; // pop 2672 // Now restore state at this level 2673 master_th->th.th_task_state = 2674 master_th->th 2675 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2676 } else if (team != root->r.r_hot_team) { 2677 // Reset the task state of primary thread if we are not hot team because 2678 // in this case all the worker threads will be free, and their task state 2679 // will be reset. If not reset the primary's, the task state will be 2680 // inconsistent. 2681 master_th->th.th_task_state = 0; 2682 } 2683 // Copy the task team from the parent team to the primary thread 2684 master_th->th.th_task_team = 2685 parent_team->t.t_task_team[master_th->th.th_task_state]; 2686 KA_TRACE(20, 2687 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n", 2688 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2689 parent_team)); 2690 } 2691 2692 // TODO: GEH - cannot do this assertion because root thread not set up as 2693 // executing 2694 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2695 master_th->th.th_current_task->td_flags.executing = 1; 2696 2697 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2698 2699 #if KMP_AFFINITY_SUPPORTED 2700 if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) { 2701 __kmp_reset_root_init_mask(gtid); 2702 } 2703 #endif 2704 #if OMPT_SUPPORT 2705 int flags = 2706 OMPT_INVOKER(fork_context) | 2707 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league 2708 : ompt_parallel_team); 2709 if (ompt_enabled.enabled) { 2710 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags, 2711 codeptr); 2712 } 2713 #endif 2714 2715 KMP_MB(); 2716 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2717 } 2718 2719 /* Check whether we should push an internal control record onto the 2720 serial team stack. If so, do it. */ 2721 void __kmp_save_internal_controls(kmp_info_t *thread) { 2722 2723 if (thread->th.th_team != thread->th.th_serial_team) { 2724 return; 2725 } 2726 if (thread->th.th_team->t.t_serialized > 1) { 2727 int push = 0; 2728 2729 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2730 push = 1; 2731 } else { 2732 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2733 thread->th.th_team->t.t_serialized) { 2734 push = 1; 2735 } 2736 } 2737 if (push) { /* push a record on the serial team's stack */ 2738 kmp_internal_control_t *control = 2739 (kmp_internal_control_t *)__kmp_allocate( 2740 sizeof(kmp_internal_control_t)); 2741 2742 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2743 2744 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2745 2746 control->next = thread->th.th_team->t.t_control_stack_top; 2747 thread->th.th_team->t.t_control_stack_top = control; 2748 } 2749 } 2750 } 2751 2752 /* Changes set_nproc */ 2753 void __kmp_set_num_threads(int new_nth, int gtid) { 2754 kmp_info_t *thread; 2755 kmp_root_t *root; 2756 2757 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2758 KMP_DEBUG_ASSERT(__kmp_init_serial); 2759 2760 if (new_nth < 1) 2761 new_nth = 1; 2762 else if (new_nth > __kmp_max_nth) 2763 new_nth = __kmp_max_nth; 2764 2765 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2766 thread = __kmp_threads[gtid]; 2767 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2768 return; // nothing to do 2769 2770 __kmp_save_internal_controls(thread); 2771 2772 set__nproc(thread, new_nth); 2773 2774 // If this omp_set_num_threads() call will cause the hot team size to be 2775 // reduced (in the absence of a num_threads clause), then reduce it now, 2776 // rather than waiting for the next parallel region. 2777 root = thread->th.th_root; 2778 if (__kmp_init_parallel && (!root->r.r_active) && 2779 (root->r.r_hot_team->t.t_nproc > new_nth) 2780 #if KMP_NESTED_HOT_TEAMS 2781 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2782 #endif 2783 ) { 2784 kmp_team_t *hot_team = root->r.r_hot_team; 2785 int f; 2786 2787 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2788 2789 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 2790 __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth); 2791 } 2792 // Release the extra threads we don't need any more. 2793 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2794 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2795 if (__kmp_tasking_mode != tskm_immediate_exec) { 2796 // When decreasing team size, threads no longer in the team should unref 2797 // task team. 2798 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2799 } 2800 __kmp_free_thread(hot_team->t.t_threads[f]); 2801 hot_team->t.t_threads[f] = NULL; 2802 } 2803 hot_team->t.t_nproc = new_nth; 2804 #if KMP_NESTED_HOT_TEAMS 2805 if (thread->th.th_hot_teams) { 2806 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2807 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2808 } 2809 #endif 2810 2811 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 2812 hot_team->t.b->update_num_threads(new_nth); 2813 __kmp_add_threads_to_team(hot_team, new_nth); 2814 } 2815 2816 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2817 2818 // Update the t_nproc field in the threads that are still active. 2819 for (f = 0; f < new_nth; f++) { 2820 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2821 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2822 } 2823 // Special flag in case omp_set_num_threads() call 2824 hot_team->t.t_size_changed = -1; 2825 } 2826 } 2827 2828 /* Changes max_active_levels */ 2829 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2830 kmp_info_t *thread; 2831 2832 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2833 "%d = (%d)\n", 2834 gtid, max_active_levels)); 2835 KMP_DEBUG_ASSERT(__kmp_init_serial); 2836 2837 // validate max_active_levels 2838 if (max_active_levels < 0) { 2839 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2840 // We ignore this call if the user has specified a negative value. 2841 // The current setting won't be changed. The last valid setting will be 2842 // used. A warning will be issued (if warnings are allowed as controlled by 2843 // the KMP_WARNINGS env var). 2844 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2845 "max_active_levels for thread %d = (%d)\n", 2846 gtid, max_active_levels)); 2847 return; 2848 } 2849 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2850 // it's OK, the max_active_levels is within the valid range: [ 0; 2851 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2852 // We allow a zero value. (implementation defined behavior) 2853 } else { 2854 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2855 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2856 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2857 // Current upper limit is MAX_INT. (implementation defined behavior) 2858 // If the input exceeds the upper limit, we correct the input to be the 2859 // upper limit. (implementation defined behavior) 2860 // Actually, the flow should never get here until we use MAX_INT limit. 2861 } 2862 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2863 "max_active_levels for thread %d = (%d)\n", 2864 gtid, max_active_levels)); 2865 2866 thread = __kmp_threads[gtid]; 2867 2868 __kmp_save_internal_controls(thread); 2869 2870 set__max_active_levels(thread, max_active_levels); 2871 } 2872 2873 /* Gets max_active_levels */ 2874 int __kmp_get_max_active_levels(int gtid) { 2875 kmp_info_t *thread; 2876 2877 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2878 KMP_DEBUG_ASSERT(__kmp_init_serial); 2879 2880 thread = __kmp_threads[gtid]; 2881 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2882 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2883 "curtask_maxaclevel=%d\n", 2884 gtid, thread->th.th_current_task, 2885 thread->th.th_current_task->td_icvs.max_active_levels)); 2886 return thread->th.th_current_task->td_icvs.max_active_levels; 2887 } 2888 2889 // nteams-var per-device ICV 2890 void __kmp_set_num_teams(int num_teams) { 2891 if (num_teams > 0) 2892 __kmp_nteams = num_teams; 2893 } 2894 int __kmp_get_max_teams(void) { return __kmp_nteams; } 2895 // teams-thread-limit-var per-device ICV 2896 void __kmp_set_teams_thread_limit(int limit) { 2897 if (limit > 0) 2898 __kmp_teams_thread_limit = limit; 2899 } 2900 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; } 2901 2902 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); 2903 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); 2904 2905 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2906 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2907 kmp_info_t *thread; 2908 kmp_sched_t orig_kind; 2909 // kmp_team_t *team; 2910 2911 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2912 gtid, (int)kind, chunk)); 2913 KMP_DEBUG_ASSERT(__kmp_init_serial); 2914 2915 // Check if the kind parameter is valid, correct if needed. 2916 // Valid parameters should fit in one of two intervals - standard or extended: 2917 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2918 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2919 orig_kind = kind; 2920 kind = __kmp_sched_without_mods(kind); 2921 2922 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2923 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2924 // TODO: Hint needs attention in case we change the default schedule. 2925 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2926 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2927 __kmp_msg_null); 2928 kind = kmp_sched_default; 2929 chunk = 0; // ignore chunk value in case of bad kind 2930 } 2931 2932 thread = __kmp_threads[gtid]; 2933 2934 __kmp_save_internal_controls(thread); 2935 2936 if (kind < kmp_sched_upper_std) { 2937 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2938 // differ static chunked vs. unchunked: chunk should be invalid to 2939 // indicate unchunked schedule (which is the default) 2940 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2941 } else { 2942 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2943 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2944 } 2945 } else { 2946 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2947 // kmp_sched_lower - 2 ]; 2948 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2949 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2950 kmp_sched_lower - 2]; 2951 } 2952 __kmp_sched_apply_mods_intkind( 2953 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); 2954 if (kind == kmp_sched_auto || chunk < 1) { 2955 // ignore parameter chunk for schedule auto 2956 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2957 } else { 2958 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2959 } 2960 } 2961 2962 /* Gets def_sched_var ICV values */ 2963 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2964 kmp_info_t *thread; 2965 enum sched_type th_type; 2966 2967 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2968 KMP_DEBUG_ASSERT(__kmp_init_serial); 2969 2970 thread = __kmp_threads[gtid]; 2971 2972 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2973 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { 2974 case kmp_sch_static: 2975 case kmp_sch_static_greedy: 2976 case kmp_sch_static_balanced: 2977 *kind = kmp_sched_static; 2978 __kmp_sched_apply_mods_stdkind(kind, th_type); 2979 *chunk = 0; // chunk was not set, try to show this fact via zero value 2980 return; 2981 case kmp_sch_static_chunked: 2982 *kind = kmp_sched_static; 2983 break; 2984 case kmp_sch_dynamic_chunked: 2985 *kind = kmp_sched_dynamic; 2986 break; 2987 case kmp_sch_guided_chunked: 2988 case kmp_sch_guided_iterative_chunked: 2989 case kmp_sch_guided_analytical_chunked: 2990 *kind = kmp_sched_guided; 2991 break; 2992 case kmp_sch_auto: 2993 *kind = kmp_sched_auto; 2994 break; 2995 case kmp_sch_trapezoidal: 2996 *kind = kmp_sched_trapezoidal; 2997 break; 2998 #if KMP_STATIC_STEAL_ENABLED 2999 case kmp_sch_static_steal: 3000 *kind = kmp_sched_static_steal; 3001 break; 3002 #endif 3003 default: 3004 KMP_FATAL(UnknownSchedulingType, th_type); 3005 } 3006 3007 __kmp_sched_apply_mods_stdkind(kind, th_type); 3008 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 3009 } 3010 3011 int __kmp_get_ancestor_thread_num(int gtid, int level) { 3012 3013 int ii, dd; 3014 kmp_team_t *team; 3015 kmp_info_t *thr; 3016 3017 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 3018 KMP_DEBUG_ASSERT(__kmp_init_serial); 3019 3020 // validate level 3021 if (level == 0) 3022 return 0; 3023 if (level < 0) 3024 return -1; 3025 thr = __kmp_threads[gtid]; 3026 team = thr->th.th_team; 3027 ii = team->t.t_level; 3028 if (level > ii) 3029 return -1; 3030 3031 if (thr->th.th_teams_microtask) { 3032 // AC: we are in teams region where multiple nested teams have same level 3033 int tlevel = thr->th.th_teams_level; // the level of the teams construct 3034 if (level <= 3035 tlevel) { // otherwise usual algorithm works (will not touch the teams) 3036 KMP_DEBUG_ASSERT(ii >= tlevel); 3037 // AC: As we need to pass by the teams league, we need to artificially 3038 // increase ii 3039 if (ii == tlevel) { 3040 ii += 2; // three teams have same level 3041 } else { 3042 ii++; // two teams have same level 3043 } 3044 } 3045 } 3046 3047 if (ii == level) 3048 return __kmp_tid_from_gtid(gtid); 3049 3050 dd = team->t.t_serialized; 3051 level++; 3052 while (ii > level) { 3053 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 3054 } 3055 if ((team->t.t_serialized) && (!dd)) { 3056 team = team->t.t_parent; 3057 continue; 3058 } 3059 if (ii > level) { 3060 team = team->t.t_parent; 3061 dd = team->t.t_serialized; 3062 ii--; 3063 } 3064 } 3065 3066 return (dd > 1) ? (0) : (team->t.t_master_tid); 3067 } 3068 3069 int __kmp_get_team_size(int gtid, int level) { 3070 3071 int ii, dd; 3072 kmp_team_t *team; 3073 kmp_info_t *thr; 3074 3075 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 3076 KMP_DEBUG_ASSERT(__kmp_init_serial); 3077 3078 // validate level 3079 if (level == 0) 3080 return 1; 3081 if (level < 0) 3082 return -1; 3083 thr = __kmp_threads[gtid]; 3084 team = thr->th.th_team; 3085 ii = team->t.t_level; 3086 if (level > ii) 3087 return -1; 3088 3089 if (thr->th.th_teams_microtask) { 3090 // AC: we are in teams region where multiple nested teams have same level 3091 int tlevel = thr->th.th_teams_level; // the level of the teams construct 3092 if (level <= 3093 tlevel) { // otherwise usual algorithm works (will not touch the teams) 3094 KMP_DEBUG_ASSERT(ii >= tlevel); 3095 // AC: As we need to pass by the teams league, we need to artificially 3096 // increase ii 3097 if (ii == tlevel) { 3098 ii += 2; // three teams have same level 3099 } else { 3100 ii++; // two teams have same level 3101 } 3102 } 3103 } 3104 3105 while (ii > level) { 3106 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 3107 } 3108 if (team->t.t_serialized && (!dd)) { 3109 team = team->t.t_parent; 3110 continue; 3111 } 3112 if (ii > level) { 3113 team = team->t.t_parent; 3114 ii--; 3115 } 3116 } 3117 3118 return team->t.t_nproc; 3119 } 3120 3121 kmp_r_sched_t __kmp_get_schedule_global() { 3122 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 3123 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 3124 // independently. So one can get the updated schedule here. 3125 3126 kmp_r_sched_t r_sched; 3127 3128 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 3129 // __kmp_guided. __kmp_sched should keep original value, so that user can set 3130 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 3131 // different roots (even in OMP 2.5) 3132 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); 3133 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); 3134 if (s == kmp_sch_static) { 3135 // replace STATIC with more detailed schedule (balanced or greedy) 3136 r_sched.r_sched_type = __kmp_static; 3137 } else if (s == kmp_sch_guided_chunked) { 3138 // replace GUIDED with more detailed schedule (iterative or analytical) 3139 r_sched.r_sched_type = __kmp_guided; 3140 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 3141 r_sched.r_sched_type = __kmp_sched; 3142 } 3143 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); 3144 3145 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 3146 // __kmp_chunk may be wrong here (if it was not ever set) 3147 r_sched.chunk = KMP_DEFAULT_CHUNK; 3148 } else { 3149 r_sched.chunk = __kmp_chunk; 3150 } 3151 3152 return r_sched; 3153 } 3154 3155 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 3156 at least argc number of *t_argv entries for the requested team. */ 3157 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 3158 3159 KMP_DEBUG_ASSERT(team); 3160 if (!realloc || argc > team->t.t_max_argc) { 3161 3162 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 3163 "current entries=%d\n", 3164 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 3165 /* if previously allocated heap space for args, free them */ 3166 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 3167 __kmp_free((void *)team->t.t_argv); 3168 3169 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 3170 /* use unused space in the cache line for arguments */ 3171 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 3172 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 3173 "argv entries\n", 3174 team->t.t_id, team->t.t_max_argc)); 3175 team->t.t_argv = &team->t.t_inline_argv[0]; 3176 if (__kmp_storage_map) { 3177 __kmp_print_storage_map_gtid( 3178 -1, &team->t.t_inline_argv[0], 3179 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3180 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3181 team->t.t_id); 3182 } 3183 } else { 3184 /* allocate space for arguments in the heap */ 3185 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3186 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3187 : 2 * argc; 3188 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3189 "argv entries\n", 3190 team->t.t_id, team->t.t_max_argc)); 3191 team->t.t_argv = 3192 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3193 if (__kmp_storage_map) { 3194 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3195 &team->t.t_argv[team->t.t_max_argc], 3196 sizeof(void *) * team->t.t_max_argc, 3197 "team_%d.t_argv", team->t.t_id); 3198 } 3199 } 3200 } 3201 } 3202 3203 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3204 int i; 3205 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3206 team->t.t_threads = 3207 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3208 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3209 sizeof(dispatch_shared_info_t) * num_disp_buff); 3210 team->t.t_dispatch = 3211 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3212 team->t.t_implicit_task_taskdata = 3213 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3214 team->t.t_max_nproc = max_nth; 3215 3216 /* setup dispatch buffers */ 3217 for (i = 0; i < num_disp_buff; ++i) { 3218 team->t.t_disp_buffer[i].buffer_index = i; 3219 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3220 } 3221 } 3222 3223 static void __kmp_free_team_arrays(kmp_team_t *team) { 3224 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3225 int i; 3226 for (i = 0; i < team->t.t_max_nproc; ++i) { 3227 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3228 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3229 team->t.t_dispatch[i].th_disp_buffer = NULL; 3230 } 3231 } 3232 #if KMP_USE_HIER_SCHED 3233 __kmp_dispatch_free_hierarchies(team); 3234 #endif 3235 __kmp_free(team->t.t_threads); 3236 __kmp_free(team->t.t_disp_buffer); 3237 __kmp_free(team->t.t_dispatch); 3238 __kmp_free(team->t.t_implicit_task_taskdata); 3239 team->t.t_threads = NULL; 3240 team->t.t_disp_buffer = NULL; 3241 team->t.t_dispatch = NULL; 3242 team->t.t_implicit_task_taskdata = 0; 3243 } 3244 3245 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3246 kmp_info_t **oldThreads = team->t.t_threads; 3247 3248 __kmp_free(team->t.t_disp_buffer); 3249 __kmp_free(team->t.t_dispatch); 3250 __kmp_free(team->t.t_implicit_task_taskdata); 3251 __kmp_allocate_team_arrays(team, max_nth); 3252 3253 KMP_MEMCPY(team->t.t_threads, oldThreads, 3254 team->t.t_nproc * sizeof(kmp_info_t *)); 3255 3256 __kmp_free(oldThreads); 3257 } 3258 3259 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3260 3261 kmp_r_sched_t r_sched = 3262 __kmp_get_schedule_global(); // get current state of scheduling globals 3263 3264 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3265 3266 kmp_internal_control_t g_icvs = { 3267 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3268 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3269 // adjustment of threads (per thread) 3270 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3271 // whether blocktime is explicitly set 3272 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3273 #if KMP_USE_MONITOR 3274 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3275 // intervals 3276 #endif 3277 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3278 // next parallel region (per thread) 3279 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3280 __kmp_cg_max_nth, // int thread_limit; 3281 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3282 // for max_active_levels 3283 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3284 // {sched,chunk} pair 3285 __kmp_nested_proc_bind.bind_types[0], 3286 __kmp_default_device, 3287 NULL // struct kmp_internal_control *next; 3288 }; 3289 3290 return g_icvs; 3291 } 3292 3293 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3294 3295 kmp_internal_control_t gx_icvs; 3296 gx_icvs.serial_nesting_level = 3297 0; // probably =team->t.t_serial like in save_inter_controls 3298 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3299 gx_icvs.next = NULL; 3300 3301 return gx_icvs; 3302 } 3303 3304 static void __kmp_initialize_root(kmp_root_t *root) { 3305 int f; 3306 kmp_team_t *root_team; 3307 kmp_team_t *hot_team; 3308 int hot_team_max_nth; 3309 kmp_r_sched_t r_sched = 3310 __kmp_get_schedule_global(); // get current state of scheduling globals 3311 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3312 KMP_DEBUG_ASSERT(root); 3313 KMP_ASSERT(!root->r.r_begin); 3314 3315 /* setup the root state structure */ 3316 __kmp_init_lock(&root->r.r_begin_lock); 3317 root->r.r_begin = FALSE; 3318 root->r.r_active = FALSE; 3319 root->r.r_in_parallel = 0; 3320 root->r.r_blocktime = __kmp_dflt_blocktime; 3321 #if KMP_AFFINITY_SUPPORTED 3322 root->r.r_affinity_assigned = FALSE; 3323 #endif 3324 3325 /* setup the root team for this task */ 3326 /* allocate the root team structure */ 3327 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3328 3329 root_team = 3330 __kmp_allocate_team(root, 3331 1, // new_nproc 3332 1, // max_nproc 3333 #if OMPT_SUPPORT 3334 ompt_data_none, // root parallel id 3335 #endif 3336 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3337 0 // argc 3338 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3339 ); 3340 #if USE_DEBUGGER 3341 // Non-NULL value should be assigned to make the debugger display the root 3342 // team. 3343 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3344 #endif 3345 3346 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3347 3348 root->r.r_root_team = root_team; 3349 root_team->t.t_control_stack_top = NULL; 3350 3351 /* initialize root team */ 3352 root_team->t.t_threads[0] = NULL; 3353 root_team->t.t_nproc = 1; 3354 root_team->t.t_serialized = 1; 3355 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3356 root_team->t.t_sched.sched = r_sched.sched; 3357 KA_TRACE( 3358 20, 3359 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3360 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3361 3362 /* setup the hot team for this task */ 3363 /* allocate the hot team structure */ 3364 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3365 3366 hot_team = 3367 __kmp_allocate_team(root, 3368 1, // new_nproc 3369 __kmp_dflt_team_nth_ub * 2, // max_nproc 3370 #if OMPT_SUPPORT 3371 ompt_data_none, // root parallel id 3372 #endif 3373 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3374 0 // argc 3375 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3376 ); 3377 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3378 3379 root->r.r_hot_team = hot_team; 3380 root_team->t.t_control_stack_top = NULL; 3381 3382 /* first-time initialization */ 3383 hot_team->t.t_parent = root_team; 3384 3385 /* initialize hot team */ 3386 hot_team_max_nth = hot_team->t.t_max_nproc; 3387 for (f = 0; f < hot_team_max_nth; ++f) { 3388 hot_team->t.t_threads[f] = NULL; 3389 } 3390 hot_team->t.t_nproc = 1; 3391 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3392 hot_team->t.t_sched.sched = r_sched.sched; 3393 hot_team->t.t_size_changed = 0; 3394 } 3395 3396 #ifdef KMP_DEBUG 3397 3398 typedef struct kmp_team_list_item { 3399 kmp_team_p const *entry; 3400 struct kmp_team_list_item *next; 3401 } kmp_team_list_item_t; 3402 typedef kmp_team_list_item_t *kmp_team_list_t; 3403 3404 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3405 kmp_team_list_t list, // List of teams. 3406 kmp_team_p const *team // Team to add. 3407 ) { 3408 3409 // List must terminate with item where both entry and next are NULL. 3410 // Team is added to the list only once. 3411 // List is sorted in ascending order by team id. 3412 // Team id is *not* a key. 3413 3414 kmp_team_list_t l; 3415 3416 KMP_DEBUG_ASSERT(list != NULL); 3417 if (team == NULL) { 3418 return; 3419 } 3420 3421 __kmp_print_structure_team_accum(list, team->t.t_parent); 3422 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3423 3424 // Search list for the team. 3425 l = list; 3426 while (l->next != NULL && l->entry != team) { 3427 l = l->next; 3428 } 3429 if (l->next != NULL) { 3430 return; // Team has been added before, exit. 3431 } 3432 3433 // Team is not found. Search list again for insertion point. 3434 l = list; 3435 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3436 l = l->next; 3437 } 3438 3439 // Insert team. 3440 { 3441 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3442 sizeof(kmp_team_list_item_t)); 3443 *item = *l; 3444 l->entry = team; 3445 l->next = item; 3446 } 3447 } 3448 3449 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3450 3451 ) { 3452 __kmp_printf("%s", title); 3453 if (team != NULL) { 3454 __kmp_printf("%2x %p\n", team->t.t_id, team); 3455 } else { 3456 __kmp_printf(" - (nil)\n"); 3457 } 3458 } 3459 3460 static void __kmp_print_structure_thread(char const *title, 3461 kmp_info_p const *thread) { 3462 __kmp_printf("%s", title); 3463 if (thread != NULL) { 3464 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3465 } else { 3466 __kmp_printf(" - (nil)\n"); 3467 } 3468 } 3469 3470 void __kmp_print_structure(void) { 3471 3472 kmp_team_list_t list; 3473 3474 // Initialize list of teams. 3475 list = 3476 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3477 list->entry = NULL; 3478 list->next = NULL; 3479 3480 __kmp_printf("\n------------------------------\nGlobal Thread " 3481 "Table\n------------------------------\n"); 3482 { 3483 int gtid; 3484 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3485 __kmp_printf("%2d", gtid); 3486 if (__kmp_threads != NULL) { 3487 __kmp_printf(" %p", __kmp_threads[gtid]); 3488 } 3489 if (__kmp_root != NULL) { 3490 __kmp_printf(" %p", __kmp_root[gtid]); 3491 } 3492 __kmp_printf("\n"); 3493 } 3494 } 3495 3496 // Print out __kmp_threads array. 3497 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3498 "----------\n"); 3499 if (__kmp_threads != NULL) { 3500 int gtid; 3501 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3502 kmp_info_t const *thread = __kmp_threads[gtid]; 3503 if (thread != NULL) { 3504 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3505 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3506 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3507 __kmp_print_structure_team(" Serial Team: ", 3508 thread->th.th_serial_team); 3509 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3510 __kmp_print_structure_thread(" Primary: ", 3511 thread->th.th_team_master); 3512 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3513 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3514 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3515 __kmp_print_structure_thread(" Next in pool: ", 3516 thread->th.th_next_pool); 3517 __kmp_printf("\n"); 3518 __kmp_print_structure_team_accum(list, thread->th.th_team); 3519 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3520 } 3521 } 3522 } else { 3523 __kmp_printf("Threads array is not allocated.\n"); 3524 } 3525 3526 // Print out __kmp_root array. 3527 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3528 "--------\n"); 3529 if (__kmp_root != NULL) { 3530 int gtid; 3531 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3532 kmp_root_t const *root = __kmp_root[gtid]; 3533 if (root != NULL) { 3534 __kmp_printf("GTID %2d %p:\n", gtid, root); 3535 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3536 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3537 __kmp_print_structure_thread(" Uber Thread: ", 3538 root->r.r_uber_thread); 3539 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3540 __kmp_printf(" In Parallel: %2d\n", 3541 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3542 __kmp_printf("\n"); 3543 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3544 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3545 } 3546 } 3547 } else { 3548 __kmp_printf("Ubers array is not allocated.\n"); 3549 } 3550 3551 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3552 "--------\n"); 3553 while (list->next != NULL) { 3554 kmp_team_p const *team = list->entry; 3555 int i; 3556 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3557 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3558 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid); 3559 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3560 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3561 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3562 for (i = 0; i < team->t.t_nproc; ++i) { 3563 __kmp_printf(" Thread %2d: ", i); 3564 __kmp_print_structure_thread("", team->t.t_threads[i]); 3565 } 3566 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3567 __kmp_printf("\n"); 3568 list = list->next; 3569 } 3570 3571 // Print out __kmp_thread_pool and __kmp_team_pool. 3572 __kmp_printf("\n------------------------------\nPools\n----------------------" 3573 "--------\n"); 3574 __kmp_print_structure_thread("Thread pool: ", 3575 CCAST(kmp_info_t *, __kmp_thread_pool)); 3576 __kmp_print_structure_team("Team pool: ", 3577 CCAST(kmp_team_t *, __kmp_team_pool)); 3578 __kmp_printf("\n"); 3579 3580 // Free team list. 3581 while (list != NULL) { 3582 kmp_team_list_item_t *item = list; 3583 list = list->next; 3584 KMP_INTERNAL_FREE(item); 3585 } 3586 } 3587 3588 #endif 3589 3590 //--------------------------------------------------------------------------- 3591 // Stuff for per-thread fast random number generator 3592 // Table of primes 3593 static const unsigned __kmp_primes[] = { 3594 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3595 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3596 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3597 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3598 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3599 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3600 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3601 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3602 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3603 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3604 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3605 3606 //--------------------------------------------------------------------------- 3607 // __kmp_get_random: Get a random number using a linear congruential method. 3608 unsigned short __kmp_get_random(kmp_info_t *thread) { 3609 unsigned x = thread->th.th_x; 3610 unsigned short r = (unsigned short)(x >> 16); 3611 3612 thread->th.th_x = x * thread->th.th_a + 1; 3613 3614 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3615 thread->th.th_info.ds.ds_tid, r)); 3616 3617 return r; 3618 } 3619 //-------------------------------------------------------- 3620 // __kmp_init_random: Initialize a random number generator 3621 void __kmp_init_random(kmp_info_t *thread) { 3622 unsigned seed = thread->th.th_info.ds.ds_tid; 3623 3624 thread->th.th_a = 3625 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3626 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3627 KA_TRACE(30, 3628 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3629 } 3630 3631 #if KMP_OS_WINDOWS 3632 /* reclaim array entries for root threads that are already dead, returns number 3633 * reclaimed */ 3634 static int __kmp_reclaim_dead_roots(void) { 3635 int i, r = 0; 3636 3637 for (i = 0; i < __kmp_threads_capacity; ++i) { 3638 if (KMP_UBER_GTID(i) && 3639 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3640 !__kmp_root[i] 3641 ->r.r_active) { // AC: reclaim only roots died in non-active state 3642 r += __kmp_unregister_root_other_thread(i); 3643 } 3644 } 3645 return r; 3646 } 3647 #endif 3648 3649 /* This function attempts to create free entries in __kmp_threads and 3650 __kmp_root, and returns the number of free entries generated. 3651 3652 For Windows* OS static library, the first mechanism used is to reclaim array 3653 entries for root threads that are already dead. 3654 3655 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3656 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3657 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3658 threadprivate cache array has been created. Synchronization with 3659 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3660 3661 After any dead root reclamation, if the clipping value allows array expansion 3662 to result in the generation of a total of nNeed free slots, the function does 3663 that expansion. If not, nothing is done beyond the possible initial root 3664 thread reclamation. 3665 3666 If any argument is negative, the behavior is undefined. */ 3667 static int __kmp_expand_threads(int nNeed) { 3668 int added = 0; 3669 int minimumRequiredCapacity; 3670 int newCapacity; 3671 kmp_info_t **newThreads; 3672 kmp_root_t **newRoot; 3673 3674 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3675 // resizing __kmp_threads does not need additional protection if foreign 3676 // threads are present 3677 3678 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3679 /* only for Windows static library */ 3680 /* reclaim array entries for root threads that are already dead */ 3681 added = __kmp_reclaim_dead_roots(); 3682 3683 if (nNeed) { 3684 nNeed -= added; 3685 if (nNeed < 0) 3686 nNeed = 0; 3687 } 3688 #endif 3689 if (nNeed <= 0) 3690 return added; 3691 3692 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3693 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3694 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3695 // > __kmp_max_nth in one of two ways: 3696 // 3697 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3698 // may not be reused by another thread, so we may need to increase 3699 // __kmp_threads_capacity to __kmp_max_nth + 1. 3700 // 3701 // 2) New foreign root(s) are encountered. We always register new foreign 3702 // roots. This may cause a smaller # of threads to be allocated at 3703 // subsequent parallel regions, but the worker threads hang around (and 3704 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3705 // 3706 // Anyway, that is the reason for moving the check to see if 3707 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3708 // instead of having it performed here. -BB 3709 3710 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3711 3712 /* compute expansion headroom to check if we can expand */ 3713 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3714 /* possible expansion too small -- give up */ 3715 return added; 3716 } 3717 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3718 3719 newCapacity = __kmp_threads_capacity; 3720 do { 3721 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3722 : __kmp_sys_max_nth; 3723 } while (newCapacity < minimumRequiredCapacity); 3724 newThreads = (kmp_info_t **)__kmp_allocate( 3725 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3726 newRoot = 3727 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3728 KMP_MEMCPY(newThreads, __kmp_threads, 3729 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3730 KMP_MEMCPY(newRoot, __kmp_root, 3731 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3732 // Put old __kmp_threads array on a list. Any ongoing references to the old 3733 // list will be valid. This list is cleaned up at library shutdown. 3734 kmp_old_threads_list_t *node = 3735 (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t)); 3736 node->threads = __kmp_threads; 3737 node->next = __kmp_old_threads_list; 3738 __kmp_old_threads_list = node; 3739 3740 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3741 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3742 added += newCapacity - __kmp_threads_capacity; 3743 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3744 3745 if (newCapacity > __kmp_tp_capacity) { 3746 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3747 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3748 __kmp_threadprivate_resize_cache(newCapacity); 3749 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3750 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3751 } 3752 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3753 } 3754 3755 return added; 3756 } 3757 3758 /* Register the current thread as a root thread and obtain our gtid. We must 3759 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3760 thread that calls from __kmp_do_serial_initialize() */ 3761 int __kmp_register_root(int initial_thread) { 3762 kmp_info_t *root_thread; 3763 kmp_root_t *root; 3764 int gtid; 3765 int capacity; 3766 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3767 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3768 KMP_MB(); 3769 3770 /* 2007-03-02: 3771 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3772 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3773 work as expected -- it may return false (that means there is at least one 3774 empty slot in __kmp_threads array), but it is possible the only free slot 3775 is #0, which is reserved for initial thread and so cannot be used for this 3776 one. Following code workarounds this bug. 3777 3778 However, right solution seems to be not reserving slot #0 for initial 3779 thread because: 3780 (1) there is no magic in slot #0, 3781 (2) we cannot detect initial thread reliably (the first thread which does 3782 serial initialization may be not a real initial thread). 3783 */ 3784 capacity = __kmp_threads_capacity; 3785 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3786 --capacity; 3787 } 3788 3789 // If it is not for initializing the hidden helper team, we need to take 3790 // __kmp_hidden_helper_threads_num out of the capacity because it is included 3791 // in __kmp_threads_capacity. 3792 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 3793 capacity -= __kmp_hidden_helper_threads_num; 3794 } 3795 3796 /* see if there are too many threads */ 3797 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3798 if (__kmp_tp_cached) { 3799 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3800 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3801 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3802 } else { 3803 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3804 __kmp_msg_null); 3805 } 3806 } 3807 3808 // When hidden helper task is enabled, __kmp_threads is organized as follows: 3809 // 0: initial thread, also a regular OpenMP thread. 3810 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads. 3811 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for 3812 // regular OpenMP threads. 3813 if (TCR_4(__kmp_init_hidden_helper_threads)) { 3814 // Find an available thread slot for hidden helper thread. Slots for hidden 3815 // helper threads start from 1 to __kmp_hidden_helper_threads_num. 3816 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL && 3817 gtid <= __kmp_hidden_helper_threads_num; 3818 gtid++) 3819 ; 3820 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num); 3821 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for " 3822 "hidden helper thread: T#%d\n", 3823 gtid)); 3824 } else { 3825 /* find an available thread slot */ 3826 // Don't reassign the zero slot since we need that to only be used by 3827 // initial thread. Slots for hidden helper threads should also be skipped. 3828 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3829 gtid = 0; 3830 } else { 3831 for (gtid = __kmp_hidden_helper_threads_num + 1; 3832 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++) 3833 ; 3834 } 3835 KA_TRACE( 3836 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3837 KMP_ASSERT(gtid < __kmp_threads_capacity); 3838 } 3839 3840 /* update global accounting */ 3841 __kmp_all_nth++; 3842 TCW_4(__kmp_nth, __kmp_nth + 1); 3843 3844 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3845 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3846 if (__kmp_adjust_gtid_mode) { 3847 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3848 if (TCR_4(__kmp_gtid_mode) != 2) { 3849 TCW_4(__kmp_gtid_mode, 2); 3850 } 3851 } else { 3852 if (TCR_4(__kmp_gtid_mode) != 1) { 3853 TCW_4(__kmp_gtid_mode, 1); 3854 } 3855 } 3856 } 3857 3858 #ifdef KMP_ADJUST_BLOCKTIME 3859 /* Adjust blocktime to zero if necessary */ 3860 /* Middle initialization might not have occurred yet */ 3861 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3862 if (__kmp_nth > __kmp_avail_proc) { 3863 __kmp_zero_bt = TRUE; 3864 } 3865 } 3866 #endif /* KMP_ADJUST_BLOCKTIME */ 3867 3868 /* setup this new hierarchy */ 3869 if (!(root = __kmp_root[gtid])) { 3870 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3871 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3872 } 3873 3874 #if KMP_STATS_ENABLED 3875 // Initialize stats as soon as possible (right after gtid assignment). 3876 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3877 __kmp_stats_thread_ptr->startLife(); 3878 KMP_SET_THREAD_STATE(SERIAL_REGION); 3879 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3880 #endif 3881 __kmp_initialize_root(root); 3882 3883 /* setup new root thread structure */ 3884 if (root->r.r_uber_thread) { 3885 root_thread = root->r.r_uber_thread; 3886 } else { 3887 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3888 if (__kmp_storage_map) { 3889 __kmp_print_thread_storage_map(root_thread, gtid); 3890 } 3891 root_thread->th.th_info.ds.ds_gtid = gtid; 3892 #if OMPT_SUPPORT 3893 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3894 #endif 3895 root_thread->th.th_root = root; 3896 if (__kmp_env_consistency_check) { 3897 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3898 } 3899 #if USE_FAST_MEMORY 3900 __kmp_initialize_fast_memory(root_thread); 3901 #endif /* USE_FAST_MEMORY */ 3902 3903 #if KMP_USE_BGET 3904 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3905 __kmp_initialize_bget(root_thread); 3906 #endif 3907 __kmp_init_random(root_thread); // Initialize random number generator 3908 } 3909 3910 /* setup the serial team held in reserve by the root thread */ 3911 if (!root_thread->th.th_serial_team) { 3912 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3913 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3914 root_thread->th.th_serial_team = __kmp_allocate_team( 3915 root, 1, 1, 3916 #if OMPT_SUPPORT 3917 ompt_data_none, // root parallel id 3918 #endif 3919 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3920 } 3921 KMP_ASSERT(root_thread->th.th_serial_team); 3922 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3923 root_thread->th.th_serial_team)); 3924 3925 /* drop root_thread into place */ 3926 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3927 3928 root->r.r_root_team->t.t_threads[0] = root_thread; 3929 root->r.r_hot_team->t.t_threads[0] = root_thread; 3930 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3931 // AC: the team created in reserve, not for execution (it is unused for now). 3932 root_thread->th.th_serial_team->t.t_serialized = 0; 3933 root->r.r_uber_thread = root_thread; 3934 3935 /* initialize the thread, get it ready to go */ 3936 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3937 TCW_4(__kmp_init_gtid, TRUE); 3938 3939 /* prepare the primary thread for get_gtid() */ 3940 __kmp_gtid_set_specific(gtid); 3941 3942 #if USE_ITT_BUILD 3943 __kmp_itt_thread_name(gtid); 3944 #endif /* USE_ITT_BUILD */ 3945 3946 #ifdef KMP_TDATA_GTID 3947 __kmp_gtid = gtid; 3948 #endif 3949 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3950 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3951 3952 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3953 "plain=%u\n", 3954 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3955 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3956 KMP_INIT_BARRIER_STATE)); 3957 { // Initialize barrier data. 3958 int b; 3959 for (b = 0; b < bs_last_barrier; ++b) { 3960 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3961 #if USE_DEBUGGER 3962 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3963 #endif 3964 } 3965 } 3966 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3967 KMP_INIT_BARRIER_STATE); 3968 3969 #if KMP_AFFINITY_SUPPORTED 3970 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3971 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3972 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3973 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3974 #endif /* KMP_AFFINITY_SUPPORTED */ 3975 root_thread->th.th_def_allocator = __kmp_def_allocator; 3976 root_thread->th.th_prev_level = 0; 3977 root_thread->th.th_prev_num_threads = 1; 3978 3979 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 3980 tmp->cg_root = root_thread; 3981 tmp->cg_thread_limit = __kmp_cg_max_nth; 3982 tmp->cg_nthreads = 1; 3983 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" 3984 " cg_nthreads init to 1\n", 3985 root_thread, tmp)); 3986 tmp->up = NULL; 3987 root_thread->th.th_cg_roots = tmp; 3988 3989 __kmp_root_counter++; 3990 3991 #if OMPT_SUPPORT 3992 if (!initial_thread && ompt_enabled.enabled) { 3993 3994 kmp_info_t *root_thread = ompt_get_thread(); 3995 3996 ompt_set_thread_state(root_thread, ompt_state_overhead); 3997 3998 if (ompt_enabled.ompt_callback_thread_begin) { 3999 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 4000 ompt_thread_initial, __ompt_get_thread_data_internal()); 4001 } 4002 ompt_data_t *task_data; 4003 ompt_data_t *parallel_data; 4004 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 4005 NULL); 4006 if (ompt_enabled.ompt_callback_implicit_task) { 4007 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 4008 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); 4009 } 4010 4011 ompt_set_thread_state(root_thread, ompt_state_work_serial); 4012 } 4013 #endif 4014 #if OMPD_SUPPORT 4015 if (ompd_state & OMPD_ENABLE_BP) 4016 ompd_bp_thread_begin(); 4017 #endif 4018 4019 KMP_MB(); 4020 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4021 4022 return gtid; 4023 } 4024 4025 #if KMP_NESTED_HOT_TEAMS 4026 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 4027 const int max_level) { 4028 int i, n, nth; 4029 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 4030 if (!hot_teams || !hot_teams[level].hot_team) { 4031 return 0; 4032 } 4033 KMP_DEBUG_ASSERT(level < max_level); 4034 kmp_team_t *team = hot_teams[level].hot_team; 4035 nth = hot_teams[level].hot_team_nth; 4036 n = nth - 1; // primary thread is not freed 4037 if (level < max_level - 1) { 4038 for (i = 0; i < nth; ++i) { 4039 kmp_info_t *th = team->t.t_threads[i]; 4040 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 4041 if (i > 0 && th->th.th_hot_teams) { 4042 __kmp_free(th->th.th_hot_teams); 4043 th->th.th_hot_teams = NULL; 4044 } 4045 } 4046 } 4047 __kmp_free_team(root, team, NULL); 4048 return n; 4049 } 4050 #endif 4051 4052 // Resets a root thread and clear its root and hot teams. 4053 // Returns the number of __kmp_threads entries directly and indirectly freed. 4054 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 4055 kmp_team_t *root_team = root->r.r_root_team; 4056 kmp_team_t *hot_team = root->r.r_hot_team; 4057 int n = hot_team->t.t_nproc; 4058 int i; 4059 4060 KMP_DEBUG_ASSERT(!root->r.r_active); 4061 4062 root->r.r_root_team = NULL; 4063 root->r.r_hot_team = NULL; 4064 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 4065 // before call to __kmp_free_team(). 4066 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 4067 #if KMP_NESTED_HOT_TEAMS 4068 if (__kmp_hot_teams_max_level > 4069 0) { // need to free nested hot teams and their threads if any 4070 for (i = 0; i < hot_team->t.t_nproc; ++i) { 4071 kmp_info_t *th = hot_team->t.t_threads[i]; 4072 if (__kmp_hot_teams_max_level > 1) { 4073 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 4074 } 4075 if (th->th.th_hot_teams) { 4076 __kmp_free(th->th.th_hot_teams); 4077 th->th.th_hot_teams = NULL; 4078 } 4079 } 4080 } 4081 #endif 4082 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 4083 4084 // Before we can reap the thread, we need to make certain that all other 4085 // threads in the teams that had this root as ancestor have stopped trying to 4086 // steal tasks. 4087 if (__kmp_tasking_mode != tskm_immediate_exec) { 4088 __kmp_wait_to_unref_task_teams(); 4089 } 4090 4091 #if KMP_OS_WINDOWS 4092 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 4093 KA_TRACE( 4094 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 4095 "\n", 4096 (LPVOID) & (root->r.r_uber_thread->th), 4097 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 4098 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 4099 #endif /* KMP_OS_WINDOWS */ 4100 4101 #if OMPD_SUPPORT 4102 if (ompd_state & OMPD_ENABLE_BP) 4103 ompd_bp_thread_end(); 4104 #endif 4105 4106 #if OMPT_SUPPORT 4107 ompt_data_t *task_data; 4108 ompt_data_t *parallel_data; 4109 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 4110 NULL); 4111 if (ompt_enabled.ompt_callback_implicit_task) { 4112 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 4113 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); 4114 } 4115 if (ompt_enabled.ompt_callback_thread_end) { 4116 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 4117 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 4118 } 4119 #endif 4120 4121 TCW_4(__kmp_nth, 4122 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 4123 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; 4124 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" 4125 " to %d\n", 4126 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, 4127 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); 4128 if (i == 1) { 4129 // need to free contention group structure 4130 KMP_DEBUG_ASSERT(root->r.r_uber_thread == 4131 root->r.r_uber_thread->th.th_cg_roots->cg_root); 4132 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); 4133 __kmp_free(root->r.r_uber_thread->th.th_cg_roots); 4134 root->r.r_uber_thread->th.th_cg_roots = NULL; 4135 } 4136 __kmp_reap_thread(root->r.r_uber_thread, 1); 4137 4138 // We canot put root thread to __kmp_thread_pool, so we have to reap it 4139 // instead of freeing. 4140 root->r.r_uber_thread = NULL; 4141 /* mark root as no longer in use */ 4142 root->r.r_begin = FALSE; 4143 4144 return n; 4145 } 4146 4147 void __kmp_unregister_root_current_thread(int gtid) { 4148 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 4149 /* this lock should be ok, since unregister_root_current_thread is never 4150 called during an abort, only during a normal close. furthermore, if you 4151 have the forkjoin lock, you should never try to get the initz lock */ 4152 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 4153 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 4154 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 4155 "exiting T#%d\n", 4156 gtid)); 4157 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4158 return; 4159 } 4160 kmp_root_t *root = __kmp_root[gtid]; 4161 4162 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4163 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4164 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4165 KMP_ASSERT(root->r.r_active == FALSE); 4166 4167 KMP_MB(); 4168 4169 kmp_info_t *thread = __kmp_threads[gtid]; 4170 kmp_team_t *team = thread->th.th_team; 4171 kmp_task_team_t *task_team = thread->th.th_task_team; 4172 4173 // we need to wait for the proxy tasks before finishing the thread 4174 if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks || 4175 task_team->tt.tt_hidden_helper_task_encountered)) { 4176 #if OMPT_SUPPORT 4177 // the runtime is shutting down so we won't report any events 4178 thread->th.ompt_thread_info.state = ompt_state_undefined; 4179 #endif 4180 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 4181 } 4182 4183 __kmp_reset_root(gtid, root); 4184 4185 KMP_MB(); 4186 KC_TRACE(10, 4187 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 4188 4189 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4190 } 4191 4192 #if KMP_OS_WINDOWS 4193 /* __kmp_forkjoin_lock must be already held 4194 Unregisters a root thread that is not the current thread. Returns the number 4195 of __kmp_threads entries freed as a result. */ 4196 static int __kmp_unregister_root_other_thread(int gtid) { 4197 kmp_root_t *root = __kmp_root[gtid]; 4198 int r; 4199 4200 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 4201 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4202 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4203 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4204 KMP_ASSERT(root->r.r_active == FALSE); 4205 4206 r = __kmp_reset_root(gtid, root); 4207 KC_TRACE(10, 4208 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 4209 return r; 4210 } 4211 #endif 4212 4213 #if KMP_DEBUG 4214 void __kmp_task_info() { 4215 4216 kmp_int32 gtid = __kmp_entry_gtid(); 4217 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4218 kmp_info_t *this_thr = __kmp_threads[gtid]; 4219 kmp_team_t *steam = this_thr->th.th_serial_team; 4220 kmp_team_t *team = this_thr->th.th_team; 4221 4222 __kmp_printf( 4223 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4224 "ptask=%p\n", 4225 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4226 team->t.t_implicit_task_taskdata[tid].td_parent); 4227 } 4228 #endif // KMP_DEBUG 4229 4230 /* TODO optimize with one big memclr, take out what isn't needed, split 4231 responsibility to workers as much as possible, and delay initialization of 4232 features as much as possible */ 4233 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4234 int tid, int gtid) { 4235 /* this_thr->th.th_info.ds.ds_gtid is setup in 4236 kmp_allocate_thread/create_worker. 4237 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4238 KMP_DEBUG_ASSERT(this_thr != NULL); 4239 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4240 KMP_DEBUG_ASSERT(team); 4241 KMP_DEBUG_ASSERT(team->t.t_threads); 4242 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4243 kmp_info_t *master = team->t.t_threads[0]; 4244 KMP_DEBUG_ASSERT(master); 4245 KMP_DEBUG_ASSERT(master->th.th_root); 4246 4247 KMP_MB(); 4248 4249 TCW_SYNC_PTR(this_thr->th.th_team, team); 4250 4251 this_thr->th.th_info.ds.ds_tid = tid; 4252 this_thr->th.th_set_nproc = 0; 4253 if (__kmp_tasking_mode != tskm_immediate_exec) 4254 // When tasking is possible, threads are not safe to reap until they are 4255 // done tasking; this will be set when tasking code is exited in wait 4256 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4257 else // no tasking --> always safe to reap 4258 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4259 this_thr->th.th_set_proc_bind = proc_bind_default; 4260 #if KMP_AFFINITY_SUPPORTED 4261 this_thr->th.th_new_place = this_thr->th.th_current_place; 4262 #endif 4263 this_thr->th.th_root = master->th.th_root; 4264 4265 /* setup the thread's cache of the team structure */ 4266 this_thr->th.th_team_nproc = team->t.t_nproc; 4267 this_thr->th.th_team_master = master; 4268 this_thr->th.th_team_serialized = team->t.t_serialized; 4269 4270 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4271 4272 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4273 tid, gtid, this_thr, this_thr->th.th_current_task)); 4274 4275 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4276 team, tid, TRUE); 4277 4278 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4279 tid, gtid, this_thr, this_thr->th.th_current_task)); 4280 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4281 // __kmp_initialize_team()? 4282 4283 /* TODO no worksharing in speculative threads */ 4284 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4285 4286 this_thr->th.th_local.this_construct = 0; 4287 4288 if (!this_thr->th.th_pri_common) { 4289 this_thr->th.th_pri_common = 4290 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4291 if (__kmp_storage_map) { 4292 __kmp_print_storage_map_gtid( 4293 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4294 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4295 } 4296 this_thr->th.th_pri_head = NULL; 4297 } 4298 4299 if (this_thr != master && // Primary thread's CG root is initialized elsewhere 4300 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set 4301 // Make new thread's CG root same as primary thread's 4302 KMP_DEBUG_ASSERT(master->th.th_cg_roots); 4303 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 4304 if (tmp) { 4305 // worker changes CG, need to check if old CG should be freed 4306 int i = tmp->cg_nthreads--; 4307 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" 4308 " on node %p of thread %p to %d\n", 4309 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); 4310 if (i == 1) { 4311 __kmp_free(tmp); // last thread left CG --> free it 4312 } 4313 } 4314 this_thr->th.th_cg_roots = master->th.th_cg_roots; 4315 // Increment new thread's CG root's counter to add the new thread 4316 this_thr->th.th_cg_roots->cg_nthreads++; 4317 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" 4318 " node %p of thread %p to %d\n", 4319 this_thr, this_thr->th.th_cg_roots, 4320 this_thr->th.th_cg_roots->cg_root, 4321 this_thr->th.th_cg_roots->cg_nthreads)); 4322 this_thr->th.th_current_task->td_icvs.thread_limit = 4323 this_thr->th.th_cg_roots->cg_thread_limit; 4324 } 4325 4326 /* Initialize dynamic dispatch */ 4327 { 4328 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4329 // Use team max_nproc since this will never change for the team. 4330 size_t disp_size = 4331 sizeof(dispatch_private_info_t) * 4332 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4333 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4334 team->t.t_max_nproc)); 4335 KMP_ASSERT(dispatch); 4336 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4337 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4338 4339 dispatch->th_disp_index = 0; 4340 dispatch->th_doacross_buf_idx = 0; 4341 if (!dispatch->th_disp_buffer) { 4342 dispatch->th_disp_buffer = 4343 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4344 4345 if (__kmp_storage_map) { 4346 __kmp_print_storage_map_gtid( 4347 gtid, &dispatch->th_disp_buffer[0], 4348 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4349 ? 1 4350 : __kmp_dispatch_num_buffers], 4351 disp_size, 4352 "th_%d.th_dispatch.th_disp_buffer " 4353 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4354 gtid, team->t.t_id, gtid); 4355 } 4356 } else { 4357 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4358 } 4359 4360 dispatch->th_dispatch_pr_current = 0; 4361 dispatch->th_dispatch_sh_current = 0; 4362 4363 dispatch->th_deo_fcn = 0; /* ORDERED */ 4364 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4365 } 4366 4367 this_thr->th.th_next_pool = NULL; 4368 4369 if (!this_thr->th.th_task_state_memo_stack) { 4370 size_t i; 4371 this_thr->th.th_task_state_memo_stack = 4372 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4373 this_thr->th.th_task_state_top = 0; 4374 this_thr->th.th_task_state_stack_sz = 4; 4375 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4376 ++i) // zero init the stack 4377 this_thr->th.th_task_state_memo_stack[i] = 0; 4378 } 4379 4380 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4381 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4382 4383 KMP_MB(); 4384 } 4385 4386 /* allocate a new thread for the requesting team. this is only called from 4387 within a forkjoin critical section. we will first try to get an available 4388 thread from the thread pool. if none is available, we will fork a new one 4389 assuming we are able to create a new one. this should be assured, as the 4390 caller should check on this first. */ 4391 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4392 int new_tid) { 4393 kmp_team_t *serial_team; 4394 kmp_info_t *new_thr; 4395 int new_gtid; 4396 4397 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4398 KMP_DEBUG_ASSERT(root && team); 4399 #if !KMP_NESTED_HOT_TEAMS 4400 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4401 #endif 4402 KMP_MB(); 4403 4404 /* first, try to get one from the thread pool */ 4405 if (__kmp_thread_pool) { 4406 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4407 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4408 if (new_thr == __kmp_thread_pool_insert_pt) { 4409 __kmp_thread_pool_insert_pt = NULL; 4410 } 4411 TCW_4(new_thr->th.th_in_pool, FALSE); 4412 __kmp_suspend_initialize_thread(new_thr); 4413 __kmp_lock_suspend_mx(new_thr); 4414 if (new_thr->th.th_active_in_pool == TRUE) { 4415 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); 4416 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 4417 new_thr->th.th_active_in_pool = FALSE; 4418 } 4419 __kmp_unlock_suspend_mx(new_thr); 4420 4421 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4422 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4423 KMP_ASSERT(!new_thr->th.th_team); 4424 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4425 4426 /* setup the thread structure */ 4427 __kmp_initialize_info(new_thr, team, new_tid, 4428 new_thr->th.th_info.ds.ds_gtid); 4429 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4430 4431 TCW_4(__kmp_nth, __kmp_nth + 1); 4432 4433 new_thr->th.th_task_state = 0; 4434 new_thr->th.th_task_state_top = 0; 4435 new_thr->th.th_task_state_stack_sz = 4; 4436 4437 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 4438 // Make sure pool thread has transitioned to waiting on own thread struct 4439 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0); 4440 // Thread activated in __kmp_allocate_team when increasing team size 4441 } 4442 4443 #ifdef KMP_ADJUST_BLOCKTIME 4444 /* Adjust blocktime back to zero if necessary */ 4445 /* Middle initialization might not have occurred yet */ 4446 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4447 if (__kmp_nth > __kmp_avail_proc) { 4448 __kmp_zero_bt = TRUE; 4449 } 4450 } 4451 #endif /* KMP_ADJUST_BLOCKTIME */ 4452 4453 #if KMP_DEBUG 4454 // If thread entered pool via __kmp_free_thread, wait_flag should != 4455 // KMP_BARRIER_PARENT_FLAG. 4456 int b; 4457 kmp_balign_t *balign = new_thr->th.th_bar; 4458 for (b = 0; b < bs_last_barrier; ++b) 4459 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4460 #endif 4461 4462 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4463 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4464 4465 KMP_MB(); 4466 return new_thr; 4467 } 4468 4469 /* no, well fork a new one */ 4470 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4471 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4472 4473 #if KMP_USE_MONITOR 4474 // If this is the first worker thread the RTL is creating, then also 4475 // launch the monitor thread. We try to do this as early as possible. 4476 if (!TCR_4(__kmp_init_monitor)) { 4477 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4478 if (!TCR_4(__kmp_init_monitor)) { 4479 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4480 TCW_4(__kmp_init_monitor, 1); 4481 __kmp_create_monitor(&__kmp_monitor); 4482 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4483 #if KMP_OS_WINDOWS 4484 // AC: wait until monitor has started. This is a fix for CQ232808. 4485 // The reason is that if the library is loaded/unloaded in a loop with 4486 // small (parallel) work in between, then there is high probability that 4487 // monitor thread started after the library shutdown. At shutdown it is 4488 // too late to cope with the problem, because when the primary thread is 4489 // in DllMain (process detach) the monitor has no chances to start (it is 4490 // blocked), and primary thread has no means to inform the monitor that 4491 // the library has gone, because all the memory which the monitor can 4492 // access is going to be released/reset. 4493 while (TCR_4(__kmp_init_monitor) < 2) { 4494 KMP_YIELD(TRUE); 4495 } 4496 KF_TRACE(10, ("after monitor thread has started\n")); 4497 #endif 4498 } 4499 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4500 } 4501 #endif 4502 4503 KMP_MB(); 4504 4505 { 4506 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads) 4507 ? 1 4508 : __kmp_hidden_helper_threads_num + 1; 4509 4510 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL; 4511 ++new_gtid) { 4512 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4513 } 4514 4515 if (TCR_4(__kmp_init_hidden_helper_threads)) { 4516 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num); 4517 } 4518 } 4519 4520 /* allocate space for it. */ 4521 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4522 4523 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4524 4525 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 4526 // suppress race conditions detection on synchronization flags in debug mode 4527 // this helps to analyze library internals eliminating false positives 4528 __itt_suppress_mark_range( 4529 __itt_suppress_range, __itt_suppress_threading_errors, 4530 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc)); 4531 __itt_suppress_mark_range( 4532 __itt_suppress_range, __itt_suppress_threading_errors, 4533 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state)); 4534 #if KMP_OS_WINDOWS 4535 __itt_suppress_mark_range( 4536 __itt_suppress_range, __itt_suppress_threading_errors, 4537 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init)); 4538 #else 4539 __itt_suppress_mark_range(__itt_suppress_range, 4540 __itt_suppress_threading_errors, 4541 &new_thr->th.th_suspend_init_count, 4542 sizeof(new_thr->th.th_suspend_init_count)); 4543 #endif 4544 // TODO: check if we need to also suppress b_arrived flags 4545 __itt_suppress_mark_range(__itt_suppress_range, 4546 __itt_suppress_threading_errors, 4547 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go), 4548 sizeof(new_thr->th.th_bar[0].bb.b_go)); 4549 __itt_suppress_mark_range(__itt_suppress_range, 4550 __itt_suppress_threading_errors, 4551 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go), 4552 sizeof(new_thr->th.th_bar[1].bb.b_go)); 4553 __itt_suppress_mark_range(__itt_suppress_range, 4554 __itt_suppress_threading_errors, 4555 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go), 4556 sizeof(new_thr->th.th_bar[2].bb.b_go)); 4557 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 4558 if (__kmp_storage_map) { 4559 __kmp_print_thread_storage_map(new_thr, new_gtid); 4560 } 4561 4562 // add the reserve serialized team, initialized from the team's primary thread 4563 { 4564 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4565 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4566 new_thr->th.th_serial_team = serial_team = 4567 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4568 #if OMPT_SUPPORT 4569 ompt_data_none, // root parallel id 4570 #endif 4571 proc_bind_default, &r_icvs, 4572 0 USE_NESTED_HOT_ARG(NULL)); 4573 } 4574 KMP_ASSERT(serial_team); 4575 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4576 // execution (it is unused for now). 4577 serial_team->t.t_threads[0] = new_thr; 4578 KF_TRACE(10, 4579 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4580 new_thr)); 4581 4582 /* setup the thread structures */ 4583 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4584 4585 #if USE_FAST_MEMORY 4586 __kmp_initialize_fast_memory(new_thr); 4587 #endif /* USE_FAST_MEMORY */ 4588 4589 #if KMP_USE_BGET 4590 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4591 __kmp_initialize_bget(new_thr); 4592 #endif 4593 4594 __kmp_init_random(new_thr); // Initialize random number generator 4595 4596 /* Initialize these only once when thread is grabbed for a team allocation */ 4597 KA_TRACE(20, 4598 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4599 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4600 4601 int b; 4602 kmp_balign_t *balign = new_thr->th.th_bar; 4603 for (b = 0; b < bs_last_barrier; ++b) { 4604 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4605 balign[b].bb.team = NULL; 4606 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4607 balign[b].bb.use_oncore_barrier = 0; 4608 } 4609 4610 TCW_PTR(new_thr->th.th_sleep_loc, NULL); 4611 new_thr->th.th_sleep_loc_type = flag_unset; 4612 4613 new_thr->th.th_spin_here = FALSE; 4614 new_thr->th.th_next_waiting = 0; 4615 #if KMP_OS_UNIX 4616 new_thr->th.th_blocking = false; 4617 #endif 4618 4619 #if KMP_AFFINITY_SUPPORTED 4620 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4621 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4622 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4623 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4624 #endif 4625 new_thr->th.th_def_allocator = __kmp_def_allocator; 4626 new_thr->th.th_prev_level = 0; 4627 new_thr->th.th_prev_num_threads = 1; 4628 4629 TCW_4(new_thr->th.th_in_pool, FALSE); 4630 new_thr->th.th_active_in_pool = FALSE; 4631 TCW_4(new_thr->th.th_active, TRUE); 4632 4633 /* adjust the global counters */ 4634 __kmp_all_nth++; 4635 __kmp_nth++; 4636 4637 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4638 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4639 if (__kmp_adjust_gtid_mode) { 4640 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4641 if (TCR_4(__kmp_gtid_mode) != 2) { 4642 TCW_4(__kmp_gtid_mode, 2); 4643 } 4644 } else { 4645 if (TCR_4(__kmp_gtid_mode) != 1) { 4646 TCW_4(__kmp_gtid_mode, 1); 4647 } 4648 } 4649 } 4650 4651 #ifdef KMP_ADJUST_BLOCKTIME 4652 /* Adjust blocktime back to zero if necessary */ 4653 /* Middle initialization might not have occurred yet */ 4654 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4655 if (__kmp_nth > __kmp_avail_proc) { 4656 __kmp_zero_bt = TRUE; 4657 } 4658 } 4659 #endif /* KMP_ADJUST_BLOCKTIME */ 4660 4661 /* actually fork it and create the new worker thread */ 4662 KF_TRACE( 4663 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4664 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4665 KF_TRACE(10, 4666 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4667 4668 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4669 new_gtid)); 4670 KMP_MB(); 4671 return new_thr; 4672 } 4673 4674 /* Reinitialize team for reuse. 4675 The hot team code calls this case at every fork barrier, so EPCC barrier 4676 test are extremely sensitive to changes in it, esp. writes to the team 4677 struct, which cause a cache invalidation in all threads. 4678 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4679 static void __kmp_reinitialize_team(kmp_team_t *team, 4680 kmp_internal_control_t *new_icvs, 4681 ident_t *loc) { 4682 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4683 team->t.t_threads[0], team)); 4684 KMP_DEBUG_ASSERT(team && new_icvs); 4685 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4686 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4687 4688 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4689 // Copy ICVs to the primary thread's implicit taskdata 4690 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4691 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4692 4693 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4694 team->t.t_threads[0], team)); 4695 } 4696 4697 /* Initialize the team data structure. 4698 This assumes the t_threads and t_max_nproc are already set. 4699 Also, we don't touch the arguments */ 4700 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4701 kmp_internal_control_t *new_icvs, 4702 ident_t *loc) { 4703 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4704 4705 /* verify */ 4706 KMP_DEBUG_ASSERT(team); 4707 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4708 KMP_DEBUG_ASSERT(team->t.t_threads); 4709 KMP_MB(); 4710 4711 team->t.t_master_tid = 0; /* not needed */ 4712 /* team->t.t_master_bar; not needed */ 4713 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4714 team->t.t_nproc = new_nproc; 4715 4716 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4717 team->t.t_next_pool = NULL; 4718 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4719 * up hot team */ 4720 4721 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4722 team->t.t_invoke = NULL; /* not needed */ 4723 4724 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4725 team->t.t_sched.sched = new_icvs->sched.sched; 4726 4727 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4728 team->t.t_fp_control_saved = FALSE; /* not needed */ 4729 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4730 team->t.t_mxcsr = 0; /* not needed */ 4731 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4732 4733 team->t.t_construct = 0; 4734 4735 team->t.t_ordered.dt.t_value = 0; 4736 team->t.t_master_active = FALSE; 4737 4738 #ifdef KMP_DEBUG 4739 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4740 #endif 4741 #if KMP_OS_WINDOWS 4742 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4743 #endif 4744 4745 team->t.t_control_stack_top = NULL; 4746 4747 __kmp_reinitialize_team(team, new_icvs, loc); 4748 4749 KMP_MB(); 4750 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4751 } 4752 4753 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 4754 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4755 static void 4756 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4757 if (KMP_AFFINITY_CAPABLE()) { 4758 int status; 4759 if (old_mask != NULL) { 4760 status = __kmp_get_system_affinity(old_mask, TRUE); 4761 int error = errno; 4762 if (status != 0) { 4763 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4764 __kmp_msg_null); 4765 } 4766 } 4767 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4768 } 4769 } 4770 #endif 4771 4772 #if KMP_AFFINITY_SUPPORTED 4773 4774 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4775 // It calculates the worker + primary thread's partition based upon the parent 4776 // thread's partition, and binds each worker to a thread in their partition. 4777 // The primary thread's partition should already include its current binding. 4778 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4779 // Do not partition places for the hidden helper team 4780 if (KMP_HIDDEN_HELPER_TEAM(team)) 4781 return; 4782 // Copy the primary thread's place partition to the team struct 4783 kmp_info_t *master_th = team->t.t_threads[0]; 4784 KMP_DEBUG_ASSERT(master_th != NULL); 4785 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4786 int first_place = master_th->th.th_first_place; 4787 int last_place = master_th->th.th_last_place; 4788 int masters_place = master_th->th.th_current_place; 4789 int num_masks = __kmp_affinity.num_masks; 4790 team->t.t_first_place = first_place; 4791 team->t.t_last_place = last_place; 4792 4793 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4794 "bound to place %d partition = [%d,%d]\n", 4795 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4796 team->t.t_id, masters_place, first_place, last_place)); 4797 4798 switch (proc_bind) { 4799 4800 case proc_bind_default: 4801 // Serial teams might have the proc_bind policy set to proc_bind_default. 4802 // Not an issue -- we don't rebind primary thread for any proc_bind policy. 4803 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4804 break; 4805 4806 case proc_bind_primary: { 4807 int f; 4808 int n_th = team->t.t_nproc; 4809 for (f = 1; f < n_th; f++) { 4810 kmp_info_t *th = team->t.t_threads[f]; 4811 KMP_DEBUG_ASSERT(th != NULL); 4812 th->th.th_first_place = first_place; 4813 th->th.th_last_place = last_place; 4814 th->th.th_new_place = masters_place; 4815 if (__kmp_display_affinity && masters_place != th->th.th_current_place && 4816 team->t.t_display_affinity != 1) { 4817 team->t.t_display_affinity = 1; 4818 } 4819 4820 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d " 4821 "partition = [%d,%d]\n", 4822 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4823 f, masters_place, first_place, last_place)); 4824 } 4825 } break; 4826 4827 case proc_bind_close: { 4828 int f; 4829 int n_th = team->t.t_nproc; 4830 int n_places; 4831 if (first_place <= last_place) { 4832 n_places = last_place - first_place + 1; 4833 } else { 4834 n_places = num_masks - first_place + last_place + 1; 4835 } 4836 if (n_th <= n_places) { 4837 int place = masters_place; 4838 for (f = 1; f < n_th; f++) { 4839 kmp_info_t *th = team->t.t_threads[f]; 4840 KMP_DEBUG_ASSERT(th != NULL); 4841 4842 if (place == last_place) { 4843 place = first_place; 4844 } else if (place == (num_masks - 1)) { 4845 place = 0; 4846 } else { 4847 place++; 4848 } 4849 th->th.th_first_place = first_place; 4850 th->th.th_last_place = last_place; 4851 th->th.th_new_place = place; 4852 if (__kmp_display_affinity && place != th->th.th_current_place && 4853 team->t.t_display_affinity != 1) { 4854 team->t.t_display_affinity = 1; 4855 } 4856 4857 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4858 "partition = [%d,%d]\n", 4859 __kmp_gtid_from_thread(team->t.t_threads[f]), 4860 team->t.t_id, f, place, first_place, last_place)); 4861 } 4862 } else { 4863 int S, rem, gap, s_count; 4864 S = n_th / n_places; 4865 s_count = 0; 4866 rem = n_th - (S * n_places); 4867 gap = rem > 0 ? n_places / rem : n_places; 4868 int place = masters_place; 4869 int gap_ct = gap; 4870 for (f = 0; f < n_th; f++) { 4871 kmp_info_t *th = team->t.t_threads[f]; 4872 KMP_DEBUG_ASSERT(th != NULL); 4873 4874 th->th.th_first_place = first_place; 4875 th->th.th_last_place = last_place; 4876 th->th.th_new_place = place; 4877 if (__kmp_display_affinity && place != th->th.th_current_place && 4878 team->t.t_display_affinity != 1) { 4879 team->t.t_display_affinity = 1; 4880 } 4881 s_count++; 4882 4883 if ((s_count == S) && rem && (gap_ct == gap)) { 4884 // do nothing, add an extra thread to place on next iteration 4885 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4886 // we added an extra thread to this place; move to next place 4887 if (place == last_place) { 4888 place = first_place; 4889 } else if (place == (num_masks - 1)) { 4890 place = 0; 4891 } else { 4892 place++; 4893 } 4894 s_count = 0; 4895 gap_ct = 1; 4896 rem--; 4897 } else if (s_count == S) { // place full; don't add extra 4898 if (place == last_place) { 4899 place = first_place; 4900 } else if (place == (num_masks - 1)) { 4901 place = 0; 4902 } else { 4903 place++; 4904 } 4905 gap_ct++; 4906 s_count = 0; 4907 } 4908 4909 KA_TRACE(100, 4910 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4911 "partition = [%d,%d]\n", 4912 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4913 th->th.th_new_place, first_place, last_place)); 4914 } 4915 KMP_DEBUG_ASSERT(place == masters_place); 4916 } 4917 } break; 4918 4919 case proc_bind_spread: { 4920 int f; 4921 int n_th = team->t.t_nproc; 4922 int n_places; 4923 int thidx; 4924 if (first_place <= last_place) { 4925 n_places = last_place - first_place + 1; 4926 } else { 4927 n_places = num_masks - first_place + last_place + 1; 4928 } 4929 if (n_th <= n_places) { 4930 int place = -1; 4931 4932 if (n_places != num_masks) { 4933 int S = n_places / n_th; 4934 int s_count, rem, gap, gap_ct; 4935 4936 place = masters_place; 4937 rem = n_places - n_th * S; 4938 gap = rem ? n_th / rem : 1; 4939 gap_ct = gap; 4940 thidx = n_th; 4941 if (update_master_only == 1) 4942 thidx = 1; 4943 for (f = 0; f < thidx; f++) { 4944 kmp_info_t *th = team->t.t_threads[f]; 4945 KMP_DEBUG_ASSERT(th != NULL); 4946 4947 th->th.th_first_place = place; 4948 th->th.th_new_place = place; 4949 if (__kmp_display_affinity && place != th->th.th_current_place && 4950 team->t.t_display_affinity != 1) { 4951 team->t.t_display_affinity = 1; 4952 } 4953 s_count = 1; 4954 while (s_count < S) { 4955 if (place == last_place) { 4956 place = first_place; 4957 } else if (place == (num_masks - 1)) { 4958 place = 0; 4959 } else { 4960 place++; 4961 } 4962 s_count++; 4963 } 4964 if (rem && (gap_ct == gap)) { 4965 if (place == last_place) { 4966 place = first_place; 4967 } else if (place == (num_masks - 1)) { 4968 place = 0; 4969 } else { 4970 place++; 4971 } 4972 rem--; 4973 gap_ct = 0; 4974 } 4975 th->th.th_last_place = place; 4976 gap_ct++; 4977 4978 if (place == last_place) { 4979 place = first_place; 4980 } else if (place == (num_masks - 1)) { 4981 place = 0; 4982 } else { 4983 place++; 4984 } 4985 4986 KA_TRACE(100, 4987 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4988 "partition = [%d,%d], num_masks: %u\n", 4989 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4990 f, th->th.th_new_place, th->th.th_first_place, 4991 th->th.th_last_place, num_masks)); 4992 } 4993 } else { 4994 /* Having uniform space of available computation places I can create 4995 T partitions of round(P/T) size and put threads into the first 4996 place of each partition. */ 4997 double current = static_cast<double>(masters_place); 4998 double spacing = 4999 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 5000 int first, last; 5001 kmp_info_t *th; 5002 5003 thidx = n_th + 1; 5004 if (update_master_only == 1) 5005 thidx = 1; 5006 for (f = 0; f < thidx; f++) { 5007 first = static_cast<int>(current); 5008 last = static_cast<int>(current + spacing) - 1; 5009 KMP_DEBUG_ASSERT(last >= first); 5010 if (first >= n_places) { 5011 if (masters_place) { 5012 first -= n_places; 5013 last -= n_places; 5014 if (first == (masters_place + 1)) { 5015 KMP_DEBUG_ASSERT(f == n_th); 5016 first--; 5017 } 5018 if (last == masters_place) { 5019 KMP_DEBUG_ASSERT(f == (n_th - 1)); 5020 last--; 5021 } 5022 } else { 5023 KMP_DEBUG_ASSERT(f == n_th); 5024 first = 0; 5025 last = 0; 5026 } 5027 } 5028 if (last >= n_places) { 5029 last = (n_places - 1); 5030 } 5031 place = first; 5032 current += spacing; 5033 if (f < n_th) { 5034 KMP_DEBUG_ASSERT(0 <= first); 5035 KMP_DEBUG_ASSERT(n_places > first); 5036 KMP_DEBUG_ASSERT(0 <= last); 5037 KMP_DEBUG_ASSERT(n_places > last); 5038 KMP_DEBUG_ASSERT(last_place >= first_place); 5039 th = team->t.t_threads[f]; 5040 KMP_DEBUG_ASSERT(th); 5041 th->th.th_first_place = first; 5042 th->th.th_new_place = place; 5043 th->th.th_last_place = last; 5044 if (__kmp_display_affinity && place != th->th.th_current_place && 5045 team->t.t_display_affinity != 1) { 5046 team->t.t_display_affinity = 1; 5047 } 5048 KA_TRACE(100, 5049 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 5050 "partition = [%d,%d], spacing = %.4f\n", 5051 __kmp_gtid_from_thread(team->t.t_threads[f]), 5052 team->t.t_id, f, th->th.th_new_place, 5053 th->th.th_first_place, th->th.th_last_place, spacing)); 5054 } 5055 } 5056 } 5057 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 5058 } else { 5059 int S, rem, gap, s_count; 5060 S = n_th / n_places; 5061 s_count = 0; 5062 rem = n_th - (S * n_places); 5063 gap = rem > 0 ? n_places / rem : n_places; 5064 int place = masters_place; 5065 int gap_ct = gap; 5066 thidx = n_th; 5067 if (update_master_only == 1) 5068 thidx = 1; 5069 for (f = 0; f < thidx; f++) { 5070 kmp_info_t *th = team->t.t_threads[f]; 5071 KMP_DEBUG_ASSERT(th != NULL); 5072 5073 th->th.th_first_place = place; 5074 th->th.th_last_place = place; 5075 th->th.th_new_place = place; 5076 if (__kmp_display_affinity && place != th->th.th_current_place && 5077 team->t.t_display_affinity != 1) { 5078 team->t.t_display_affinity = 1; 5079 } 5080 s_count++; 5081 5082 if ((s_count == S) && rem && (gap_ct == gap)) { 5083 // do nothing, add an extra thread to place on next iteration 5084 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 5085 // we added an extra thread to this place; move on to next place 5086 if (place == last_place) { 5087 place = first_place; 5088 } else if (place == (num_masks - 1)) { 5089 place = 0; 5090 } else { 5091 place++; 5092 } 5093 s_count = 0; 5094 gap_ct = 1; 5095 rem--; 5096 } else if (s_count == S) { // place is full; don't add extra thread 5097 if (place == last_place) { 5098 place = first_place; 5099 } else if (place == (num_masks - 1)) { 5100 place = 0; 5101 } else { 5102 place++; 5103 } 5104 gap_ct++; 5105 s_count = 0; 5106 } 5107 5108 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 5109 "partition = [%d,%d]\n", 5110 __kmp_gtid_from_thread(team->t.t_threads[f]), 5111 team->t.t_id, f, th->th.th_new_place, 5112 th->th.th_first_place, th->th.th_last_place)); 5113 } 5114 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 5115 } 5116 } break; 5117 5118 default: 5119 break; 5120 } 5121 5122 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 5123 } 5124 5125 #endif // KMP_AFFINITY_SUPPORTED 5126 5127 /* allocate a new team data structure to use. take one off of the free pool if 5128 available */ 5129 kmp_team_t * 5130 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 5131 #if OMPT_SUPPORT 5132 ompt_data_t ompt_parallel_data, 5133 #endif 5134 kmp_proc_bind_t new_proc_bind, 5135 kmp_internal_control_t *new_icvs, 5136 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5137 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 5138 int f; 5139 kmp_team_t *team; 5140 int use_hot_team = !root->r.r_active; 5141 int level = 0; 5142 int do_place_partition = 1; 5143 5144 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 5145 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 5146 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 5147 KMP_MB(); 5148 5149 #if KMP_NESTED_HOT_TEAMS 5150 kmp_hot_team_ptr_t *hot_teams; 5151 if (master) { 5152 team = master->th.th_team; 5153 level = team->t.t_active_level; 5154 if (master->th.th_teams_microtask) { // in teams construct? 5155 if (master->th.th_teams_size.nteams > 1 && 5156 ( // #teams > 1 5157 team->t.t_pkfn == 5158 (microtask_t)__kmp_teams_master || // inner fork of the teams 5159 master->th.th_teams_level < 5160 team->t.t_level)) { // or nested parallel inside the teams 5161 ++level; // not increment if #teams==1, or for outer fork of the teams; 5162 // increment otherwise 5163 } 5164 // Do not perform the place partition if inner fork of the teams 5165 // Wait until nested parallel region encountered inside teams construct 5166 if ((master->th.th_teams_size.nteams == 1 && 5167 master->th.th_teams_level >= team->t.t_level) || 5168 (team->t.t_pkfn == (microtask_t)__kmp_teams_master)) 5169 do_place_partition = 0; 5170 } 5171 hot_teams = master->th.th_hot_teams; 5172 if (level < __kmp_hot_teams_max_level && hot_teams && 5173 hot_teams[level].hot_team) { 5174 // hot team has already been allocated for given level 5175 use_hot_team = 1; 5176 } else { 5177 use_hot_team = 0; 5178 } 5179 } else { 5180 // check we won't access uninitialized hot_teams, just in case 5181 KMP_DEBUG_ASSERT(new_nproc == 1); 5182 } 5183 #endif 5184 // Optimization to use a "hot" team 5185 if (use_hot_team && new_nproc > 1) { 5186 KMP_DEBUG_ASSERT(new_nproc <= max_nproc); 5187 #if KMP_NESTED_HOT_TEAMS 5188 team = hot_teams[level].hot_team; 5189 #else 5190 team = root->r.r_hot_team; 5191 #endif 5192 #if KMP_DEBUG 5193 if (__kmp_tasking_mode != tskm_immediate_exec) { 5194 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5195 "task_team[1] = %p before reinit\n", 5196 team->t.t_task_team[0], team->t.t_task_team[1])); 5197 } 5198 #endif 5199 5200 if (team->t.t_nproc != new_nproc && 5201 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5202 // Distributed barrier may need a resize 5203 int old_nthr = team->t.t_nproc; 5204 __kmp_resize_dist_barrier(team, old_nthr, new_nproc); 5205 } 5206 5207 // If not doing the place partition, then reset the team's proc bind 5208 // to indicate that partitioning of all threads still needs to take place 5209 if (do_place_partition == 0) 5210 team->t.t_proc_bind = proc_bind_default; 5211 // Has the number of threads changed? 5212 /* Let's assume the most common case is that the number of threads is 5213 unchanged, and put that case first. */ 5214 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 5215 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 5216 // This case can mean that omp_set_num_threads() was called and the hot 5217 // team size was already reduced, so we check the special flag 5218 if (team->t.t_size_changed == -1) { 5219 team->t.t_size_changed = 1; 5220 } else { 5221 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 5222 } 5223 5224 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5225 kmp_r_sched_t new_sched = new_icvs->sched; 5226 // set primary thread's schedule as new run-time schedule 5227 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 5228 5229 __kmp_reinitialize_team(team, new_icvs, 5230 root->r.r_uber_thread->th.th_ident); 5231 5232 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 5233 team->t.t_threads[0], team)); 5234 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5235 5236 #if KMP_AFFINITY_SUPPORTED 5237 if ((team->t.t_size_changed == 0) && 5238 (team->t.t_proc_bind == new_proc_bind)) { 5239 if (new_proc_bind == proc_bind_spread) { 5240 if (do_place_partition) { 5241 // add flag to update only master for spread 5242 __kmp_partition_places(team, 1); 5243 } 5244 } 5245 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 5246 "proc_bind = %d, partition = [%d,%d]\n", 5247 team->t.t_id, new_proc_bind, team->t.t_first_place, 5248 team->t.t_last_place)); 5249 } else { 5250 if (do_place_partition) { 5251 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5252 __kmp_partition_places(team); 5253 } 5254 } 5255 #else 5256 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5257 #endif /* KMP_AFFINITY_SUPPORTED */ 5258 } else if (team->t.t_nproc > new_nproc) { 5259 KA_TRACE(20, 5260 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 5261 new_nproc)); 5262 5263 team->t.t_size_changed = 1; 5264 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5265 // Barrier size already reduced earlier in this function 5266 // Activate team threads via th_used_in_team 5267 __kmp_add_threads_to_team(team, new_nproc); 5268 } 5269 #if KMP_NESTED_HOT_TEAMS 5270 if (__kmp_hot_teams_mode == 0) { 5271 // AC: saved number of threads should correspond to team's value in this 5272 // mode, can be bigger in mode 1, when hot team has threads in reserve 5273 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 5274 hot_teams[level].hot_team_nth = new_nproc; 5275 #endif // KMP_NESTED_HOT_TEAMS 5276 /* release the extra threads we don't need any more */ 5277 for (f = new_nproc; f < team->t.t_nproc; f++) { 5278 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5279 if (__kmp_tasking_mode != tskm_immediate_exec) { 5280 // When decreasing team size, threads no longer in the team should 5281 // unref task team. 5282 team->t.t_threads[f]->th.th_task_team = NULL; 5283 } 5284 __kmp_free_thread(team->t.t_threads[f]); 5285 team->t.t_threads[f] = NULL; 5286 } 5287 #if KMP_NESTED_HOT_TEAMS 5288 } // (__kmp_hot_teams_mode == 0) 5289 else { 5290 // When keeping extra threads in team, switch threads to wait on own 5291 // b_go flag 5292 for (f = new_nproc; f < team->t.t_nproc; ++f) { 5293 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5294 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 5295 for (int b = 0; b < bs_last_barrier; ++b) { 5296 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5297 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5298 } 5299 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5300 } 5301 } 5302 } 5303 #endif // KMP_NESTED_HOT_TEAMS 5304 team->t.t_nproc = new_nproc; 5305 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5306 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 5307 __kmp_reinitialize_team(team, new_icvs, 5308 root->r.r_uber_thread->th.th_ident); 5309 5310 // Update remaining threads 5311 for (f = 0; f < new_nproc; ++f) { 5312 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5313 } 5314 5315 // restore the current task state of the primary thread: should be the 5316 // implicit task 5317 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5318 team->t.t_threads[0], team)); 5319 5320 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5321 5322 #ifdef KMP_DEBUG 5323 for (f = 0; f < team->t.t_nproc; f++) { 5324 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5325 team->t.t_threads[f]->th.th_team_nproc == 5326 team->t.t_nproc); 5327 } 5328 #endif 5329 5330 if (do_place_partition) { 5331 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5332 #if KMP_AFFINITY_SUPPORTED 5333 __kmp_partition_places(team); 5334 #endif 5335 } 5336 } else { // team->t.t_nproc < new_nproc 5337 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5338 kmp_affin_mask_t *old_mask; 5339 if (KMP_AFFINITY_CAPABLE()) { 5340 KMP_CPU_ALLOC(old_mask); 5341 } 5342 #endif 5343 5344 KA_TRACE(20, 5345 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5346 new_nproc)); 5347 int old_nproc = team->t.t_nproc; // save old value and use to update only 5348 team->t.t_size_changed = 1; 5349 5350 #if KMP_NESTED_HOT_TEAMS 5351 int avail_threads = hot_teams[level].hot_team_nth; 5352 if (new_nproc < avail_threads) 5353 avail_threads = new_nproc; 5354 kmp_info_t **other_threads = team->t.t_threads; 5355 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5356 // Adjust barrier data of reserved threads (if any) of the team 5357 // Other data will be set in __kmp_initialize_info() below. 5358 int b; 5359 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5360 for (b = 0; b < bs_last_barrier; ++b) { 5361 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5362 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5363 #if USE_DEBUGGER 5364 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5365 #endif 5366 } 5367 } 5368 if (hot_teams[level].hot_team_nth >= new_nproc) { 5369 // we have all needed threads in reserve, no need to allocate any 5370 // this only possible in mode 1, cannot have reserved threads in mode 0 5371 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5372 team->t.t_nproc = new_nproc; // just get reserved threads involved 5373 } else { 5374 // We may have some threads in reserve, but not enough; 5375 // get reserved threads involved if any. 5376 team->t.t_nproc = hot_teams[level].hot_team_nth; 5377 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5378 #endif // KMP_NESTED_HOT_TEAMS 5379 if (team->t.t_max_nproc < new_nproc) { 5380 /* reallocate larger arrays */ 5381 __kmp_reallocate_team_arrays(team, new_nproc); 5382 __kmp_reinitialize_team(team, new_icvs, NULL); 5383 } 5384 5385 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5386 /* Temporarily set full mask for primary thread before creation of 5387 workers. The reason is that workers inherit the affinity from the 5388 primary thread, so if a lot of workers are created on the single 5389 core quickly, they don't get a chance to set their own affinity for 5390 a long time. */ 5391 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5392 #endif 5393 5394 /* allocate new threads for the hot team */ 5395 for (f = team->t.t_nproc; f < new_nproc; f++) { 5396 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5397 KMP_DEBUG_ASSERT(new_worker); 5398 team->t.t_threads[f] = new_worker; 5399 5400 KA_TRACE(20, 5401 ("__kmp_allocate_team: team %d init T#%d arrived: " 5402 "join=%llu, plain=%llu\n", 5403 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5404 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5405 team->t.t_bar[bs_plain_barrier].b_arrived)); 5406 5407 { // Initialize barrier data for new threads. 5408 int b; 5409 kmp_balign_t *balign = new_worker->th.th_bar; 5410 for (b = 0; b < bs_last_barrier; ++b) { 5411 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5412 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5413 KMP_BARRIER_PARENT_FLAG); 5414 #if USE_DEBUGGER 5415 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5416 #endif 5417 } 5418 } 5419 } 5420 5421 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5422 if (KMP_AFFINITY_CAPABLE()) { 5423 /* Restore initial primary thread's affinity mask */ 5424 __kmp_set_system_affinity(old_mask, TRUE); 5425 KMP_CPU_FREE(old_mask); 5426 } 5427 #endif 5428 #if KMP_NESTED_HOT_TEAMS 5429 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5430 #endif // KMP_NESTED_HOT_TEAMS 5431 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5432 // Barrier size already increased earlier in this function 5433 // Activate team threads via th_used_in_team 5434 __kmp_add_threads_to_team(team, new_nproc); 5435 } 5436 /* make sure everyone is syncronized */ 5437 // new threads below 5438 __kmp_initialize_team(team, new_nproc, new_icvs, 5439 root->r.r_uber_thread->th.th_ident); 5440 5441 /* reinitialize the threads */ 5442 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5443 for (f = 0; f < team->t.t_nproc; ++f) 5444 __kmp_initialize_info(team->t.t_threads[f], team, f, 5445 __kmp_gtid_from_tid(f, team)); 5446 5447 if (level) { // set th_task_state for new threads in nested hot team 5448 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5449 // only need to set the th_task_state for the new threads. th_task_state 5450 // for primary thread will not be accurate until after this in 5451 // __kmp_fork_call(), so we look to the primary thread's memo_stack to 5452 // get the correct value. 5453 for (f = old_nproc; f < team->t.t_nproc; ++f) 5454 team->t.t_threads[f]->th.th_task_state = 5455 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5456 } else { // set th_task_state for new threads in non-nested hot team 5457 // copy primary thread's state 5458 kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state; 5459 for (f = old_nproc; f < team->t.t_nproc; ++f) 5460 team->t.t_threads[f]->th.th_task_state = old_state; 5461 } 5462 5463 #ifdef KMP_DEBUG 5464 for (f = 0; f < team->t.t_nproc; ++f) { 5465 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5466 team->t.t_threads[f]->th.th_team_nproc == 5467 team->t.t_nproc); 5468 } 5469 #endif 5470 5471 if (do_place_partition) { 5472 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5473 #if KMP_AFFINITY_SUPPORTED 5474 __kmp_partition_places(team); 5475 #endif 5476 } 5477 } // Check changes in number of threads 5478 5479 kmp_info_t *master = team->t.t_threads[0]; 5480 if (master->th.th_teams_microtask) { 5481 for (f = 1; f < new_nproc; ++f) { 5482 // propagate teams construct specific info to workers 5483 kmp_info_t *thr = team->t.t_threads[f]; 5484 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5485 thr->th.th_teams_level = master->th.th_teams_level; 5486 thr->th.th_teams_size = master->th.th_teams_size; 5487 } 5488 } 5489 #if KMP_NESTED_HOT_TEAMS 5490 if (level) { 5491 // Sync barrier state for nested hot teams, not needed for outermost hot 5492 // team. 5493 for (f = 1; f < new_nproc; ++f) { 5494 kmp_info_t *thr = team->t.t_threads[f]; 5495 int b; 5496 kmp_balign_t *balign = thr->th.th_bar; 5497 for (b = 0; b < bs_last_barrier; ++b) { 5498 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5499 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5500 #if USE_DEBUGGER 5501 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5502 #endif 5503 } 5504 } 5505 } 5506 #endif // KMP_NESTED_HOT_TEAMS 5507 5508 /* reallocate space for arguments if necessary */ 5509 __kmp_alloc_argv_entries(argc, team, TRUE); 5510 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5511 // The hot team re-uses the previous task team, 5512 // if untouched during the previous release->gather phase. 5513 5514 KF_TRACE(10, (" hot_team = %p\n", team)); 5515 5516 #if KMP_DEBUG 5517 if (__kmp_tasking_mode != tskm_immediate_exec) { 5518 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5519 "task_team[1] = %p after reinit\n", 5520 team->t.t_task_team[0], team->t.t_task_team[1])); 5521 } 5522 #endif 5523 5524 #if OMPT_SUPPORT 5525 __ompt_team_assign_id(team, ompt_parallel_data); 5526 #endif 5527 5528 KMP_MB(); 5529 5530 return team; 5531 } 5532 5533 /* next, let's try to take one from the team pool */ 5534 KMP_MB(); 5535 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5536 /* TODO: consider resizing undersized teams instead of reaping them, now 5537 that we have a resizing mechanism */ 5538 if (team->t.t_max_nproc >= max_nproc) { 5539 /* take this team from the team pool */ 5540 __kmp_team_pool = team->t.t_next_pool; 5541 5542 if (max_nproc > 1 && 5543 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5544 if (!team->t.b) { // Allocate barrier structure 5545 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub); 5546 } 5547 } 5548 5549 /* setup the team for fresh use */ 5550 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5551 5552 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5553 "task_team[1] %p to NULL\n", 5554 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5555 team->t.t_task_team[0] = NULL; 5556 team->t.t_task_team[1] = NULL; 5557 5558 /* reallocate space for arguments if necessary */ 5559 __kmp_alloc_argv_entries(argc, team, TRUE); 5560 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5561 5562 KA_TRACE( 5563 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5564 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5565 { // Initialize barrier data. 5566 int b; 5567 for (b = 0; b < bs_last_barrier; ++b) { 5568 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5569 #if USE_DEBUGGER 5570 team->t.t_bar[b].b_master_arrived = 0; 5571 team->t.t_bar[b].b_team_arrived = 0; 5572 #endif 5573 } 5574 } 5575 5576 team->t.t_proc_bind = new_proc_bind; 5577 5578 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5579 team->t.t_id)); 5580 5581 #if OMPT_SUPPORT 5582 __ompt_team_assign_id(team, ompt_parallel_data); 5583 #endif 5584 5585 KMP_MB(); 5586 5587 return team; 5588 } 5589 5590 /* reap team if it is too small, then loop back and check the next one */ 5591 // not sure if this is wise, but, will be redone during the hot-teams 5592 // rewrite. 5593 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5594 team = __kmp_reap_team(team); 5595 __kmp_team_pool = team; 5596 } 5597 5598 /* nothing available in the pool, no matter, make a new team! */ 5599 KMP_MB(); 5600 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5601 5602 /* and set it up */ 5603 team->t.t_max_nproc = max_nproc; 5604 if (max_nproc > 1 && 5605 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5606 // Allocate barrier structure 5607 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub); 5608 } 5609 5610 /* NOTE well, for some reason allocating one big buffer and dividing it up 5611 seems to really hurt performance a lot on the P4, so, let's not use this */ 5612 __kmp_allocate_team_arrays(team, max_nproc); 5613 5614 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5615 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5616 5617 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5618 "%p to NULL\n", 5619 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5620 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5621 // memory, no need to duplicate 5622 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5623 // memory, no need to duplicate 5624 5625 if (__kmp_storage_map) { 5626 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5627 } 5628 5629 /* allocate space for arguments */ 5630 __kmp_alloc_argv_entries(argc, team, FALSE); 5631 team->t.t_argc = argc; 5632 5633 KA_TRACE(20, 5634 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5635 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5636 { // Initialize barrier data. 5637 int b; 5638 for (b = 0; b < bs_last_barrier; ++b) { 5639 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5640 #if USE_DEBUGGER 5641 team->t.t_bar[b].b_master_arrived = 0; 5642 team->t.t_bar[b].b_team_arrived = 0; 5643 #endif 5644 } 5645 } 5646 5647 team->t.t_proc_bind = new_proc_bind; 5648 5649 #if OMPT_SUPPORT 5650 __ompt_team_assign_id(team, ompt_parallel_data); 5651 team->t.ompt_serialized_team_info = NULL; 5652 #endif 5653 5654 KMP_MB(); 5655 5656 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5657 team->t.t_id)); 5658 5659 return team; 5660 } 5661 5662 /* TODO implement hot-teams at all levels */ 5663 /* TODO implement lazy thread release on demand (disband request) */ 5664 5665 /* free the team. return it to the team pool. release all the threads 5666 * associated with it */ 5667 void __kmp_free_team(kmp_root_t *root, 5668 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5669 int f; 5670 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5671 team->t.t_id)); 5672 5673 /* verify state */ 5674 KMP_DEBUG_ASSERT(root); 5675 KMP_DEBUG_ASSERT(team); 5676 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5677 KMP_DEBUG_ASSERT(team->t.t_threads); 5678 5679 int use_hot_team = team == root->r.r_hot_team; 5680 #if KMP_NESTED_HOT_TEAMS 5681 int level; 5682 if (master) { 5683 level = team->t.t_active_level - 1; 5684 if (master->th.th_teams_microtask) { // in teams construct? 5685 if (master->th.th_teams_size.nteams > 1) { 5686 ++level; // level was not increased in teams construct for 5687 // team_of_masters 5688 } 5689 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5690 master->th.th_teams_level == team->t.t_level) { 5691 ++level; // level was not increased in teams construct for 5692 // team_of_workers before the parallel 5693 } // team->t.t_level will be increased inside parallel 5694 } 5695 #if KMP_DEBUG 5696 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams; 5697 #endif 5698 if (level < __kmp_hot_teams_max_level) { 5699 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5700 use_hot_team = 1; 5701 } 5702 } 5703 #endif // KMP_NESTED_HOT_TEAMS 5704 5705 /* team is done working */ 5706 TCW_SYNC_PTR(team->t.t_pkfn, 5707 NULL); // Important for Debugging Support Library. 5708 #if KMP_OS_WINDOWS 5709 team->t.t_copyin_counter = 0; // init counter for possible reuse 5710 #endif 5711 // Do not reset pointer to parent team to NULL for hot teams. 5712 5713 /* if we are non-hot team, release our threads */ 5714 if (!use_hot_team) { 5715 if (__kmp_tasking_mode != tskm_immediate_exec) { 5716 // Wait for threads to reach reapable state 5717 for (f = 1; f < team->t.t_nproc; ++f) { 5718 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5719 kmp_info_t *th = team->t.t_threads[f]; 5720 volatile kmp_uint32 *state = &th->th.th_reap_state; 5721 while (*state != KMP_SAFE_TO_REAP) { 5722 #if KMP_OS_WINDOWS 5723 // On Windows a thread can be killed at any time, check this 5724 DWORD ecode; 5725 if (!__kmp_is_thread_alive(th, &ecode)) { 5726 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5727 break; 5728 } 5729 #endif 5730 // first check if thread is sleeping 5731 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5732 if (fl.is_sleeping()) 5733 fl.resume(__kmp_gtid_from_thread(th)); 5734 KMP_CPU_PAUSE(); 5735 } 5736 } 5737 5738 // Delete task teams 5739 int tt_idx; 5740 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5741 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5742 if (task_team != NULL) { 5743 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams 5744 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5745 team->t.t_threads[f]->th.th_task_team = NULL; 5746 } 5747 KA_TRACE( 5748 20, 5749 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5750 __kmp_get_gtid(), task_team, team->t.t_id)); 5751 #if KMP_NESTED_HOT_TEAMS 5752 __kmp_free_task_team(master, task_team); 5753 #endif 5754 team->t.t_task_team[tt_idx] = NULL; 5755 } 5756 } 5757 } 5758 5759 // Reset pointer to parent team only for non-hot teams. 5760 team->t.t_parent = NULL; 5761 team->t.t_level = 0; 5762 team->t.t_active_level = 0; 5763 5764 /* free the worker threads */ 5765 for (f = 1; f < team->t.t_nproc; ++f) { 5766 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5767 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5768 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 5769 1, 2); 5770 } 5771 __kmp_free_thread(team->t.t_threads[f]); 5772 } 5773 5774 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5775 if (team->t.b) { 5776 // wake up thread at old location 5777 team->t.b->go_release(); 5778 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5779 for (f = 1; f < team->t.t_nproc; ++f) { 5780 if (team->t.b->sleep[f].sleep) { 5781 __kmp_atomic_resume_64( 5782 team->t.t_threads[f]->th.th_info.ds.ds_gtid, 5783 (kmp_atomic_flag_64<> *)NULL); 5784 } 5785 } 5786 } 5787 // Wait for threads to be removed from team 5788 for (int f = 1; f < team->t.t_nproc; ++f) { 5789 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0) 5790 KMP_CPU_PAUSE(); 5791 } 5792 } 5793 } 5794 5795 for (f = 1; f < team->t.t_nproc; ++f) { 5796 team->t.t_threads[f] = NULL; 5797 } 5798 5799 if (team->t.t_max_nproc > 1 && 5800 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5801 distributedBarrier::deallocate(team->t.b); 5802 team->t.b = NULL; 5803 } 5804 /* put the team back in the team pool */ 5805 /* TODO limit size of team pool, call reap_team if pool too large */ 5806 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5807 __kmp_team_pool = (volatile kmp_team_t *)team; 5808 } else { // Check if team was created for primary threads in teams construct 5809 // See if first worker is a CG root 5810 KMP_DEBUG_ASSERT(team->t.t_threads[1] && 5811 team->t.t_threads[1]->th.th_cg_roots); 5812 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { 5813 // Clean up the CG root nodes on workers so that this team can be re-used 5814 for (f = 1; f < team->t.t_nproc; ++f) { 5815 kmp_info_t *thr = team->t.t_threads[f]; 5816 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && 5817 thr->th.th_cg_roots->cg_root == thr); 5818 // Pop current CG root off list 5819 kmp_cg_root_t *tmp = thr->th.th_cg_roots; 5820 thr->th.th_cg_roots = tmp->up; 5821 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" 5822 " up to node %p. cg_nthreads was %d\n", 5823 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); 5824 int i = tmp->cg_nthreads--; 5825 if (i == 1) { 5826 __kmp_free(tmp); // free CG if we are the last thread in it 5827 } 5828 // Restore current task's thread_limit from CG root 5829 if (thr->th.th_cg_roots) 5830 thr->th.th_current_task->td_icvs.thread_limit = 5831 thr->th.th_cg_roots->cg_thread_limit; 5832 } 5833 } 5834 } 5835 5836 KMP_MB(); 5837 } 5838 5839 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5840 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5841 kmp_team_t *next_pool = team->t.t_next_pool; 5842 5843 KMP_DEBUG_ASSERT(team); 5844 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5845 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5846 KMP_DEBUG_ASSERT(team->t.t_threads); 5847 KMP_DEBUG_ASSERT(team->t.t_argv); 5848 5849 /* TODO clean the threads that are a part of this? */ 5850 5851 /* free stuff */ 5852 __kmp_free_team_arrays(team); 5853 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5854 __kmp_free((void *)team->t.t_argv); 5855 __kmp_free(team); 5856 5857 KMP_MB(); 5858 return next_pool; 5859 } 5860 5861 // Free the thread. Don't reap it, just place it on the pool of available 5862 // threads. 5863 // 5864 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5865 // binding for the affinity mechanism to be useful. 5866 // 5867 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5868 // However, we want to avoid a potential performance problem by always 5869 // scanning through the list to find the correct point at which to insert 5870 // the thread (potential N**2 behavior). To do this we keep track of the 5871 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5872 // With single-level parallelism, threads will always be added to the tail 5873 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5874 // parallelism, all bets are off and we may need to scan through the entire 5875 // free list. 5876 // 5877 // This change also has a potentially large performance benefit, for some 5878 // applications. Previously, as threads were freed from the hot team, they 5879 // would be placed back on the free list in inverse order. If the hot team 5880 // grew back to it's original size, then the freed thread would be placed 5881 // back on the hot team in reverse order. This could cause bad cache 5882 // locality problems on programs where the size of the hot team regularly 5883 // grew and shrunk. 5884 // 5885 // Now, for single-level parallelism, the OMP tid is always == gtid. 5886 void __kmp_free_thread(kmp_info_t *this_th) { 5887 int gtid; 5888 kmp_info_t **scan; 5889 5890 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5891 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5892 5893 KMP_DEBUG_ASSERT(this_th); 5894 5895 // When moving thread to pool, switch thread to wait on own b_go flag, and 5896 // uninitialized (NULL team). 5897 int b; 5898 kmp_balign_t *balign = this_th->th.th_bar; 5899 for (b = 0; b < bs_last_barrier; ++b) { 5900 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5901 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5902 balign[b].bb.team = NULL; 5903 balign[b].bb.leaf_kids = 0; 5904 } 5905 this_th->th.th_task_state = 0; 5906 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5907 5908 /* put thread back on the free pool */ 5909 TCW_PTR(this_th->th.th_team, NULL); 5910 TCW_PTR(this_th->th.th_root, NULL); 5911 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5912 5913 while (this_th->th.th_cg_roots) { 5914 this_th->th.th_cg_roots->cg_nthreads--; 5915 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" 5916 " %p of thread %p to %d\n", 5917 this_th, this_th->th.th_cg_roots, 5918 this_th->th.th_cg_roots->cg_root, 5919 this_th->th.th_cg_roots->cg_nthreads)); 5920 kmp_cg_root_t *tmp = this_th->th.th_cg_roots; 5921 if (tmp->cg_root == this_th) { // Thread is a cg_root 5922 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); 5923 KA_TRACE( 5924 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); 5925 this_th->th.th_cg_roots = tmp->up; 5926 __kmp_free(tmp); 5927 } else { // Worker thread 5928 if (tmp->cg_nthreads == 0) { // last thread leaves contention group 5929 __kmp_free(tmp); 5930 } 5931 this_th->th.th_cg_roots = NULL; 5932 break; 5933 } 5934 } 5935 5936 /* If the implicit task assigned to this thread can be used by other threads 5937 * -> multiple threads can share the data and try to free the task at 5938 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5939 * with higher probability when hot team is disabled but can occurs even when 5940 * the hot team is enabled */ 5941 __kmp_free_implicit_task(this_th); 5942 this_th->th.th_current_task = NULL; 5943 5944 // If the __kmp_thread_pool_insert_pt is already past the new insert 5945 // point, then we need to re-scan the entire list. 5946 gtid = this_th->th.th_info.ds.ds_gtid; 5947 if (__kmp_thread_pool_insert_pt != NULL) { 5948 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5949 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5950 __kmp_thread_pool_insert_pt = NULL; 5951 } 5952 } 5953 5954 // Scan down the list to find the place to insert the thread. 5955 // scan is the address of a link in the list, possibly the address of 5956 // __kmp_thread_pool itself. 5957 // 5958 // In the absence of nested parallelism, the for loop will have 0 iterations. 5959 if (__kmp_thread_pool_insert_pt != NULL) { 5960 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5961 } else { 5962 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5963 } 5964 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5965 scan = &((*scan)->th.th_next_pool)) 5966 ; 5967 5968 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5969 // to its address. 5970 TCW_PTR(this_th->th.th_next_pool, *scan); 5971 __kmp_thread_pool_insert_pt = *scan = this_th; 5972 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5973 (this_th->th.th_info.ds.ds_gtid < 5974 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5975 TCW_4(this_th->th.th_in_pool, TRUE); 5976 __kmp_suspend_initialize_thread(this_th); 5977 __kmp_lock_suspend_mx(this_th); 5978 if (this_th->th.th_active == TRUE) { 5979 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); 5980 this_th->th.th_active_in_pool = TRUE; 5981 } 5982 #if KMP_DEBUG 5983 else { 5984 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); 5985 } 5986 #endif 5987 __kmp_unlock_suspend_mx(this_th); 5988 5989 TCW_4(__kmp_nth, __kmp_nth - 1); 5990 5991 #ifdef KMP_ADJUST_BLOCKTIME 5992 /* Adjust blocktime back to user setting or default if necessary */ 5993 /* Middle initialization might never have occurred */ 5994 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5995 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5996 if (__kmp_nth <= __kmp_avail_proc) { 5997 __kmp_zero_bt = FALSE; 5998 } 5999 } 6000 #endif /* KMP_ADJUST_BLOCKTIME */ 6001 6002 KMP_MB(); 6003 } 6004 6005 /* ------------------------------------------------------------------------ */ 6006 6007 void *__kmp_launch_thread(kmp_info_t *this_thr) { 6008 #if OMP_PROFILING_SUPPORT 6009 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE"); 6010 // TODO: add a configuration option for time granularity 6011 if (ProfileTraceFile) 6012 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget"); 6013 #endif 6014 6015 int gtid = this_thr->th.th_info.ds.ds_gtid; 6016 /* void *stack_data;*/ 6017 kmp_team_t **volatile pteam; 6018 6019 KMP_MB(); 6020 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 6021 6022 if (__kmp_env_consistency_check) { 6023 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 6024 } 6025 6026 #if OMPD_SUPPORT 6027 if (ompd_state & OMPD_ENABLE_BP) 6028 ompd_bp_thread_begin(); 6029 #endif 6030 6031 #if OMPT_SUPPORT 6032 ompt_data_t *thread_data = nullptr; 6033 if (ompt_enabled.enabled) { 6034 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 6035 *thread_data = ompt_data_none; 6036 6037 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 6038 this_thr->th.ompt_thread_info.wait_id = 0; 6039 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 6040 this_thr->th.ompt_thread_info.parallel_flags = 0; 6041 if (ompt_enabled.ompt_callback_thread_begin) { 6042 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 6043 ompt_thread_worker, thread_data); 6044 } 6045 this_thr->th.ompt_thread_info.state = ompt_state_idle; 6046 } 6047 #endif 6048 6049 /* This is the place where threads wait for work */ 6050 while (!TCR_4(__kmp_global.g.g_done)) { 6051 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 6052 KMP_MB(); 6053 6054 /* wait for work to do */ 6055 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 6056 6057 /* No tid yet since not part of a team */ 6058 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 6059 6060 #if OMPT_SUPPORT 6061 if (ompt_enabled.enabled) { 6062 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 6063 } 6064 #endif 6065 6066 pteam = &this_thr->th.th_team; 6067 6068 /* have we been allocated? */ 6069 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 6070 /* we were just woken up, so run our new task */ 6071 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 6072 int rc; 6073 KA_TRACE(20, 6074 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 6075 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 6076 (*pteam)->t.t_pkfn)); 6077 6078 updateHWFPControl(*pteam); 6079 6080 #if OMPT_SUPPORT 6081 if (ompt_enabled.enabled) { 6082 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 6083 } 6084 #endif 6085 6086 rc = (*pteam)->t.t_invoke(gtid); 6087 KMP_ASSERT(rc); 6088 6089 KMP_MB(); 6090 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 6091 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 6092 (*pteam)->t.t_pkfn)); 6093 } 6094 #if OMPT_SUPPORT 6095 if (ompt_enabled.enabled) { 6096 /* no frame set while outside task */ 6097 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 6098 6099 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 6100 } 6101 #endif 6102 /* join barrier after parallel region */ 6103 __kmp_join_barrier(gtid); 6104 } 6105 } 6106 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 6107 6108 #if OMPD_SUPPORT 6109 if (ompd_state & OMPD_ENABLE_BP) 6110 ompd_bp_thread_end(); 6111 #endif 6112 6113 #if OMPT_SUPPORT 6114 if (ompt_enabled.ompt_callback_thread_end) { 6115 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 6116 } 6117 #endif 6118 6119 this_thr->th.th_task_team = NULL; 6120 /* run the destructors for the threadprivate data for this thread */ 6121 __kmp_common_destroy_gtid(gtid); 6122 6123 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 6124 KMP_MB(); 6125 6126 #if OMP_PROFILING_SUPPORT 6127 llvm::timeTraceProfilerFinishThread(); 6128 #endif 6129 return this_thr; 6130 } 6131 6132 /* ------------------------------------------------------------------------ */ 6133 6134 void __kmp_internal_end_dest(void *specific_gtid) { 6135 // Make sure no significant bits are lost 6136 int gtid; 6137 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, >id); 6138 6139 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 6140 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 6141 * this is because 0 is reserved for the nothing-stored case */ 6142 6143 __kmp_internal_end_thread(gtid); 6144 } 6145 6146 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 6147 6148 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 6149 __kmp_internal_end_atexit(); 6150 } 6151 6152 #endif 6153 6154 /* [Windows] josh: when the atexit handler is called, there may still be more 6155 than one thread alive */ 6156 void __kmp_internal_end_atexit(void) { 6157 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 6158 /* [Windows] 6159 josh: ideally, we want to completely shutdown the library in this atexit 6160 handler, but stat code that depends on thread specific data for gtid fails 6161 because that data becomes unavailable at some point during the shutdown, so 6162 we call __kmp_internal_end_thread instead. We should eventually remove the 6163 dependency on __kmp_get_specific_gtid in the stat code and use 6164 __kmp_internal_end_library to cleanly shutdown the library. 6165 6166 // TODO: Can some of this comment about GVS be removed? 6167 I suspect that the offending stat code is executed when the calling thread 6168 tries to clean up a dead root thread's data structures, resulting in GVS 6169 code trying to close the GVS structures for that thread, but since the stat 6170 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 6171 the calling thread is cleaning up itself instead of another thread, it get 6172 confused. This happens because allowing a thread to unregister and cleanup 6173 another thread is a recent modification for addressing an issue. 6174 Based on the current design (20050722), a thread may end up 6175 trying to unregister another thread only if thread death does not trigger 6176 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 6177 thread specific data destructor function to detect thread death. For 6178 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 6179 is nothing. Thus, the workaround is applicable only for Windows static 6180 stat library. */ 6181 __kmp_internal_end_library(-1); 6182 #if KMP_OS_WINDOWS 6183 __kmp_close_console(); 6184 #endif 6185 } 6186 6187 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 6188 // It is assumed __kmp_forkjoin_lock is acquired. 6189 6190 int gtid; 6191 6192 KMP_DEBUG_ASSERT(thread != NULL); 6193 6194 gtid = thread->th.th_info.ds.ds_gtid; 6195 6196 if (!is_root) { 6197 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 6198 /* Assume the threads are at the fork barrier here */ 6199 KA_TRACE( 6200 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 6201 gtid)); 6202 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 6203 while ( 6204 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3)) 6205 KMP_CPU_PAUSE(); 6206 __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL); 6207 } else { 6208 /* Need release fence here to prevent seg faults for tree forkjoin 6209 barrier (GEH) */ 6210 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 6211 thread); 6212 __kmp_release_64(&flag); 6213 } 6214 } 6215 6216 // Terminate OS thread. 6217 __kmp_reap_worker(thread); 6218 6219 // The thread was killed asynchronously. If it was actively 6220 // spinning in the thread pool, decrement the global count. 6221 // 6222 // There is a small timing hole here - if the worker thread was just waking 6223 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 6224 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 6225 // the global counter might not get updated. 6226 // 6227 // Currently, this can only happen as the library is unloaded, 6228 // so there are no harmful side effects. 6229 if (thread->th.th_active_in_pool) { 6230 thread->th.th_active_in_pool = FALSE; 6231 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 6232 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 6233 } 6234 } 6235 6236 __kmp_free_implicit_task(thread); 6237 6238 // Free the fast memory for tasking 6239 #if USE_FAST_MEMORY 6240 __kmp_free_fast_memory(thread); 6241 #endif /* USE_FAST_MEMORY */ 6242 6243 __kmp_suspend_uninitialize_thread(thread); 6244 6245 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 6246 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 6247 6248 --__kmp_all_nth; 6249 // __kmp_nth was decremented when thread is added to the pool. 6250 6251 #ifdef KMP_ADJUST_BLOCKTIME 6252 /* Adjust blocktime back to user setting or default if necessary */ 6253 /* Middle initialization might never have occurred */ 6254 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 6255 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 6256 if (__kmp_nth <= __kmp_avail_proc) { 6257 __kmp_zero_bt = FALSE; 6258 } 6259 } 6260 #endif /* KMP_ADJUST_BLOCKTIME */ 6261 6262 /* free the memory being used */ 6263 if (__kmp_env_consistency_check) { 6264 if (thread->th.th_cons) { 6265 __kmp_free_cons_stack(thread->th.th_cons); 6266 thread->th.th_cons = NULL; 6267 } 6268 } 6269 6270 if (thread->th.th_pri_common != NULL) { 6271 __kmp_free(thread->th.th_pri_common); 6272 thread->th.th_pri_common = NULL; 6273 } 6274 6275 if (thread->th.th_task_state_memo_stack != NULL) { 6276 __kmp_free(thread->th.th_task_state_memo_stack); 6277 thread->th.th_task_state_memo_stack = NULL; 6278 } 6279 6280 #if KMP_USE_BGET 6281 if (thread->th.th_local.bget_data != NULL) { 6282 __kmp_finalize_bget(thread); 6283 } 6284 #endif 6285 6286 #if KMP_AFFINITY_SUPPORTED 6287 if (thread->th.th_affin_mask != NULL) { 6288 KMP_CPU_FREE(thread->th.th_affin_mask); 6289 thread->th.th_affin_mask = NULL; 6290 } 6291 #endif /* KMP_AFFINITY_SUPPORTED */ 6292 6293 #if KMP_USE_HIER_SCHED 6294 if (thread->th.th_hier_bar_data != NULL) { 6295 __kmp_free(thread->th.th_hier_bar_data); 6296 thread->th.th_hier_bar_data = NULL; 6297 } 6298 #endif 6299 6300 __kmp_reap_team(thread->th.th_serial_team); 6301 thread->th.th_serial_team = NULL; 6302 __kmp_free(thread); 6303 6304 KMP_MB(); 6305 6306 } // __kmp_reap_thread 6307 6308 static void __kmp_itthash_clean(kmp_info_t *th) { 6309 #if USE_ITT_NOTIFY 6310 if (__kmp_itt_region_domains.count > 0) { 6311 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) { 6312 kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i]; 6313 while (bucket) { 6314 kmp_itthash_entry_t *next = bucket->next_in_bucket; 6315 __kmp_thread_free(th, bucket); 6316 bucket = next; 6317 } 6318 } 6319 } 6320 if (__kmp_itt_barrier_domains.count > 0) { 6321 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) { 6322 kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i]; 6323 while (bucket) { 6324 kmp_itthash_entry_t *next = bucket->next_in_bucket; 6325 __kmp_thread_free(th, bucket); 6326 bucket = next; 6327 } 6328 } 6329 } 6330 #endif 6331 } 6332 6333 static void __kmp_internal_end(void) { 6334 int i; 6335 6336 /* First, unregister the library */ 6337 __kmp_unregister_library(); 6338 6339 #if KMP_OS_WINDOWS 6340 /* In Win static library, we can't tell when a root actually dies, so we 6341 reclaim the data structures for any root threads that have died but not 6342 unregistered themselves, in order to shut down cleanly. 6343 In Win dynamic library we also can't tell when a thread dies. */ 6344 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 6345 // dead roots 6346 #endif 6347 6348 for (i = 0; i < __kmp_threads_capacity; i++) 6349 if (__kmp_root[i]) 6350 if (__kmp_root[i]->r.r_active) 6351 break; 6352 KMP_MB(); /* Flush all pending memory write invalidates. */ 6353 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6354 6355 if (i < __kmp_threads_capacity) { 6356 #if KMP_USE_MONITOR 6357 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 6358 KMP_MB(); /* Flush all pending memory write invalidates. */ 6359 6360 // Need to check that monitor was initialized before reaping it. If we are 6361 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 6362 // __kmp_monitor will appear to contain valid data, but it is only valid in 6363 // the parent process, not the child. 6364 // New behavior (201008): instead of keying off of the flag 6365 // __kmp_init_parallel, the monitor thread creation is keyed off 6366 // of the new flag __kmp_init_monitor. 6367 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6368 if (TCR_4(__kmp_init_monitor)) { 6369 __kmp_reap_monitor(&__kmp_monitor); 6370 TCW_4(__kmp_init_monitor, 0); 6371 } 6372 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6373 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6374 #endif // KMP_USE_MONITOR 6375 } else { 6376 /* TODO move this to cleanup code */ 6377 #ifdef KMP_DEBUG 6378 /* make sure that everything has properly ended */ 6379 for (i = 0; i < __kmp_threads_capacity; i++) { 6380 if (__kmp_root[i]) { 6381 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 6382 // there can be uber threads alive here 6383 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 6384 } 6385 } 6386 #endif 6387 6388 KMP_MB(); 6389 6390 // Reap the worker threads. 6391 // This is valid for now, but be careful if threads are reaped sooner. 6392 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 6393 // Get the next thread from the pool. 6394 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 6395 __kmp_thread_pool = thread->th.th_next_pool; 6396 // Reap it. 6397 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 6398 thread->th.th_next_pool = NULL; 6399 thread->th.th_in_pool = FALSE; 6400 __kmp_reap_thread(thread, 0); 6401 } 6402 __kmp_thread_pool_insert_pt = NULL; 6403 6404 // Reap teams. 6405 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 6406 // Get the next team from the pool. 6407 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 6408 __kmp_team_pool = team->t.t_next_pool; 6409 // Reap it. 6410 team->t.t_next_pool = NULL; 6411 __kmp_reap_team(team); 6412 } 6413 6414 __kmp_reap_task_teams(); 6415 6416 #if KMP_OS_UNIX 6417 // Threads that are not reaped should not access any resources since they 6418 // are going to be deallocated soon, so the shutdown sequence should wait 6419 // until all threads either exit the final spin-waiting loop or begin 6420 // sleeping after the given blocktime. 6421 for (i = 0; i < __kmp_threads_capacity; i++) { 6422 kmp_info_t *thr = __kmp_threads[i]; 6423 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 6424 KMP_CPU_PAUSE(); 6425 } 6426 #endif 6427 6428 for (i = 0; i < __kmp_threads_capacity; ++i) { 6429 // TBD: Add some checking... 6430 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6431 } 6432 6433 /* Make sure all threadprivate destructors get run by joining with all 6434 worker threads before resetting this flag */ 6435 TCW_SYNC_4(__kmp_init_common, FALSE); 6436 6437 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6438 KMP_MB(); 6439 6440 #if KMP_USE_MONITOR 6441 // See note above: One of the possible fixes for CQ138434 / CQ140126 6442 // 6443 // FIXME: push both code fragments down and CSE them? 6444 // push them into __kmp_cleanup() ? 6445 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6446 if (TCR_4(__kmp_init_monitor)) { 6447 __kmp_reap_monitor(&__kmp_monitor); 6448 TCW_4(__kmp_init_monitor, 0); 6449 } 6450 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6451 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6452 #endif 6453 } /* else !__kmp_global.t_active */ 6454 TCW_4(__kmp_init_gtid, FALSE); 6455 KMP_MB(); /* Flush all pending memory write invalidates. */ 6456 6457 __kmp_cleanup(); 6458 #if OMPT_SUPPORT 6459 ompt_fini(); 6460 #endif 6461 } 6462 6463 void __kmp_internal_end_library(int gtid_req) { 6464 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6465 /* this shouldn't be a race condition because __kmp_internal_end() is the 6466 only place to clear __kmp_serial_init */ 6467 /* we'll check this later too, after we get the lock */ 6468 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6469 // redundant, because the next check will work in any case. 6470 if (__kmp_global.g.g_abort) { 6471 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6472 /* TODO abort? */ 6473 return; 6474 } 6475 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6476 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6477 return; 6478 } 6479 6480 // If hidden helper team has been initialized, we need to deinit it 6481 if (TCR_4(__kmp_init_hidden_helper) && 6482 !TCR_4(__kmp_hidden_helper_team_done)) { 6483 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6484 // First release the main thread to let it continue its work 6485 __kmp_hidden_helper_main_thread_release(); 6486 // Wait until the hidden helper team has been destroyed 6487 __kmp_hidden_helper_threads_deinitz_wait(); 6488 } 6489 6490 KMP_MB(); /* Flush all pending memory write invalidates. */ 6491 /* find out who we are and what we should do */ 6492 { 6493 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6494 KA_TRACE( 6495 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6496 if (gtid == KMP_GTID_SHUTDOWN) { 6497 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6498 "already shutdown\n")); 6499 return; 6500 } else if (gtid == KMP_GTID_MONITOR) { 6501 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6502 "registered, or system shutdown\n")); 6503 return; 6504 } else if (gtid == KMP_GTID_DNE) { 6505 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6506 "shutdown\n")); 6507 /* we don't know who we are, but we may still shutdown the library */ 6508 } else if (KMP_UBER_GTID(gtid)) { 6509 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6510 if (__kmp_root[gtid]->r.r_active) { 6511 __kmp_global.g.g_abort = -1; 6512 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6513 __kmp_unregister_library(); 6514 KA_TRACE(10, 6515 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6516 gtid)); 6517 return; 6518 } else { 6519 __kmp_itthash_clean(__kmp_threads[gtid]); 6520 KA_TRACE( 6521 10, 6522 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6523 __kmp_unregister_root_current_thread(gtid); 6524 } 6525 } else { 6526 /* worker threads may call this function through the atexit handler, if they 6527 * call exit() */ 6528 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6529 TODO: do a thorough shutdown instead */ 6530 #ifdef DUMP_DEBUG_ON_EXIT 6531 if (__kmp_debug_buf) 6532 __kmp_dump_debug_buffer(); 6533 #endif 6534 // added unregister library call here when we switch to shm linux 6535 // if we don't, it will leave lots of files in /dev/shm 6536 // cleanup shared memory file before exiting. 6537 __kmp_unregister_library(); 6538 return; 6539 } 6540 } 6541 /* synchronize the termination process */ 6542 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6543 6544 /* have we already finished */ 6545 if (__kmp_global.g.g_abort) { 6546 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6547 /* TODO abort? */ 6548 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6549 return; 6550 } 6551 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6552 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6553 return; 6554 } 6555 6556 /* We need this lock to enforce mutex between this reading of 6557 __kmp_threads_capacity and the writing by __kmp_register_root. 6558 Alternatively, we can use a counter of roots that is atomically updated by 6559 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6560 __kmp_internal_end_*. */ 6561 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6562 6563 /* now we can safely conduct the actual termination */ 6564 __kmp_internal_end(); 6565 6566 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6567 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6568 6569 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6570 6571 #ifdef DUMP_DEBUG_ON_EXIT 6572 if (__kmp_debug_buf) 6573 __kmp_dump_debug_buffer(); 6574 #endif 6575 6576 #if KMP_OS_WINDOWS 6577 __kmp_close_console(); 6578 #endif 6579 6580 __kmp_fini_allocator(); 6581 6582 } // __kmp_internal_end_library 6583 6584 void __kmp_internal_end_thread(int gtid_req) { 6585 int i; 6586 6587 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6588 /* this shouldn't be a race condition because __kmp_internal_end() is the 6589 * only place to clear __kmp_serial_init */ 6590 /* we'll check this later too, after we get the lock */ 6591 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6592 // redundant, because the next check will work in any case. 6593 if (__kmp_global.g.g_abort) { 6594 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6595 /* TODO abort? */ 6596 return; 6597 } 6598 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6599 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6600 return; 6601 } 6602 6603 // If hidden helper team has been initialized, we need to deinit it 6604 if (TCR_4(__kmp_init_hidden_helper) && 6605 !TCR_4(__kmp_hidden_helper_team_done)) { 6606 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6607 // First release the main thread to let it continue its work 6608 __kmp_hidden_helper_main_thread_release(); 6609 // Wait until the hidden helper team has been destroyed 6610 __kmp_hidden_helper_threads_deinitz_wait(); 6611 } 6612 6613 KMP_MB(); /* Flush all pending memory write invalidates. */ 6614 6615 /* find out who we are and what we should do */ 6616 { 6617 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6618 KA_TRACE(10, 6619 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6620 if (gtid == KMP_GTID_SHUTDOWN) { 6621 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6622 "already shutdown\n")); 6623 return; 6624 } else if (gtid == KMP_GTID_MONITOR) { 6625 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6626 "registered, or system shutdown\n")); 6627 return; 6628 } else if (gtid == KMP_GTID_DNE) { 6629 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6630 "shutdown\n")); 6631 return; 6632 /* we don't know who we are */ 6633 } else if (KMP_UBER_GTID(gtid)) { 6634 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6635 if (__kmp_root[gtid]->r.r_active) { 6636 __kmp_global.g.g_abort = -1; 6637 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6638 KA_TRACE(10, 6639 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6640 gtid)); 6641 return; 6642 } else { 6643 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6644 gtid)); 6645 __kmp_unregister_root_current_thread(gtid); 6646 } 6647 } else { 6648 /* just a worker thread, let's leave */ 6649 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6650 6651 if (gtid >= 0) { 6652 __kmp_threads[gtid]->th.th_task_team = NULL; 6653 } 6654 6655 KA_TRACE(10, 6656 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6657 gtid)); 6658 return; 6659 } 6660 } 6661 #if KMP_DYNAMIC_LIB 6662 if (__kmp_pause_status != kmp_hard_paused) 6663 // AC: lets not shutdown the dynamic library at the exit of uber thread, 6664 // because we will better shutdown later in the library destructor. 6665 { 6666 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6667 return; 6668 } 6669 #endif 6670 /* synchronize the termination process */ 6671 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6672 6673 /* have we already finished */ 6674 if (__kmp_global.g.g_abort) { 6675 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6676 /* TODO abort? */ 6677 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6678 return; 6679 } 6680 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6681 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6682 return; 6683 } 6684 6685 /* We need this lock to enforce mutex between this reading of 6686 __kmp_threads_capacity and the writing by __kmp_register_root. 6687 Alternatively, we can use a counter of roots that is atomically updated by 6688 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6689 __kmp_internal_end_*. */ 6690 6691 /* should we finish the run-time? are all siblings done? */ 6692 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6693 6694 for (i = 0; i < __kmp_threads_capacity; ++i) { 6695 if (KMP_UBER_GTID(i)) { 6696 KA_TRACE( 6697 10, 6698 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6699 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6700 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6701 return; 6702 } 6703 } 6704 6705 /* now we can safely conduct the actual termination */ 6706 6707 __kmp_internal_end(); 6708 6709 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6710 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6711 6712 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6713 6714 #ifdef DUMP_DEBUG_ON_EXIT 6715 if (__kmp_debug_buf) 6716 __kmp_dump_debug_buffer(); 6717 #endif 6718 } // __kmp_internal_end_thread 6719 6720 // ----------------------------------------------------------------------------- 6721 // Library registration stuff. 6722 6723 static long __kmp_registration_flag = 0; 6724 // Random value used to indicate library initialization. 6725 static char *__kmp_registration_str = NULL; 6726 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6727 6728 static inline char *__kmp_reg_status_name() { 6729 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6730 each thread. If registration and unregistration go in different threads 6731 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6732 env var can not be found, because the name will contain different pid. */ 6733 // macOS* complains about name being too long with additional getuid() 6734 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB 6735 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(), 6736 (int)getuid()); 6737 #else 6738 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6739 #endif 6740 } // __kmp_reg_status_get 6741 6742 #if defined(KMP_USE_SHM) 6743 // If /dev/shm is not accessible, we will create a temporary file under /tmp. 6744 char *temp_reg_status_file_name = nullptr; 6745 #endif 6746 6747 void __kmp_register_library_startup(void) { 6748 6749 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6750 int done = 0; 6751 union { 6752 double dtime; 6753 long ltime; 6754 } time; 6755 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6756 __kmp_initialize_system_tick(); 6757 #endif 6758 __kmp_read_system_time(&time.dtime); 6759 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6760 __kmp_registration_str = 6761 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6762 __kmp_registration_flag, KMP_LIBRARY_FILE); 6763 6764 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6765 __kmp_registration_str)); 6766 6767 while (!done) { 6768 6769 char *value = NULL; // Actual value of the environment variable. 6770 6771 #if defined(KMP_USE_SHM) 6772 char *shm_name = __kmp_str_format("/%s", name); 6773 int shm_preexist = 0; 6774 char *data1; 6775 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666); 6776 if ((fd1 == -1) && (errno == EEXIST)) { 6777 // file didn't open because it already exists. 6778 // try opening existing file 6779 fd1 = shm_open(shm_name, O_RDWR, 0666); 6780 if (fd1 == -1) { // file didn't open 6781 // error out here 6782 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0), 6783 __kmp_msg_null); 6784 } else { 6785 // able to open existing file 6786 shm_preexist = 1; 6787 } 6788 } else if (fd1 == -1) { 6789 // SHM didn't open; it was due to error other than already exists. Try to 6790 // create a temp file under /tmp. 6791 // TODO: /tmp might not always be the temporary directory. For now we will 6792 // not consider TMPDIR. If /tmp is not accessible, we simply error out. 6793 char *temp_file_name = __kmp_str_format("/tmp/%sXXXXXX", name); 6794 fd1 = mkstemp(temp_file_name); 6795 if (fd1 == -1) { 6796 // error out here. 6797 __kmp_fatal(KMP_MSG(FunctionError, "Can't open TEMP"), KMP_ERR(errno), 6798 __kmp_msg_null); 6799 } 6800 temp_reg_status_file_name = temp_file_name; 6801 } 6802 if (shm_preexist == 0) { 6803 // we created SHM now set size 6804 if (ftruncate(fd1, SHM_SIZE) == -1) { 6805 // error occured setting size; 6806 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"), 6807 KMP_ERR(errno), __kmp_msg_null); 6808 } 6809 } 6810 data1 = 6811 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0); 6812 if (data1 == MAP_FAILED) { 6813 // failed to map shared memory 6814 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno), 6815 __kmp_msg_null); 6816 } 6817 if (shm_preexist == 0) { // set data to SHM, set value 6818 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); 6819 } 6820 // Read value from either what we just wrote or existing file. 6821 value = __kmp_str_format("%s", data1); // read value from SHM 6822 munmap(data1, SHM_SIZE); 6823 close(fd1); 6824 #else // Windows and unix with static library 6825 // Set environment variable, but do not overwrite if it is exist. 6826 __kmp_env_set(name, __kmp_registration_str, 0); 6827 // read value to see if it got set 6828 value = __kmp_env_get(name); 6829 #endif 6830 6831 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6832 done = 1; // Ok, environment variable set successfully, exit the loop. 6833 } else { 6834 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6835 // Check whether it alive or dead. 6836 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6837 char *tail = value; 6838 char *flag_addr_str = NULL; 6839 char *flag_val_str = NULL; 6840 char const *file_name = NULL; 6841 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6842 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6843 file_name = tail; 6844 if (tail != NULL) { 6845 unsigned long *flag_addr = 0; 6846 unsigned long flag_val = 0; 6847 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr)); 6848 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6849 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6850 // First, check whether environment-encoded address is mapped into 6851 // addr space. 6852 // If so, dereference it to see if it still has the right value. 6853 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6854 neighbor = 1; 6855 } else { 6856 // If not, then we know the other copy of the library is no longer 6857 // running. 6858 neighbor = 2; 6859 } 6860 } 6861 } 6862 switch (neighbor) { 6863 case 0: // Cannot parse environment variable -- neighbor status unknown. 6864 // Assume it is the incompatible format of future version of the 6865 // library. Assume the other library is alive. 6866 // WARN( ... ); // TODO: Issue a warning. 6867 file_name = "unknown library"; 6868 KMP_FALLTHROUGH(); 6869 // Attention! Falling to the next case. That's intentional. 6870 case 1: { // Neighbor is alive. 6871 // Check it is allowed. 6872 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6873 if (!__kmp_str_match_true(duplicate_ok)) { 6874 // That's not allowed. Issue fatal error. 6875 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6876 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6877 } 6878 KMP_INTERNAL_FREE(duplicate_ok); 6879 __kmp_duplicate_library_ok = 1; 6880 done = 1; // Exit the loop. 6881 } break; 6882 case 2: { // Neighbor is dead. 6883 6884 #if defined(KMP_USE_SHM) 6885 // close shared memory. 6886 shm_unlink(shm_name); // this removes file in /dev/shm 6887 #else 6888 // Clear the variable and try to register library again. 6889 __kmp_env_unset(name); 6890 #endif 6891 } break; 6892 default: { 6893 KMP_DEBUG_ASSERT(0); 6894 } break; 6895 } 6896 } 6897 KMP_INTERNAL_FREE((void *)value); 6898 #if defined(KMP_USE_SHM) 6899 KMP_INTERNAL_FREE((void *)shm_name); 6900 #endif 6901 } // while 6902 KMP_INTERNAL_FREE((void *)name); 6903 6904 } // func __kmp_register_library_startup 6905 6906 void __kmp_unregister_library(void) { 6907 6908 char *name = __kmp_reg_status_name(); 6909 char *value = NULL; 6910 6911 #if defined(KMP_USE_SHM) 6912 bool use_shm = true; 6913 char *shm_name = __kmp_str_format("/%s", name); 6914 int fd1 = shm_open(shm_name, O_RDONLY, 0666); 6915 if (fd1 == -1) { 6916 // File did not open. Try the temporary file. 6917 use_shm = false; 6918 KMP_DEBUG_ASSERT(temp_reg_status_file_name); 6919 fd1 = open(temp_reg_status_file_name, O_RDONLY); 6920 if (fd1 == -1) { 6921 // give it up now. 6922 return; 6923 } 6924 } 6925 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); 6926 if (data1 != MAP_FAILED) { 6927 value = __kmp_str_format("%s", data1); // read value from SHM 6928 munmap(data1, SHM_SIZE); 6929 } 6930 close(fd1); 6931 #else 6932 value = __kmp_env_get(name); 6933 #endif 6934 6935 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6936 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6937 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6938 // Ok, this is our variable. Delete it. 6939 #if defined(KMP_USE_SHM) 6940 if (use_shm) { 6941 shm_unlink(shm_name); // this removes file in /dev/shm 6942 } else { 6943 KMP_DEBUG_ASSERT(temp_reg_status_file_name); 6944 unlink(temp_reg_status_file_name); // this removes the temp file 6945 } 6946 #else 6947 __kmp_env_unset(name); 6948 #endif 6949 } 6950 6951 #if defined(KMP_USE_SHM) 6952 KMP_INTERNAL_FREE(shm_name); 6953 if (!use_shm) { 6954 KMP_DEBUG_ASSERT(temp_reg_status_file_name); 6955 KMP_INTERNAL_FREE(temp_reg_status_file_name); 6956 } 6957 #endif 6958 6959 KMP_INTERNAL_FREE(__kmp_registration_str); 6960 KMP_INTERNAL_FREE(value); 6961 KMP_INTERNAL_FREE(name); 6962 6963 __kmp_registration_flag = 0; 6964 __kmp_registration_str = NULL; 6965 6966 } // __kmp_unregister_library 6967 6968 // End of Library registration stuff. 6969 // ----------------------------------------------------------------------------- 6970 6971 #if KMP_MIC_SUPPORTED 6972 6973 static void __kmp_check_mic_type() { 6974 kmp_cpuid_t cpuid_state = {0}; 6975 kmp_cpuid_t *cs_p = &cpuid_state; 6976 __kmp_x86_cpuid(1, 0, cs_p); 6977 // We don't support mic1 at the moment 6978 if ((cs_p->eax & 0xff0) == 0xB10) { 6979 __kmp_mic_type = mic2; 6980 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6981 __kmp_mic_type = mic3; 6982 } else { 6983 __kmp_mic_type = non_mic; 6984 } 6985 } 6986 6987 #endif /* KMP_MIC_SUPPORTED */ 6988 6989 #if KMP_HAVE_UMWAIT 6990 static void __kmp_user_level_mwait_init() { 6991 struct kmp_cpuid buf; 6992 __kmp_x86_cpuid(7, 0, &buf); 6993 __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1); 6994 __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait; 6995 __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0); 6996 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", 6997 __kmp_umwait_enabled)); 6998 } 6999 #elif KMP_HAVE_MWAIT 7000 #ifndef AT_INTELPHIUSERMWAIT 7001 // Spurious, non-existent value that should always fail to return anything. 7002 // Will be replaced with the correct value when we know that. 7003 #define AT_INTELPHIUSERMWAIT 10000 7004 #endif 7005 // getauxval() function is available in RHEL7 and SLES12. If a system with an 7006 // earlier OS is used to build the RTL, we'll use the following internal 7007 // function when the entry is not found. 7008 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL; 7009 unsigned long getauxval(unsigned long) { return 0; } 7010 7011 static void __kmp_user_level_mwait_init() { 7012 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available 7013 // use them to find if the user-level mwait is enabled. Otherwise, forcibly 7014 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable 7015 // KMP_USER_LEVEL_MWAIT was set to TRUE. 7016 if (__kmp_mic_type == mic3) { 7017 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT); 7018 if ((res & 0x1) || __kmp_user_level_mwait) { 7019 __kmp_mwait_enabled = TRUE; 7020 if (__kmp_user_level_mwait) { 7021 KMP_INFORM(EnvMwaitWarn); 7022 } 7023 } else { 7024 __kmp_mwait_enabled = FALSE; 7025 } 7026 } 7027 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, " 7028 "__kmp_mwait_enabled = %d\n", 7029 __kmp_mic_type, __kmp_mwait_enabled)); 7030 } 7031 #endif /* KMP_HAVE_UMWAIT */ 7032 7033 static void __kmp_do_serial_initialize(void) { 7034 int i, gtid; 7035 size_t size; 7036 7037 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 7038 7039 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 7040 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 7041 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 7042 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 7043 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 7044 7045 #if OMPT_SUPPORT 7046 ompt_pre_init(); 7047 #endif 7048 #if OMPD_SUPPORT 7049 __kmp_env_dump(); 7050 ompd_init(); 7051 #endif 7052 7053 __kmp_validate_locks(); 7054 7055 /* Initialize internal memory allocator */ 7056 __kmp_init_allocator(); 7057 7058 /* Register the library startup via an environment variable or via mapped 7059 shared memory file and check to see whether another copy of the library is 7060 already registered. Since forked child process is often terminated, we 7061 postpone the registration till middle initialization in the child */ 7062 if (__kmp_need_register_serial) 7063 __kmp_register_library_startup(); 7064 7065 /* TODO reinitialization of library */ 7066 if (TCR_4(__kmp_global.g.g_done)) { 7067 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 7068 } 7069 7070 __kmp_global.g.g_abort = 0; 7071 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 7072 7073 /* initialize the locks */ 7074 #if KMP_USE_ADAPTIVE_LOCKS 7075 #if KMP_DEBUG_ADAPTIVE_LOCKS 7076 __kmp_init_speculative_stats(); 7077 #endif 7078 #endif 7079 #if KMP_STATS_ENABLED 7080 __kmp_stats_init(); 7081 #endif 7082 __kmp_init_lock(&__kmp_global_lock); 7083 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 7084 __kmp_init_lock(&__kmp_debug_lock); 7085 __kmp_init_atomic_lock(&__kmp_atomic_lock); 7086 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 7087 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 7088 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 7089 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 7090 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 7091 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 7092 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 7093 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 7094 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 7095 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 7096 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 7097 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 7098 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 7099 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 7100 #if KMP_USE_MONITOR 7101 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 7102 #endif 7103 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 7104 7105 /* conduct initialization and initial setup of configuration */ 7106 7107 __kmp_runtime_initialize(); 7108 7109 #if KMP_MIC_SUPPORTED 7110 __kmp_check_mic_type(); 7111 #endif 7112 7113 // Some global variable initialization moved here from kmp_env_initialize() 7114 #ifdef KMP_DEBUG 7115 kmp_diag = 0; 7116 #endif 7117 __kmp_abort_delay = 0; 7118 7119 // From __kmp_init_dflt_team_nth() 7120 /* assume the entire machine will be used */ 7121 __kmp_dflt_team_nth_ub = __kmp_xproc; 7122 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 7123 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 7124 } 7125 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 7126 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 7127 } 7128 __kmp_max_nth = __kmp_sys_max_nth; 7129 __kmp_cg_max_nth = __kmp_sys_max_nth; 7130 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 7131 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 7132 __kmp_teams_max_nth = __kmp_sys_max_nth; 7133 } 7134 7135 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 7136 // part 7137 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 7138 #if KMP_USE_MONITOR 7139 __kmp_monitor_wakeups = 7140 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 7141 __kmp_bt_intervals = 7142 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 7143 #endif 7144 // From "KMP_LIBRARY" part of __kmp_env_initialize() 7145 __kmp_library = library_throughput; 7146 // From KMP_SCHEDULE initialization 7147 __kmp_static = kmp_sch_static_balanced; 7148 // AC: do not use analytical here, because it is non-monotonous 7149 //__kmp_guided = kmp_sch_guided_iterative_chunked; 7150 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 7151 // need to repeat assignment 7152 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 7153 // bit control and barrier method control parts 7154 #if KMP_FAST_REDUCTION_BARRIER 7155 #define kmp_reduction_barrier_gather_bb ((int)1) 7156 #define kmp_reduction_barrier_release_bb ((int)1) 7157 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt 7158 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt 7159 #endif // KMP_FAST_REDUCTION_BARRIER 7160 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 7161 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 7162 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 7163 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 7164 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 7165 #if KMP_FAST_REDUCTION_BARRIER 7166 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 7167 // lin_64 ): hyper,1 7168 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 7169 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 7170 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 7171 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 7172 } 7173 #endif // KMP_FAST_REDUCTION_BARRIER 7174 } 7175 #if KMP_FAST_REDUCTION_BARRIER 7176 #undef kmp_reduction_barrier_release_pat 7177 #undef kmp_reduction_barrier_gather_pat 7178 #undef kmp_reduction_barrier_release_bb 7179 #undef kmp_reduction_barrier_gather_bb 7180 #endif // KMP_FAST_REDUCTION_BARRIER 7181 #if KMP_MIC_SUPPORTED 7182 if (__kmp_mic_type == mic2) { // KNC 7183 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 7184 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 7185 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 7186 1; // forkjoin release 7187 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 7188 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 7189 } 7190 #if KMP_FAST_REDUCTION_BARRIER 7191 if (__kmp_mic_type == mic2) { // KNC 7192 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 7193 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 7194 } 7195 #endif // KMP_FAST_REDUCTION_BARRIER 7196 #endif // KMP_MIC_SUPPORTED 7197 7198 // From KMP_CHECKS initialization 7199 #ifdef KMP_DEBUG 7200 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 7201 #else 7202 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 7203 #endif 7204 7205 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 7206 __kmp_foreign_tp = TRUE; 7207 7208 __kmp_global.g.g_dynamic = FALSE; 7209 __kmp_global.g.g_dynamic_mode = dynamic_default; 7210 7211 __kmp_init_nesting_mode(); 7212 7213 __kmp_env_initialize(NULL); 7214 7215 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT 7216 __kmp_user_level_mwait_init(); 7217 #endif 7218 // Print all messages in message catalog for testing purposes. 7219 #ifdef KMP_DEBUG 7220 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 7221 if (__kmp_str_match_true(val)) { 7222 kmp_str_buf_t buffer; 7223 __kmp_str_buf_init(&buffer); 7224 __kmp_i18n_dump_catalog(&buffer); 7225 __kmp_printf("%s", buffer.str); 7226 __kmp_str_buf_free(&buffer); 7227 } 7228 __kmp_env_free(&val); 7229 #endif 7230 7231 __kmp_threads_capacity = 7232 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 7233 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 7234 __kmp_tp_capacity = __kmp_default_tp_capacity( 7235 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 7236 7237 // If the library is shut down properly, both pools must be NULL. Just in 7238 // case, set them to NULL -- some memory may leak, but subsequent code will 7239 // work even if pools are not freed. 7240 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 7241 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 7242 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 7243 __kmp_thread_pool = NULL; 7244 __kmp_thread_pool_insert_pt = NULL; 7245 __kmp_team_pool = NULL; 7246 7247 /* Allocate all of the variable sized records */ 7248 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 7249 * expandable */ 7250 /* Since allocation is cache-aligned, just add extra padding at the end */ 7251 size = 7252 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 7253 CACHE_LINE; 7254 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 7255 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 7256 sizeof(kmp_info_t *) * __kmp_threads_capacity); 7257 7258 /* init thread counts */ 7259 KMP_DEBUG_ASSERT(__kmp_all_nth == 7260 0); // Asserts fail if the library is reinitializing and 7261 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 7262 __kmp_all_nth = 0; 7263 __kmp_nth = 0; 7264 7265 /* setup the uber master thread and hierarchy */ 7266 gtid = __kmp_register_root(TRUE); 7267 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 7268 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7269 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 7270 7271 KMP_MB(); /* Flush all pending memory write invalidates. */ 7272 7273 __kmp_common_initialize(); 7274 7275 #if KMP_OS_UNIX 7276 /* invoke the child fork handler */ 7277 __kmp_register_atfork(); 7278 #endif 7279 7280 #if !KMP_DYNAMIC_LIB || \ 7281 ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN) 7282 { 7283 /* Invoke the exit handler when the program finishes, only for static 7284 library and macOS* dynamic. For other dynamic libraries, we already 7285 have _fini and DllMain. */ 7286 int rc = atexit(__kmp_internal_end_atexit); 7287 if (rc != 0) { 7288 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 7289 __kmp_msg_null); 7290 } 7291 } 7292 #endif 7293 7294 #if KMP_HANDLE_SIGNALS 7295 #if KMP_OS_UNIX 7296 /* NOTE: make sure that this is called before the user installs their own 7297 signal handlers so that the user handlers are called first. this way they 7298 can return false, not call our handler, avoid terminating the library, and 7299 continue execution where they left off. */ 7300 __kmp_install_signals(FALSE); 7301 #endif /* KMP_OS_UNIX */ 7302 #if KMP_OS_WINDOWS 7303 __kmp_install_signals(TRUE); 7304 #endif /* KMP_OS_WINDOWS */ 7305 #endif 7306 7307 /* we have finished the serial initialization */ 7308 __kmp_init_counter++; 7309 7310 __kmp_init_serial = TRUE; 7311 7312 if (__kmp_settings) { 7313 __kmp_env_print(); 7314 } 7315 7316 if (__kmp_display_env || __kmp_display_env_verbose) { 7317 __kmp_env_print_2(); 7318 } 7319 7320 #if OMPT_SUPPORT 7321 ompt_post_init(); 7322 #endif 7323 7324 KMP_MB(); 7325 7326 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 7327 } 7328 7329 void __kmp_serial_initialize(void) { 7330 if (__kmp_init_serial) { 7331 return; 7332 } 7333 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7334 if (__kmp_init_serial) { 7335 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7336 return; 7337 } 7338 __kmp_do_serial_initialize(); 7339 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7340 } 7341 7342 static void __kmp_do_middle_initialize(void) { 7343 int i, j; 7344 int prev_dflt_team_nth; 7345 7346 if (!__kmp_init_serial) { 7347 __kmp_do_serial_initialize(); 7348 } 7349 7350 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 7351 7352 if (UNLIKELY(!__kmp_need_register_serial)) { 7353 // We are in a forked child process. The registration was skipped during 7354 // serial initialization in __kmp_atfork_child handler. Do it here. 7355 __kmp_register_library_startup(); 7356 } 7357 7358 // Save the previous value for the __kmp_dflt_team_nth so that 7359 // we can avoid some reinitialization if it hasn't changed. 7360 prev_dflt_team_nth = __kmp_dflt_team_nth; 7361 7362 #if KMP_AFFINITY_SUPPORTED 7363 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 7364 // number of cores on the machine. 7365 __kmp_affinity_initialize(__kmp_affinity); 7366 7367 #endif /* KMP_AFFINITY_SUPPORTED */ 7368 7369 KMP_ASSERT(__kmp_xproc > 0); 7370 if (__kmp_avail_proc == 0) { 7371 __kmp_avail_proc = __kmp_xproc; 7372 } 7373 7374 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 7375 // correct them now 7376 j = 0; 7377 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 7378 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 7379 __kmp_avail_proc; 7380 j++; 7381 } 7382 7383 if (__kmp_dflt_team_nth == 0) { 7384 #ifdef KMP_DFLT_NTH_CORES 7385 // Default #threads = #cores 7386 __kmp_dflt_team_nth = __kmp_ncores; 7387 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7388 "__kmp_ncores (%d)\n", 7389 __kmp_dflt_team_nth)); 7390 #else 7391 // Default #threads = #available OS procs 7392 __kmp_dflt_team_nth = __kmp_avail_proc; 7393 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7394 "__kmp_avail_proc(%d)\n", 7395 __kmp_dflt_team_nth)); 7396 #endif /* KMP_DFLT_NTH_CORES */ 7397 } 7398 7399 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 7400 __kmp_dflt_team_nth = KMP_MIN_NTH; 7401 } 7402 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 7403 __kmp_dflt_team_nth = __kmp_sys_max_nth; 7404 } 7405 7406 if (__kmp_nesting_mode > 0) 7407 __kmp_set_nesting_mode_threads(); 7408 7409 // There's no harm in continuing if the following check fails, 7410 // but it indicates an error in the previous logic. 7411 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 7412 7413 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 7414 // Run through the __kmp_threads array and set the num threads icv for each 7415 // root thread that is currently registered with the RTL (which has not 7416 // already explicitly set its nthreads-var with a call to 7417 // omp_set_num_threads()). 7418 for (i = 0; i < __kmp_threads_capacity; i++) { 7419 kmp_info_t *thread = __kmp_threads[i]; 7420 if (thread == NULL) 7421 continue; 7422 if (thread->th.th_current_task->td_icvs.nproc != 0) 7423 continue; 7424 7425 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 7426 } 7427 } 7428 KA_TRACE( 7429 20, 7430 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 7431 __kmp_dflt_team_nth)); 7432 7433 #ifdef KMP_ADJUST_BLOCKTIME 7434 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 7435 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 7436 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 7437 if (__kmp_nth > __kmp_avail_proc) { 7438 __kmp_zero_bt = TRUE; 7439 } 7440 } 7441 #endif /* KMP_ADJUST_BLOCKTIME */ 7442 7443 /* we have finished middle initialization */ 7444 TCW_SYNC_4(__kmp_init_middle, TRUE); 7445 7446 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 7447 } 7448 7449 void __kmp_middle_initialize(void) { 7450 if (__kmp_init_middle) { 7451 return; 7452 } 7453 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7454 if (__kmp_init_middle) { 7455 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7456 return; 7457 } 7458 __kmp_do_middle_initialize(); 7459 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7460 } 7461 7462 void __kmp_parallel_initialize(void) { 7463 int gtid = __kmp_entry_gtid(); // this might be a new root 7464 7465 /* synchronize parallel initialization (for sibling) */ 7466 if (TCR_4(__kmp_init_parallel)) 7467 return; 7468 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7469 if (TCR_4(__kmp_init_parallel)) { 7470 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7471 return; 7472 } 7473 7474 /* TODO reinitialization after we have already shut down */ 7475 if (TCR_4(__kmp_global.g.g_done)) { 7476 KA_TRACE( 7477 10, 7478 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 7479 __kmp_infinite_loop(); 7480 } 7481 7482 /* jc: The lock __kmp_initz_lock is already held, so calling 7483 __kmp_serial_initialize would cause a deadlock. So we call 7484 __kmp_do_serial_initialize directly. */ 7485 if (!__kmp_init_middle) { 7486 __kmp_do_middle_initialize(); 7487 } 7488 __kmp_assign_root_init_mask(); 7489 __kmp_resume_if_hard_paused(); 7490 7491 /* begin initialization */ 7492 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 7493 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7494 7495 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 7496 // Save the FP control regs. 7497 // Worker threads will set theirs to these values at thread startup. 7498 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 7499 __kmp_store_mxcsr(&__kmp_init_mxcsr); 7500 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 7501 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 7502 7503 #if KMP_OS_UNIX 7504 #if KMP_HANDLE_SIGNALS 7505 /* must be after __kmp_serial_initialize */ 7506 __kmp_install_signals(TRUE); 7507 #endif 7508 #endif 7509 7510 __kmp_suspend_initialize(); 7511 7512 #if defined(USE_LOAD_BALANCE) 7513 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7514 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 7515 } 7516 #else 7517 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7518 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7519 } 7520 #endif 7521 7522 if (__kmp_version) { 7523 __kmp_print_version_2(); 7524 } 7525 7526 /* we have finished parallel initialization */ 7527 TCW_SYNC_4(__kmp_init_parallel, TRUE); 7528 7529 KMP_MB(); 7530 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 7531 7532 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7533 } 7534 7535 void __kmp_hidden_helper_initialize() { 7536 if (TCR_4(__kmp_init_hidden_helper)) 7537 return; 7538 7539 // __kmp_parallel_initialize is required before we initialize hidden helper 7540 if (!TCR_4(__kmp_init_parallel)) 7541 __kmp_parallel_initialize(); 7542 7543 // Double check. Note that this double check should not be placed before 7544 // __kmp_parallel_initialize as it will cause dead lock. 7545 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7546 if (TCR_4(__kmp_init_hidden_helper)) { 7547 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7548 return; 7549 } 7550 7551 #if KMP_AFFINITY_SUPPORTED 7552 // Initialize hidden helper affinity settings. 7553 // The above __kmp_parallel_initialize() will initialize 7554 // regular affinity (and topology) if not already done. 7555 if (!__kmp_hh_affinity.flags.initialized) 7556 __kmp_affinity_initialize(__kmp_hh_affinity); 7557 #endif 7558 7559 // Set the count of hidden helper tasks to be executed to zero 7560 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0); 7561 7562 // Set the global variable indicating that we're initializing hidden helper 7563 // team/threads 7564 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE); 7565 7566 // Platform independent initialization 7567 __kmp_do_initialize_hidden_helper_threads(); 7568 7569 // Wait here for the finish of initialization of hidden helper teams 7570 __kmp_hidden_helper_threads_initz_wait(); 7571 7572 // We have finished hidden helper initialization 7573 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE); 7574 7575 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7576 } 7577 7578 /* ------------------------------------------------------------------------ */ 7579 7580 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7581 kmp_team_t *team) { 7582 kmp_disp_t *dispatch; 7583 7584 KMP_MB(); 7585 7586 /* none of the threads have encountered any constructs, yet. */ 7587 this_thr->th.th_local.this_construct = 0; 7588 #if KMP_CACHE_MANAGE 7589 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 7590 #endif /* KMP_CACHE_MANAGE */ 7591 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 7592 KMP_DEBUG_ASSERT(dispatch); 7593 KMP_DEBUG_ASSERT(team->t.t_dispatch); 7594 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 7595 // this_thr->th.th_info.ds.ds_tid ] ); 7596 7597 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 7598 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter 7599 if (__kmp_env_consistency_check) 7600 __kmp_push_parallel(gtid, team->t.t_ident); 7601 7602 KMP_MB(); /* Flush all pending memory write invalidates. */ 7603 } 7604 7605 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7606 kmp_team_t *team) { 7607 if (__kmp_env_consistency_check) 7608 __kmp_pop_parallel(gtid, team->t.t_ident); 7609 7610 __kmp_finish_implicit_task(this_thr); 7611 } 7612 7613 int __kmp_invoke_task_func(int gtid) { 7614 int rc; 7615 int tid = __kmp_tid_from_gtid(gtid); 7616 kmp_info_t *this_thr = __kmp_threads[gtid]; 7617 kmp_team_t *team = this_thr->th.th_team; 7618 7619 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 7620 #if USE_ITT_BUILD 7621 if (__itt_stack_caller_create_ptr) { 7622 // inform ittnotify about entering user's code 7623 if (team->t.t_stack_id != NULL) { 7624 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id); 7625 } else { 7626 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7627 __kmp_itt_stack_callee_enter( 7628 (__itt_caller)team->t.t_parent->t.t_stack_id); 7629 } 7630 } 7631 #endif /* USE_ITT_BUILD */ 7632 #if INCLUDE_SSC_MARKS 7633 SSC_MARK_INVOKING(); 7634 #endif 7635 7636 #if OMPT_SUPPORT 7637 void *dummy; 7638 void **exit_frame_p; 7639 ompt_data_t *my_task_data; 7640 ompt_data_t *my_parallel_data; 7641 int ompt_team_size; 7642 7643 if (ompt_enabled.enabled) { 7644 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid] 7645 .ompt_task_info.frame.exit_frame.ptr); 7646 } else { 7647 exit_frame_p = &dummy; 7648 } 7649 7650 my_task_data = 7651 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 7652 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 7653 if (ompt_enabled.ompt_callback_implicit_task) { 7654 ompt_team_size = team->t.t_nproc; 7655 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7656 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 7657 __kmp_tid_from_gtid(gtid), ompt_task_implicit); 7658 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 7659 } 7660 #endif 7661 7662 #if KMP_STATS_ENABLED 7663 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 7664 if (previous_state == stats_state_e::TEAMS_REGION) { 7665 KMP_PUSH_PARTITIONED_TIMER(OMP_teams); 7666 } else { 7667 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); 7668 } 7669 KMP_SET_THREAD_STATE(IMPLICIT_TASK); 7670 #endif 7671 7672 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 7673 tid, (int)team->t.t_argc, (void **)team->t.t_argv 7674 #if OMPT_SUPPORT 7675 , 7676 exit_frame_p 7677 #endif 7678 ); 7679 #if OMPT_SUPPORT 7680 *exit_frame_p = NULL; 7681 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team; 7682 #endif 7683 7684 #if KMP_STATS_ENABLED 7685 if (previous_state == stats_state_e::TEAMS_REGION) { 7686 KMP_SET_THREAD_STATE(previous_state); 7687 } 7688 KMP_POP_PARTITIONED_TIMER(); 7689 #endif 7690 7691 #if USE_ITT_BUILD 7692 if (__itt_stack_caller_create_ptr) { 7693 // inform ittnotify about leaving user's code 7694 if (team->t.t_stack_id != NULL) { 7695 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id); 7696 } else { 7697 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7698 __kmp_itt_stack_callee_leave( 7699 (__itt_caller)team->t.t_parent->t.t_stack_id); 7700 } 7701 } 7702 #endif /* USE_ITT_BUILD */ 7703 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7704 7705 return rc; 7706 } 7707 7708 void __kmp_teams_master(int gtid) { 7709 // This routine is called by all primary threads in teams construct 7710 kmp_info_t *thr = __kmp_threads[gtid]; 7711 kmp_team_t *team = thr->th.th_team; 7712 ident_t *loc = team->t.t_ident; 7713 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7714 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7715 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7716 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7717 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7718 7719 // This thread is a new CG root. Set up the proper variables. 7720 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 7721 tmp->cg_root = thr; // Make thr the CG root 7722 // Init to thread limit stored when league primary threads were forked 7723 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; 7724 tmp->cg_nthreads = 1; // Init counter to one active thread, this one 7725 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" 7726 " cg_nthreads to 1\n", 7727 thr, tmp)); 7728 tmp->up = thr->th.th_cg_roots; 7729 thr->th.th_cg_roots = tmp; 7730 7731 // Launch league of teams now, but not let workers execute 7732 // (they hang on fork barrier until next parallel) 7733 #if INCLUDE_SSC_MARKS 7734 SSC_MARK_FORKING(); 7735 #endif 7736 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7737 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7738 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7739 #if INCLUDE_SSC_MARKS 7740 SSC_MARK_JOINING(); 7741 #endif 7742 // If the team size was reduced from the limit, set it to the new size 7743 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) 7744 thr->th.th_teams_size.nth = thr->th.th_team_nproc; 7745 // AC: last parameter "1" eliminates join barrier which won't work because 7746 // worker threads are in a fork barrier waiting for more parallel regions 7747 __kmp_join_call(loc, gtid 7748 #if OMPT_SUPPORT 7749 , 7750 fork_context_intel 7751 #endif 7752 , 7753 1); 7754 } 7755 7756 int __kmp_invoke_teams_master(int gtid) { 7757 kmp_info_t *this_thr = __kmp_threads[gtid]; 7758 kmp_team_t *team = this_thr->th.th_team; 7759 #if KMP_DEBUG 7760 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7761 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7762 (void *)__kmp_teams_master); 7763 #endif 7764 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7765 #if OMPT_SUPPORT 7766 int tid = __kmp_tid_from_gtid(gtid); 7767 ompt_data_t *task_data = 7768 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data; 7769 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data; 7770 if (ompt_enabled.ompt_callback_implicit_task) { 7771 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7772 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid, 7773 ompt_task_initial); 7774 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid; 7775 } 7776 #endif 7777 __kmp_teams_master(gtid); 7778 #if OMPT_SUPPORT 7779 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league; 7780 #endif 7781 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7782 return 1; 7783 } 7784 7785 /* this sets the requested number of threads for the next parallel region 7786 encountered by this team. since this should be enclosed in the forkjoin 7787 critical section it should avoid race conditions with asymmetrical nested 7788 parallelism */ 7789 7790 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7791 kmp_info_t *thr = __kmp_threads[gtid]; 7792 7793 if (num_threads > 0) 7794 thr->th.th_set_nproc = num_threads; 7795 } 7796 7797 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams, 7798 int num_threads) { 7799 KMP_DEBUG_ASSERT(thr); 7800 // Remember the number of threads for inner parallel regions 7801 if (!TCR_4(__kmp_init_middle)) 7802 __kmp_middle_initialize(); // get internal globals calculated 7803 __kmp_assign_root_init_mask(); 7804 KMP_DEBUG_ASSERT(__kmp_avail_proc); 7805 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth); 7806 7807 if (num_threads == 0) { 7808 if (__kmp_teams_thread_limit > 0) { 7809 num_threads = __kmp_teams_thread_limit; 7810 } else { 7811 num_threads = __kmp_avail_proc / num_teams; 7812 } 7813 // adjust num_threads w/o warning as it is not user setting 7814 // num_threads = min(num_threads, nthreads-var, thread-limit-var) 7815 // no thread_limit clause specified - do not change thread-limit-var ICV 7816 if (num_threads > __kmp_dflt_team_nth) { 7817 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7818 } 7819 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) { 7820 num_threads = thr->th.th_current_task->td_icvs.thread_limit; 7821 } // prevent team size to exceed thread-limit-var 7822 if (num_teams * num_threads > __kmp_teams_max_nth) { 7823 num_threads = __kmp_teams_max_nth / num_teams; 7824 } 7825 if (num_threads == 0) { 7826 num_threads = 1; 7827 } 7828 } else { 7829 if (num_threads < 0) { 7830 __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1), 7831 __kmp_msg_null); 7832 num_threads = 1; 7833 } 7834 // This thread will be the primary thread of the league primary threads 7835 // Store new thread limit; old limit is saved in th_cg_roots list 7836 thr->th.th_current_task->td_icvs.thread_limit = num_threads; 7837 // num_threads = min(num_threads, nthreads-var) 7838 if (num_threads > __kmp_dflt_team_nth) { 7839 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7840 } 7841 if (num_teams * num_threads > __kmp_teams_max_nth) { 7842 int new_threads = __kmp_teams_max_nth / num_teams; 7843 if (new_threads == 0) { 7844 new_threads = 1; 7845 } 7846 if (new_threads != num_threads) { 7847 if (!__kmp_reserve_warn) { // user asked for too many threads 7848 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT 7849 __kmp_msg(kmp_ms_warning, 7850 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7851 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7852 } 7853 } 7854 num_threads = new_threads; 7855 } 7856 } 7857 thr->th.th_teams_size.nth = num_threads; 7858 } 7859 7860 /* this sets the requested number of teams for the teams region and/or 7861 the number of threads for the next parallel region encountered */ 7862 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7863 int num_threads) { 7864 kmp_info_t *thr = __kmp_threads[gtid]; 7865 if (num_teams < 0) { 7866 // OpenMP specification requires requested values to be positive, 7867 // but people can send us any value, so we'd better check 7868 __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1), 7869 __kmp_msg_null); 7870 num_teams = 1; 7871 } 7872 if (num_teams == 0) { 7873 if (__kmp_nteams > 0) { 7874 num_teams = __kmp_nteams; 7875 } else { 7876 num_teams = 1; // default number of teams is 1. 7877 } 7878 } 7879 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7880 if (!__kmp_reserve_warn) { 7881 __kmp_reserve_warn = 1; 7882 __kmp_msg(kmp_ms_warning, 7883 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7884 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7885 } 7886 num_teams = __kmp_teams_max_nth; 7887 } 7888 // Set number of teams (number of threads in the outer "parallel" of the 7889 // teams) 7890 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7891 7892 __kmp_push_thread_limit(thr, num_teams, num_threads); 7893 } 7894 7895 /* This sets the requested number of teams for the teams region and/or 7896 the number of threads for the next parallel region encountered */ 7897 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb, 7898 int num_teams_ub, int num_threads) { 7899 kmp_info_t *thr = __kmp_threads[gtid]; 7900 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0); 7901 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb); 7902 KMP_DEBUG_ASSERT(num_threads >= 0); 7903 7904 if (num_teams_lb > num_teams_ub) { 7905 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub), 7906 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null); 7907 } 7908 7909 int num_teams = 1; // defalt number of teams is 1. 7910 7911 if (num_teams_lb == 0 && num_teams_ub > 0) 7912 num_teams_lb = num_teams_ub; 7913 7914 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause 7915 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams; 7916 if (num_teams > __kmp_teams_max_nth) { 7917 if (!__kmp_reserve_warn) { 7918 __kmp_reserve_warn = 1; 7919 __kmp_msg(kmp_ms_warning, 7920 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7921 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7922 } 7923 num_teams = __kmp_teams_max_nth; 7924 } 7925 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams 7926 num_teams = num_teams_ub; 7927 } else { // num_teams_lb <= num_teams <= num_teams_ub 7928 if (num_threads <= 0) { 7929 if (num_teams_ub > __kmp_teams_max_nth) { 7930 num_teams = num_teams_lb; 7931 } else { 7932 num_teams = num_teams_ub; 7933 } 7934 } else { 7935 num_teams = (num_threads > __kmp_teams_max_nth) 7936 ? num_teams 7937 : __kmp_teams_max_nth / num_threads; 7938 if (num_teams < num_teams_lb) { 7939 num_teams = num_teams_lb; 7940 } else if (num_teams > num_teams_ub) { 7941 num_teams = num_teams_ub; 7942 } 7943 } 7944 } 7945 // Set number of teams (number of threads in the outer "parallel" of the 7946 // teams) 7947 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7948 7949 __kmp_push_thread_limit(thr, num_teams, num_threads); 7950 } 7951 7952 // Set the proc_bind var to use in the following parallel region. 7953 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7954 kmp_info_t *thr = __kmp_threads[gtid]; 7955 thr->th.th_set_proc_bind = proc_bind; 7956 } 7957 7958 /* Launch the worker threads into the microtask. */ 7959 7960 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7961 kmp_info_t *this_thr = __kmp_threads[gtid]; 7962 7963 #ifdef KMP_DEBUG 7964 int f; 7965 #endif /* KMP_DEBUG */ 7966 7967 KMP_DEBUG_ASSERT(team); 7968 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7969 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7970 KMP_MB(); /* Flush all pending memory write invalidates. */ 7971 7972 team->t.t_construct = 0; /* no single directives seen yet */ 7973 team->t.t_ordered.dt.t_value = 7974 0; /* thread 0 enters the ordered section first */ 7975 7976 /* Reset the identifiers on the dispatch buffer */ 7977 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7978 if (team->t.t_max_nproc > 1) { 7979 int i; 7980 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7981 team->t.t_disp_buffer[i].buffer_index = i; 7982 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7983 } 7984 } else { 7985 team->t.t_disp_buffer[0].buffer_index = 0; 7986 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7987 } 7988 7989 KMP_MB(); /* Flush all pending memory write invalidates. */ 7990 KMP_ASSERT(this_thr->th.th_team == team); 7991 7992 #ifdef KMP_DEBUG 7993 for (f = 0; f < team->t.t_nproc; f++) { 7994 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7995 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7996 } 7997 #endif /* KMP_DEBUG */ 7998 7999 /* release the worker threads so they may begin working */ 8000 __kmp_fork_barrier(gtid, 0); 8001 } 8002 8003 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 8004 kmp_info_t *this_thr = __kmp_threads[gtid]; 8005 8006 KMP_DEBUG_ASSERT(team); 8007 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 8008 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 8009 KMP_MB(); /* Flush all pending memory write invalidates. */ 8010 8011 /* Join barrier after fork */ 8012 8013 #ifdef KMP_DEBUG 8014 if (__kmp_threads[gtid] && 8015 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 8016 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 8017 __kmp_threads[gtid]); 8018 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 8019 "team->t.t_nproc=%d\n", 8020 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 8021 team->t.t_nproc); 8022 __kmp_print_structure(); 8023 } 8024 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 8025 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 8026 #endif /* KMP_DEBUG */ 8027 8028 __kmp_join_barrier(gtid); /* wait for everyone */ 8029 #if OMPT_SUPPORT 8030 if (ompt_enabled.enabled && 8031 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 8032 int ds_tid = this_thr->th.th_info.ds.ds_tid; 8033 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 8034 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 8035 #if OMPT_OPTIONAL 8036 void *codeptr = NULL; 8037 if (KMP_MASTER_TID(ds_tid) && 8038 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 8039 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 8040 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 8041 8042 if (ompt_enabled.ompt_callback_sync_region_wait) { 8043 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 8044 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 8045 codeptr); 8046 } 8047 if (ompt_enabled.ompt_callback_sync_region) { 8048 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 8049 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 8050 codeptr); 8051 } 8052 #endif 8053 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 8054 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 8055 ompt_scope_end, NULL, task_data, 0, ds_tid, 8056 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 8057 } 8058 } 8059 #endif 8060 8061 KMP_MB(); /* Flush all pending memory write invalidates. */ 8062 KMP_ASSERT(this_thr->th.th_team == team); 8063 } 8064 8065 /* ------------------------------------------------------------------------ */ 8066 8067 #ifdef USE_LOAD_BALANCE 8068 8069 // Return the worker threads actively spinning in the hot team, if we 8070 // are at the outermost level of parallelism. Otherwise, return 0. 8071 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 8072 int i; 8073 int retval; 8074 kmp_team_t *hot_team; 8075 8076 if (root->r.r_active) { 8077 return 0; 8078 } 8079 hot_team = root->r.r_hot_team; 8080 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 8081 return hot_team->t.t_nproc - 1; // Don't count primary thread 8082 } 8083 8084 // Skip the primary thread - it is accounted for elsewhere. 8085 retval = 0; 8086 for (i = 1; i < hot_team->t.t_nproc; i++) { 8087 if (hot_team->t.t_threads[i]->th.th_active) { 8088 retval++; 8089 } 8090 } 8091 return retval; 8092 } 8093 8094 // Perform an automatic adjustment to the number of 8095 // threads used by the next parallel region. 8096 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 8097 int retval; 8098 int pool_active; 8099 int hot_team_active; 8100 int team_curr_active; 8101 int system_active; 8102 8103 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 8104 set_nproc)); 8105 KMP_DEBUG_ASSERT(root); 8106 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 8107 ->th.th_current_task->td_icvs.dynamic == TRUE); 8108 KMP_DEBUG_ASSERT(set_nproc > 1); 8109 8110 if (set_nproc == 1) { 8111 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 8112 return 1; 8113 } 8114 8115 // Threads that are active in the thread pool, active in the hot team for this 8116 // particular root (if we are at the outer par level), and the currently 8117 // executing thread (to become the primary thread) are available to add to the 8118 // new team, but are currently contributing to the system load, and must be 8119 // accounted for. 8120 pool_active = __kmp_thread_pool_active_nth; 8121 hot_team_active = __kmp_active_hot_team_nproc(root); 8122 team_curr_active = pool_active + hot_team_active + 1; 8123 8124 // Check the system load. 8125 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 8126 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 8127 "hot team active = %d\n", 8128 system_active, pool_active, hot_team_active)); 8129 8130 if (system_active < 0) { 8131 // There was an error reading the necessary info from /proc, so use the 8132 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 8133 // = dynamic_thread_limit, we shouldn't wind up getting back here. 8134 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 8135 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 8136 8137 // Make this call behave like the thread limit algorithm. 8138 retval = __kmp_avail_proc - __kmp_nth + 8139 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 8140 if (retval > set_nproc) { 8141 retval = set_nproc; 8142 } 8143 if (retval < KMP_MIN_NTH) { 8144 retval = KMP_MIN_NTH; 8145 } 8146 8147 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 8148 retval)); 8149 return retval; 8150 } 8151 8152 // There is a slight delay in the load balance algorithm in detecting new 8153 // running procs. The real system load at this instant should be at least as 8154 // large as the #active omp thread that are available to add to the team. 8155 if (system_active < team_curr_active) { 8156 system_active = team_curr_active; 8157 } 8158 retval = __kmp_avail_proc - system_active + team_curr_active; 8159 if (retval > set_nproc) { 8160 retval = set_nproc; 8161 } 8162 if (retval < KMP_MIN_NTH) { 8163 retval = KMP_MIN_NTH; 8164 } 8165 8166 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 8167 return retval; 8168 } // __kmp_load_balance_nproc() 8169 8170 #endif /* USE_LOAD_BALANCE */ 8171 8172 /* ------------------------------------------------------------------------ */ 8173 8174 /* NOTE: this is called with the __kmp_init_lock held */ 8175 void __kmp_cleanup(void) { 8176 int f; 8177 8178 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 8179 8180 if (TCR_4(__kmp_init_parallel)) { 8181 #if KMP_HANDLE_SIGNALS 8182 __kmp_remove_signals(); 8183 #endif 8184 TCW_4(__kmp_init_parallel, FALSE); 8185 } 8186 8187 if (TCR_4(__kmp_init_middle)) { 8188 #if KMP_AFFINITY_SUPPORTED 8189 __kmp_affinity_uninitialize(); 8190 #endif /* KMP_AFFINITY_SUPPORTED */ 8191 __kmp_cleanup_hierarchy(); 8192 TCW_4(__kmp_init_middle, FALSE); 8193 } 8194 8195 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 8196 8197 if (__kmp_init_serial) { 8198 __kmp_runtime_destroy(); 8199 __kmp_init_serial = FALSE; 8200 } 8201 8202 __kmp_cleanup_threadprivate_caches(); 8203 8204 for (f = 0; f < __kmp_threads_capacity; f++) { 8205 if (__kmp_root[f] != NULL) { 8206 __kmp_free(__kmp_root[f]); 8207 __kmp_root[f] = NULL; 8208 } 8209 } 8210 __kmp_free(__kmp_threads); 8211 // __kmp_threads and __kmp_root were allocated at once, as single block, so 8212 // there is no need in freeing __kmp_root. 8213 __kmp_threads = NULL; 8214 __kmp_root = NULL; 8215 __kmp_threads_capacity = 0; 8216 8217 // Free old __kmp_threads arrays if they exist. 8218 kmp_old_threads_list_t *ptr = __kmp_old_threads_list; 8219 while (ptr) { 8220 kmp_old_threads_list_t *next = ptr->next; 8221 __kmp_free(ptr->threads); 8222 __kmp_free(ptr); 8223 ptr = next; 8224 } 8225 8226 #if KMP_USE_DYNAMIC_LOCK 8227 __kmp_cleanup_indirect_user_locks(); 8228 #else 8229 __kmp_cleanup_user_locks(); 8230 #endif 8231 #if OMPD_SUPPORT 8232 if (ompd_state) { 8233 __kmp_free(ompd_env_block); 8234 ompd_env_block = NULL; 8235 ompd_env_block_size = 0; 8236 } 8237 #endif 8238 8239 #if KMP_AFFINITY_SUPPORTED 8240 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 8241 __kmp_cpuinfo_file = NULL; 8242 #endif /* KMP_AFFINITY_SUPPORTED */ 8243 8244 #if KMP_USE_ADAPTIVE_LOCKS 8245 #if KMP_DEBUG_ADAPTIVE_LOCKS 8246 __kmp_print_speculative_stats(); 8247 #endif 8248 #endif 8249 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 8250 __kmp_nested_nth.nth = NULL; 8251 __kmp_nested_nth.size = 0; 8252 __kmp_nested_nth.used = 0; 8253 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 8254 __kmp_nested_proc_bind.bind_types = NULL; 8255 __kmp_nested_proc_bind.size = 0; 8256 __kmp_nested_proc_bind.used = 0; 8257 if (__kmp_affinity_format) { 8258 KMP_INTERNAL_FREE(__kmp_affinity_format); 8259 __kmp_affinity_format = NULL; 8260 } 8261 8262 __kmp_i18n_catclose(); 8263 8264 #if KMP_USE_HIER_SCHED 8265 __kmp_hier_scheds.deallocate(); 8266 #endif 8267 8268 #if KMP_STATS_ENABLED 8269 __kmp_stats_fini(); 8270 #endif 8271 8272 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 8273 } 8274 8275 /* ------------------------------------------------------------------------ */ 8276 8277 int __kmp_ignore_mppbeg(void) { 8278 char *env; 8279 8280 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 8281 if (__kmp_str_match_false(env)) 8282 return FALSE; 8283 } 8284 // By default __kmpc_begin() is no-op. 8285 return TRUE; 8286 } 8287 8288 int __kmp_ignore_mppend(void) { 8289 char *env; 8290 8291 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 8292 if (__kmp_str_match_false(env)) 8293 return FALSE; 8294 } 8295 // By default __kmpc_end() is no-op. 8296 return TRUE; 8297 } 8298 8299 void __kmp_internal_begin(void) { 8300 int gtid; 8301 kmp_root_t *root; 8302 8303 /* this is a very important step as it will register new sibling threads 8304 and assign these new uber threads a new gtid */ 8305 gtid = __kmp_entry_gtid(); 8306 root = __kmp_threads[gtid]->th.th_root; 8307 KMP_ASSERT(KMP_UBER_GTID(gtid)); 8308 8309 if (root->r.r_begin) 8310 return; 8311 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 8312 if (root->r.r_begin) { 8313 __kmp_release_lock(&root->r.r_begin_lock, gtid); 8314 return; 8315 } 8316 8317 root->r.r_begin = TRUE; 8318 8319 __kmp_release_lock(&root->r.r_begin_lock, gtid); 8320 } 8321 8322 /* ------------------------------------------------------------------------ */ 8323 8324 void __kmp_user_set_library(enum library_type arg) { 8325 int gtid; 8326 kmp_root_t *root; 8327 kmp_info_t *thread; 8328 8329 /* first, make sure we are initialized so we can get our gtid */ 8330 8331 gtid = __kmp_entry_gtid(); 8332 thread = __kmp_threads[gtid]; 8333 8334 root = thread->th.th_root; 8335 8336 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 8337 library_serial)); 8338 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 8339 thread */ 8340 KMP_WARNING(SetLibraryIncorrectCall); 8341 return; 8342 } 8343 8344 switch (arg) { 8345 case library_serial: 8346 thread->th.th_set_nproc = 0; 8347 set__nproc(thread, 1); 8348 break; 8349 case library_turnaround: 8350 thread->th.th_set_nproc = 0; 8351 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8352 : __kmp_dflt_team_nth_ub); 8353 break; 8354 case library_throughput: 8355 thread->th.th_set_nproc = 0; 8356 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8357 : __kmp_dflt_team_nth_ub); 8358 break; 8359 default: 8360 KMP_FATAL(UnknownLibraryType, arg); 8361 } 8362 8363 __kmp_aux_set_library(arg); 8364 } 8365 8366 void __kmp_aux_set_stacksize(size_t arg) { 8367 if (!__kmp_init_serial) 8368 __kmp_serial_initialize(); 8369 8370 #if KMP_OS_DARWIN 8371 if (arg & (0x1000 - 1)) { 8372 arg &= ~(0x1000 - 1); 8373 if (arg + 0x1000) /* check for overflow if we round up */ 8374 arg += 0x1000; 8375 } 8376 #endif 8377 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8378 8379 /* only change the default stacksize before the first parallel region */ 8380 if (!TCR_4(__kmp_init_parallel)) { 8381 size_t value = arg; /* argument is in bytes */ 8382 8383 if (value < __kmp_sys_min_stksize) 8384 value = __kmp_sys_min_stksize; 8385 else if (value > KMP_MAX_STKSIZE) 8386 value = KMP_MAX_STKSIZE; 8387 8388 __kmp_stksize = value; 8389 8390 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 8391 } 8392 8393 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8394 } 8395 8396 /* set the behaviour of the runtime library */ 8397 /* TODO this can cause some odd behaviour with sibling parallelism... */ 8398 void __kmp_aux_set_library(enum library_type arg) { 8399 __kmp_library = arg; 8400 8401 switch (__kmp_library) { 8402 case library_serial: { 8403 KMP_INFORM(LibraryIsSerial); 8404 } break; 8405 case library_turnaround: 8406 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) 8407 __kmp_use_yield = 2; // only yield when oversubscribed 8408 break; 8409 case library_throughput: 8410 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) 8411 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 8412 break; 8413 default: 8414 KMP_FATAL(UnknownLibraryType, arg); 8415 } 8416 } 8417 8418 /* Getting team information common for all team API */ 8419 // Returns NULL if not in teams construct 8420 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 8421 kmp_info_t *thr = __kmp_entry_thread(); 8422 teams_serialized = 0; 8423 if (thr->th.th_teams_microtask) { 8424 kmp_team_t *team = thr->th.th_team; 8425 int tlevel = thr->th.th_teams_level; // the level of the teams construct 8426 int ii = team->t.t_level; 8427 teams_serialized = team->t.t_serialized; 8428 int level = tlevel + 1; 8429 KMP_DEBUG_ASSERT(ii >= tlevel); 8430 while (ii > level) { 8431 for (teams_serialized = team->t.t_serialized; 8432 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 8433 } 8434 if (team->t.t_serialized && (!teams_serialized)) { 8435 team = team->t.t_parent; 8436 continue; 8437 } 8438 if (ii > level) { 8439 team = team->t.t_parent; 8440 ii--; 8441 } 8442 } 8443 return team; 8444 } 8445 return NULL; 8446 } 8447 8448 int __kmp_aux_get_team_num() { 8449 int serialized; 8450 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8451 if (team) { 8452 if (serialized > 1) { 8453 return 0; // teams region is serialized ( 1 team of 1 thread ). 8454 } else { 8455 return team->t.t_master_tid; 8456 } 8457 } 8458 return 0; 8459 } 8460 8461 int __kmp_aux_get_num_teams() { 8462 int serialized; 8463 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8464 if (team) { 8465 if (serialized > 1) { 8466 return 1; 8467 } else { 8468 return team->t.t_parent->t.t_nproc; 8469 } 8470 } 8471 return 1; 8472 } 8473 8474 /* ------------------------------------------------------------------------ */ 8475 8476 /* 8477 * Affinity Format Parser 8478 * 8479 * Field is in form of: %[[[0].]size]type 8480 * % and type are required (%% means print a literal '%') 8481 * type is either single char or long name surrounded by {}, 8482 * e.g., N or {num_threads} 8483 * 0 => leading zeros 8484 * . => right justified when size is specified 8485 * by default output is left justified 8486 * size is the *minimum* field length 8487 * All other characters are printed as is 8488 * 8489 * Available field types: 8490 * L {thread_level} - omp_get_level() 8491 * n {thread_num} - omp_get_thread_num() 8492 * h {host} - name of host machine 8493 * P {process_id} - process id (integer) 8494 * T {thread_identifier} - native thread identifier (integer) 8495 * N {num_threads} - omp_get_num_threads() 8496 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 8497 * a {thread_affinity} - comma separated list of integers or integer ranges 8498 * (values of affinity mask) 8499 * 8500 * Implementation-specific field types can be added 8501 * If a type is unknown, print "undefined" 8502 */ 8503 8504 // Structure holding the short name, long name, and corresponding data type 8505 // for snprintf. A table of these will represent the entire valid keyword 8506 // field types. 8507 typedef struct kmp_affinity_format_field_t { 8508 char short_name; // from spec e.g., L -> thread level 8509 const char *long_name; // from spec thread_level -> thread level 8510 char field_format; // data type for snprintf (typically 'd' or 's' 8511 // for integer or string) 8512 } kmp_affinity_format_field_t; 8513 8514 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 8515 #if KMP_AFFINITY_SUPPORTED 8516 {'A', "thread_affinity", 's'}, 8517 #endif 8518 {'t', "team_num", 'd'}, 8519 {'T', "num_teams", 'd'}, 8520 {'L', "nesting_level", 'd'}, 8521 {'n', "thread_num", 'd'}, 8522 {'N', "num_threads", 'd'}, 8523 {'a', "ancestor_tnum", 'd'}, 8524 {'H', "host", 's'}, 8525 {'P', "process_id", 'd'}, 8526 {'i', "native_thread_id", 'd'}}; 8527 8528 // Return the number of characters it takes to hold field 8529 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 8530 const char **ptr, 8531 kmp_str_buf_t *field_buffer) { 8532 int rc, format_index, field_value; 8533 const char *width_left, *width_right; 8534 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 8535 static const int FORMAT_SIZE = 20; 8536 char format[FORMAT_SIZE] = {0}; 8537 char absolute_short_name = 0; 8538 8539 KMP_DEBUG_ASSERT(gtid >= 0); 8540 KMP_DEBUG_ASSERT(th); 8541 KMP_DEBUG_ASSERT(**ptr == '%'); 8542 KMP_DEBUG_ASSERT(field_buffer); 8543 8544 __kmp_str_buf_clear(field_buffer); 8545 8546 // Skip the initial % 8547 (*ptr)++; 8548 8549 // Check for %% first 8550 if (**ptr == '%') { 8551 __kmp_str_buf_cat(field_buffer, "%", 1); 8552 (*ptr)++; // skip over the second % 8553 return 1; 8554 } 8555 8556 // Parse field modifiers if they are present 8557 pad_zeros = false; 8558 if (**ptr == '0') { 8559 pad_zeros = true; 8560 (*ptr)++; // skip over 0 8561 } 8562 right_justify = false; 8563 if (**ptr == '.') { 8564 right_justify = true; 8565 (*ptr)++; // skip over . 8566 } 8567 // Parse width of field: [width_left, width_right) 8568 width_left = width_right = NULL; 8569 if (**ptr >= '0' && **ptr <= '9') { 8570 width_left = *ptr; 8571 SKIP_DIGITS(*ptr); 8572 width_right = *ptr; 8573 } 8574 8575 // Create the format for KMP_SNPRINTF based on flags parsed above 8576 format_index = 0; 8577 format[format_index++] = '%'; 8578 if (!right_justify) 8579 format[format_index++] = '-'; 8580 if (pad_zeros) 8581 format[format_index++] = '0'; 8582 if (width_left && width_right) { 8583 int i = 0; 8584 // Only allow 8 digit number widths. 8585 // This also prevents overflowing format variable 8586 while (i < 8 && width_left < width_right) { 8587 format[format_index++] = *width_left; 8588 width_left++; 8589 i++; 8590 } 8591 } 8592 8593 // Parse a name (long or short) 8594 // Canonicalize the name into absolute_short_name 8595 found_valid_name = false; 8596 parse_long_name = (**ptr == '{'); 8597 if (parse_long_name) 8598 (*ptr)++; // skip initial left brace 8599 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 8600 sizeof(__kmp_affinity_format_table[0]); 8601 ++i) { 8602 char short_name = __kmp_affinity_format_table[i].short_name; 8603 const char *long_name = __kmp_affinity_format_table[i].long_name; 8604 char field_format = __kmp_affinity_format_table[i].field_format; 8605 if (parse_long_name) { 8606 size_t length = KMP_STRLEN(long_name); 8607 if (strncmp(*ptr, long_name, length) == 0) { 8608 found_valid_name = true; 8609 (*ptr) += length; // skip the long name 8610 } 8611 } else if (**ptr == short_name) { 8612 found_valid_name = true; 8613 (*ptr)++; // skip the short name 8614 } 8615 if (found_valid_name) { 8616 format[format_index++] = field_format; 8617 format[format_index++] = '\0'; 8618 absolute_short_name = short_name; 8619 break; 8620 } 8621 } 8622 if (parse_long_name) { 8623 if (**ptr != '}') { 8624 absolute_short_name = 0; 8625 } else { 8626 (*ptr)++; // skip over the right brace 8627 } 8628 } 8629 8630 // Attempt to fill the buffer with the requested 8631 // value using snprintf within __kmp_str_buf_print() 8632 switch (absolute_short_name) { 8633 case 't': 8634 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 8635 break; 8636 case 'T': 8637 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 8638 break; 8639 case 'L': 8640 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 8641 break; 8642 case 'n': 8643 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 8644 break; 8645 case 'H': { 8646 static const int BUFFER_SIZE = 256; 8647 char buf[BUFFER_SIZE]; 8648 __kmp_expand_host_name(buf, BUFFER_SIZE); 8649 rc = __kmp_str_buf_print(field_buffer, format, buf); 8650 } break; 8651 case 'P': 8652 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 8653 break; 8654 case 'i': 8655 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 8656 break; 8657 case 'N': 8658 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 8659 break; 8660 case 'a': 8661 field_value = 8662 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 8663 rc = __kmp_str_buf_print(field_buffer, format, field_value); 8664 break; 8665 #if KMP_AFFINITY_SUPPORTED 8666 case 'A': { 8667 kmp_str_buf_t buf; 8668 __kmp_str_buf_init(&buf); 8669 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 8670 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 8671 __kmp_str_buf_free(&buf); 8672 } break; 8673 #endif 8674 default: 8675 // According to spec, If an implementation does not have info for field 8676 // type, then "undefined" is printed 8677 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 8678 // Skip the field 8679 if (parse_long_name) { 8680 SKIP_TOKEN(*ptr); 8681 if (**ptr == '}') 8682 (*ptr)++; 8683 } else { 8684 (*ptr)++; 8685 } 8686 } 8687 8688 KMP_ASSERT(format_index <= FORMAT_SIZE); 8689 return rc; 8690 } 8691 8692 /* 8693 * Return number of characters needed to hold the affinity string 8694 * (not including null byte character) 8695 * The resultant string is printed to buffer, which the caller can then 8696 * handle afterwards 8697 */ 8698 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 8699 kmp_str_buf_t *buffer) { 8700 const char *parse_ptr; 8701 size_t retval; 8702 const kmp_info_t *th; 8703 kmp_str_buf_t field; 8704 8705 KMP_DEBUG_ASSERT(buffer); 8706 KMP_DEBUG_ASSERT(gtid >= 0); 8707 8708 __kmp_str_buf_init(&field); 8709 __kmp_str_buf_clear(buffer); 8710 8711 th = __kmp_threads[gtid]; 8712 retval = 0; 8713 8714 // If format is NULL or zero-length string, then we use 8715 // affinity-format-var ICV 8716 parse_ptr = format; 8717 if (parse_ptr == NULL || *parse_ptr == '\0') { 8718 parse_ptr = __kmp_affinity_format; 8719 } 8720 KMP_DEBUG_ASSERT(parse_ptr); 8721 8722 while (*parse_ptr != '\0') { 8723 // Parse a field 8724 if (*parse_ptr == '%') { 8725 // Put field in the buffer 8726 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 8727 __kmp_str_buf_catbuf(buffer, &field); 8728 retval += rc; 8729 } else { 8730 // Put literal character in buffer 8731 __kmp_str_buf_cat(buffer, parse_ptr, 1); 8732 retval++; 8733 parse_ptr++; 8734 } 8735 } 8736 __kmp_str_buf_free(&field); 8737 return retval; 8738 } 8739 8740 // Displays the affinity string to stdout 8741 void __kmp_aux_display_affinity(int gtid, const char *format) { 8742 kmp_str_buf_t buf; 8743 __kmp_str_buf_init(&buf); 8744 __kmp_aux_capture_affinity(gtid, format, &buf); 8745 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 8746 __kmp_str_buf_free(&buf); 8747 } 8748 8749 /* ------------------------------------------------------------------------ */ 8750 8751 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 8752 int blocktime = arg; /* argument is in milliseconds */ 8753 #if KMP_USE_MONITOR 8754 int bt_intervals; 8755 #endif 8756 kmp_int8 bt_set; 8757 8758 __kmp_save_internal_controls(thread); 8759 8760 /* Normalize and set blocktime for the teams */ 8761 if (blocktime < KMP_MIN_BLOCKTIME) 8762 blocktime = KMP_MIN_BLOCKTIME; 8763 else if (blocktime > KMP_MAX_BLOCKTIME) 8764 blocktime = KMP_MAX_BLOCKTIME; 8765 8766 set__blocktime_team(thread->th.th_team, tid, blocktime); 8767 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 8768 8769 #if KMP_USE_MONITOR 8770 /* Calculate and set blocktime intervals for the teams */ 8771 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 8772 8773 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 8774 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 8775 #endif 8776 8777 /* Set whether blocktime has been set to "TRUE" */ 8778 bt_set = TRUE; 8779 8780 set__bt_set_team(thread->th.th_team, tid, bt_set); 8781 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 8782 #if KMP_USE_MONITOR 8783 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 8784 "bt_intervals=%d, monitor_updates=%d\n", 8785 __kmp_gtid_from_tid(tid, thread->th.th_team), 8786 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 8787 __kmp_monitor_wakeups)); 8788 #else 8789 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 8790 __kmp_gtid_from_tid(tid, thread->th.th_team), 8791 thread->th.th_team->t.t_id, tid, blocktime)); 8792 #endif 8793 } 8794 8795 void __kmp_aux_set_defaults(char const *str, size_t len) { 8796 if (!__kmp_init_serial) { 8797 __kmp_serial_initialize(); 8798 } 8799 __kmp_env_initialize(str); 8800 8801 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { 8802 __kmp_env_print(); 8803 } 8804 } // __kmp_aux_set_defaults 8805 8806 /* ------------------------------------------------------------------------ */ 8807 /* internal fast reduction routines */ 8808 8809 PACKED_REDUCTION_METHOD_T 8810 __kmp_determine_reduction_method( 8811 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 8812 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 8813 kmp_critical_name *lck) { 8814 8815 // Default reduction method: critical construct ( lck != NULL, like in current 8816 // PAROPT ) 8817 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 8818 // can be selected by RTL 8819 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 8820 // can be selected by RTL 8821 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8822 // among generated by PAROPT. 8823 8824 PACKED_REDUCTION_METHOD_T retval; 8825 8826 int team_size; 8827 8828 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 8829 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8830 8831 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8832 (loc && \ 8833 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))) 8834 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8835 8836 retval = critical_reduce_block; 8837 8838 // another choice of getting a team size (with 1 dynamic deference) is slower 8839 team_size = __kmp_get_team_num_threads(global_tid); 8840 if (team_size == 1) { 8841 8842 retval = empty_reduce_block; 8843 8844 } else { 8845 8846 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8847 8848 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ 8849 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 8850 8851 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8852 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8853 8854 int teamsize_cutoff = 4; 8855 8856 #if KMP_MIC_SUPPORTED 8857 if (__kmp_mic_type != non_mic) { 8858 teamsize_cutoff = 8; 8859 } 8860 #endif 8861 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8862 if (tree_available) { 8863 if (team_size <= teamsize_cutoff) { 8864 if (atomic_available) { 8865 retval = atomic_reduce_block; 8866 } 8867 } else { 8868 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8869 } 8870 } else if (atomic_available) { 8871 retval = atomic_reduce_block; 8872 } 8873 #else 8874 #error "Unknown or unsupported OS" 8875 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8876 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8877 8878 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 8879 8880 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD 8881 8882 // basic tuning 8883 8884 if (atomic_available) { 8885 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 8886 retval = atomic_reduce_block; 8887 } 8888 } // otherwise: use critical section 8889 8890 #elif KMP_OS_DARWIN 8891 8892 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8893 if (atomic_available && (num_vars <= 3)) { 8894 retval = atomic_reduce_block; 8895 } else if (tree_available) { 8896 if ((reduce_size > (9 * sizeof(kmp_real64))) && 8897 (reduce_size < (2000 * sizeof(kmp_real64)))) { 8898 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 8899 } 8900 } // otherwise: use critical section 8901 8902 #else 8903 #error "Unknown or unsupported OS" 8904 #endif 8905 8906 #else 8907 #error "Unknown or unsupported architecture" 8908 #endif 8909 } 8910 8911 // KMP_FORCE_REDUCTION 8912 8913 // If the team is serialized (team_size == 1), ignore the forced reduction 8914 // method and stay with the unsynchronized method (empty_reduce_block) 8915 if (__kmp_force_reduction_method != reduction_method_not_defined && 8916 team_size != 1) { 8917 8918 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 8919 8920 int atomic_available, tree_available; 8921 8922 switch ((forced_retval = __kmp_force_reduction_method)) { 8923 case critical_reduce_block: 8924 KMP_ASSERT(lck); // lck should be != 0 8925 break; 8926 8927 case atomic_reduce_block: 8928 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8929 if (!atomic_available) { 8930 KMP_WARNING(RedMethodNotSupported, "atomic"); 8931 forced_retval = critical_reduce_block; 8932 } 8933 break; 8934 8935 case tree_reduce_block: 8936 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8937 if (!tree_available) { 8938 KMP_WARNING(RedMethodNotSupported, "tree"); 8939 forced_retval = critical_reduce_block; 8940 } else { 8941 #if KMP_FAST_REDUCTION_BARRIER 8942 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8943 #endif 8944 } 8945 break; 8946 8947 default: 8948 KMP_ASSERT(0); // "unsupported method specified" 8949 } 8950 8951 retval = forced_retval; 8952 } 8953 8954 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 8955 8956 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 8957 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 8958 8959 return (retval); 8960 } 8961 // this function is for testing set/get/determine reduce method 8962 kmp_int32 __kmp_get_reduce_method(void) { 8963 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 8964 } 8965 8966 // Soft pause sets up threads to ignore blocktime and just go to sleep. 8967 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 8968 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 8969 8970 // Hard pause shuts down the runtime completely. Resume happens naturally when 8971 // OpenMP is used subsequently. 8972 void __kmp_hard_pause() { 8973 __kmp_pause_status = kmp_hard_paused; 8974 __kmp_internal_end_thread(-1); 8975 } 8976 8977 // Soft resume sets __kmp_pause_status, and wakes up all threads. 8978 void __kmp_resume_if_soft_paused() { 8979 if (__kmp_pause_status == kmp_soft_paused) { 8980 __kmp_pause_status = kmp_not_paused; 8981 8982 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 8983 kmp_info_t *thread = __kmp_threads[gtid]; 8984 if (thread) { // Wake it if sleeping 8985 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 8986 thread); 8987 if (fl.is_sleeping()) 8988 fl.resume(gtid); 8989 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 8990 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 8991 } else { // thread holds the lock and may sleep soon 8992 do { // until either the thread sleeps, or we can get the lock 8993 if (fl.is_sleeping()) { 8994 fl.resume(gtid); 8995 break; 8996 } else if (__kmp_try_suspend_mx(thread)) { 8997 __kmp_unlock_suspend_mx(thread); 8998 break; 8999 } 9000 } while (1); 9001 } 9002 } 9003 } 9004 } 9005 } 9006 9007 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 9008 // TODO: add warning messages 9009 int __kmp_pause_resource(kmp_pause_status_t level) { 9010 if (level == kmp_not_paused) { // requesting resume 9011 if (__kmp_pause_status == kmp_not_paused) { 9012 // error message about runtime not being paused, so can't resume 9013 return 1; 9014 } else { 9015 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 9016 __kmp_pause_status == kmp_hard_paused); 9017 __kmp_pause_status = kmp_not_paused; 9018 return 0; 9019 } 9020 } else if (level == kmp_soft_paused) { // requesting soft pause 9021 if (__kmp_pause_status != kmp_not_paused) { 9022 // error message about already being paused 9023 return 1; 9024 } else { 9025 __kmp_soft_pause(); 9026 return 0; 9027 } 9028 } else if (level == kmp_hard_paused) { // requesting hard pause 9029 if (__kmp_pause_status != kmp_not_paused) { 9030 // error message about already being paused 9031 return 1; 9032 } else { 9033 __kmp_hard_pause(); 9034 return 0; 9035 } 9036 } else { 9037 // error message about invalid level 9038 return 1; 9039 } 9040 } 9041 9042 void __kmp_omp_display_env(int verbose) { 9043 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 9044 if (__kmp_init_serial == 0) 9045 __kmp_do_serial_initialize(); 9046 __kmp_display_env_impl(!verbose, verbose); 9047 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 9048 } 9049 9050 // The team size is changing, so distributed barrier must be modified 9051 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads, 9052 int new_nthreads) { 9053 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] == 9054 bp_dist_bar); 9055 kmp_info_t **other_threads = team->t.t_threads; 9056 9057 // We want all the workers to stop waiting on the barrier while we adjust the 9058 // size of the team. 9059 for (int f = 1; f < old_nthreads; ++f) { 9060 KMP_DEBUG_ASSERT(other_threads[f] != NULL); 9061 // Ignore threads that are already inactive or not present in the team 9062 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) { 9063 // teams construct causes thread_limit to get passed in, and some of 9064 // those could be inactive; just ignore them 9065 continue; 9066 } 9067 // If thread is transitioning still to in_use state, wait for it 9068 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) { 9069 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3) 9070 KMP_CPU_PAUSE(); 9071 } 9072 // The thread should be in_use now 9073 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1); 9074 // Transition to unused state 9075 team->t.t_threads[f]->th.th_used_in_team.store(2); 9076 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2); 9077 } 9078 // Release all the workers 9079 team->t.b->go_release(); 9080 9081 KMP_MFENCE(); 9082 9083 // Workers should see transition status 2 and move to 0; but may need to be 9084 // woken up first 9085 int count = old_nthreads - 1; 9086 while (count > 0) { 9087 count = old_nthreads - 1; 9088 for (int f = 1; f < old_nthreads; ++f) { 9089 if (other_threads[f]->th.th_used_in_team.load() != 0) { 9090 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers 9091 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST( 9092 void *, other_threads[f]->th.th_sleep_loc); 9093 __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag); 9094 } 9095 } else { 9096 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0); 9097 count--; 9098 } 9099 } 9100 } 9101 // Now update the barrier size 9102 team->t.b->update_num_threads(new_nthreads); 9103 team->t.b->go_reset(); 9104 } 9105 9106 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) { 9107 // Add the threads back to the team 9108 KMP_DEBUG_ASSERT(team); 9109 // Threads were paused and pointed at th_used_in_team temporarily during a 9110 // resize of the team. We're going to set th_used_in_team to 3 to indicate to 9111 // the thread that it should transition itself back into the team. Then, if 9112 // blocktime isn't infinite, the thread could be sleeping, so we send a resume 9113 // to wake it up. 9114 for (int f = 1; f < new_nthreads; ++f) { 9115 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 9116 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0, 9117 3); 9118 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads 9119 __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid, 9120 (kmp_flag_32<false, false> *)NULL); 9121 } 9122 } 9123 // The threads should be transitioning to the team; when they are done, they 9124 // should have set th_used_in_team to 1. This loop forces master to wait until 9125 // all threads have moved into the team and are waiting in the barrier. 9126 int count = new_nthreads - 1; 9127 while (count > 0) { 9128 count = new_nthreads - 1; 9129 for (int f = 1; f < new_nthreads; ++f) { 9130 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) { 9131 count--; 9132 } 9133 } 9134 } 9135 } 9136 9137 // Globals and functions for hidden helper task 9138 kmp_info_t **__kmp_hidden_helper_threads; 9139 kmp_info_t *__kmp_hidden_helper_main_thread; 9140 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks; 9141 #if KMP_OS_LINUX 9142 kmp_int32 __kmp_hidden_helper_threads_num = 8; 9143 kmp_int32 __kmp_enable_hidden_helper = TRUE; 9144 #else 9145 kmp_int32 __kmp_hidden_helper_threads_num = 0; 9146 kmp_int32 __kmp_enable_hidden_helper = FALSE; 9147 #endif 9148 9149 namespace { 9150 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num; 9151 9152 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) { 9153 // This is an explicit synchronization on all hidden helper threads in case 9154 // that when a regular thread pushes a hidden helper task to one hidden 9155 // helper thread, the thread has not been awaken once since they're released 9156 // by the main thread after creating the team. 9157 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num); 9158 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) != 9159 __kmp_hidden_helper_threads_num) 9160 ; 9161 9162 // If main thread, then wait for signal 9163 if (__kmpc_master(nullptr, *gtid)) { 9164 // First, unset the initial state and release the initial thread 9165 TCW_4(__kmp_init_hidden_helper_threads, FALSE); 9166 __kmp_hidden_helper_initz_release(); 9167 __kmp_hidden_helper_main_thread_wait(); 9168 // Now wake up all worker threads 9169 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) { 9170 __kmp_hidden_helper_worker_thread_signal(); 9171 } 9172 } 9173 } 9174 } // namespace 9175 9176 void __kmp_hidden_helper_threads_initz_routine() { 9177 // Create a new root for hidden helper team/threads 9178 const int gtid = __kmp_register_root(TRUE); 9179 __kmp_hidden_helper_main_thread = __kmp_threads[gtid]; 9180 __kmp_hidden_helper_threads = &__kmp_threads[gtid]; 9181 __kmp_hidden_helper_main_thread->th.th_set_nproc = 9182 __kmp_hidden_helper_threads_num; 9183 9184 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0); 9185 9186 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn); 9187 9188 // Set the initialization flag to FALSE 9189 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE); 9190 9191 __kmp_hidden_helper_threads_deinitz_release(); 9192 } 9193 9194 /* Nesting Mode: 9195 Set via KMP_NESTING_MODE, which takes an integer. 9196 Note: we skip duplicate topology levels, and skip levels with only 9197 one entity. 9198 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode. 9199 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels 9200 in the topology, and initializes the number of threads at each of those 9201 levels to the number of entities at each level, respectively, below the 9202 entity at the parent level. 9203 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels, 9204 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires 9205 the user to turn nesting on explicitly. This is an even more experimental 9206 option to this experimental feature, and may change or go away in the 9207 future. 9208 */ 9209 9210 // Allocate space to store nesting levels 9211 void __kmp_init_nesting_mode() { 9212 int levels = KMP_HW_LAST; 9213 __kmp_nesting_mode_nlevels = levels; 9214 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int)); 9215 for (int i = 0; i < levels; ++i) 9216 __kmp_nesting_nth_level[i] = 0; 9217 if (__kmp_nested_nth.size < levels) { 9218 __kmp_nested_nth.nth = 9219 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int)); 9220 __kmp_nested_nth.size = levels; 9221 } 9222 } 9223 9224 // Set # threads for top levels of nesting; must be called after topology set 9225 void __kmp_set_nesting_mode_threads() { 9226 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()]; 9227 9228 if (__kmp_nesting_mode == 1) 9229 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 9230 else if (__kmp_nesting_mode > 1) 9231 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 9232 9233 if (__kmp_topology) { // use topology info 9234 int loc, hw_level; 9235 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() && 9236 loc < __kmp_nesting_mode_nlevels; 9237 loc++, hw_level++) { 9238 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level); 9239 if (__kmp_nesting_nth_level[loc] == 1) 9240 loc--; 9241 } 9242 // Make sure all cores are used 9243 if (__kmp_nesting_mode > 1 && loc > 1) { 9244 int core_level = __kmp_topology->get_level(KMP_HW_CORE); 9245 int num_cores = __kmp_topology->get_count(core_level); 9246 int upper_levels = 1; 9247 for (int level = 0; level < loc - 1; ++level) 9248 upper_levels *= __kmp_nesting_nth_level[level]; 9249 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores) 9250 __kmp_nesting_nth_level[loc - 1] = 9251 num_cores / __kmp_nesting_nth_level[loc - 2]; 9252 } 9253 __kmp_nesting_mode_nlevels = loc; 9254 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 9255 } else { // no topology info available; provide a reasonable guesstimation 9256 if (__kmp_avail_proc >= 4) { 9257 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2; 9258 __kmp_nesting_nth_level[1] = 2; 9259 __kmp_nesting_mode_nlevels = 2; 9260 } else { 9261 __kmp_nesting_nth_level[0] = __kmp_avail_proc; 9262 __kmp_nesting_mode_nlevels = 1; 9263 } 9264 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 9265 } 9266 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) { 9267 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i]; 9268 } 9269 set__nproc(thread, __kmp_nesting_nth_level[0]); 9270 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode) 9271 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 9272 if (get__max_active_levels(thread) > 1) { 9273 // if max levels was set, set nesting mode levels to same 9274 __kmp_nesting_mode_nlevels = get__max_active_levels(thread); 9275 } 9276 if (__kmp_nesting_mode == 1) // turn on nesting for this case only 9277 set__max_active_levels(thread, __kmp_nesting_mode_nlevels); 9278 } 9279 9280 // Empty symbols to export (see exports_so.txt) when feature is disabled 9281 extern "C" { 9282 #if !KMP_STATS_ENABLED 9283 void __kmp_reset_stats() {} 9284 #endif 9285 #if !USE_DEBUGGER 9286 int __kmp_omp_debug_struct_info = FALSE; 9287 int __kmp_debugging = FALSE; 9288 #endif 9289 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY 9290 void __kmp_itt_fini_ittlib() {} 9291 void __kmp_itt_init_ittlib() {} 9292 #endif 9293 } 9294 9295 // end of file 9296