1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #if KMP_USE_HIER_SCHED 28 #include "kmp_dispatch_hier.h" 29 #endif 30 31 #if OMPT_SUPPORT 32 #include "ompt-specific.h" 33 #endif 34 #if OMPD_SUPPORT 35 #include "ompd-specific.h" 36 #endif 37 38 #if OMP_PROFILING_SUPPORT 39 #include "llvm/Support/TimeProfiler.h" 40 static char *ProfileTraceFile = nullptr; 41 #endif 42 43 /* these are temporary issues to be dealt with */ 44 #define KMP_USE_PRCTL 0 45 46 #if KMP_OS_WINDOWS 47 #include <process.h> 48 #endif 49 50 #if KMP_OS_WINDOWS 51 // windows does not need include files as it doesn't use shared memory 52 #else 53 #include <sys/mman.h> 54 #include <sys/stat.h> 55 #include <fcntl.h> 56 #define SHM_SIZE 1024 57 #endif 58 59 #if defined(KMP_GOMP_COMPAT) 60 char const __kmp_version_alt_comp[] = 61 KMP_VERSION_PREFIX "alternative compiler support: yes"; 62 #endif /* defined(KMP_GOMP_COMPAT) */ 63 64 char const __kmp_version_omp_api[] = 65 KMP_VERSION_PREFIX "API version: 5.0 (201611)"; 66 67 #ifdef KMP_DEBUG 68 char const __kmp_version_lock[] = 69 KMP_VERSION_PREFIX "lock type: run time selectable"; 70 #endif /* KMP_DEBUG */ 71 72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 73 74 /* ------------------------------------------------------------------------ */ 75 76 #if KMP_USE_MONITOR 77 kmp_info_t __kmp_monitor; 78 #endif 79 80 /* Forward declarations */ 81 82 void __kmp_cleanup(void); 83 84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 85 int gtid); 86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 87 kmp_internal_control_t *new_icvs, 88 ident_t *loc); 89 #if KMP_AFFINITY_SUPPORTED 90 static void __kmp_partition_places(kmp_team_t *team, 91 int update_master_only = 0); 92 #endif 93 static void __kmp_do_serial_initialize(void); 94 void __kmp_fork_barrier(int gtid, int tid); 95 void __kmp_join_barrier(int gtid); 96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 97 kmp_internal_control_t *new_icvs, ident_t *loc); 98 99 #ifdef USE_LOAD_BALANCE 100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 101 #endif 102 103 static int __kmp_expand_threads(int nNeed); 104 #if KMP_OS_WINDOWS 105 static int __kmp_unregister_root_other_thread(int gtid); 106 #endif 107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 109 110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads, 111 int new_nthreads); 112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads); 113 114 /* Calculate the identifier of the current thread */ 115 /* fast (and somewhat portable) way to get unique identifier of executing 116 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 117 int __kmp_get_global_thread_id() { 118 int i; 119 kmp_info_t **other_threads; 120 size_t stack_data; 121 char *stack_addr; 122 size_t stack_size; 123 char *stack_base; 124 125 KA_TRACE( 126 1000, 127 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 128 __kmp_nth, __kmp_all_nth)); 129 130 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 131 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 132 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 133 __kmp_init_gtid for this to work. */ 134 135 if (!TCR_4(__kmp_init_gtid)) 136 return KMP_GTID_DNE; 137 138 #ifdef KMP_TDATA_GTID 139 if (TCR_4(__kmp_gtid_mode) >= 3) { 140 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 141 return __kmp_gtid; 142 } 143 #endif 144 if (TCR_4(__kmp_gtid_mode) >= 2) { 145 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 146 return __kmp_gtid_get_specific(); 147 } 148 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 149 150 stack_addr = (char *)&stack_data; 151 other_threads = __kmp_threads; 152 153 /* ATT: The code below is a source of potential bugs due to unsynchronized 154 access to __kmp_threads array. For example: 155 1. Current thread loads other_threads[i] to thr and checks it, it is 156 non-NULL. 157 2. Current thread is suspended by OS. 158 3. Another thread unregisters and finishes (debug versions of free() 159 may fill memory with something like 0xEF). 160 4. Current thread is resumed. 161 5. Current thread reads junk from *thr. 162 TODO: Fix it. --ln */ 163 164 for (i = 0; i < __kmp_threads_capacity; i++) { 165 166 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 167 if (!thr) 168 continue; 169 170 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 171 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 172 173 /* stack grows down -- search through all of the active threads */ 174 175 if (stack_addr <= stack_base) { 176 size_t stack_diff = stack_base - stack_addr; 177 178 if (stack_diff <= stack_size) { 179 /* The only way we can be closer than the allocated */ 180 /* stack size is if we are running on this thread. */ 181 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 182 return i; 183 } 184 } 185 } 186 187 /* get specific to try and determine our gtid */ 188 KA_TRACE(1000, 189 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 190 "thread, using TLS\n")); 191 i = __kmp_gtid_get_specific(); 192 193 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 194 195 /* if we havn't been assigned a gtid, then return code */ 196 if (i < 0) 197 return i; 198 199 /* dynamically updated stack window for uber threads to avoid get_specific 200 call */ 201 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 202 KMP_FATAL(StackOverflow, i); 203 } 204 205 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 206 if (stack_addr > stack_base) { 207 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 208 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 209 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 210 stack_base); 211 } else { 212 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 213 stack_base - stack_addr); 214 } 215 216 /* Reprint stack bounds for ubermaster since they have been refined */ 217 if (__kmp_storage_map) { 218 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 219 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 220 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 221 other_threads[i]->th.th_info.ds.ds_stacksize, 222 "th_%d stack (refinement)", i); 223 } 224 return i; 225 } 226 227 int __kmp_get_global_thread_id_reg() { 228 int gtid; 229 230 if (!__kmp_init_serial) { 231 gtid = KMP_GTID_DNE; 232 } else 233 #ifdef KMP_TDATA_GTID 234 if (TCR_4(__kmp_gtid_mode) >= 3) { 235 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 236 gtid = __kmp_gtid; 237 } else 238 #endif 239 if (TCR_4(__kmp_gtid_mode) >= 2) { 240 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 241 gtid = __kmp_gtid_get_specific(); 242 } else { 243 KA_TRACE(1000, 244 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 245 gtid = __kmp_get_global_thread_id(); 246 } 247 248 /* we must be a new uber master sibling thread */ 249 if (gtid == KMP_GTID_DNE) { 250 KA_TRACE(10, 251 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 252 "Registering a new gtid.\n")); 253 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 254 if (!__kmp_init_serial) { 255 __kmp_do_serial_initialize(); 256 gtid = __kmp_gtid_get_specific(); 257 } else { 258 gtid = __kmp_register_root(FALSE); 259 } 260 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 261 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 262 } 263 264 KMP_DEBUG_ASSERT(gtid >= 0); 265 266 return gtid; 267 } 268 269 /* caller must hold forkjoin_lock */ 270 void __kmp_check_stack_overlap(kmp_info_t *th) { 271 int f; 272 char *stack_beg = NULL; 273 char *stack_end = NULL; 274 int gtid; 275 276 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 277 if (__kmp_storage_map) { 278 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 279 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 280 281 gtid = __kmp_gtid_from_thread(th); 282 283 if (gtid == KMP_GTID_MONITOR) { 284 __kmp_print_storage_map_gtid( 285 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 286 "th_%s stack (%s)", "mon", 287 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 288 } else { 289 __kmp_print_storage_map_gtid( 290 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 291 "th_%d stack (%s)", gtid, 292 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 293 } 294 } 295 296 /* No point in checking ubermaster threads since they use refinement and 297 * cannot overlap */ 298 gtid = __kmp_gtid_from_thread(th); 299 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 300 KA_TRACE(10, 301 ("__kmp_check_stack_overlap: performing extensive checking\n")); 302 if (stack_beg == NULL) { 303 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 304 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 305 } 306 307 for (f = 0; f < __kmp_threads_capacity; f++) { 308 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 309 310 if (f_th && f_th != th) { 311 char *other_stack_end = 312 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 313 char *other_stack_beg = 314 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 315 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 316 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 317 318 /* Print the other stack values before the abort */ 319 if (__kmp_storage_map) 320 __kmp_print_storage_map_gtid( 321 -1, other_stack_beg, other_stack_end, 322 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 323 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 324 325 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 326 __kmp_msg_null); 327 } 328 } 329 } 330 } 331 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 332 } 333 334 /* ------------------------------------------------------------------------ */ 335 336 void __kmp_infinite_loop(void) { 337 static int done = FALSE; 338 339 while (!done) { 340 KMP_YIELD(TRUE); 341 } 342 } 343 344 #define MAX_MESSAGE 512 345 346 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 347 char const *format, ...) { 348 char buffer[MAX_MESSAGE]; 349 va_list ap; 350 351 va_start(ap, format); 352 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 353 p2, (unsigned long)size, format); 354 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 355 __kmp_vprintf(kmp_err, buffer, ap); 356 #if KMP_PRINT_DATA_PLACEMENT 357 int node; 358 if (gtid >= 0) { 359 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 360 if (__kmp_storage_map_verbose) { 361 node = __kmp_get_host_node(p1); 362 if (node < 0) /* doesn't work, so don't try this next time */ 363 __kmp_storage_map_verbose = FALSE; 364 else { 365 char *last; 366 int lastNode; 367 int localProc = __kmp_get_cpu_from_gtid(gtid); 368 369 const int page_size = KMP_GET_PAGE_SIZE(); 370 371 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 372 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 373 if (localProc >= 0) 374 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 375 localProc >> 1); 376 else 377 __kmp_printf_no_lock(" GTID %d\n", gtid); 378 #if KMP_USE_PRCTL 379 /* The more elaborate format is disabled for now because of the prctl 380 * hanging bug. */ 381 do { 382 last = p1; 383 lastNode = node; 384 /* This loop collates adjacent pages with the same host node. */ 385 do { 386 (char *)p1 += page_size; 387 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 388 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 389 lastNode); 390 } while (p1 <= p2); 391 #else 392 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 393 (char *)p1 + (page_size - 1), 394 __kmp_get_host_node(p1)); 395 if (p1 < p2) { 396 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 397 (char *)p2 + (page_size - 1), 398 __kmp_get_host_node(p2)); 399 } 400 #endif 401 } 402 } 403 } else 404 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 405 } 406 #endif /* KMP_PRINT_DATA_PLACEMENT */ 407 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 408 } 409 410 void __kmp_warn(char const *format, ...) { 411 char buffer[MAX_MESSAGE]; 412 va_list ap; 413 414 if (__kmp_generate_warnings == kmp_warnings_off) { 415 return; 416 } 417 418 va_start(ap, format); 419 420 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 421 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 422 __kmp_vprintf(kmp_err, buffer, ap); 423 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 424 425 va_end(ap); 426 } 427 428 void __kmp_abort_process() { 429 // Later threads may stall here, but that's ok because abort() will kill them. 430 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 431 432 if (__kmp_debug_buf) { 433 __kmp_dump_debug_buffer(); 434 } 435 436 if (KMP_OS_WINDOWS) { 437 // Let other threads know of abnormal termination and prevent deadlock 438 // if abort happened during library initialization or shutdown 439 __kmp_global.g.g_abort = SIGABRT; 440 441 /* On Windows* OS by default abort() causes pop-up error box, which stalls 442 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 443 boxes. _set_abort_behavior() works well, but this function is not 444 available in VS7 (this is not problem for DLL, but it is a problem for 445 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 446 help, at least in some versions of MS C RTL. 447 448 It seems following sequence is the only way to simulate abort() and 449 avoid pop-up error box. */ 450 raise(SIGABRT); 451 _exit(3); // Just in case, if signal ignored, exit anyway. 452 } else { 453 __kmp_unregister_library(); 454 abort(); 455 } 456 457 __kmp_infinite_loop(); 458 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 459 460 } // __kmp_abort_process 461 462 void __kmp_abort_thread(void) { 463 // TODO: Eliminate g_abort global variable and this function. 464 // In case of abort just call abort(), it will kill all the threads. 465 __kmp_infinite_loop(); 466 } // __kmp_abort_thread 467 468 /* Print out the storage map for the major kmp_info_t thread data structures 469 that are allocated together. */ 470 471 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 472 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 473 gtid); 474 475 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 476 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 477 478 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 479 sizeof(kmp_local_t), "th_%d.th_local", gtid); 480 481 __kmp_print_storage_map_gtid( 482 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 483 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 484 485 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 486 &thr->th.th_bar[bs_plain_barrier + 1], 487 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 488 gtid); 489 490 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 491 &thr->th.th_bar[bs_forkjoin_barrier + 1], 492 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 493 gtid); 494 495 #if KMP_FAST_REDUCTION_BARRIER 496 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 497 &thr->th.th_bar[bs_reduction_barrier + 1], 498 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 499 gtid); 500 #endif // KMP_FAST_REDUCTION_BARRIER 501 } 502 503 /* Print out the storage map for the major kmp_team_t team data structures 504 that are allocated together. */ 505 506 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 507 int team_id, int num_thr) { 508 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 509 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 510 header, team_id); 511 512 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 513 &team->t.t_bar[bs_last_barrier], 514 sizeof(kmp_balign_team_t) * bs_last_barrier, 515 "%s_%d.t_bar", header, team_id); 516 517 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 518 &team->t.t_bar[bs_plain_barrier + 1], 519 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 520 header, team_id); 521 522 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 523 &team->t.t_bar[bs_forkjoin_barrier + 1], 524 sizeof(kmp_balign_team_t), 525 "%s_%d.t_bar[forkjoin]", header, team_id); 526 527 #if KMP_FAST_REDUCTION_BARRIER 528 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 529 &team->t.t_bar[bs_reduction_barrier + 1], 530 sizeof(kmp_balign_team_t), 531 "%s_%d.t_bar[reduction]", header, team_id); 532 #endif // KMP_FAST_REDUCTION_BARRIER 533 534 __kmp_print_storage_map_gtid( 535 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 536 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 537 538 __kmp_print_storage_map_gtid( 539 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 540 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 541 542 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 543 &team->t.t_disp_buffer[num_disp_buff], 544 sizeof(dispatch_shared_info_t) * num_disp_buff, 545 "%s_%d.t_disp_buffer", header, team_id); 546 } 547 548 static void __kmp_init_allocator() { 549 __kmp_init_memkind(); 550 __kmp_init_target_mem(); 551 } 552 static void __kmp_fini_allocator() { __kmp_fini_memkind(); } 553 554 /* ------------------------------------------------------------------------ */ 555 556 #if KMP_DYNAMIC_LIB 557 #if KMP_OS_WINDOWS 558 559 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 560 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 561 562 switch (fdwReason) { 563 564 case DLL_PROCESS_ATTACH: 565 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 566 567 return TRUE; 568 569 case DLL_PROCESS_DETACH: 570 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 571 572 // According to Windows* documentation for DllMain entry point: 573 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference: 574 // lpReserved == NULL when FreeLibrary() is called, 575 // lpReserved != NULL when the process is terminated. 576 // When FreeLibrary() is called, worker threads remain alive. So the 577 // runtime's state is consistent and executing proper shutdown is OK. 578 // When the process is terminated, worker threads have exited or been 579 // forcefully terminated by the OS and only the shutdown thread remains. 580 // This can leave the runtime in an inconsistent state. 581 // Hence, only attempt proper cleanup when FreeLibrary() is called. 582 // Otherwise, rely on OS to reclaim resources. 583 if (lpReserved == NULL) 584 __kmp_internal_end_library(__kmp_gtid_get_specific()); 585 586 return TRUE; 587 588 case DLL_THREAD_ATTACH: 589 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 590 591 /* if we want to register new siblings all the time here call 592 * __kmp_get_gtid(); */ 593 return TRUE; 594 595 case DLL_THREAD_DETACH: 596 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 597 598 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 599 return TRUE; 600 } 601 602 return TRUE; 603 } 604 605 #endif /* KMP_OS_WINDOWS */ 606 #endif /* KMP_DYNAMIC_LIB */ 607 608 /* __kmp_parallel_deo -- Wait until it's our turn. */ 609 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 610 int gtid = *gtid_ref; 611 #ifdef BUILD_PARALLEL_ORDERED 612 kmp_team_t *team = __kmp_team_from_gtid(gtid); 613 #endif /* BUILD_PARALLEL_ORDERED */ 614 615 if (__kmp_env_consistency_check) { 616 if (__kmp_threads[gtid]->th.th_root->r.r_active) 617 #if KMP_USE_DYNAMIC_LOCK 618 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 619 #else 620 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 621 #endif 622 } 623 #ifdef BUILD_PARALLEL_ORDERED 624 if (!team->t.t_serialized) { 625 KMP_MB(); 626 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, 627 NULL); 628 KMP_MB(); 629 } 630 #endif /* BUILD_PARALLEL_ORDERED */ 631 } 632 633 /* __kmp_parallel_dxo -- Signal the next task. */ 634 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 635 int gtid = *gtid_ref; 636 #ifdef BUILD_PARALLEL_ORDERED 637 int tid = __kmp_tid_from_gtid(gtid); 638 kmp_team_t *team = __kmp_team_from_gtid(gtid); 639 #endif /* BUILD_PARALLEL_ORDERED */ 640 641 if (__kmp_env_consistency_check) { 642 if (__kmp_threads[gtid]->th.th_root->r.r_active) 643 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 644 } 645 #ifdef BUILD_PARALLEL_ORDERED 646 if (!team->t.t_serialized) { 647 KMP_MB(); /* Flush all pending memory write invalidates. */ 648 649 /* use the tid of the next thread in this team */ 650 /* TODO replace with general release procedure */ 651 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 652 653 KMP_MB(); /* Flush all pending memory write invalidates. */ 654 } 655 #endif /* BUILD_PARALLEL_ORDERED */ 656 } 657 658 /* ------------------------------------------------------------------------ */ 659 /* The BARRIER for a SINGLE process section is always explicit */ 660 661 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 662 int status; 663 kmp_info_t *th; 664 kmp_team_t *team; 665 666 if (!TCR_4(__kmp_init_parallel)) 667 __kmp_parallel_initialize(); 668 __kmp_resume_if_soft_paused(); 669 670 th = __kmp_threads[gtid]; 671 team = th->th.th_team; 672 status = 0; 673 674 th->th.th_ident = id_ref; 675 676 if (team->t.t_serialized) { 677 status = 1; 678 } else { 679 kmp_int32 old_this = th->th.th_local.this_construct; 680 681 ++th->th.th_local.this_construct; 682 /* try to set team count to thread count--success means thread got the 683 single block */ 684 /* TODO: Should this be acquire or release? */ 685 if (team->t.t_construct == old_this) { 686 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 687 th->th.th_local.this_construct); 688 } 689 #if USE_ITT_BUILD 690 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 691 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 692 team->t.t_active_level == 1) { 693 // Only report metadata by primary thread of active team at level 1 694 __kmp_itt_metadata_single(id_ref); 695 } 696 #endif /* USE_ITT_BUILD */ 697 } 698 699 if (__kmp_env_consistency_check) { 700 if (status && push_ws) { 701 __kmp_push_workshare(gtid, ct_psingle, id_ref); 702 } else { 703 __kmp_check_workshare(gtid, ct_psingle, id_ref); 704 } 705 } 706 #if USE_ITT_BUILD 707 if (status) { 708 __kmp_itt_single_start(gtid); 709 } 710 #endif /* USE_ITT_BUILD */ 711 return status; 712 } 713 714 void __kmp_exit_single(int gtid) { 715 #if USE_ITT_BUILD 716 __kmp_itt_single_end(gtid); 717 #endif /* USE_ITT_BUILD */ 718 if (__kmp_env_consistency_check) 719 __kmp_pop_workshare(gtid, ct_psingle, NULL); 720 } 721 722 /* determine if we can go parallel or must use a serialized parallel region and 723 * how many threads we can use 724 * set_nproc is the number of threads requested for the team 725 * returns 0 if we should serialize or only use one thread, 726 * otherwise the number of threads to use 727 * The forkjoin lock is held by the caller. */ 728 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 729 int master_tid, int set_nthreads, 730 int enter_teams) { 731 int capacity; 732 int new_nthreads; 733 KMP_DEBUG_ASSERT(__kmp_init_serial); 734 KMP_DEBUG_ASSERT(root && parent_team); 735 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; 736 737 // If dyn-var is set, dynamically adjust the number of desired threads, 738 // according to the method specified by dynamic_mode. 739 new_nthreads = set_nthreads; 740 if (!get__dynamic_2(parent_team, master_tid)) { 741 ; 742 } 743 #ifdef USE_LOAD_BALANCE 744 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 745 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 746 if (new_nthreads == 1) { 747 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 748 "reservation to 1 thread\n", 749 master_tid)); 750 return 1; 751 } 752 if (new_nthreads < set_nthreads) { 753 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 754 "reservation to %d threads\n", 755 master_tid, new_nthreads)); 756 } 757 } 758 #endif /* USE_LOAD_BALANCE */ 759 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 760 new_nthreads = __kmp_avail_proc - __kmp_nth + 761 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 762 if (new_nthreads <= 1) { 763 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 764 "reservation to 1 thread\n", 765 master_tid)); 766 return 1; 767 } 768 if (new_nthreads < set_nthreads) { 769 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 770 "reservation to %d threads\n", 771 master_tid, new_nthreads)); 772 } else { 773 new_nthreads = set_nthreads; 774 } 775 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 776 if (set_nthreads > 2) { 777 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 778 new_nthreads = (new_nthreads % set_nthreads) + 1; 779 if (new_nthreads == 1) { 780 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 781 "reservation to 1 thread\n", 782 master_tid)); 783 return 1; 784 } 785 if (new_nthreads < set_nthreads) { 786 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 787 "reservation to %d threads\n", 788 master_tid, new_nthreads)); 789 } 790 } 791 } else { 792 KMP_ASSERT(0); 793 } 794 795 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 796 if (__kmp_nth + new_nthreads - 797 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 798 __kmp_max_nth) { 799 int tl_nthreads = __kmp_max_nth - __kmp_nth + 800 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 801 if (tl_nthreads <= 0) { 802 tl_nthreads = 1; 803 } 804 805 // If dyn-var is false, emit a 1-time warning. 806 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 807 __kmp_reserve_warn = 1; 808 __kmp_msg(kmp_ms_warning, 809 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 810 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 811 } 812 if (tl_nthreads == 1) { 813 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 814 "reduced reservation to 1 thread\n", 815 master_tid)); 816 return 1; 817 } 818 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 819 "reservation to %d threads\n", 820 master_tid, tl_nthreads)); 821 new_nthreads = tl_nthreads; 822 } 823 824 // Respect OMP_THREAD_LIMIT 825 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; 826 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; 827 if (cg_nthreads + new_nthreads - 828 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 829 max_cg_threads) { 830 int tl_nthreads = max_cg_threads - cg_nthreads + 831 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 832 if (tl_nthreads <= 0) { 833 tl_nthreads = 1; 834 } 835 836 // If dyn-var is false, emit a 1-time warning. 837 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 838 __kmp_reserve_warn = 1; 839 __kmp_msg(kmp_ms_warning, 840 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 841 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 842 } 843 if (tl_nthreads == 1) { 844 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 845 "reduced reservation to 1 thread\n", 846 master_tid)); 847 return 1; 848 } 849 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 850 "reservation to %d threads\n", 851 master_tid, tl_nthreads)); 852 new_nthreads = tl_nthreads; 853 } 854 855 // Check if the threads array is large enough, or needs expanding. 856 // See comment in __kmp_register_root() about the adjustment if 857 // __kmp_threads[0] == NULL. 858 capacity = __kmp_threads_capacity; 859 if (TCR_PTR(__kmp_threads[0]) == NULL) { 860 --capacity; 861 } 862 // If it is not for initializing the hidden helper team, we need to take 863 // __kmp_hidden_helper_threads_num out of the capacity because it is included 864 // in __kmp_threads_capacity. 865 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 866 capacity -= __kmp_hidden_helper_threads_num; 867 } 868 if (__kmp_nth + new_nthreads - 869 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 870 capacity) { 871 // Expand the threads array. 872 int slotsRequired = __kmp_nth + new_nthreads - 873 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 874 capacity; 875 int slotsAdded = __kmp_expand_threads(slotsRequired); 876 if (slotsAdded < slotsRequired) { 877 // The threads array was not expanded enough. 878 new_nthreads -= (slotsRequired - slotsAdded); 879 KMP_ASSERT(new_nthreads >= 1); 880 881 // If dyn-var is false, emit a 1-time warning. 882 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 883 __kmp_reserve_warn = 1; 884 if (__kmp_tp_cached) { 885 __kmp_msg(kmp_ms_warning, 886 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 887 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 888 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 889 } else { 890 __kmp_msg(kmp_ms_warning, 891 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 892 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 893 } 894 } 895 } 896 } 897 898 #ifdef KMP_DEBUG 899 if (new_nthreads == 1) { 900 KC_TRACE(10, 901 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 902 "dead roots and rechecking; requested %d threads\n", 903 __kmp_get_gtid(), set_nthreads)); 904 } else { 905 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 906 " %d threads\n", 907 __kmp_get_gtid(), new_nthreads, set_nthreads)); 908 } 909 #endif // KMP_DEBUG 910 return new_nthreads; 911 } 912 913 /* Allocate threads from the thread pool and assign them to the new team. We are 914 assured that there are enough threads available, because we checked on that 915 earlier within critical section forkjoin */ 916 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 917 kmp_info_t *master_th, int master_gtid, 918 int fork_teams_workers) { 919 int i; 920 int use_hot_team; 921 922 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 923 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 924 KMP_MB(); 925 926 /* first, let's setup the primary thread */ 927 master_th->th.th_info.ds.ds_tid = 0; 928 master_th->th.th_team = team; 929 master_th->th.th_team_nproc = team->t.t_nproc; 930 master_th->th.th_team_master = master_th; 931 master_th->th.th_team_serialized = FALSE; 932 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 933 934 /* make sure we are not the optimized hot team */ 935 #if KMP_NESTED_HOT_TEAMS 936 use_hot_team = 0; 937 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 938 if (hot_teams) { // hot teams array is not allocated if 939 // KMP_HOT_TEAMS_MAX_LEVEL=0 940 int level = team->t.t_active_level - 1; // index in array of hot teams 941 if (master_th->th.th_teams_microtask) { // are we inside the teams? 942 if (master_th->th.th_teams_size.nteams > 1) { 943 ++level; // level was not increased in teams construct for 944 // team_of_masters 945 } 946 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 947 master_th->th.th_teams_level == team->t.t_level) { 948 ++level; // level was not increased in teams construct for 949 // team_of_workers before the parallel 950 } // team->t.t_level will be increased inside parallel 951 } 952 if (level < __kmp_hot_teams_max_level) { 953 if (hot_teams[level].hot_team) { 954 // hot team has already been allocated for given level 955 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 956 use_hot_team = 1; // the team is ready to use 957 } else { 958 use_hot_team = 0; // AC: threads are not allocated yet 959 hot_teams[level].hot_team = team; // remember new hot team 960 hot_teams[level].hot_team_nth = team->t.t_nproc; 961 } 962 } else { 963 use_hot_team = 0; 964 } 965 } 966 #else 967 use_hot_team = team == root->r.r_hot_team; 968 #endif 969 if (!use_hot_team) { 970 971 /* install the primary thread */ 972 team->t.t_threads[0] = master_th; 973 __kmp_initialize_info(master_th, team, 0, master_gtid); 974 975 /* now, install the worker threads */ 976 for (i = 1; i < team->t.t_nproc; i++) { 977 978 /* fork or reallocate a new thread and install it in team */ 979 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 980 team->t.t_threads[i] = thr; 981 KMP_DEBUG_ASSERT(thr); 982 KMP_DEBUG_ASSERT(thr->th.th_team == team); 983 /* align team and thread arrived states */ 984 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 985 "T#%d(%d:%d) join =%llu, plain=%llu\n", 986 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 987 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 988 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 989 team->t.t_bar[bs_plain_barrier].b_arrived)); 990 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 991 thr->th.th_teams_level = master_th->th.th_teams_level; 992 thr->th.th_teams_size = master_th->th.th_teams_size; 993 { // Initialize threads' barrier data. 994 int b; 995 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 996 for (b = 0; b < bs_last_barrier; ++b) { 997 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 998 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 999 #if USE_DEBUGGER 1000 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 1001 #endif 1002 } 1003 } 1004 } 1005 1006 #if KMP_AFFINITY_SUPPORTED 1007 // Do not partition the places list for teams construct workers who 1008 // haven't actually been forked to do real work yet. This partitioning 1009 // will take place in the parallel region nested within the teams construct. 1010 if (!fork_teams_workers) { 1011 __kmp_partition_places(team); 1012 } 1013 #endif 1014 } 1015 1016 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 1017 for (i = 0; i < team->t.t_nproc; i++) { 1018 kmp_info_t *thr = team->t.t_threads[i]; 1019 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1020 thr->th.th_prev_level != team->t.t_level) { 1021 team->t.t_display_affinity = 1; 1022 break; 1023 } 1024 } 1025 } 1026 1027 KMP_MB(); 1028 } 1029 1030 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1031 // Propagate any changes to the floating point control registers out to the team 1032 // We try to avoid unnecessary writes to the relevant cache line in the team 1033 // structure, so we don't make changes unless they are needed. 1034 inline static void propagateFPControl(kmp_team_t *team) { 1035 if (__kmp_inherit_fp_control) { 1036 kmp_int16 x87_fpu_control_word; 1037 kmp_uint32 mxcsr; 1038 1039 // Get primary thread's values of FPU control flags (both X87 and vector) 1040 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1041 __kmp_store_mxcsr(&mxcsr); 1042 mxcsr &= KMP_X86_MXCSR_MASK; 1043 1044 // There is no point looking at t_fp_control_saved here. 1045 // If it is TRUE, we still have to update the values if they are different 1046 // from those we now have. If it is FALSE we didn't save anything yet, but 1047 // our objective is the same. We have to ensure that the values in the team 1048 // are the same as those we have. 1049 // So, this code achieves what we need whether or not t_fp_control_saved is 1050 // true. By checking whether the value needs updating we avoid unnecessary 1051 // writes that would put the cache-line into a written state, causing all 1052 // threads in the team to have to read it again. 1053 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1054 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1055 // Although we don't use this value, other code in the runtime wants to know 1056 // whether it should restore them. So we must ensure it is correct. 1057 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1058 } else { 1059 // Similarly here. Don't write to this cache-line in the team structure 1060 // unless we have to. 1061 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1062 } 1063 } 1064 1065 // Do the opposite, setting the hardware registers to the updated values from 1066 // the team. 1067 inline static void updateHWFPControl(kmp_team_t *team) { 1068 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1069 // Only reset the fp control regs if they have been changed in the team. 1070 // the parallel region that we are exiting. 1071 kmp_int16 x87_fpu_control_word; 1072 kmp_uint32 mxcsr; 1073 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1074 __kmp_store_mxcsr(&mxcsr); 1075 mxcsr &= KMP_X86_MXCSR_MASK; 1076 1077 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1078 __kmp_clear_x87_fpu_status_word(); 1079 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1080 } 1081 1082 if (team->t.t_mxcsr != mxcsr) { 1083 __kmp_load_mxcsr(&team->t.t_mxcsr); 1084 } 1085 } 1086 } 1087 #else 1088 #define propagateFPControl(x) ((void)0) 1089 #define updateHWFPControl(x) ((void)0) 1090 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1091 1092 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1093 int realloc); // forward declaration 1094 1095 /* Run a parallel region that has been serialized, so runs only in a team of the 1096 single primary thread. */ 1097 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1098 kmp_info_t *this_thr; 1099 kmp_team_t *serial_team; 1100 1101 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1102 1103 /* Skip all this code for autopar serialized loops since it results in 1104 unacceptable overhead */ 1105 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1106 return; 1107 1108 if (!TCR_4(__kmp_init_parallel)) 1109 __kmp_parallel_initialize(); 1110 __kmp_resume_if_soft_paused(); 1111 1112 this_thr = __kmp_threads[global_tid]; 1113 serial_team = this_thr->th.th_serial_team; 1114 1115 /* utilize the serialized team held by this thread */ 1116 KMP_DEBUG_ASSERT(serial_team); 1117 KMP_MB(); 1118 1119 if (__kmp_tasking_mode != tskm_immediate_exec) { 1120 KMP_DEBUG_ASSERT( 1121 this_thr->th.th_task_team == 1122 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1123 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1124 NULL); 1125 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1126 "team %p, new task_team = NULL\n", 1127 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1128 this_thr->th.th_task_team = NULL; 1129 } 1130 1131 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1132 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1133 proc_bind = proc_bind_false; 1134 } else if (proc_bind == proc_bind_default) { 1135 // No proc_bind clause was specified, so use the current value 1136 // of proc-bind-var for this parallel region. 1137 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1138 } 1139 // Reset for next parallel region 1140 this_thr->th.th_set_proc_bind = proc_bind_default; 1141 1142 #if OMPT_SUPPORT 1143 ompt_data_t ompt_parallel_data = ompt_data_none; 1144 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1145 if (ompt_enabled.enabled && 1146 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1147 1148 ompt_task_info_t *parent_task_info; 1149 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1150 1151 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1152 if (ompt_enabled.ompt_callback_parallel_begin) { 1153 int team_size = 1; 1154 1155 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1156 &(parent_task_info->task_data), &(parent_task_info->frame), 1157 &ompt_parallel_data, team_size, 1158 ompt_parallel_invoker_program | ompt_parallel_team, codeptr); 1159 } 1160 } 1161 #endif // OMPT_SUPPORT 1162 1163 if (this_thr->th.th_team != serial_team) { 1164 // Nested level will be an index in the nested nthreads array 1165 int level = this_thr->th.th_team->t.t_level; 1166 1167 if (serial_team->t.t_serialized) { 1168 /* this serial team was already used 1169 TODO increase performance by making this locks more specific */ 1170 kmp_team_t *new_team; 1171 1172 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1173 1174 new_team = 1175 __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1176 #if OMPT_SUPPORT 1177 ompt_parallel_data, 1178 #endif 1179 proc_bind, &this_thr->th.th_current_task->td_icvs, 1180 0 USE_NESTED_HOT_ARG(NULL)); 1181 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1182 KMP_ASSERT(new_team); 1183 1184 /* setup new serialized team and install it */ 1185 new_team->t.t_threads[0] = this_thr; 1186 new_team->t.t_parent = this_thr->th.th_team; 1187 serial_team = new_team; 1188 this_thr->th.th_serial_team = serial_team; 1189 1190 KF_TRACE( 1191 10, 1192 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1193 global_tid, serial_team)); 1194 1195 /* TODO the above breaks the requirement that if we run out of resources, 1196 then we can still guarantee that serialized teams are ok, since we may 1197 need to allocate a new one */ 1198 } else { 1199 KF_TRACE( 1200 10, 1201 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1202 global_tid, serial_team)); 1203 } 1204 1205 /* we have to initialize this serial team */ 1206 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1207 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1208 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1209 serial_team->t.t_ident = loc; 1210 serial_team->t.t_serialized = 1; 1211 serial_team->t.t_nproc = 1; 1212 serial_team->t.t_parent = this_thr->th.th_team; 1213 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1214 this_thr->th.th_team = serial_team; 1215 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1216 1217 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid, 1218 this_thr->th.th_current_task)); 1219 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1220 this_thr->th.th_current_task->td_flags.executing = 0; 1221 1222 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1223 1224 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1225 implicit task for each serialized task represented by 1226 team->t.t_serialized? */ 1227 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1228 &this_thr->th.th_current_task->td_parent->td_icvs); 1229 1230 // Thread value exists in the nested nthreads array for the next nested 1231 // level 1232 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1233 this_thr->th.th_current_task->td_icvs.nproc = 1234 __kmp_nested_nth.nth[level + 1]; 1235 } 1236 1237 if (__kmp_nested_proc_bind.used && 1238 (level + 1 < __kmp_nested_proc_bind.used)) { 1239 this_thr->th.th_current_task->td_icvs.proc_bind = 1240 __kmp_nested_proc_bind.bind_types[level + 1]; 1241 } 1242 1243 #if USE_DEBUGGER 1244 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1245 #endif 1246 this_thr->th.th_info.ds.ds_tid = 0; 1247 1248 /* set thread cache values */ 1249 this_thr->th.th_team_nproc = 1; 1250 this_thr->th.th_team_master = this_thr; 1251 this_thr->th.th_team_serialized = 1; 1252 1253 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1254 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1255 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1256 1257 propagateFPControl(serial_team); 1258 1259 /* check if we need to allocate dispatch buffers stack */ 1260 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1261 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1262 serial_team->t.t_dispatch->th_disp_buffer = 1263 (dispatch_private_info_t *)__kmp_allocate( 1264 sizeof(dispatch_private_info_t)); 1265 } 1266 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1267 1268 KMP_MB(); 1269 1270 } else { 1271 /* this serialized team is already being used, 1272 * that's fine, just add another nested level */ 1273 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1274 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1275 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1276 ++serial_team->t.t_serialized; 1277 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1278 1279 // Nested level will be an index in the nested nthreads array 1280 int level = this_thr->th.th_team->t.t_level; 1281 // Thread value exists in the nested nthreads array for the next nested 1282 // level 1283 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1284 this_thr->th.th_current_task->td_icvs.nproc = 1285 __kmp_nested_nth.nth[level + 1]; 1286 } 1287 serial_team->t.t_level++; 1288 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1289 "of serial team %p to %d\n", 1290 global_tid, serial_team, serial_team->t.t_level)); 1291 1292 /* allocate/push dispatch buffers stack */ 1293 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1294 { 1295 dispatch_private_info_t *disp_buffer = 1296 (dispatch_private_info_t *)__kmp_allocate( 1297 sizeof(dispatch_private_info_t)); 1298 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1299 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1300 } 1301 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1302 1303 KMP_MB(); 1304 } 1305 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1306 1307 // Perform the display affinity functionality for 1308 // serialized parallel regions 1309 if (__kmp_display_affinity) { 1310 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1311 this_thr->th.th_prev_num_threads != 1) { 1312 // NULL means use the affinity-format-var ICV 1313 __kmp_aux_display_affinity(global_tid, NULL); 1314 this_thr->th.th_prev_level = serial_team->t.t_level; 1315 this_thr->th.th_prev_num_threads = 1; 1316 } 1317 } 1318 1319 if (__kmp_env_consistency_check) 1320 __kmp_push_parallel(global_tid, NULL); 1321 #if OMPT_SUPPORT 1322 serial_team->t.ompt_team_info.master_return_address = codeptr; 1323 if (ompt_enabled.enabled && 1324 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1325 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1326 OMPT_GET_FRAME_ADDRESS(0); 1327 1328 ompt_lw_taskteam_t lw_taskteam; 1329 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1330 &ompt_parallel_data, codeptr); 1331 1332 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1333 // don't use lw_taskteam after linking. content was swaped 1334 1335 /* OMPT implicit task begin */ 1336 if (ompt_enabled.ompt_callback_implicit_task) { 1337 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1338 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1339 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), 1340 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1341 OMPT_CUR_TASK_INFO(this_thr)->thread_num = 1342 __kmp_tid_from_gtid(global_tid); 1343 } 1344 1345 /* OMPT state */ 1346 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1347 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1348 OMPT_GET_FRAME_ADDRESS(0); 1349 } 1350 #endif 1351 } 1352 1353 /* most of the work for a fork */ 1354 /* return true if we really went parallel, false if serialized */ 1355 int __kmp_fork_call(ident_t *loc, int gtid, 1356 enum fork_context_e call_context, // Intel, GNU, ... 1357 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1358 kmp_va_list ap) { 1359 void **argv; 1360 int i; 1361 int master_tid; 1362 int master_this_cons; 1363 kmp_team_t *team; 1364 kmp_team_t *parent_team; 1365 kmp_info_t *master_th; 1366 kmp_root_t *root; 1367 int nthreads; 1368 int master_active; 1369 int master_set_numthreads; 1370 int level; 1371 int active_level; 1372 int teams_level; 1373 #if KMP_NESTED_HOT_TEAMS 1374 kmp_hot_team_ptr_t **p_hot_teams; 1375 #endif 1376 { // KMP_TIME_BLOCK 1377 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1378 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1379 1380 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1381 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1382 /* Some systems prefer the stack for the root thread(s) to start with */ 1383 /* some gap from the parent stack to prevent false sharing. */ 1384 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1385 /* These 2 lines below are so this does not get optimized out */ 1386 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1387 __kmp_stkpadding += (short)((kmp_int64)dummy); 1388 } 1389 1390 /* initialize if needed */ 1391 KMP_DEBUG_ASSERT( 1392 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1393 if (!TCR_4(__kmp_init_parallel)) 1394 __kmp_parallel_initialize(); 1395 __kmp_resume_if_soft_paused(); 1396 1397 /* setup current data */ 1398 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with 1399 // shutdown 1400 parent_team = master_th->th.th_team; 1401 master_tid = master_th->th.th_info.ds.ds_tid; 1402 master_this_cons = master_th->th.th_local.this_construct; 1403 root = master_th->th.th_root; 1404 master_active = root->r.r_active; 1405 master_set_numthreads = master_th->th.th_set_nproc; 1406 1407 #if OMPT_SUPPORT 1408 ompt_data_t ompt_parallel_data = ompt_data_none; 1409 ompt_data_t *parent_task_data; 1410 ompt_frame_t *ompt_frame; 1411 ompt_data_t *implicit_task_data; 1412 void *return_address = NULL; 1413 1414 if (ompt_enabled.enabled) { 1415 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1416 NULL, NULL); 1417 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1418 } 1419 #endif 1420 1421 // Assign affinity to root thread if it hasn't happened yet 1422 __kmp_assign_root_init_mask(); 1423 1424 // Nested level will be an index in the nested nthreads array 1425 level = parent_team->t.t_level; 1426 // used to launch non-serial teams even if nested is not allowed 1427 active_level = parent_team->t.t_active_level; 1428 // needed to check nesting inside the teams 1429 teams_level = master_th->th.th_teams_level; 1430 #if KMP_NESTED_HOT_TEAMS 1431 p_hot_teams = &master_th->th.th_hot_teams; 1432 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1433 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1434 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1435 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1436 // it is either actual or not needed (when active_level > 0) 1437 (*p_hot_teams)[0].hot_team_nth = 1; 1438 } 1439 #endif 1440 1441 #if OMPT_SUPPORT 1442 if (ompt_enabled.enabled) { 1443 if (ompt_enabled.ompt_callback_parallel_begin) { 1444 int team_size = master_set_numthreads 1445 ? master_set_numthreads 1446 : get__nproc_2(parent_team, master_tid); 1447 int flags = OMPT_INVOKER(call_context) | 1448 ((microtask == (microtask_t)__kmp_teams_master) 1449 ? ompt_parallel_league 1450 : ompt_parallel_team); 1451 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1452 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags, 1453 return_address); 1454 } 1455 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1456 } 1457 #endif 1458 1459 master_th->th.th_ident = loc; 1460 1461 if (master_th->th.th_teams_microtask && ap && 1462 microtask != (microtask_t)__kmp_teams_master && level == teams_level) { 1463 // AC: This is start of parallel that is nested inside teams construct. 1464 // The team is actual (hot), all workers are ready at the fork barrier. 1465 // No lock needed to initialize the team a bit, then free workers. 1466 parent_team->t.t_ident = loc; 1467 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1468 parent_team->t.t_argc = argc; 1469 argv = (void **)parent_team->t.t_argv; 1470 for (i = argc - 1; i >= 0; --i) 1471 *argv++ = va_arg(kmp_va_deref(ap), void *); 1472 // Increment our nested depth levels, but not increase the serialization 1473 if (parent_team == master_th->th.th_serial_team) { 1474 // AC: we are in serialized parallel 1475 __kmpc_serialized_parallel(loc, gtid); 1476 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1477 1478 if (call_context == fork_context_gnu) { 1479 // AC: need to decrement t_serialized for enquiry functions to work 1480 // correctly, will restore at join time 1481 parent_team->t.t_serialized--; 1482 return TRUE; 1483 } 1484 1485 #if OMPD_SUPPORT 1486 parent_team->t.t_pkfn = microtask; 1487 #endif 1488 1489 #if OMPT_SUPPORT 1490 void *dummy; 1491 void **exit_frame_p; 1492 1493 ompt_lw_taskteam_t lw_taskteam; 1494 1495 if (ompt_enabled.enabled) { 1496 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1497 &ompt_parallel_data, return_address); 1498 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1499 1500 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1501 // don't use lw_taskteam after linking. content was swaped 1502 1503 /* OMPT implicit task begin */ 1504 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1505 if (ompt_enabled.ompt_callback_implicit_task) { 1506 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1507 __kmp_tid_from_gtid(gtid); 1508 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1509 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1510 implicit_task_data, 1, 1511 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1512 } 1513 1514 /* OMPT state */ 1515 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1516 } else { 1517 exit_frame_p = &dummy; 1518 } 1519 #endif 1520 // AC: need to decrement t_serialized for enquiry functions to work 1521 // correctly, will restore at join time 1522 parent_team->t.t_serialized--; 1523 1524 { 1525 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1526 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1527 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1528 #if OMPT_SUPPORT 1529 , 1530 exit_frame_p 1531 #endif 1532 ); 1533 } 1534 1535 #if OMPT_SUPPORT 1536 if (ompt_enabled.enabled) { 1537 *exit_frame_p = NULL; 1538 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1539 if (ompt_enabled.ompt_callback_implicit_task) { 1540 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1541 ompt_scope_end, NULL, implicit_task_data, 1, 1542 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1543 } 1544 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1545 __ompt_lw_taskteam_unlink(master_th); 1546 if (ompt_enabled.ompt_callback_parallel_end) { 1547 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1548 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th), 1549 OMPT_INVOKER(call_context) | ompt_parallel_team, 1550 return_address); 1551 } 1552 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1553 } 1554 #endif 1555 return TRUE; 1556 } 1557 1558 parent_team->t.t_pkfn = microtask; 1559 parent_team->t.t_invoke = invoker; 1560 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1561 parent_team->t.t_active_level++; 1562 parent_team->t.t_level++; 1563 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1564 1565 #if OMPT_SUPPORT 1566 if (ompt_enabled.enabled) { 1567 ompt_lw_taskteam_t lw_taskteam; 1568 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1569 &ompt_parallel_data, return_address); 1570 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true); 1571 } 1572 #endif 1573 1574 /* Change number of threads in the team if requested */ 1575 if (master_set_numthreads) { // The parallel has num_threads clause 1576 if (master_set_numthreads <= master_th->th.th_teams_size.nth) { 1577 // AC: only can reduce number of threads dynamically, can't increase 1578 kmp_info_t **other_threads = parent_team->t.t_threads; 1579 // NOTE: if using distributed barrier, we need to run this code block 1580 // even when the team size appears not to have changed from the max. 1581 int old_proc = master_th->th.th_teams_size.nth; 1582 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == 1583 bp_dist_bar) { 1584 __kmp_resize_dist_barrier(parent_team, old_proc, 1585 master_set_numthreads); 1586 __kmp_add_threads_to_team(parent_team, master_set_numthreads); 1587 } 1588 parent_team->t.t_nproc = master_set_numthreads; 1589 for (i = 0; i < master_set_numthreads; ++i) { 1590 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1591 } 1592 } 1593 // Keep extra threads hot in the team for possible next parallels 1594 master_th->th.th_set_nproc = 0; 1595 } 1596 1597 #if USE_DEBUGGER 1598 if (__kmp_debugging) { // Let debugger override number of threads. 1599 int nth = __kmp_omp_num_threads(loc); 1600 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1601 master_set_numthreads = nth; 1602 } 1603 } 1604 #endif 1605 1606 // Figure out the proc_bind policy for the nested parallel within teams 1607 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1608 // proc_bind_default means don't update 1609 kmp_proc_bind_t proc_bind_icv = proc_bind_default; 1610 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1611 proc_bind = proc_bind_false; 1612 } else { 1613 // No proc_bind clause specified; use current proc-bind-var 1614 if (proc_bind == proc_bind_default) { 1615 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1616 } 1617 /* else: The proc_bind policy was specified explicitly on parallel 1618 clause. 1619 This overrides proc-bind-var for this parallel region, but does not 1620 change proc-bind-var. */ 1621 // Figure the value of proc-bind-var for the child threads. 1622 if ((level + 1 < __kmp_nested_proc_bind.used) && 1623 (__kmp_nested_proc_bind.bind_types[level + 1] != 1624 master_th->th.th_current_task->td_icvs.proc_bind)) { 1625 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1626 } 1627 } 1628 KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind); 1629 // Need to change the bind-var ICV to correct value for each implicit task 1630 if (proc_bind_icv != proc_bind_default && 1631 master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) { 1632 kmp_info_t **other_threads = parent_team->t.t_threads; 1633 for (i = 0; i < master_th->th.th_team_nproc; ++i) { 1634 other_threads[i]->th.th_current_task->td_icvs.proc_bind = 1635 proc_bind_icv; 1636 } 1637 } 1638 // Reset for next parallel region 1639 master_th->th.th_set_proc_bind = proc_bind_default; 1640 1641 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1642 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || 1643 KMP_ITT_DEBUG) && 1644 __kmp_forkjoin_frames_mode == 3 && 1645 parent_team->t.t_active_level == 1 // only report frames at level 1 1646 && master_th->th.th_teams_size.nteams == 1) { 1647 kmp_uint64 tmp_time = __itt_get_timestamp(); 1648 master_th->th.th_frame_time = tmp_time; 1649 parent_team->t.t_region_time = tmp_time; 1650 } 1651 if (__itt_stack_caller_create_ptr) { 1652 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 1653 // create new stack stitching id before entering fork barrier 1654 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 1655 } 1656 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1657 #if KMP_AFFINITY_SUPPORTED 1658 __kmp_partition_places(parent_team); 1659 #endif 1660 1661 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " 1662 "master_th=%p, gtid=%d\n", 1663 root, parent_team, master_th, gtid)); 1664 __kmp_internal_fork(loc, gtid, parent_team); 1665 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, " 1666 "master_th=%p, gtid=%d\n", 1667 root, parent_team, master_th, gtid)); 1668 1669 if (call_context == fork_context_gnu) 1670 return TRUE; 1671 1672 /* Invoke microtask for PRIMARY thread */ 1673 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 1674 parent_team->t.t_id, parent_team->t.t_pkfn)); 1675 1676 if (!parent_team->t.t_invoke(gtid)) { 1677 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 1678 } 1679 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 1680 parent_team->t.t_id, parent_team->t.t_pkfn)); 1681 KMP_MB(); /* Flush all pending memory write invalidates. */ 1682 1683 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 1684 1685 return TRUE; 1686 } // Parallel closely nested in teams construct 1687 1688 #if KMP_DEBUG 1689 if (__kmp_tasking_mode != tskm_immediate_exec) { 1690 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1691 parent_team->t.t_task_team[master_th->th.th_task_state]); 1692 } 1693 #endif 1694 1695 // Need this to happen before we determine the number of threads, not while 1696 // we are allocating the team 1697 //__kmp_push_current_task_to_thread(master_th, parent_team, 0); 1698 int enter_teams = 0; 1699 if (parent_team->t.t_active_level >= 1700 master_th->th.th_current_task->td_icvs.max_active_levels) { 1701 nthreads = 1; 1702 } else { 1703 enter_teams = ((ap == NULL && active_level == 0) || 1704 (ap && teams_level > 0 && teams_level == level)); 1705 nthreads = master_set_numthreads 1706 ? master_set_numthreads 1707 // TODO: get nproc directly from current task 1708 : get__nproc_2(parent_team, master_tid); 1709 // Check if we need to take forkjoin lock? (no need for serialized 1710 // parallel out of teams construct). This code moved here from 1711 // __kmp_reserve_threads() to speedup nested serialized parallels. 1712 if (nthreads > 1) { 1713 if ((get__max_active_levels(master_th) == 1 && 1714 (root->r.r_in_parallel && !enter_teams)) || 1715 (__kmp_library == library_serial)) { 1716 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" 1717 " threads\n", 1718 gtid, nthreads)); 1719 nthreads = 1; 1720 } 1721 } 1722 if (nthreads > 1) { 1723 /* determine how many new threads we can use */ 1724 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1725 /* AC: If we execute teams from parallel region (on host), then teams 1726 should be created but each can only have 1 thread if nesting is 1727 disabled. If teams called from serial region, then teams and their 1728 threads should be created regardless of the nesting setting. */ 1729 nthreads = __kmp_reserve_threads(root, parent_team, master_tid, 1730 nthreads, enter_teams); 1731 if (nthreads == 1) { 1732 // Free lock for single thread execution here; for multi-thread 1733 // execution it will be freed later after team of threads created 1734 // and initialized 1735 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1736 } 1737 } 1738 } 1739 KMP_DEBUG_ASSERT(nthreads > 0); 1740 1741 // If we temporarily changed the set number of threads then restore it now 1742 master_th->th.th_set_nproc = 0; 1743 1744 /* create a serialized parallel region? */ 1745 if (nthreads == 1) { 1746 /* josh todo: hypothetical question: what do we do for OS X*? */ 1747 #if KMP_OS_LINUX && \ 1748 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1749 void *args[argc]; 1750 #else 1751 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1752 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1753 KMP_ARCH_AARCH64) */ 1754 1755 KA_TRACE(20, 1756 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid)); 1757 1758 __kmpc_serialized_parallel(loc, gtid); 1759 1760 #if OMPD_SUPPORT 1761 master_th->th.th_serial_team->t.t_pkfn = microtask; 1762 #endif 1763 1764 if (call_context == fork_context_intel) { 1765 /* TODO this sucks, use the compiler itself to pass args! :) */ 1766 master_th->th.th_serial_team->t.t_ident = loc; 1767 if (!ap) { 1768 // revert change made in __kmpc_serialized_parallel() 1769 master_th->th.th_serial_team->t.t_level--; 1770 // Get args from parent team for teams construct 1771 1772 #if OMPT_SUPPORT 1773 void *dummy; 1774 void **exit_frame_p; 1775 ompt_task_info_t *task_info; 1776 1777 ompt_lw_taskteam_t lw_taskteam; 1778 1779 if (ompt_enabled.enabled) { 1780 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1781 &ompt_parallel_data, return_address); 1782 1783 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1784 // don't use lw_taskteam after linking. content was swaped 1785 1786 task_info = OMPT_CUR_TASK_INFO(master_th); 1787 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1788 if (ompt_enabled.ompt_callback_implicit_task) { 1789 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1790 __kmp_tid_from_gtid(gtid); 1791 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1792 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1793 &(task_info->task_data), 1, 1794 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1795 ompt_task_implicit); 1796 } 1797 1798 /* OMPT state */ 1799 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1800 } else { 1801 exit_frame_p = &dummy; 1802 } 1803 #endif 1804 1805 { 1806 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1807 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1808 __kmp_invoke_microtask(microtask, gtid, 0, argc, 1809 parent_team->t.t_argv 1810 #if OMPT_SUPPORT 1811 , 1812 exit_frame_p 1813 #endif 1814 ); 1815 } 1816 1817 #if OMPT_SUPPORT 1818 if (ompt_enabled.enabled) { 1819 *exit_frame_p = NULL; 1820 if (ompt_enabled.ompt_callback_implicit_task) { 1821 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1822 ompt_scope_end, NULL, &(task_info->task_data), 1, 1823 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1824 ompt_task_implicit); 1825 } 1826 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1827 __ompt_lw_taskteam_unlink(master_th); 1828 if (ompt_enabled.ompt_callback_parallel_end) { 1829 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1830 &ompt_parallel_data, parent_task_data, 1831 OMPT_INVOKER(call_context) | ompt_parallel_team, 1832 return_address); 1833 } 1834 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1835 } 1836 #endif 1837 } else if (microtask == (microtask_t)__kmp_teams_master) { 1838 KMP_DEBUG_ASSERT(master_th->th.th_team == 1839 master_th->th.th_serial_team); 1840 team = master_th->th.th_team; 1841 // team->t.t_pkfn = microtask; 1842 team->t.t_invoke = invoker; 1843 __kmp_alloc_argv_entries(argc, team, TRUE); 1844 team->t.t_argc = argc; 1845 argv = (void **)team->t.t_argv; 1846 if (ap) { 1847 for (i = argc - 1; i >= 0; --i) 1848 *argv++ = va_arg(kmp_va_deref(ap), void *); 1849 } else { 1850 for (i = 0; i < argc; ++i) 1851 // Get args from parent team for teams construct 1852 argv[i] = parent_team->t.t_argv[i]; 1853 } 1854 // AC: revert change made in __kmpc_serialized_parallel() 1855 // because initial code in teams should have level=0 1856 team->t.t_level--; 1857 // AC: call special invoker for outer "parallel" of teams construct 1858 invoker(gtid); 1859 #if OMPT_SUPPORT 1860 if (ompt_enabled.enabled) { 1861 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th); 1862 if (ompt_enabled.ompt_callback_implicit_task) { 1863 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1864 ompt_scope_end, NULL, &(task_info->task_data), 0, 1865 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial); 1866 } 1867 if (ompt_enabled.ompt_callback_parallel_end) { 1868 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1869 &ompt_parallel_data, parent_task_data, 1870 OMPT_INVOKER(call_context) | ompt_parallel_league, 1871 return_address); 1872 } 1873 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1874 } 1875 #endif 1876 } else { 1877 argv = args; 1878 for (i = argc - 1; i >= 0; --i) 1879 *argv++ = va_arg(kmp_va_deref(ap), void *); 1880 KMP_MB(); 1881 1882 #if OMPT_SUPPORT 1883 void *dummy; 1884 void **exit_frame_p; 1885 ompt_task_info_t *task_info; 1886 1887 ompt_lw_taskteam_t lw_taskteam; 1888 1889 if (ompt_enabled.enabled) { 1890 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1891 &ompt_parallel_data, return_address); 1892 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1893 // don't use lw_taskteam after linking. content was swaped 1894 task_info = OMPT_CUR_TASK_INFO(master_th); 1895 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1896 1897 /* OMPT implicit task begin */ 1898 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1899 if (ompt_enabled.ompt_callback_implicit_task) { 1900 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1901 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1902 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), 1903 ompt_task_implicit); 1904 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1905 __kmp_tid_from_gtid(gtid); 1906 } 1907 1908 /* OMPT state */ 1909 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1910 } else { 1911 exit_frame_p = &dummy; 1912 } 1913 #endif 1914 1915 { 1916 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1917 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1918 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1919 #if OMPT_SUPPORT 1920 , 1921 exit_frame_p 1922 #endif 1923 ); 1924 } 1925 1926 #if OMPT_SUPPORT 1927 if (ompt_enabled.enabled) { 1928 *exit_frame_p = NULL; 1929 if (ompt_enabled.ompt_callback_implicit_task) { 1930 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1931 ompt_scope_end, NULL, &(task_info->task_data), 1, 1932 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1933 ompt_task_implicit); 1934 } 1935 1936 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1937 __ompt_lw_taskteam_unlink(master_th); 1938 if (ompt_enabled.ompt_callback_parallel_end) { 1939 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1940 &ompt_parallel_data, parent_task_data, 1941 OMPT_INVOKER(call_context) | ompt_parallel_team, 1942 return_address); 1943 } 1944 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1945 } 1946 #endif 1947 } 1948 } else if (call_context == fork_context_gnu) { 1949 #if OMPT_SUPPORT 1950 ompt_lw_taskteam_t lwt; 1951 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data, 1952 return_address); 1953 1954 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1955 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1956 // don't use lw_taskteam after linking. content was swaped 1957 #endif 1958 1959 // we were called from GNU native code 1960 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1961 return FALSE; 1962 } else { 1963 KMP_ASSERT2(call_context < fork_context_last, 1964 "__kmp_fork_call: unknown fork_context parameter"); 1965 } 1966 1967 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1968 KMP_MB(); 1969 return FALSE; 1970 } // if (nthreads == 1) 1971 1972 // GEH: only modify the executing flag in the case when not serialized 1973 // serialized case is handled in kmpc_serialized_parallel 1974 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 1975 "curtask=%p, curtask_max_aclevel=%d\n", 1976 parent_team->t.t_active_level, master_th, 1977 master_th->th.th_current_task, 1978 master_th->th.th_current_task->td_icvs.max_active_levels)); 1979 // TODO: GEH - cannot do this assertion because root thread not set up as 1980 // executing 1981 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 1982 master_th->th.th_current_task->td_flags.executing = 0; 1983 1984 if (!master_th->th.th_teams_microtask || level > teams_level) { 1985 /* Increment our nested depth level */ 1986 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1987 } 1988 1989 // See if we need to make a copy of the ICVs. 1990 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 1991 if ((level + 1 < __kmp_nested_nth.used) && 1992 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 1993 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 1994 } else { 1995 nthreads_icv = 0; // don't update 1996 } 1997 1998 // Figure out the proc_bind_policy for the new team. 1999 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 2000 // proc_bind_default means don't update 2001 kmp_proc_bind_t proc_bind_icv = proc_bind_default; 2002 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 2003 proc_bind = proc_bind_false; 2004 } else { 2005 // No proc_bind clause specified; use current proc-bind-var for this 2006 // parallel region 2007 if (proc_bind == proc_bind_default) { 2008 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 2009 } 2010 // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND 2011 if (master_th->th.th_teams_microtask && 2012 microtask == (microtask_t)__kmp_teams_master) { 2013 proc_bind = __kmp_teams_proc_bind; 2014 } 2015 /* else: The proc_bind policy was specified explicitly on parallel clause. 2016 This overrides proc-bind-var for this parallel region, but does not 2017 change proc-bind-var. */ 2018 // Figure the value of proc-bind-var for the child threads. 2019 if ((level + 1 < __kmp_nested_proc_bind.used) && 2020 (__kmp_nested_proc_bind.bind_types[level + 1] != 2021 master_th->th.th_current_task->td_icvs.proc_bind)) { 2022 // Do not modify the proc bind icv for the two teams construct forks 2023 // They just let the proc bind icv pass through 2024 if (!master_th->th.th_teams_microtask || 2025 !(microtask == (microtask_t)__kmp_teams_master || ap == NULL)) 2026 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 2027 } 2028 } 2029 2030 // Reset for next parallel region 2031 master_th->th.th_set_proc_bind = proc_bind_default; 2032 2033 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { 2034 kmp_internal_control_t new_icvs; 2035 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 2036 new_icvs.next = NULL; 2037 if (nthreads_icv > 0) { 2038 new_icvs.nproc = nthreads_icv; 2039 } 2040 if (proc_bind_icv != proc_bind_default) { 2041 new_icvs.proc_bind = proc_bind_icv; 2042 } 2043 2044 /* allocate a new parallel team */ 2045 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2046 team = __kmp_allocate_team(root, nthreads, nthreads, 2047 #if OMPT_SUPPORT 2048 ompt_parallel_data, 2049 #endif 2050 proc_bind, &new_icvs, 2051 argc USE_NESTED_HOT_ARG(master_th)); 2052 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) 2053 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs); 2054 } else { 2055 /* allocate a new parallel team */ 2056 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2057 team = __kmp_allocate_team(root, nthreads, nthreads, 2058 #if OMPT_SUPPORT 2059 ompt_parallel_data, 2060 #endif 2061 proc_bind, 2062 &master_th->th.th_current_task->td_icvs, 2063 argc USE_NESTED_HOT_ARG(master_th)); 2064 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) 2065 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, 2066 &master_th->th.th_current_task->td_icvs); 2067 } 2068 KF_TRACE( 2069 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 2070 2071 /* setup the new team */ 2072 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 2073 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 2074 KMP_CHECK_UPDATE(team->t.t_ident, loc); 2075 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 2076 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 2077 #if OMPT_SUPPORT 2078 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 2079 return_address); 2080 #endif 2081 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 2082 // TODO: parent_team->t.t_level == INT_MAX ??? 2083 if (!master_th->th.th_teams_microtask || level > teams_level) { 2084 int new_level = parent_team->t.t_level + 1; 2085 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2086 new_level = parent_team->t.t_active_level + 1; 2087 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2088 } else { 2089 // AC: Do not increase parallel level at start of the teams construct 2090 int new_level = parent_team->t.t_level; 2091 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2092 new_level = parent_team->t.t_active_level; 2093 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2094 } 2095 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2096 // set primary thread's schedule as new run-time schedule 2097 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2098 2099 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2100 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2101 2102 // Update the floating point rounding in the team if required. 2103 propagateFPControl(team); 2104 #if OMPD_SUPPORT 2105 if (ompd_state & OMPD_ENABLE_BP) 2106 ompd_bp_parallel_begin(); 2107 #endif 2108 2109 if (__kmp_tasking_mode != tskm_immediate_exec) { 2110 // Set primary thread's task team to team's task team. Unless this is hot 2111 // team, it should be NULL. 2112 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2113 parent_team->t.t_task_team[master_th->th.th_task_state]); 2114 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team " 2115 "%p, new task_team %p / team %p\n", 2116 __kmp_gtid_from_thread(master_th), 2117 master_th->th.th_task_team, parent_team, 2118 team->t.t_task_team[master_th->th.th_task_state], team)); 2119 2120 if (active_level || master_th->th.th_task_team) { 2121 // Take a memo of primary thread's task_state 2122 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2123 if (master_th->th.th_task_state_top >= 2124 master_th->th.th_task_state_stack_sz) { // increase size 2125 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2126 kmp_uint8 *old_stack, *new_stack; 2127 kmp_uint32 i; 2128 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2129 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2130 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2131 } 2132 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2133 ++i) { // zero-init rest of stack 2134 new_stack[i] = 0; 2135 } 2136 old_stack = master_th->th.th_task_state_memo_stack; 2137 master_th->th.th_task_state_memo_stack = new_stack; 2138 master_th->th.th_task_state_stack_sz = new_size; 2139 __kmp_free(old_stack); 2140 } 2141 // Store primary thread's task_state on stack 2142 master_th->th 2143 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2144 master_th->th.th_task_state; 2145 master_th->th.th_task_state_top++; 2146 #if KMP_NESTED_HOT_TEAMS 2147 if (master_th->th.th_hot_teams && 2148 active_level < __kmp_hot_teams_max_level && 2149 team == master_th->th.th_hot_teams[active_level].hot_team) { 2150 // Restore primary thread's nested state if nested hot team 2151 master_th->th.th_task_state = 2152 master_th->th 2153 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2154 } else { 2155 #endif 2156 master_th->th.th_task_state = 0; 2157 #if KMP_NESTED_HOT_TEAMS 2158 } 2159 #endif 2160 } 2161 #if !KMP_NESTED_HOT_TEAMS 2162 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2163 (team == root->r.r_hot_team)); 2164 #endif 2165 } 2166 2167 KA_TRACE( 2168 20, 2169 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2170 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2171 team->t.t_nproc)); 2172 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2173 (team->t.t_master_tid == 0 && 2174 (team->t.t_parent == root->r.r_root_team || 2175 team->t.t_parent->t.t_serialized))); 2176 KMP_MB(); 2177 2178 /* now, setup the arguments */ 2179 argv = (void **)team->t.t_argv; 2180 if (ap) { 2181 for (i = argc - 1; i >= 0; --i) { 2182 void *new_argv = va_arg(kmp_va_deref(ap), void *); 2183 KMP_CHECK_UPDATE(*argv, new_argv); 2184 argv++; 2185 } 2186 } else { 2187 for (i = 0; i < argc; ++i) { 2188 // Get args from parent team for teams construct 2189 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2190 } 2191 } 2192 2193 /* now actually fork the threads */ 2194 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2195 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2196 root->r.r_active = TRUE; 2197 2198 __kmp_fork_team_threads(root, team, master_th, gtid, !ap); 2199 __kmp_setup_icv_copy(team, nthreads, 2200 &master_th->th.th_current_task->td_icvs, loc); 2201 2202 #if OMPT_SUPPORT 2203 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2204 #endif 2205 2206 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2207 2208 #if USE_ITT_BUILD 2209 if (team->t.t_active_level == 1 // only report frames at level 1 2210 && !master_th->th.th_teams_microtask) { // not in teams construct 2211 #if USE_ITT_NOTIFY 2212 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2213 (__kmp_forkjoin_frames_mode == 3 || 2214 __kmp_forkjoin_frames_mode == 1)) { 2215 kmp_uint64 tmp_time = 0; 2216 if (__itt_get_timestamp_ptr) 2217 tmp_time = __itt_get_timestamp(); 2218 // Internal fork - report frame begin 2219 master_th->th.th_frame_time = tmp_time; 2220 if (__kmp_forkjoin_frames_mode == 3) 2221 team->t.t_region_time = tmp_time; 2222 } else 2223 // only one notification scheme (either "submit" or "forking/joined", not both) 2224 #endif /* USE_ITT_NOTIFY */ 2225 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2226 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2227 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2228 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2229 } 2230 } 2231 #endif /* USE_ITT_BUILD */ 2232 2233 /* now go on and do the work */ 2234 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2235 KMP_MB(); 2236 KF_TRACE(10, 2237 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2238 root, team, master_th, gtid)); 2239 2240 #if USE_ITT_BUILD 2241 if (__itt_stack_caller_create_ptr) { 2242 // create new stack stitching id before entering fork barrier 2243 if (!enter_teams) { 2244 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL); 2245 team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2246 } else if (parent_team->t.t_serialized) { 2247 // keep stack stitching id in the serialized parent_team; 2248 // current team will be used for parallel inside the teams; 2249 // if parent_team is active, then it already keeps stack stitching id 2250 // for the league of teams 2251 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 2252 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2253 } 2254 } 2255 #endif /* USE_ITT_BUILD */ 2256 2257 // AC: skip __kmp_internal_fork at teams construct, let only primary 2258 // threads execute 2259 if (ap) { 2260 __kmp_internal_fork(loc, gtid, team); 2261 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2262 "master_th=%p, gtid=%d\n", 2263 root, team, master_th, gtid)); 2264 } 2265 2266 if (call_context == fork_context_gnu) { 2267 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2268 return TRUE; 2269 } 2270 2271 /* Invoke microtask for PRIMARY thread */ 2272 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2273 team->t.t_id, team->t.t_pkfn)); 2274 } // END of timer KMP_fork_call block 2275 2276 #if KMP_STATS_ENABLED 2277 // If beginning a teams construct, then change thread state 2278 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 2279 if (!ap) { 2280 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); 2281 } 2282 #endif 2283 2284 if (!team->t.t_invoke(gtid)) { 2285 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 2286 } 2287 2288 #if KMP_STATS_ENABLED 2289 // If was beginning of a teams construct, then reset thread state 2290 if (!ap) { 2291 KMP_SET_THREAD_STATE(previous_state); 2292 } 2293 #endif 2294 2295 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2296 team->t.t_id, team->t.t_pkfn)); 2297 KMP_MB(); /* Flush all pending memory write invalidates. */ 2298 2299 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2300 #if OMPT_SUPPORT 2301 if (ompt_enabled.enabled) { 2302 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2303 } 2304 #endif 2305 2306 return TRUE; 2307 } 2308 2309 #if OMPT_SUPPORT 2310 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2311 kmp_team_t *team) { 2312 // restore state outside the region 2313 thread->th.ompt_thread_info.state = 2314 ((team->t.t_serialized) ? ompt_state_work_serial 2315 : ompt_state_work_parallel); 2316 } 2317 2318 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2319 kmp_team_t *team, ompt_data_t *parallel_data, 2320 int flags, void *codeptr) { 2321 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2322 if (ompt_enabled.ompt_callback_parallel_end) { 2323 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2324 parallel_data, &(task_info->task_data), flags, codeptr); 2325 } 2326 2327 task_info->frame.enter_frame = ompt_data_none; 2328 __kmp_join_restore_state(thread, team); 2329 } 2330 #endif 2331 2332 void __kmp_join_call(ident_t *loc, int gtid 2333 #if OMPT_SUPPORT 2334 , 2335 enum fork_context_e fork_context 2336 #endif 2337 , 2338 int exit_teams) { 2339 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2340 kmp_team_t *team; 2341 kmp_team_t *parent_team; 2342 kmp_info_t *master_th; 2343 kmp_root_t *root; 2344 int master_active; 2345 2346 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2347 2348 /* setup current data */ 2349 master_th = __kmp_threads[gtid]; 2350 root = master_th->th.th_root; 2351 team = master_th->th.th_team; 2352 parent_team = team->t.t_parent; 2353 2354 master_th->th.th_ident = loc; 2355 2356 #if OMPT_SUPPORT 2357 void *team_microtask = (void *)team->t.t_pkfn; 2358 // For GOMP interface with serialized parallel, need the 2359 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task 2360 // and end-parallel events. 2361 if (ompt_enabled.enabled && 2362 !(team->t.t_serialized && fork_context == fork_context_gnu)) { 2363 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2364 } 2365 #endif 2366 2367 #if KMP_DEBUG 2368 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2369 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2370 "th_task_team = %p\n", 2371 __kmp_gtid_from_thread(master_th), team, 2372 team->t.t_task_team[master_th->th.th_task_state], 2373 master_th->th.th_task_team)); 2374 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2375 team->t.t_task_team[master_th->th.th_task_state]); 2376 } 2377 #endif 2378 2379 if (team->t.t_serialized) { 2380 if (master_th->th.th_teams_microtask) { 2381 // We are in teams construct 2382 int level = team->t.t_level; 2383 int tlevel = master_th->th.th_teams_level; 2384 if (level == tlevel) { 2385 // AC: we haven't incremented it earlier at start of teams construct, 2386 // so do it here - at the end of teams construct 2387 team->t.t_level++; 2388 } else if (level == tlevel + 1) { 2389 // AC: we are exiting parallel inside teams, need to increment 2390 // serialization in order to restore it in the next call to 2391 // __kmpc_end_serialized_parallel 2392 team->t.t_serialized++; 2393 } 2394 } 2395 __kmpc_end_serialized_parallel(loc, gtid); 2396 2397 #if OMPT_SUPPORT 2398 if (ompt_enabled.enabled) { 2399 __kmp_join_restore_state(master_th, parent_team); 2400 } 2401 #endif 2402 2403 return; 2404 } 2405 2406 master_active = team->t.t_master_active; 2407 2408 if (!exit_teams) { 2409 // AC: No barrier for internal teams at exit from teams construct. 2410 // But there is barrier for external team (league). 2411 __kmp_internal_join(loc, gtid, team); 2412 #if USE_ITT_BUILD 2413 if (__itt_stack_caller_create_ptr) { 2414 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL); 2415 // destroy the stack stitching id after join barrier 2416 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id); 2417 team->t.t_stack_id = NULL; 2418 } 2419 #endif 2420 } else { 2421 master_th->th.th_task_state = 2422 0; // AC: no tasking in teams (out of any parallel) 2423 #if USE_ITT_BUILD 2424 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) { 2425 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL); 2426 // destroy the stack stitching id on exit from the teams construct 2427 // if parent_team is active, then the id will be destroyed later on 2428 // by master of the league of teams 2429 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id); 2430 parent_team->t.t_stack_id = NULL; 2431 } 2432 #endif 2433 2434 if (team->t.t_nproc > 1 && 2435 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 2436 team->t.b->update_num_threads(team->t.t_nproc); 2437 __kmp_add_threads_to_team(team, team->t.t_nproc); 2438 } 2439 } 2440 2441 KMP_MB(); 2442 2443 #if OMPT_SUPPORT 2444 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2445 void *codeptr = team->t.ompt_team_info.master_return_address; 2446 #endif 2447 2448 #if USE_ITT_BUILD 2449 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2450 if (team->t.t_active_level == 1 && 2451 (!master_th->th.th_teams_microtask || /* not in teams construct */ 2452 master_th->th.th_teams_size.nteams == 1)) { 2453 master_th->th.th_ident = loc; 2454 // only one notification scheme (either "submit" or "forking/joined", not 2455 // both) 2456 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2457 __kmp_forkjoin_frames_mode == 3) 2458 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2459 master_th->th.th_frame_time, 0, loc, 2460 master_th->th.th_team_nproc, 1); 2461 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2462 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2463 __kmp_itt_region_joined(gtid); 2464 } // active_level == 1 2465 #endif /* USE_ITT_BUILD */ 2466 2467 #if KMP_AFFINITY_SUPPORTED 2468 if (!exit_teams) { 2469 // Restore master thread's partition. 2470 master_th->th.th_first_place = team->t.t_first_place; 2471 master_th->th.th_last_place = team->t.t_last_place; 2472 } 2473 #endif // KMP_AFFINITY_SUPPORTED 2474 2475 if (master_th->th.th_teams_microtask && !exit_teams && 2476 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2477 team->t.t_level == master_th->th.th_teams_level + 1) { 2478 // AC: We need to leave the team structure intact at the end of parallel 2479 // inside the teams construct, so that at the next parallel same (hot) team 2480 // works, only adjust nesting levels 2481 #if OMPT_SUPPORT 2482 ompt_data_t ompt_parallel_data = ompt_data_none; 2483 if (ompt_enabled.enabled) { 2484 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2485 if (ompt_enabled.ompt_callback_implicit_task) { 2486 int ompt_team_size = team->t.t_nproc; 2487 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2488 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2489 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 2490 } 2491 task_info->frame.exit_frame = ompt_data_none; 2492 task_info->task_data = ompt_data_none; 2493 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 2494 __ompt_lw_taskteam_unlink(master_th); 2495 } 2496 #endif 2497 /* Decrement our nested depth level */ 2498 team->t.t_level--; 2499 team->t.t_active_level--; 2500 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2501 2502 // Restore number of threads in the team if needed. This code relies on 2503 // the proper adjustment of th_teams_size.nth after the fork in 2504 // __kmp_teams_master on each teams primary thread in the case that 2505 // __kmp_reserve_threads reduced it. 2506 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2507 int old_num = master_th->th.th_team_nproc; 2508 int new_num = master_th->th.th_teams_size.nth; 2509 kmp_info_t **other_threads = team->t.t_threads; 2510 team->t.t_nproc = new_num; 2511 for (int i = 0; i < old_num; ++i) { 2512 other_threads[i]->th.th_team_nproc = new_num; 2513 } 2514 // Adjust states of non-used threads of the team 2515 for (int i = old_num; i < new_num; ++i) { 2516 // Re-initialize thread's barrier data. 2517 KMP_DEBUG_ASSERT(other_threads[i]); 2518 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2519 for (int b = 0; b < bs_last_barrier; ++b) { 2520 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2521 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2522 #if USE_DEBUGGER 2523 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2524 #endif 2525 } 2526 if (__kmp_tasking_mode != tskm_immediate_exec) { 2527 // Synchronize thread's task state 2528 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2529 } 2530 } 2531 } 2532 2533 #if OMPT_SUPPORT 2534 if (ompt_enabled.enabled) { 2535 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data, 2536 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr); 2537 } 2538 #endif 2539 2540 return; 2541 } 2542 2543 /* do cleanup and restore the parent team */ 2544 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2545 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2546 2547 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2548 2549 /* jc: The following lock has instructions with REL and ACQ semantics, 2550 separating the parallel user code called in this parallel region 2551 from the serial user code called after this function returns. */ 2552 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2553 2554 if (!master_th->th.th_teams_microtask || 2555 team->t.t_level > master_th->th.th_teams_level) { 2556 /* Decrement our nested depth level */ 2557 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2558 } 2559 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2560 2561 #if OMPT_SUPPORT 2562 if (ompt_enabled.enabled) { 2563 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2564 if (ompt_enabled.ompt_callback_implicit_task) { 2565 int flags = (team_microtask == (void *)__kmp_teams_master) 2566 ? ompt_task_initial 2567 : ompt_task_implicit; 2568 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc; 2569 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2570 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2571 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags); 2572 } 2573 task_info->frame.exit_frame = ompt_data_none; 2574 task_info->task_data = ompt_data_none; 2575 } 2576 #endif 2577 2578 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2579 master_th, team)); 2580 __kmp_pop_current_task_from_thread(master_th); 2581 2582 master_th->th.th_def_allocator = team->t.t_def_allocator; 2583 2584 #if OMPD_SUPPORT 2585 if (ompd_state & OMPD_ENABLE_BP) 2586 ompd_bp_parallel_end(); 2587 #endif 2588 updateHWFPControl(team); 2589 2590 if (root->r.r_active != master_active) 2591 root->r.r_active = master_active; 2592 2593 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2594 master_th)); // this will free worker threads 2595 2596 /* this race was fun to find. make sure the following is in the critical 2597 region otherwise assertions may fail occasionally since the old team may be 2598 reallocated and the hierarchy appears inconsistent. it is actually safe to 2599 run and won't cause any bugs, but will cause those assertion failures. it's 2600 only one deref&assign so might as well put this in the critical region */ 2601 master_th->th.th_team = parent_team; 2602 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2603 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2604 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2605 2606 /* restore serialized team, if need be */ 2607 if (parent_team->t.t_serialized && 2608 parent_team != master_th->th.th_serial_team && 2609 parent_team != root->r.r_root_team) { 2610 __kmp_free_team(root, 2611 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2612 master_th->th.th_serial_team = parent_team; 2613 } 2614 2615 if (__kmp_tasking_mode != tskm_immediate_exec) { 2616 if (master_th->th.th_task_state_top > 2617 0) { // Restore task state from memo stack 2618 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2619 // Remember primary thread's state if we re-use this nested hot team 2620 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2621 master_th->th.th_task_state; 2622 --master_th->th.th_task_state_top; // pop 2623 // Now restore state at this level 2624 master_th->th.th_task_state = 2625 master_th->th 2626 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2627 } 2628 // Copy the task team from the parent team to the primary thread 2629 master_th->th.th_task_team = 2630 parent_team->t.t_task_team[master_th->th.th_task_state]; 2631 KA_TRACE(20, 2632 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n", 2633 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2634 parent_team)); 2635 } 2636 2637 // TODO: GEH - cannot do this assertion because root thread not set up as 2638 // executing 2639 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2640 master_th->th.th_current_task->td_flags.executing = 1; 2641 2642 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2643 2644 #if KMP_AFFINITY_SUPPORTED 2645 if (master_th->th.th_team->t.t_level == 0 && __kmp_affin_reset) { 2646 __kmp_reset_root_init_mask(gtid); 2647 } 2648 #endif 2649 #if OMPT_SUPPORT 2650 int flags = 2651 OMPT_INVOKER(fork_context) | 2652 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league 2653 : ompt_parallel_team); 2654 if (ompt_enabled.enabled) { 2655 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags, 2656 codeptr); 2657 } 2658 #endif 2659 2660 KMP_MB(); 2661 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2662 } 2663 2664 /* Check whether we should push an internal control record onto the 2665 serial team stack. If so, do it. */ 2666 void __kmp_save_internal_controls(kmp_info_t *thread) { 2667 2668 if (thread->th.th_team != thread->th.th_serial_team) { 2669 return; 2670 } 2671 if (thread->th.th_team->t.t_serialized > 1) { 2672 int push = 0; 2673 2674 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2675 push = 1; 2676 } else { 2677 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2678 thread->th.th_team->t.t_serialized) { 2679 push = 1; 2680 } 2681 } 2682 if (push) { /* push a record on the serial team's stack */ 2683 kmp_internal_control_t *control = 2684 (kmp_internal_control_t *)__kmp_allocate( 2685 sizeof(kmp_internal_control_t)); 2686 2687 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2688 2689 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2690 2691 control->next = thread->th.th_team->t.t_control_stack_top; 2692 thread->th.th_team->t.t_control_stack_top = control; 2693 } 2694 } 2695 } 2696 2697 /* Changes set_nproc */ 2698 void __kmp_set_num_threads(int new_nth, int gtid) { 2699 kmp_info_t *thread; 2700 kmp_root_t *root; 2701 2702 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2703 KMP_DEBUG_ASSERT(__kmp_init_serial); 2704 2705 if (new_nth < 1) 2706 new_nth = 1; 2707 else if (new_nth > __kmp_max_nth) 2708 new_nth = __kmp_max_nth; 2709 2710 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2711 thread = __kmp_threads[gtid]; 2712 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2713 return; // nothing to do 2714 2715 __kmp_save_internal_controls(thread); 2716 2717 set__nproc(thread, new_nth); 2718 2719 // If this omp_set_num_threads() call will cause the hot team size to be 2720 // reduced (in the absence of a num_threads clause), then reduce it now, 2721 // rather than waiting for the next parallel region. 2722 root = thread->th.th_root; 2723 if (__kmp_init_parallel && (!root->r.r_active) && 2724 (root->r.r_hot_team->t.t_nproc > new_nth) 2725 #if KMP_NESTED_HOT_TEAMS 2726 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2727 #endif 2728 ) { 2729 kmp_team_t *hot_team = root->r.r_hot_team; 2730 int f; 2731 2732 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2733 2734 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 2735 __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth); 2736 } 2737 // Release the extra threads we don't need any more. 2738 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2739 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2740 if (__kmp_tasking_mode != tskm_immediate_exec) { 2741 // When decreasing team size, threads no longer in the team should unref 2742 // task team. 2743 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2744 } 2745 __kmp_free_thread(hot_team->t.t_threads[f]); 2746 hot_team->t.t_threads[f] = NULL; 2747 } 2748 hot_team->t.t_nproc = new_nth; 2749 #if KMP_NESTED_HOT_TEAMS 2750 if (thread->th.th_hot_teams) { 2751 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2752 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2753 } 2754 #endif 2755 2756 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 2757 hot_team->t.b->update_num_threads(new_nth); 2758 __kmp_add_threads_to_team(hot_team, new_nth); 2759 } 2760 2761 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2762 2763 // Update the t_nproc field in the threads that are still active. 2764 for (f = 0; f < new_nth; f++) { 2765 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2766 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2767 } 2768 // Special flag in case omp_set_num_threads() call 2769 hot_team->t.t_size_changed = -1; 2770 } 2771 } 2772 2773 /* Changes max_active_levels */ 2774 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2775 kmp_info_t *thread; 2776 2777 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2778 "%d = (%d)\n", 2779 gtid, max_active_levels)); 2780 KMP_DEBUG_ASSERT(__kmp_init_serial); 2781 2782 // validate max_active_levels 2783 if (max_active_levels < 0) { 2784 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2785 // We ignore this call if the user has specified a negative value. 2786 // The current setting won't be changed. The last valid setting will be 2787 // used. A warning will be issued (if warnings are allowed as controlled by 2788 // the KMP_WARNINGS env var). 2789 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2790 "max_active_levels for thread %d = (%d)\n", 2791 gtid, max_active_levels)); 2792 return; 2793 } 2794 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2795 // it's OK, the max_active_levels is within the valid range: [ 0; 2796 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2797 // We allow a zero value. (implementation defined behavior) 2798 } else { 2799 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2800 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2801 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2802 // Current upper limit is MAX_INT. (implementation defined behavior) 2803 // If the input exceeds the upper limit, we correct the input to be the 2804 // upper limit. (implementation defined behavior) 2805 // Actually, the flow should never get here until we use MAX_INT limit. 2806 } 2807 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2808 "max_active_levels for thread %d = (%d)\n", 2809 gtid, max_active_levels)); 2810 2811 thread = __kmp_threads[gtid]; 2812 2813 __kmp_save_internal_controls(thread); 2814 2815 set__max_active_levels(thread, max_active_levels); 2816 } 2817 2818 /* Gets max_active_levels */ 2819 int __kmp_get_max_active_levels(int gtid) { 2820 kmp_info_t *thread; 2821 2822 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2823 KMP_DEBUG_ASSERT(__kmp_init_serial); 2824 2825 thread = __kmp_threads[gtid]; 2826 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2827 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2828 "curtask_maxaclevel=%d\n", 2829 gtid, thread->th.th_current_task, 2830 thread->th.th_current_task->td_icvs.max_active_levels)); 2831 return thread->th.th_current_task->td_icvs.max_active_levels; 2832 } 2833 2834 // nteams-var per-device ICV 2835 void __kmp_set_num_teams(int num_teams) { 2836 if (num_teams > 0) 2837 __kmp_nteams = num_teams; 2838 } 2839 int __kmp_get_max_teams(void) { return __kmp_nteams; } 2840 // teams-thread-limit-var per-device ICV 2841 void __kmp_set_teams_thread_limit(int limit) { 2842 if (limit > 0) 2843 __kmp_teams_thread_limit = limit; 2844 } 2845 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; } 2846 2847 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); 2848 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); 2849 2850 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2851 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2852 kmp_info_t *thread; 2853 kmp_sched_t orig_kind; 2854 // kmp_team_t *team; 2855 2856 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2857 gtid, (int)kind, chunk)); 2858 KMP_DEBUG_ASSERT(__kmp_init_serial); 2859 2860 // Check if the kind parameter is valid, correct if needed. 2861 // Valid parameters should fit in one of two intervals - standard or extended: 2862 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2863 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2864 orig_kind = kind; 2865 kind = __kmp_sched_without_mods(kind); 2866 2867 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2868 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2869 // TODO: Hint needs attention in case we change the default schedule. 2870 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2871 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2872 __kmp_msg_null); 2873 kind = kmp_sched_default; 2874 chunk = 0; // ignore chunk value in case of bad kind 2875 } 2876 2877 thread = __kmp_threads[gtid]; 2878 2879 __kmp_save_internal_controls(thread); 2880 2881 if (kind < kmp_sched_upper_std) { 2882 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2883 // differ static chunked vs. unchunked: chunk should be invalid to 2884 // indicate unchunked schedule (which is the default) 2885 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2886 } else { 2887 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2888 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2889 } 2890 } else { 2891 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2892 // kmp_sched_lower - 2 ]; 2893 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2894 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2895 kmp_sched_lower - 2]; 2896 } 2897 __kmp_sched_apply_mods_intkind( 2898 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); 2899 if (kind == kmp_sched_auto || chunk < 1) { 2900 // ignore parameter chunk for schedule auto 2901 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2902 } else { 2903 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2904 } 2905 } 2906 2907 /* Gets def_sched_var ICV values */ 2908 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2909 kmp_info_t *thread; 2910 enum sched_type th_type; 2911 2912 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2913 KMP_DEBUG_ASSERT(__kmp_init_serial); 2914 2915 thread = __kmp_threads[gtid]; 2916 2917 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2918 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { 2919 case kmp_sch_static: 2920 case kmp_sch_static_greedy: 2921 case kmp_sch_static_balanced: 2922 *kind = kmp_sched_static; 2923 __kmp_sched_apply_mods_stdkind(kind, th_type); 2924 *chunk = 0; // chunk was not set, try to show this fact via zero value 2925 return; 2926 case kmp_sch_static_chunked: 2927 *kind = kmp_sched_static; 2928 break; 2929 case kmp_sch_dynamic_chunked: 2930 *kind = kmp_sched_dynamic; 2931 break; 2932 case kmp_sch_guided_chunked: 2933 case kmp_sch_guided_iterative_chunked: 2934 case kmp_sch_guided_analytical_chunked: 2935 *kind = kmp_sched_guided; 2936 break; 2937 case kmp_sch_auto: 2938 *kind = kmp_sched_auto; 2939 break; 2940 case kmp_sch_trapezoidal: 2941 *kind = kmp_sched_trapezoidal; 2942 break; 2943 #if KMP_STATIC_STEAL_ENABLED 2944 case kmp_sch_static_steal: 2945 *kind = kmp_sched_static_steal; 2946 break; 2947 #endif 2948 default: 2949 KMP_FATAL(UnknownSchedulingType, th_type); 2950 } 2951 2952 __kmp_sched_apply_mods_stdkind(kind, th_type); 2953 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 2954 } 2955 2956 int __kmp_get_ancestor_thread_num(int gtid, int level) { 2957 2958 int ii, dd; 2959 kmp_team_t *team; 2960 kmp_info_t *thr; 2961 2962 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 2963 KMP_DEBUG_ASSERT(__kmp_init_serial); 2964 2965 // validate level 2966 if (level == 0) 2967 return 0; 2968 if (level < 0) 2969 return -1; 2970 thr = __kmp_threads[gtid]; 2971 team = thr->th.th_team; 2972 ii = team->t.t_level; 2973 if (level > ii) 2974 return -1; 2975 2976 if (thr->th.th_teams_microtask) { 2977 // AC: we are in teams region where multiple nested teams have same level 2978 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2979 if (level <= 2980 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2981 KMP_DEBUG_ASSERT(ii >= tlevel); 2982 // AC: As we need to pass by the teams league, we need to artificially 2983 // increase ii 2984 if (ii == tlevel) { 2985 ii += 2; // three teams have same level 2986 } else { 2987 ii++; // two teams have same level 2988 } 2989 } 2990 } 2991 2992 if (ii == level) 2993 return __kmp_tid_from_gtid(gtid); 2994 2995 dd = team->t.t_serialized; 2996 level++; 2997 while (ii > level) { 2998 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2999 } 3000 if ((team->t.t_serialized) && (!dd)) { 3001 team = team->t.t_parent; 3002 continue; 3003 } 3004 if (ii > level) { 3005 team = team->t.t_parent; 3006 dd = team->t.t_serialized; 3007 ii--; 3008 } 3009 } 3010 3011 return (dd > 1) ? (0) : (team->t.t_master_tid); 3012 } 3013 3014 int __kmp_get_team_size(int gtid, int level) { 3015 3016 int ii, dd; 3017 kmp_team_t *team; 3018 kmp_info_t *thr; 3019 3020 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 3021 KMP_DEBUG_ASSERT(__kmp_init_serial); 3022 3023 // validate level 3024 if (level == 0) 3025 return 1; 3026 if (level < 0) 3027 return -1; 3028 thr = __kmp_threads[gtid]; 3029 team = thr->th.th_team; 3030 ii = team->t.t_level; 3031 if (level > ii) 3032 return -1; 3033 3034 if (thr->th.th_teams_microtask) { 3035 // AC: we are in teams region where multiple nested teams have same level 3036 int tlevel = thr->th.th_teams_level; // the level of the teams construct 3037 if (level <= 3038 tlevel) { // otherwise usual algorithm works (will not touch the teams) 3039 KMP_DEBUG_ASSERT(ii >= tlevel); 3040 // AC: As we need to pass by the teams league, we need to artificially 3041 // increase ii 3042 if (ii == tlevel) { 3043 ii += 2; // three teams have same level 3044 } else { 3045 ii++; // two teams have same level 3046 } 3047 } 3048 } 3049 3050 while (ii > level) { 3051 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 3052 } 3053 if (team->t.t_serialized && (!dd)) { 3054 team = team->t.t_parent; 3055 continue; 3056 } 3057 if (ii > level) { 3058 team = team->t.t_parent; 3059 ii--; 3060 } 3061 } 3062 3063 return team->t.t_nproc; 3064 } 3065 3066 kmp_r_sched_t __kmp_get_schedule_global() { 3067 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 3068 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 3069 // independently. So one can get the updated schedule here. 3070 3071 kmp_r_sched_t r_sched; 3072 3073 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 3074 // __kmp_guided. __kmp_sched should keep original value, so that user can set 3075 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 3076 // different roots (even in OMP 2.5) 3077 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); 3078 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); 3079 if (s == kmp_sch_static) { 3080 // replace STATIC with more detailed schedule (balanced or greedy) 3081 r_sched.r_sched_type = __kmp_static; 3082 } else if (s == kmp_sch_guided_chunked) { 3083 // replace GUIDED with more detailed schedule (iterative or analytical) 3084 r_sched.r_sched_type = __kmp_guided; 3085 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 3086 r_sched.r_sched_type = __kmp_sched; 3087 } 3088 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); 3089 3090 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 3091 // __kmp_chunk may be wrong here (if it was not ever set) 3092 r_sched.chunk = KMP_DEFAULT_CHUNK; 3093 } else { 3094 r_sched.chunk = __kmp_chunk; 3095 } 3096 3097 return r_sched; 3098 } 3099 3100 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 3101 at least argc number of *t_argv entries for the requested team. */ 3102 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 3103 3104 KMP_DEBUG_ASSERT(team); 3105 if (!realloc || argc > team->t.t_max_argc) { 3106 3107 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 3108 "current entries=%d\n", 3109 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 3110 /* if previously allocated heap space for args, free them */ 3111 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 3112 __kmp_free((void *)team->t.t_argv); 3113 3114 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 3115 /* use unused space in the cache line for arguments */ 3116 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 3117 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 3118 "argv entries\n", 3119 team->t.t_id, team->t.t_max_argc)); 3120 team->t.t_argv = &team->t.t_inline_argv[0]; 3121 if (__kmp_storage_map) { 3122 __kmp_print_storage_map_gtid( 3123 -1, &team->t.t_inline_argv[0], 3124 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3125 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3126 team->t.t_id); 3127 } 3128 } else { 3129 /* allocate space for arguments in the heap */ 3130 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3131 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3132 : 2 * argc; 3133 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3134 "argv entries\n", 3135 team->t.t_id, team->t.t_max_argc)); 3136 team->t.t_argv = 3137 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3138 if (__kmp_storage_map) { 3139 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3140 &team->t.t_argv[team->t.t_max_argc], 3141 sizeof(void *) * team->t.t_max_argc, 3142 "team_%d.t_argv", team->t.t_id); 3143 } 3144 } 3145 } 3146 } 3147 3148 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3149 int i; 3150 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3151 team->t.t_threads = 3152 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3153 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3154 sizeof(dispatch_shared_info_t) * num_disp_buff); 3155 team->t.t_dispatch = 3156 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3157 team->t.t_implicit_task_taskdata = 3158 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3159 team->t.t_max_nproc = max_nth; 3160 3161 /* setup dispatch buffers */ 3162 for (i = 0; i < num_disp_buff; ++i) { 3163 team->t.t_disp_buffer[i].buffer_index = i; 3164 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3165 } 3166 } 3167 3168 static void __kmp_free_team_arrays(kmp_team_t *team) { 3169 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3170 int i; 3171 for (i = 0; i < team->t.t_max_nproc; ++i) { 3172 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3173 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3174 team->t.t_dispatch[i].th_disp_buffer = NULL; 3175 } 3176 } 3177 #if KMP_USE_HIER_SCHED 3178 __kmp_dispatch_free_hierarchies(team); 3179 #endif 3180 __kmp_free(team->t.t_threads); 3181 __kmp_free(team->t.t_disp_buffer); 3182 __kmp_free(team->t.t_dispatch); 3183 __kmp_free(team->t.t_implicit_task_taskdata); 3184 team->t.t_threads = NULL; 3185 team->t.t_disp_buffer = NULL; 3186 team->t.t_dispatch = NULL; 3187 team->t.t_implicit_task_taskdata = 0; 3188 } 3189 3190 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3191 kmp_info_t **oldThreads = team->t.t_threads; 3192 3193 __kmp_free(team->t.t_disp_buffer); 3194 __kmp_free(team->t.t_dispatch); 3195 __kmp_free(team->t.t_implicit_task_taskdata); 3196 __kmp_allocate_team_arrays(team, max_nth); 3197 3198 KMP_MEMCPY(team->t.t_threads, oldThreads, 3199 team->t.t_nproc * sizeof(kmp_info_t *)); 3200 3201 __kmp_free(oldThreads); 3202 } 3203 3204 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3205 3206 kmp_r_sched_t r_sched = 3207 __kmp_get_schedule_global(); // get current state of scheduling globals 3208 3209 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3210 3211 kmp_internal_control_t g_icvs = { 3212 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3213 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3214 // adjustment of threads (per thread) 3215 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3216 // whether blocktime is explicitly set 3217 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3218 #if KMP_USE_MONITOR 3219 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3220 // intervals 3221 #endif 3222 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3223 // next parallel region (per thread) 3224 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3225 __kmp_cg_max_nth, // int thread_limit; 3226 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3227 // for max_active_levels 3228 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3229 // {sched,chunk} pair 3230 __kmp_nested_proc_bind.bind_types[0], 3231 __kmp_default_device, 3232 NULL // struct kmp_internal_control *next; 3233 }; 3234 3235 return g_icvs; 3236 } 3237 3238 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3239 3240 kmp_internal_control_t gx_icvs; 3241 gx_icvs.serial_nesting_level = 3242 0; // probably =team->t.t_serial like in save_inter_controls 3243 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3244 gx_icvs.next = NULL; 3245 3246 return gx_icvs; 3247 } 3248 3249 static void __kmp_initialize_root(kmp_root_t *root) { 3250 int f; 3251 kmp_team_t *root_team; 3252 kmp_team_t *hot_team; 3253 int hot_team_max_nth; 3254 kmp_r_sched_t r_sched = 3255 __kmp_get_schedule_global(); // get current state of scheduling globals 3256 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3257 KMP_DEBUG_ASSERT(root); 3258 KMP_ASSERT(!root->r.r_begin); 3259 3260 /* setup the root state structure */ 3261 __kmp_init_lock(&root->r.r_begin_lock); 3262 root->r.r_begin = FALSE; 3263 root->r.r_active = FALSE; 3264 root->r.r_in_parallel = 0; 3265 root->r.r_blocktime = __kmp_dflt_blocktime; 3266 #if KMP_AFFINITY_SUPPORTED 3267 root->r.r_affinity_assigned = FALSE; 3268 #endif 3269 3270 /* setup the root team for this task */ 3271 /* allocate the root team structure */ 3272 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3273 3274 root_team = 3275 __kmp_allocate_team(root, 3276 1, // new_nproc 3277 1, // max_nproc 3278 #if OMPT_SUPPORT 3279 ompt_data_none, // root parallel id 3280 #endif 3281 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3282 0 // argc 3283 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3284 ); 3285 #if USE_DEBUGGER 3286 // Non-NULL value should be assigned to make the debugger display the root 3287 // team. 3288 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3289 #endif 3290 3291 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3292 3293 root->r.r_root_team = root_team; 3294 root_team->t.t_control_stack_top = NULL; 3295 3296 /* initialize root team */ 3297 root_team->t.t_threads[0] = NULL; 3298 root_team->t.t_nproc = 1; 3299 root_team->t.t_serialized = 1; 3300 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3301 root_team->t.t_sched.sched = r_sched.sched; 3302 KA_TRACE( 3303 20, 3304 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3305 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3306 3307 /* setup the hot team for this task */ 3308 /* allocate the hot team structure */ 3309 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3310 3311 hot_team = 3312 __kmp_allocate_team(root, 3313 1, // new_nproc 3314 __kmp_dflt_team_nth_ub * 2, // max_nproc 3315 #if OMPT_SUPPORT 3316 ompt_data_none, // root parallel id 3317 #endif 3318 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3319 0 // argc 3320 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3321 ); 3322 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3323 3324 root->r.r_hot_team = hot_team; 3325 root_team->t.t_control_stack_top = NULL; 3326 3327 /* first-time initialization */ 3328 hot_team->t.t_parent = root_team; 3329 3330 /* initialize hot team */ 3331 hot_team_max_nth = hot_team->t.t_max_nproc; 3332 for (f = 0; f < hot_team_max_nth; ++f) { 3333 hot_team->t.t_threads[f] = NULL; 3334 } 3335 hot_team->t.t_nproc = 1; 3336 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3337 hot_team->t.t_sched.sched = r_sched.sched; 3338 hot_team->t.t_size_changed = 0; 3339 } 3340 3341 #ifdef KMP_DEBUG 3342 3343 typedef struct kmp_team_list_item { 3344 kmp_team_p const *entry; 3345 struct kmp_team_list_item *next; 3346 } kmp_team_list_item_t; 3347 typedef kmp_team_list_item_t *kmp_team_list_t; 3348 3349 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3350 kmp_team_list_t list, // List of teams. 3351 kmp_team_p const *team // Team to add. 3352 ) { 3353 3354 // List must terminate with item where both entry and next are NULL. 3355 // Team is added to the list only once. 3356 // List is sorted in ascending order by team id. 3357 // Team id is *not* a key. 3358 3359 kmp_team_list_t l; 3360 3361 KMP_DEBUG_ASSERT(list != NULL); 3362 if (team == NULL) { 3363 return; 3364 } 3365 3366 __kmp_print_structure_team_accum(list, team->t.t_parent); 3367 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3368 3369 // Search list for the team. 3370 l = list; 3371 while (l->next != NULL && l->entry != team) { 3372 l = l->next; 3373 } 3374 if (l->next != NULL) { 3375 return; // Team has been added before, exit. 3376 } 3377 3378 // Team is not found. Search list again for insertion point. 3379 l = list; 3380 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3381 l = l->next; 3382 } 3383 3384 // Insert team. 3385 { 3386 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3387 sizeof(kmp_team_list_item_t)); 3388 *item = *l; 3389 l->entry = team; 3390 l->next = item; 3391 } 3392 } 3393 3394 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3395 3396 ) { 3397 __kmp_printf("%s", title); 3398 if (team != NULL) { 3399 __kmp_printf("%2x %p\n", team->t.t_id, team); 3400 } else { 3401 __kmp_printf(" - (nil)\n"); 3402 } 3403 } 3404 3405 static void __kmp_print_structure_thread(char const *title, 3406 kmp_info_p const *thread) { 3407 __kmp_printf("%s", title); 3408 if (thread != NULL) { 3409 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3410 } else { 3411 __kmp_printf(" - (nil)\n"); 3412 } 3413 } 3414 3415 void __kmp_print_structure(void) { 3416 3417 kmp_team_list_t list; 3418 3419 // Initialize list of teams. 3420 list = 3421 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3422 list->entry = NULL; 3423 list->next = NULL; 3424 3425 __kmp_printf("\n------------------------------\nGlobal Thread " 3426 "Table\n------------------------------\n"); 3427 { 3428 int gtid; 3429 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3430 __kmp_printf("%2d", gtid); 3431 if (__kmp_threads != NULL) { 3432 __kmp_printf(" %p", __kmp_threads[gtid]); 3433 } 3434 if (__kmp_root != NULL) { 3435 __kmp_printf(" %p", __kmp_root[gtid]); 3436 } 3437 __kmp_printf("\n"); 3438 } 3439 } 3440 3441 // Print out __kmp_threads array. 3442 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3443 "----------\n"); 3444 if (__kmp_threads != NULL) { 3445 int gtid; 3446 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3447 kmp_info_t const *thread = __kmp_threads[gtid]; 3448 if (thread != NULL) { 3449 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3450 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3451 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3452 __kmp_print_structure_team(" Serial Team: ", 3453 thread->th.th_serial_team); 3454 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3455 __kmp_print_structure_thread(" Primary: ", 3456 thread->th.th_team_master); 3457 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3458 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3459 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3460 __kmp_print_structure_thread(" Next in pool: ", 3461 thread->th.th_next_pool); 3462 __kmp_printf("\n"); 3463 __kmp_print_structure_team_accum(list, thread->th.th_team); 3464 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3465 } 3466 } 3467 } else { 3468 __kmp_printf("Threads array is not allocated.\n"); 3469 } 3470 3471 // Print out __kmp_root array. 3472 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3473 "--------\n"); 3474 if (__kmp_root != NULL) { 3475 int gtid; 3476 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3477 kmp_root_t const *root = __kmp_root[gtid]; 3478 if (root != NULL) { 3479 __kmp_printf("GTID %2d %p:\n", gtid, root); 3480 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3481 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3482 __kmp_print_structure_thread(" Uber Thread: ", 3483 root->r.r_uber_thread); 3484 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3485 __kmp_printf(" In Parallel: %2d\n", 3486 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3487 __kmp_printf("\n"); 3488 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3489 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3490 } 3491 } 3492 } else { 3493 __kmp_printf("Ubers array is not allocated.\n"); 3494 } 3495 3496 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3497 "--------\n"); 3498 while (list->next != NULL) { 3499 kmp_team_p const *team = list->entry; 3500 int i; 3501 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3502 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3503 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid); 3504 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3505 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3506 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3507 for (i = 0; i < team->t.t_nproc; ++i) { 3508 __kmp_printf(" Thread %2d: ", i); 3509 __kmp_print_structure_thread("", team->t.t_threads[i]); 3510 } 3511 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3512 __kmp_printf("\n"); 3513 list = list->next; 3514 } 3515 3516 // Print out __kmp_thread_pool and __kmp_team_pool. 3517 __kmp_printf("\n------------------------------\nPools\n----------------------" 3518 "--------\n"); 3519 __kmp_print_structure_thread("Thread pool: ", 3520 CCAST(kmp_info_t *, __kmp_thread_pool)); 3521 __kmp_print_structure_team("Team pool: ", 3522 CCAST(kmp_team_t *, __kmp_team_pool)); 3523 __kmp_printf("\n"); 3524 3525 // Free team list. 3526 while (list != NULL) { 3527 kmp_team_list_item_t *item = list; 3528 list = list->next; 3529 KMP_INTERNAL_FREE(item); 3530 } 3531 } 3532 3533 #endif 3534 3535 //--------------------------------------------------------------------------- 3536 // Stuff for per-thread fast random number generator 3537 // Table of primes 3538 static const unsigned __kmp_primes[] = { 3539 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3540 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3541 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3542 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3543 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3544 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3545 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3546 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3547 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3548 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3549 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3550 3551 //--------------------------------------------------------------------------- 3552 // __kmp_get_random: Get a random number using a linear congruential method. 3553 unsigned short __kmp_get_random(kmp_info_t *thread) { 3554 unsigned x = thread->th.th_x; 3555 unsigned short r = (unsigned short)(x >> 16); 3556 3557 thread->th.th_x = x * thread->th.th_a + 1; 3558 3559 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3560 thread->th.th_info.ds.ds_tid, r)); 3561 3562 return r; 3563 } 3564 //-------------------------------------------------------- 3565 // __kmp_init_random: Initialize a random number generator 3566 void __kmp_init_random(kmp_info_t *thread) { 3567 unsigned seed = thread->th.th_info.ds.ds_tid; 3568 3569 thread->th.th_a = 3570 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3571 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3572 KA_TRACE(30, 3573 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3574 } 3575 3576 #if KMP_OS_WINDOWS 3577 /* reclaim array entries for root threads that are already dead, returns number 3578 * reclaimed */ 3579 static int __kmp_reclaim_dead_roots(void) { 3580 int i, r = 0; 3581 3582 for (i = 0; i < __kmp_threads_capacity; ++i) { 3583 if (KMP_UBER_GTID(i) && 3584 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3585 !__kmp_root[i] 3586 ->r.r_active) { // AC: reclaim only roots died in non-active state 3587 r += __kmp_unregister_root_other_thread(i); 3588 } 3589 } 3590 return r; 3591 } 3592 #endif 3593 3594 /* This function attempts to create free entries in __kmp_threads and 3595 __kmp_root, and returns the number of free entries generated. 3596 3597 For Windows* OS static library, the first mechanism used is to reclaim array 3598 entries for root threads that are already dead. 3599 3600 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3601 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3602 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3603 threadprivate cache array has been created. Synchronization with 3604 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3605 3606 After any dead root reclamation, if the clipping value allows array expansion 3607 to result in the generation of a total of nNeed free slots, the function does 3608 that expansion. If not, nothing is done beyond the possible initial root 3609 thread reclamation. 3610 3611 If any argument is negative, the behavior is undefined. */ 3612 static int __kmp_expand_threads(int nNeed) { 3613 int added = 0; 3614 int minimumRequiredCapacity; 3615 int newCapacity; 3616 kmp_info_t **newThreads; 3617 kmp_root_t **newRoot; 3618 3619 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3620 // resizing __kmp_threads does not need additional protection if foreign 3621 // threads are present 3622 3623 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3624 /* only for Windows static library */ 3625 /* reclaim array entries for root threads that are already dead */ 3626 added = __kmp_reclaim_dead_roots(); 3627 3628 if (nNeed) { 3629 nNeed -= added; 3630 if (nNeed < 0) 3631 nNeed = 0; 3632 } 3633 #endif 3634 if (nNeed <= 0) 3635 return added; 3636 3637 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3638 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3639 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3640 // > __kmp_max_nth in one of two ways: 3641 // 3642 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3643 // may not be reused by another thread, so we may need to increase 3644 // __kmp_threads_capacity to __kmp_max_nth + 1. 3645 // 3646 // 2) New foreign root(s) are encountered. We always register new foreign 3647 // roots. This may cause a smaller # of threads to be allocated at 3648 // subsequent parallel regions, but the worker threads hang around (and 3649 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3650 // 3651 // Anyway, that is the reason for moving the check to see if 3652 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3653 // instead of having it performed here. -BB 3654 3655 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3656 3657 /* compute expansion headroom to check if we can expand */ 3658 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3659 /* possible expansion too small -- give up */ 3660 return added; 3661 } 3662 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3663 3664 newCapacity = __kmp_threads_capacity; 3665 do { 3666 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3667 : __kmp_sys_max_nth; 3668 } while (newCapacity < minimumRequiredCapacity); 3669 newThreads = (kmp_info_t **)__kmp_allocate( 3670 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3671 newRoot = 3672 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3673 KMP_MEMCPY(newThreads, __kmp_threads, 3674 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3675 KMP_MEMCPY(newRoot, __kmp_root, 3676 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3677 // Put old __kmp_threads array on a list. Any ongoing references to the old 3678 // list will be valid. This list is cleaned up at library shutdown. 3679 kmp_old_threads_list_t *node = 3680 (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t)); 3681 node->threads = __kmp_threads; 3682 node->next = __kmp_old_threads_list; 3683 __kmp_old_threads_list = node; 3684 3685 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3686 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3687 added += newCapacity - __kmp_threads_capacity; 3688 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3689 3690 if (newCapacity > __kmp_tp_capacity) { 3691 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3692 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3693 __kmp_threadprivate_resize_cache(newCapacity); 3694 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3695 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3696 } 3697 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3698 } 3699 3700 return added; 3701 } 3702 3703 /* Register the current thread as a root thread and obtain our gtid. We must 3704 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3705 thread that calls from __kmp_do_serial_initialize() */ 3706 int __kmp_register_root(int initial_thread) { 3707 kmp_info_t *root_thread; 3708 kmp_root_t *root; 3709 int gtid; 3710 int capacity; 3711 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3712 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3713 KMP_MB(); 3714 3715 /* 2007-03-02: 3716 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3717 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3718 work as expected -- it may return false (that means there is at least one 3719 empty slot in __kmp_threads array), but it is possible the only free slot 3720 is #0, which is reserved for initial thread and so cannot be used for this 3721 one. Following code workarounds this bug. 3722 3723 However, right solution seems to be not reserving slot #0 for initial 3724 thread because: 3725 (1) there is no magic in slot #0, 3726 (2) we cannot detect initial thread reliably (the first thread which does 3727 serial initialization may be not a real initial thread). 3728 */ 3729 capacity = __kmp_threads_capacity; 3730 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3731 --capacity; 3732 } 3733 3734 // If it is not for initializing the hidden helper team, we need to take 3735 // __kmp_hidden_helper_threads_num out of the capacity because it is included 3736 // in __kmp_threads_capacity. 3737 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 3738 capacity -= __kmp_hidden_helper_threads_num; 3739 } 3740 3741 /* see if there are too many threads */ 3742 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3743 if (__kmp_tp_cached) { 3744 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3745 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3746 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3747 } else { 3748 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3749 __kmp_msg_null); 3750 } 3751 } 3752 3753 // When hidden helper task is enabled, __kmp_threads is organized as follows: 3754 // 0: initial thread, also a regular OpenMP thread. 3755 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads. 3756 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for 3757 // regular OpenMP threads. 3758 if (TCR_4(__kmp_init_hidden_helper_threads)) { 3759 // Find an available thread slot for hidden helper thread. Slots for hidden 3760 // helper threads start from 1 to __kmp_hidden_helper_threads_num. 3761 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL && 3762 gtid <= __kmp_hidden_helper_threads_num; 3763 gtid++) 3764 ; 3765 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num); 3766 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for " 3767 "hidden helper thread: T#%d\n", 3768 gtid)); 3769 } else { 3770 /* find an available thread slot */ 3771 // Don't reassign the zero slot since we need that to only be used by 3772 // initial thread. Slots for hidden helper threads should also be skipped. 3773 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3774 gtid = 0; 3775 } else { 3776 for (gtid = __kmp_hidden_helper_threads_num + 1; 3777 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++) 3778 ; 3779 } 3780 KA_TRACE( 3781 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3782 KMP_ASSERT(gtid < __kmp_threads_capacity); 3783 } 3784 3785 /* update global accounting */ 3786 __kmp_all_nth++; 3787 TCW_4(__kmp_nth, __kmp_nth + 1); 3788 3789 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3790 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3791 if (__kmp_adjust_gtid_mode) { 3792 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3793 if (TCR_4(__kmp_gtid_mode) != 2) { 3794 TCW_4(__kmp_gtid_mode, 2); 3795 } 3796 } else { 3797 if (TCR_4(__kmp_gtid_mode) != 1) { 3798 TCW_4(__kmp_gtid_mode, 1); 3799 } 3800 } 3801 } 3802 3803 #ifdef KMP_ADJUST_BLOCKTIME 3804 /* Adjust blocktime to zero if necessary */ 3805 /* Middle initialization might not have occurred yet */ 3806 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3807 if (__kmp_nth > __kmp_avail_proc) { 3808 __kmp_zero_bt = TRUE; 3809 } 3810 } 3811 #endif /* KMP_ADJUST_BLOCKTIME */ 3812 3813 /* setup this new hierarchy */ 3814 if (!(root = __kmp_root[gtid])) { 3815 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3816 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3817 } 3818 3819 #if KMP_STATS_ENABLED 3820 // Initialize stats as soon as possible (right after gtid assignment). 3821 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3822 __kmp_stats_thread_ptr->startLife(); 3823 KMP_SET_THREAD_STATE(SERIAL_REGION); 3824 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3825 #endif 3826 __kmp_initialize_root(root); 3827 3828 /* setup new root thread structure */ 3829 if (root->r.r_uber_thread) { 3830 root_thread = root->r.r_uber_thread; 3831 } else { 3832 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3833 if (__kmp_storage_map) { 3834 __kmp_print_thread_storage_map(root_thread, gtid); 3835 } 3836 root_thread->th.th_info.ds.ds_gtid = gtid; 3837 #if OMPT_SUPPORT 3838 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3839 #endif 3840 root_thread->th.th_root = root; 3841 if (__kmp_env_consistency_check) { 3842 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3843 } 3844 #if USE_FAST_MEMORY 3845 __kmp_initialize_fast_memory(root_thread); 3846 #endif /* USE_FAST_MEMORY */ 3847 3848 #if KMP_USE_BGET 3849 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3850 __kmp_initialize_bget(root_thread); 3851 #endif 3852 __kmp_init_random(root_thread); // Initialize random number generator 3853 } 3854 3855 /* setup the serial team held in reserve by the root thread */ 3856 if (!root_thread->th.th_serial_team) { 3857 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3858 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3859 root_thread->th.th_serial_team = __kmp_allocate_team( 3860 root, 1, 1, 3861 #if OMPT_SUPPORT 3862 ompt_data_none, // root parallel id 3863 #endif 3864 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3865 } 3866 KMP_ASSERT(root_thread->th.th_serial_team); 3867 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3868 root_thread->th.th_serial_team)); 3869 3870 /* drop root_thread into place */ 3871 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3872 3873 root->r.r_root_team->t.t_threads[0] = root_thread; 3874 root->r.r_hot_team->t.t_threads[0] = root_thread; 3875 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3876 // AC: the team created in reserve, not for execution (it is unused for now). 3877 root_thread->th.th_serial_team->t.t_serialized = 0; 3878 root->r.r_uber_thread = root_thread; 3879 3880 /* initialize the thread, get it ready to go */ 3881 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3882 TCW_4(__kmp_init_gtid, TRUE); 3883 3884 /* prepare the primary thread for get_gtid() */ 3885 __kmp_gtid_set_specific(gtid); 3886 3887 #if USE_ITT_BUILD 3888 __kmp_itt_thread_name(gtid); 3889 #endif /* USE_ITT_BUILD */ 3890 3891 #ifdef KMP_TDATA_GTID 3892 __kmp_gtid = gtid; 3893 #endif 3894 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3895 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3896 3897 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3898 "plain=%u\n", 3899 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3900 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3901 KMP_INIT_BARRIER_STATE)); 3902 { // Initialize barrier data. 3903 int b; 3904 for (b = 0; b < bs_last_barrier; ++b) { 3905 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3906 #if USE_DEBUGGER 3907 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3908 #endif 3909 } 3910 } 3911 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3912 KMP_INIT_BARRIER_STATE); 3913 3914 #if KMP_AFFINITY_SUPPORTED 3915 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3916 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3917 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3918 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3919 #endif /* KMP_AFFINITY_SUPPORTED */ 3920 root_thread->th.th_def_allocator = __kmp_def_allocator; 3921 root_thread->th.th_prev_level = 0; 3922 root_thread->th.th_prev_num_threads = 1; 3923 3924 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 3925 tmp->cg_root = root_thread; 3926 tmp->cg_thread_limit = __kmp_cg_max_nth; 3927 tmp->cg_nthreads = 1; 3928 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" 3929 " cg_nthreads init to 1\n", 3930 root_thread, tmp)); 3931 tmp->up = NULL; 3932 root_thread->th.th_cg_roots = tmp; 3933 3934 __kmp_root_counter++; 3935 3936 #if OMPT_SUPPORT 3937 if (!initial_thread && ompt_enabled.enabled) { 3938 3939 kmp_info_t *root_thread = ompt_get_thread(); 3940 3941 ompt_set_thread_state(root_thread, ompt_state_overhead); 3942 3943 if (ompt_enabled.ompt_callback_thread_begin) { 3944 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 3945 ompt_thread_initial, __ompt_get_thread_data_internal()); 3946 } 3947 ompt_data_t *task_data; 3948 ompt_data_t *parallel_data; 3949 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3950 NULL); 3951 if (ompt_enabled.ompt_callback_implicit_task) { 3952 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3953 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); 3954 } 3955 3956 ompt_set_thread_state(root_thread, ompt_state_work_serial); 3957 } 3958 #endif 3959 #if OMPD_SUPPORT 3960 if (ompd_state & OMPD_ENABLE_BP) 3961 ompd_bp_thread_begin(); 3962 #endif 3963 3964 KMP_MB(); 3965 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3966 3967 return gtid; 3968 } 3969 3970 #if KMP_NESTED_HOT_TEAMS 3971 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 3972 const int max_level) { 3973 int i, n, nth; 3974 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 3975 if (!hot_teams || !hot_teams[level].hot_team) { 3976 return 0; 3977 } 3978 KMP_DEBUG_ASSERT(level < max_level); 3979 kmp_team_t *team = hot_teams[level].hot_team; 3980 nth = hot_teams[level].hot_team_nth; 3981 n = nth - 1; // primary thread is not freed 3982 if (level < max_level - 1) { 3983 for (i = 0; i < nth; ++i) { 3984 kmp_info_t *th = team->t.t_threads[i]; 3985 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 3986 if (i > 0 && th->th.th_hot_teams) { 3987 __kmp_free(th->th.th_hot_teams); 3988 th->th.th_hot_teams = NULL; 3989 } 3990 } 3991 } 3992 __kmp_free_team(root, team, NULL); 3993 return n; 3994 } 3995 #endif 3996 3997 // Resets a root thread and clear its root and hot teams. 3998 // Returns the number of __kmp_threads entries directly and indirectly freed. 3999 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 4000 kmp_team_t *root_team = root->r.r_root_team; 4001 kmp_team_t *hot_team = root->r.r_hot_team; 4002 int n = hot_team->t.t_nproc; 4003 int i; 4004 4005 KMP_DEBUG_ASSERT(!root->r.r_active); 4006 4007 root->r.r_root_team = NULL; 4008 root->r.r_hot_team = NULL; 4009 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 4010 // before call to __kmp_free_team(). 4011 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 4012 #if KMP_NESTED_HOT_TEAMS 4013 if (__kmp_hot_teams_max_level > 4014 0) { // need to free nested hot teams and their threads if any 4015 for (i = 0; i < hot_team->t.t_nproc; ++i) { 4016 kmp_info_t *th = hot_team->t.t_threads[i]; 4017 if (__kmp_hot_teams_max_level > 1) { 4018 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 4019 } 4020 if (th->th.th_hot_teams) { 4021 __kmp_free(th->th.th_hot_teams); 4022 th->th.th_hot_teams = NULL; 4023 } 4024 } 4025 } 4026 #endif 4027 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 4028 4029 // Before we can reap the thread, we need to make certain that all other 4030 // threads in the teams that had this root as ancestor have stopped trying to 4031 // steal tasks. 4032 if (__kmp_tasking_mode != tskm_immediate_exec) { 4033 __kmp_wait_to_unref_task_teams(); 4034 } 4035 4036 #if KMP_OS_WINDOWS 4037 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 4038 KA_TRACE( 4039 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 4040 "\n", 4041 (LPVOID) & (root->r.r_uber_thread->th), 4042 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 4043 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 4044 #endif /* KMP_OS_WINDOWS */ 4045 4046 #if OMPD_SUPPORT 4047 if (ompd_state & OMPD_ENABLE_BP) 4048 ompd_bp_thread_end(); 4049 #endif 4050 4051 #if OMPT_SUPPORT 4052 ompt_data_t *task_data; 4053 ompt_data_t *parallel_data; 4054 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 4055 NULL); 4056 if (ompt_enabled.ompt_callback_implicit_task) { 4057 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 4058 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); 4059 } 4060 if (ompt_enabled.ompt_callback_thread_end) { 4061 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 4062 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 4063 } 4064 #endif 4065 4066 TCW_4(__kmp_nth, 4067 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 4068 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; 4069 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" 4070 " to %d\n", 4071 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, 4072 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); 4073 if (i == 1) { 4074 // need to free contention group structure 4075 KMP_DEBUG_ASSERT(root->r.r_uber_thread == 4076 root->r.r_uber_thread->th.th_cg_roots->cg_root); 4077 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); 4078 __kmp_free(root->r.r_uber_thread->th.th_cg_roots); 4079 root->r.r_uber_thread->th.th_cg_roots = NULL; 4080 } 4081 __kmp_reap_thread(root->r.r_uber_thread, 1); 4082 4083 // We canot put root thread to __kmp_thread_pool, so we have to reap it 4084 // instead of freeing. 4085 root->r.r_uber_thread = NULL; 4086 /* mark root as no longer in use */ 4087 root->r.r_begin = FALSE; 4088 4089 return n; 4090 } 4091 4092 void __kmp_unregister_root_current_thread(int gtid) { 4093 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 4094 /* this lock should be ok, since unregister_root_current_thread is never 4095 called during an abort, only during a normal close. furthermore, if you 4096 have the forkjoin lock, you should never try to get the initz lock */ 4097 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 4098 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 4099 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 4100 "exiting T#%d\n", 4101 gtid)); 4102 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4103 return; 4104 } 4105 kmp_root_t *root = __kmp_root[gtid]; 4106 4107 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4108 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4109 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4110 KMP_ASSERT(root->r.r_active == FALSE); 4111 4112 KMP_MB(); 4113 4114 kmp_info_t *thread = __kmp_threads[gtid]; 4115 kmp_team_t *team = thread->th.th_team; 4116 kmp_task_team_t *task_team = thread->th.th_task_team; 4117 4118 // we need to wait for the proxy tasks before finishing the thread 4119 if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks || 4120 task_team->tt.tt_hidden_helper_task_encountered)) { 4121 #if OMPT_SUPPORT 4122 // the runtime is shutting down so we won't report any events 4123 thread->th.ompt_thread_info.state = ompt_state_undefined; 4124 #endif 4125 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 4126 } 4127 4128 __kmp_reset_root(gtid, root); 4129 4130 KMP_MB(); 4131 KC_TRACE(10, 4132 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 4133 4134 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4135 } 4136 4137 #if KMP_OS_WINDOWS 4138 /* __kmp_forkjoin_lock must be already held 4139 Unregisters a root thread that is not the current thread. Returns the number 4140 of __kmp_threads entries freed as a result. */ 4141 static int __kmp_unregister_root_other_thread(int gtid) { 4142 kmp_root_t *root = __kmp_root[gtid]; 4143 int r; 4144 4145 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 4146 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4147 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4148 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4149 KMP_ASSERT(root->r.r_active == FALSE); 4150 4151 r = __kmp_reset_root(gtid, root); 4152 KC_TRACE(10, 4153 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 4154 return r; 4155 } 4156 #endif 4157 4158 #if KMP_DEBUG 4159 void __kmp_task_info() { 4160 4161 kmp_int32 gtid = __kmp_entry_gtid(); 4162 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4163 kmp_info_t *this_thr = __kmp_threads[gtid]; 4164 kmp_team_t *steam = this_thr->th.th_serial_team; 4165 kmp_team_t *team = this_thr->th.th_team; 4166 4167 __kmp_printf( 4168 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4169 "ptask=%p\n", 4170 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4171 team->t.t_implicit_task_taskdata[tid].td_parent); 4172 } 4173 #endif // KMP_DEBUG 4174 4175 /* TODO optimize with one big memclr, take out what isn't needed, split 4176 responsibility to workers as much as possible, and delay initialization of 4177 features as much as possible */ 4178 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4179 int tid, int gtid) { 4180 /* this_thr->th.th_info.ds.ds_gtid is setup in 4181 kmp_allocate_thread/create_worker. 4182 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4183 KMP_DEBUG_ASSERT(this_thr != NULL); 4184 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4185 KMP_DEBUG_ASSERT(team); 4186 KMP_DEBUG_ASSERT(team->t.t_threads); 4187 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4188 kmp_info_t *master = team->t.t_threads[0]; 4189 KMP_DEBUG_ASSERT(master); 4190 KMP_DEBUG_ASSERT(master->th.th_root); 4191 4192 KMP_MB(); 4193 4194 TCW_SYNC_PTR(this_thr->th.th_team, team); 4195 4196 this_thr->th.th_info.ds.ds_tid = tid; 4197 this_thr->th.th_set_nproc = 0; 4198 if (__kmp_tasking_mode != tskm_immediate_exec) 4199 // When tasking is possible, threads are not safe to reap until they are 4200 // done tasking; this will be set when tasking code is exited in wait 4201 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4202 else // no tasking --> always safe to reap 4203 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4204 this_thr->th.th_set_proc_bind = proc_bind_default; 4205 #if KMP_AFFINITY_SUPPORTED 4206 this_thr->th.th_new_place = this_thr->th.th_current_place; 4207 #endif 4208 this_thr->th.th_root = master->th.th_root; 4209 4210 /* setup the thread's cache of the team structure */ 4211 this_thr->th.th_team_nproc = team->t.t_nproc; 4212 this_thr->th.th_team_master = master; 4213 this_thr->th.th_team_serialized = team->t.t_serialized; 4214 4215 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4216 4217 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4218 tid, gtid, this_thr, this_thr->th.th_current_task)); 4219 4220 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4221 team, tid, TRUE); 4222 4223 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4224 tid, gtid, this_thr, this_thr->th.th_current_task)); 4225 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4226 // __kmp_initialize_team()? 4227 4228 /* TODO no worksharing in speculative threads */ 4229 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4230 4231 this_thr->th.th_local.this_construct = 0; 4232 4233 if (!this_thr->th.th_pri_common) { 4234 this_thr->th.th_pri_common = 4235 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4236 if (__kmp_storage_map) { 4237 __kmp_print_storage_map_gtid( 4238 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4239 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4240 } 4241 this_thr->th.th_pri_head = NULL; 4242 } 4243 4244 if (this_thr != master && // Primary thread's CG root is initialized elsewhere 4245 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set 4246 // Make new thread's CG root same as primary thread's 4247 KMP_DEBUG_ASSERT(master->th.th_cg_roots); 4248 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 4249 if (tmp) { 4250 // worker changes CG, need to check if old CG should be freed 4251 int i = tmp->cg_nthreads--; 4252 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" 4253 " on node %p of thread %p to %d\n", 4254 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); 4255 if (i == 1) { 4256 __kmp_free(tmp); // last thread left CG --> free it 4257 } 4258 } 4259 this_thr->th.th_cg_roots = master->th.th_cg_roots; 4260 // Increment new thread's CG root's counter to add the new thread 4261 this_thr->th.th_cg_roots->cg_nthreads++; 4262 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" 4263 " node %p of thread %p to %d\n", 4264 this_thr, this_thr->th.th_cg_roots, 4265 this_thr->th.th_cg_roots->cg_root, 4266 this_thr->th.th_cg_roots->cg_nthreads)); 4267 this_thr->th.th_current_task->td_icvs.thread_limit = 4268 this_thr->th.th_cg_roots->cg_thread_limit; 4269 } 4270 4271 /* Initialize dynamic dispatch */ 4272 { 4273 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4274 // Use team max_nproc since this will never change for the team. 4275 size_t disp_size = 4276 sizeof(dispatch_private_info_t) * 4277 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4278 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4279 team->t.t_max_nproc)); 4280 KMP_ASSERT(dispatch); 4281 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4282 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4283 4284 dispatch->th_disp_index = 0; 4285 dispatch->th_doacross_buf_idx = 0; 4286 if (!dispatch->th_disp_buffer) { 4287 dispatch->th_disp_buffer = 4288 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4289 4290 if (__kmp_storage_map) { 4291 __kmp_print_storage_map_gtid( 4292 gtid, &dispatch->th_disp_buffer[0], 4293 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4294 ? 1 4295 : __kmp_dispatch_num_buffers], 4296 disp_size, 4297 "th_%d.th_dispatch.th_disp_buffer " 4298 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4299 gtid, team->t.t_id, gtid); 4300 } 4301 } else { 4302 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4303 } 4304 4305 dispatch->th_dispatch_pr_current = 0; 4306 dispatch->th_dispatch_sh_current = 0; 4307 4308 dispatch->th_deo_fcn = 0; /* ORDERED */ 4309 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4310 } 4311 4312 this_thr->th.th_next_pool = NULL; 4313 4314 if (!this_thr->th.th_task_state_memo_stack) { 4315 size_t i; 4316 this_thr->th.th_task_state_memo_stack = 4317 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4318 this_thr->th.th_task_state_top = 0; 4319 this_thr->th.th_task_state_stack_sz = 4; 4320 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4321 ++i) // zero init the stack 4322 this_thr->th.th_task_state_memo_stack[i] = 0; 4323 } 4324 4325 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4326 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4327 4328 KMP_MB(); 4329 } 4330 4331 /* allocate a new thread for the requesting team. this is only called from 4332 within a forkjoin critical section. we will first try to get an available 4333 thread from the thread pool. if none is available, we will fork a new one 4334 assuming we are able to create a new one. this should be assured, as the 4335 caller should check on this first. */ 4336 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4337 int new_tid) { 4338 kmp_team_t *serial_team; 4339 kmp_info_t *new_thr; 4340 int new_gtid; 4341 4342 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4343 KMP_DEBUG_ASSERT(root && team); 4344 #if !KMP_NESTED_HOT_TEAMS 4345 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4346 #endif 4347 KMP_MB(); 4348 4349 /* first, try to get one from the thread pool */ 4350 if (__kmp_thread_pool) { 4351 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4352 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4353 if (new_thr == __kmp_thread_pool_insert_pt) { 4354 __kmp_thread_pool_insert_pt = NULL; 4355 } 4356 TCW_4(new_thr->th.th_in_pool, FALSE); 4357 __kmp_suspend_initialize_thread(new_thr); 4358 __kmp_lock_suspend_mx(new_thr); 4359 if (new_thr->th.th_active_in_pool == TRUE) { 4360 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); 4361 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 4362 new_thr->th.th_active_in_pool = FALSE; 4363 } 4364 __kmp_unlock_suspend_mx(new_thr); 4365 4366 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4367 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4368 KMP_ASSERT(!new_thr->th.th_team); 4369 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4370 4371 /* setup the thread structure */ 4372 __kmp_initialize_info(new_thr, team, new_tid, 4373 new_thr->th.th_info.ds.ds_gtid); 4374 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4375 4376 TCW_4(__kmp_nth, __kmp_nth + 1); 4377 4378 new_thr->th.th_task_state = 0; 4379 new_thr->th.th_task_state_top = 0; 4380 new_thr->th.th_task_state_stack_sz = 4; 4381 4382 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 4383 // Make sure pool thread has transitioned to waiting on own thread struct 4384 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0); 4385 // Thread activated in __kmp_allocate_team when increasing team size 4386 } 4387 4388 #ifdef KMP_ADJUST_BLOCKTIME 4389 /* Adjust blocktime back to zero if necessary */ 4390 /* Middle initialization might not have occurred yet */ 4391 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4392 if (__kmp_nth > __kmp_avail_proc) { 4393 __kmp_zero_bt = TRUE; 4394 } 4395 } 4396 #endif /* KMP_ADJUST_BLOCKTIME */ 4397 4398 #if KMP_DEBUG 4399 // If thread entered pool via __kmp_free_thread, wait_flag should != 4400 // KMP_BARRIER_PARENT_FLAG. 4401 int b; 4402 kmp_balign_t *balign = new_thr->th.th_bar; 4403 for (b = 0; b < bs_last_barrier; ++b) 4404 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4405 #endif 4406 4407 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4408 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4409 4410 KMP_MB(); 4411 return new_thr; 4412 } 4413 4414 /* no, well fork a new one */ 4415 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4416 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4417 4418 #if KMP_USE_MONITOR 4419 // If this is the first worker thread the RTL is creating, then also 4420 // launch the monitor thread. We try to do this as early as possible. 4421 if (!TCR_4(__kmp_init_monitor)) { 4422 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4423 if (!TCR_4(__kmp_init_monitor)) { 4424 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4425 TCW_4(__kmp_init_monitor, 1); 4426 __kmp_create_monitor(&__kmp_monitor); 4427 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4428 #if KMP_OS_WINDOWS 4429 // AC: wait until monitor has started. This is a fix for CQ232808. 4430 // The reason is that if the library is loaded/unloaded in a loop with 4431 // small (parallel) work in between, then there is high probability that 4432 // monitor thread started after the library shutdown. At shutdown it is 4433 // too late to cope with the problem, because when the primary thread is 4434 // in DllMain (process detach) the monitor has no chances to start (it is 4435 // blocked), and primary thread has no means to inform the monitor that 4436 // the library has gone, because all the memory which the monitor can 4437 // access is going to be released/reset. 4438 while (TCR_4(__kmp_init_monitor) < 2) { 4439 KMP_YIELD(TRUE); 4440 } 4441 KF_TRACE(10, ("after monitor thread has started\n")); 4442 #endif 4443 } 4444 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4445 } 4446 #endif 4447 4448 KMP_MB(); 4449 4450 { 4451 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads) 4452 ? 1 4453 : __kmp_hidden_helper_threads_num + 1; 4454 4455 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL; 4456 ++new_gtid) { 4457 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4458 } 4459 4460 if (TCR_4(__kmp_init_hidden_helper_threads)) { 4461 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num); 4462 } 4463 } 4464 4465 /* allocate space for it. */ 4466 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4467 4468 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4469 4470 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 4471 // suppress race conditions detection on synchronization flags in debug mode 4472 // this helps to analyze library internals eliminating false positives 4473 __itt_suppress_mark_range( 4474 __itt_suppress_range, __itt_suppress_threading_errors, 4475 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc)); 4476 __itt_suppress_mark_range( 4477 __itt_suppress_range, __itt_suppress_threading_errors, 4478 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state)); 4479 #if KMP_OS_WINDOWS 4480 __itt_suppress_mark_range( 4481 __itt_suppress_range, __itt_suppress_threading_errors, 4482 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init)); 4483 #else 4484 __itt_suppress_mark_range(__itt_suppress_range, 4485 __itt_suppress_threading_errors, 4486 &new_thr->th.th_suspend_init_count, 4487 sizeof(new_thr->th.th_suspend_init_count)); 4488 #endif 4489 // TODO: check if we need to also suppress b_arrived flags 4490 __itt_suppress_mark_range(__itt_suppress_range, 4491 __itt_suppress_threading_errors, 4492 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go), 4493 sizeof(new_thr->th.th_bar[0].bb.b_go)); 4494 __itt_suppress_mark_range(__itt_suppress_range, 4495 __itt_suppress_threading_errors, 4496 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go), 4497 sizeof(new_thr->th.th_bar[1].bb.b_go)); 4498 __itt_suppress_mark_range(__itt_suppress_range, 4499 __itt_suppress_threading_errors, 4500 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go), 4501 sizeof(new_thr->th.th_bar[2].bb.b_go)); 4502 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 4503 if (__kmp_storage_map) { 4504 __kmp_print_thread_storage_map(new_thr, new_gtid); 4505 } 4506 4507 // add the reserve serialized team, initialized from the team's primary thread 4508 { 4509 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4510 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4511 new_thr->th.th_serial_team = serial_team = 4512 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4513 #if OMPT_SUPPORT 4514 ompt_data_none, // root parallel id 4515 #endif 4516 proc_bind_default, &r_icvs, 4517 0 USE_NESTED_HOT_ARG(NULL)); 4518 } 4519 KMP_ASSERT(serial_team); 4520 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4521 // execution (it is unused for now). 4522 serial_team->t.t_threads[0] = new_thr; 4523 KF_TRACE(10, 4524 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4525 new_thr)); 4526 4527 /* setup the thread structures */ 4528 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4529 4530 #if USE_FAST_MEMORY 4531 __kmp_initialize_fast_memory(new_thr); 4532 #endif /* USE_FAST_MEMORY */ 4533 4534 #if KMP_USE_BGET 4535 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4536 __kmp_initialize_bget(new_thr); 4537 #endif 4538 4539 __kmp_init_random(new_thr); // Initialize random number generator 4540 4541 /* Initialize these only once when thread is grabbed for a team allocation */ 4542 KA_TRACE(20, 4543 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4544 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4545 4546 int b; 4547 kmp_balign_t *balign = new_thr->th.th_bar; 4548 for (b = 0; b < bs_last_barrier; ++b) { 4549 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4550 balign[b].bb.team = NULL; 4551 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4552 balign[b].bb.use_oncore_barrier = 0; 4553 } 4554 4555 TCW_PTR(new_thr->th.th_sleep_loc, NULL); 4556 new_thr->th.th_sleep_loc_type = flag_unset; 4557 4558 new_thr->th.th_spin_here = FALSE; 4559 new_thr->th.th_next_waiting = 0; 4560 #if KMP_OS_UNIX 4561 new_thr->th.th_blocking = false; 4562 #endif 4563 4564 #if KMP_AFFINITY_SUPPORTED 4565 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4566 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4567 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4568 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4569 #endif 4570 new_thr->th.th_def_allocator = __kmp_def_allocator; 4571 new_thr->th.th_prev_level = 0; 4572 new_thr->th.th_prev_num_threads = 1; 4573 4574 TCW_4(new_thr->th.th_in_pool, FALSE); 4575 new_thr->th.th_active_in_pool = FALSE; 4576 TCW_4(new_thr->th.th_active, TRUE); 4577 4578 /* adjust the global counters */ 4579 __kmp_all_nth++; 4580 __kmp_nth++; 4581 4582 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4583 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4584 if (__kmp_adjust_gtid_mode) { 4585 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4586 if (TCR_4(__kmp_gtid_mode) != 2) { 4587 TCW_4(__kmp_gtid_mode, 2); 4588 } 4589 } else { 4590 if (TCR_4(__kmp_gtid_mode) != 1) { 4591 TCW_4(__kmp_gtid_mode, 1); 4592 } 4593 } 4594 } 4595 4596 #ifdef KMP_ADJUST_BLOCKTIME 4597 /* Adjust blocktime back to zero if necessary */ 4598 /* Middle initialization might not have occurred yet */ 4599 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4600 if (__kmp_nth > __kmp_avail_proc) { 4601 __kmp_zero_bt = TRUE; 4602 } 4603 } 4604 #endif /* KMP_ADJUST_BLOCKTIME */ 4605 4606 /* actually fork it and create the new worker thread */ 4607 KF_TRACE( 4608 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4609 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4610 KF_TRACE(10, 4611 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4612 4613 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4614 new_gtid)); 4615 KMP_MB(); 4616 return new_thr; 4617 } 4618 4619 /* Reinitialize team for reuse. 4620 The hot team code calls this case at every fork barrier, so EPCC barrier 4621 test are extremely sensitive to changes in it, esp. writes to the team 4622 struct, which cause a cache invalidation in all threads. 4623 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4624 static void __kmp_reinitialize_team(kmp_team_t *team, 4625 kmp_internal_control_t *new_icvs, 4626 ident_t *loc) { 4627 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4628 team->t.t_threads[0], team)); 4629 KMP_DEBUG_ASSERT(team && new_icvs); 4630 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4631 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4632 4633 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4634 // Copy ICVs to the primary thread's implicit taskdata 4635 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4636 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4637 4638 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4639 team->t.t_threads[0], team)); 4640 } 4641 4642 /* Initialize the team data structure. 4643 This assumes the t_threads and t_max_nproc are already set. 4644 Also, we don't touch the arguments */ 4645 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4646 kmp_internal_control_t *new_icvs, 4647 ident_t *loc) { 4648 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4649 4650 /* verify */ 4651 KMP_DEBUG_ASSERT(team); 4652 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4653 KMP_DEBUG_ASSERT(team->t.t_threads); 4654 KMP_MB(); 4655 4656 team->t.t_master_tid = 0; /* not needed */ 4657 /* team->t.t_master_bar; not needed */ 4658 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4659 team->t.t_nproc = new_nproc; 4660 4661 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4662 team->t.t_next_pool = NULL; 4663 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4664 * up hot team */ 4665 4666 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4667 team->t.t_invoke = NULL; /* not needed */ 4668 4669 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4670 team->t.t_sched.sched = new_icvs->sched.sched; 4671 4672 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4673 team->t.t_fp_control_saved = FALSE; /* not needed */ 4674 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4675 team->t.t_mxcsr = 0; /* not needed */ 4676 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4677 4678 team->t.t_construct = 0; 4679 4680 team->t.t_ordered.dt.t_value = 0; 4681 team->t.t_master_active = FALSE; 4682 4683 #ifdef KMP_DEBUG 4684 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4685 #endif 4686 #if KMP_OS_WINDOWS 4687 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4688 #endif 4689 4690 team->t.t_control_stack_top = NULL; 4691 4692 __kmp_reinitialize_team(team, new_icvs, loc); 4693 4694 KMP_MB(); 4695 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4696 } 4697 4698 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 4699 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4700 static void 4701 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4702 if (KMP_AFFINITY_CAPABLE()) { 4703 int status; 4704 if (old_mask != NULL) { 4705 status = __kmp_get_system_affinity(old_mask, TRUE); 4706 int error = errno; 4707 if (status != 0) { 4708 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4709 __kmp_msg_null); 4710 } 4711 } 4712 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4713 } 4714 } 4715 #endif 4716 4717 #if KMP_AFFINITY_SUPPORTED 4718 4719 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4720 // It calculates the worker + primary thread's partition based upon the parent 4721 // thread's partition, and binds each worker to a thread in their partition. 4722 // The primary thread's partition should already include its current binding. 4723 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4724 // Do not partition places for the hidden helper team 4725 if (KMP_HIDDEN_HELPER_TEAM(team)) 4726 return; 4727 // Copy the primary thread's place partition to the team struct 4728 kmp_info_t *master_th = team->t.t_threads[0]; 4729 KMP_DEBUG_ASSERT(master_th != NULL); 4730 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4731 int first_place = master_th->th.th_first_place; 4732 int last_place = master_th->th.th_last_place; 4733 int masters_place = master_th->th.th_current_place; 4734 team->t.t_first_place = first_place; 4735 team->t.t_last_place = last_place; 4736 4737 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4738 "bound to place %d partition = [%d,%d]\n", 4739 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4740 team->t.t_id, masters_place, first_place, last_place)); 4741 4742 switch (proc_bind) { 4743 4744 case proc_bind_default: 4745 // Serial teams might have the proc_bind policy set to proc_bind_default. 4746 // Not an issue -- we don't rebind primary thread for any proc_bind policy. 4747 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4748 break; 4749 4750 case proc_bind_primary: { 4751 int f; 4752 int n_th = team->t.t_nproc; 4753 for (f = 1; f < n_th; f++) { 4754 kmp_info_t *th = team->t.t_threads[f]; 4755 KMP_DEBUG_ASSERT(th != NULL); 4756 th->th.th_first_place = first_place; 4757 th->th.th_last_place = last_place; 4758 th->th.th_new_place = masters_place; 4759 if (__kmp_display_affinity && masters_place != th->th.th_current_place && 4760 team->t.t_display_affinity != 1) { 4761 team->t.t_display_affinity = 1; 4762 } 4763 4764 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d " 4765 "partition = [%d,%d]\n", 4766 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4767 f, masters_place, first_place, last_place)); 4768 } 4769 } break; 4770 4771 case proc_bind_close: { 4772 int f; 4773 int n_th = team->t.t_nproc; 4774 int n_places; 4775 if (first_place <= last_place) { 4776 n_places = last_place - first_place + 1; 4777 } else { 4778 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4779 } 4780 if (n_th <= n_places) { 4781 int place = masters_place; 4782 for (f = 1; f < n_th; f++) { 4783 kmp_info_t *th = team->t.t_threads[f]; 4784 KMP_DEBUG_ASSERT(th != NULL); 4785 4786 if (place == last_place) { 4787 place = first_place; 4788 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4789 place = 0; 4790 } else { 4791 place++; 4792 } 4793 th->th.th_first_place = first_place; 4794 th->th.th_last_place = last_place; 4795 th->th.th_new_place = place; 4796 if (__kmp_display_affinity && place != th->th.th_current_place && 4797 team->t.t_display_affinity != 1) { 4798 team->t.t_display_affinity = 1; 4799 } 4800 4801 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4802 "partition = [%d,%d]\n", 4803 __kmp_gtid_from_thread(team->t.t_threads[f]), 4804 team->t.t_id, f, place, first_place, last_place)); 4805 } 4806 } else { 4807 int S, rem, gap, s_count; 4808 S = n_th / n_places; 4809 s_count = 0; 4810 rem = n_th - (S * n_places); 4811 gap = rem > 0 ? n_places / rem : n_places; 4812 int place = masters_place; 4813 int gap_ct = gap; 4814 for (f = 0; f < n_th; f++) { 4815 kmp_info_t *th = team->t.t_threads[f]; 4816 KMP_DEBUG_ASSERT(th != NULL); 4817 4818 th->th.th_first_place = first_place; 4819 th->th.th_last_place = last_place; 4820 th->th.th_new_place = place; 4821 if (__kmp_display_affinity && place != th->th.th_current_place && 4822 team->t.t_display_affinity != 1) { 4823 team->t.t_display_affinity = 1; 4824 } 4825 s_count++; 4826 4827 if ((s_count == S) && rem && (gap_ct == gap)) { 4828 // do nothing, add an extra thread to place on next iteration 4829 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4830 // we added an extra thread to this place; move to next place 4831 if (place == last_place) { 4832 place = first_place; 4833 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4834 place = 0; 4835 } else { 4836 place++; 4837 } 4838 s_count = 0; 4839 gap_ct = 1; 4840 rem--; 4841 } else if (s_count == S) { // place full; don't add extra 4842 if (place == last_place) { 4843 place = first_place; 4844 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4845 place = 0; 4846 } else { 4847 place++; 4848 } 4849 gap_ct++; 4850 s_count = 0; 4851 } 4852 4853 KA_TRACE(100, 4854 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4855 "partition = [%d,%d]\n", 4856 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4857 th->th.th_new_place, first_place, last_place)); 4858 } 4859 KMP_DEBUG_ASSERT(place == masters_place); 4860 } 4861 } break; 4862 4863 case proc_bind_spread: { 4864 int f; 4865 int n_th = team->t.t_nproc; 4866 int n_places; 4867 int thidx; 4868 if (first_place <= last_place) { 4869 n_places = last_place - first_place + 1; 4870 } else { 4871 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4872 } 4873 if (n_th <= n_places) { 4874 int place = -1; 4875 4876 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) { 4877 int S = n_places / n_th; 4878 int s_count, rem, gap, gap_ct; 4879 4880 place = masters_place; 4881 rem = n_places - n_th * S; 4882 gap = rem ? n_th / rem : 1; 4883 gap_ct = gap; 4884 thidx = n_th; 4885 if (update_master_only == 1) 4886 thidx = 1; 4887 for (f = 0; f < thidx; f++) { 4888 kmp_info_t *th = team->t.t_threads[f]; 4889 KMP_DEBUG_ASSERT(th != NULL); 4890 4891 th->th.th_first_place = place; 4892 th->th.th_new_place = place; 4893 if (__kmp_display_affinity && place != th->th.th_current_place && 4894 team->t.t_display_affinity != 1) { 4895 team->t.t_display_affinity = 1; 4896 } 4897 s_count = 1; 4898 while (s_count < S) { 4899 if (place == last_place) { 4900 place = first_place; 4901 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4902 place = 0; 4903 } else { 4904 place++; 4905 } 4906 s_count++; 4907 } 4908 if (rem && (gap_ct == gap)) { 4909 if (place == last_place) { 4910 place = first_place; 4911 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4912 place = 0; 4913 } else { 4914 place++; 4915 } 4916 rem--; 4917 gap_ct = 0; 4918 } 4919 th->th.th_last_place = place; 4920 gap_ct++; 4921 4922 if (place == last_place) { 4923 place = first_place; 4924 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4925 place = 0; 4926 } else { 4927 place++; 4928 } 4929 4930 KA_TRACE(100, 4931 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4932 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n", 4933 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4934 f, th->th.th_new_place, th->th.th_first_place, 4935 th->th.th_last_place, __kmp_affinity_num_masks)); 4936 } 4937 } else { 4938 /* Having uniform space of available computation places I can create 4939 T partitions of round(P/T) size and put threads into the first 4940 place of each partition. */ 4941 double current = static_cast<double>(masters_place); 4942 double spacing = 4943 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4944 int first, last; 4945 kmp_info_t *th; 4946 4947 thidx = n_th + 1; 4948 if (update_master_only == 1) 4949 thidx = 1; 4950 for (f = 0; f < thidx; f++) { 4951 first = static_cast<int>(current); 4952 last = static_cast<int>(current + spacing) - 1; 4953 KMP_DEBUG_ASSERT(last >= first); 4954 if (first >= n_places) { 4955 if (masters_place) { 4956 first -= n_places; 4957 last -= n_places; 4958 if (first == (masters_place + 1)) { 4959 KMP_DEBUG_ASSERT(f == n_th); 4960 first--; 4961 } 4962 if (last == masters_place) { 4963 KMP_DEBUG_ASSERT(f == (n_th - 1)); 4964 last--; 4965 } 4966 } else { 4967 KMP_DEBUG_ASSERT(f == n_th); 4968 first = 0; 4969 last = 0; 4970 } 4971 } 4972 if (last >= n_places) { 4973 last = (n_places - 1); 4974 } 4975 place = first; 4976 current += spacing; 4977 if (f < n_th) { 4978 KMP_DEBUG_ASSERT(0 <= first); 4979 KMP_DEBUG_ASSERT(n_places > first); 4980 KMP_DEBUG_ASSERT(0 <= last); 4981 KMP_DEBUG_ASSERT(n_places > last); 4982 KMP_DEBUG_ASSERT(last_place >= first_place); 4983 th = team->t.t_threads[f]; 4984 KMP_DEBUG_ASSERT(th); 4985 th->th.th_first_place = first; 4986 th->th.th_new_place = place; 4987 th->th.th_last_place = last; 4988 if (__kmp_display_affinity && place != th->th.th_current_place && 4989 team->t.t_display_affinity != 1) { 4990 team->t.t_display_affinity = 1; 4991 } 4992 KA_TRACE(100, 4993 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4994 "partition = [%d,%d], spacing = %.4f\n", 4995 __kmp_gtid_from_thread(team->t.t_threads[f]), 4996 team->t.t_id, f, th->th.th_new_place, 4997 th->th.th_first_place, th->th.th_last_place, spacing)); 4998 } 4999 } 5000 } 5001 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 5002 } else { 5003 int S, rem, gap, s_count; 5004 S = n_th / n_places; 5005 s_count = 0; 5006 rem = n_th - (S * n_places); 5007 gap = rem > 0 ? n_places / rem : n_places; 5008 int place = masters_place; 5009 int gap_ct = gap; 5010 thidx = n_th; 5011 if (update_master_only == 1) 5012 thidx = 1; 5013 for (f = 0; f < thidx; f++) { 5014 kmp_info_t *th = team->t.t_threads[f]; 5015 KMP_DEBUG_ASSERT(th != NULL); 5016 5017 th->th.th_first_place = place; 5018 th->th.th_last_place = place; 5019 th->th.th_new_place = place; 5020 if (__kmp_display_affinity && place != th->th.th_current_place && 5021 team->t.t_display_affinity != 1) { 5022 team->t.t_display_affinity = 1; 5023 } 5024 s_count++; 5025 5026 if ((s_count == S) && rem && (gap_ct == gap)) { 5027 // do nothing, add an extra thread to place on next iteration 5028 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 5029 // we added an extra thread to this place; move on to next place 5030 if (place == last_place) { 5031 place = first_place; 5032 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 5033 place = 0; 5034 } else { 5035 place++; 5036 } 5037 s_count = 0; 5038 gap_ct = 1; 5039 rem--; 5040 } else if (s_count == S) { // place is full; don't add extra thread 5041 if (place == last_place) { 5042 place = first_place; 5043 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 5044 place = 0; 5045 } else { 5046 place++; 5047 } 5048 gap_ct++; 5049 s_count = 0; 5050 } 5051 5052 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 5053 "partition = [%d,%d]\n", 5054 __kmp_gtid_from_thread(team->t.t_threads[f]), 5055 team->t.t_id, f, th->th.th_new_place, 5056 th->th.th_first_place, th->th.th_last_place)); 5057 } 5058 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 5059 } 5060 } break; 5061 5062 default: 5063 break; 5064 } 5065 5066 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 5067 } 5068 5069 #endif // KMP_AFFINITY_SUPPORTED 5070 5071 /* allocate a new team data structure to use. take one off of the free pool if 5072 available */ 5073 kmp_team_t * 5074 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 5075 #if OMPT_SUPPORT 5076 ompt_data_t ompt_parallel_data, 5077 #endif 5078 kmp_proc_bind_t new_proc_bind, 5079 kmp_internal_control_t *new_icvs, 5080 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5081 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 5082 int f; 5083 kmp_team_t *team; 5084 int use_hot_team = !root->r.r_active; 5085 int level = 0; 5086 int do_place_partition = 1; 5087 5088 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 5089 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 5090 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 5091 KMP_MB(); 5092 5093 #if KMP_NESTED_HOT_TEAMS 5094 kmp_hot_team_ptr_t *hot_teams; 5095 if (master) { 5096 team = master->th.th_team; 5097 level = team->t.t_active_level; 5098 if (master->th.th_teams_microtask) { // in teams construct? 5099 if (master->th.th_teams_size.nteams > 1 && 5100 ( // #teams > 1 5101 team->t.t_pkfn == 5102 (microtask_t)__kmp_teams_master || // inner fork of the teams 5103 master->th.th_teams_level < 5104 team->t.t_level)) { // or nested parallel inside the teams 5105 ++level; // not increment if #teams==1, or for outer fork of the teams; 5106 // increment otherwise 5107 } 5108 // Do not perform the place partition if inner fork of the teams 5109 // Wait until nested parallel region encountered inside teams construct 5110 if ((master->th.th_teams_size.nteams == 1 && 5111 master->th.th_teams_level >= team->t.t_level) || 5112 (team->t.t_pkfn == (microtask_t)__kmp_teams_master)) 5113 do_place_partition = 0; 5114 } 5115 hot_teams = master->th.th_hot_teams; 5116 if (level < __kmp_hot_teams_max_level && hot_teams && 5117 hot_teams[level].hot_team) { 5118 // hot team has already been allocated for given level 5119 use_hot_team = 1; 5120 } else { 5121 use_hot_team = 0; 5122 } 5123 } else { 5124 // check we won't access uninitialized hot_teams, just in case 5125 KMP_DEBUG_ASSERT(new_nproc == 1); 5126 } 5127 #endif 5128 // Optimization to use a "hot" team 5129 if (use_hot_team && new_nproc > 1) { 5130 KMP_DEBUG_ASSERT(new_nproc <= max_nproc); 5131 #if KMP_NESTED_HOT_TEAMS 5132 team = hot_teams[level].hot_team; 5133 #else 5134 team = root->r.r_hot_team; 5135 #endif 5136 #if KMP_DEBUG 5137 if (__kmp_tasking_mode != tskm_immediate_exec) { 5138 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5139 "task_team[1] = %p before reinit\n", 5140 team->t.t_task_team[0], team->t.t_task_team[1])); 5141 } 5142 #endif 5143 5144 if (team->t.t_nproc != new_nproc && 5145 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5146 // Distributed barrier may need a resize 5147 int old_nthr = team->t.t_nproc; 5148 __kmp_resize_dist_barrier(team, old_nthr, new_nproc); 5149 } 5150 5151 // If not doing the place partition, then reset the team's proc bind 5152 // to indicate that partitioning of all threads still needs to take place 5153 if (do_place_partition == 0) 5154 team->t.t_proc_bind = proc_bind_default; 5155 // Has the number of threads changed? 5156 /* Let's assume the most common case is that the number of threads is 5157 unchanged, and put that case first. */ 5158 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 5159 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 5160 // This case can mean that omp_set_num_threads() was called and the hot 5161 // team size was already reduced, so we check the special flag 5162 if (team->t.t_size_changed == -1) { 5163 team->t.t_size_changed = 1; 5164 } else { 5165 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 5166 } 5167 5168 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5169 kmp_r_sched_t new_sched = new_icvs->sched; 5170 // set primary thread's schedule as new run-time schedule 5171 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 5172 5173 __kmp_reinitialize_team(team, new_icvs, 5174 root->r.r_uber_thread->th.th_ident); 5175 5176 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 5177 team->t.t_threads[0], team)); 5178 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5179 5180 #if KMP_AFFINITY_SUPPORTED 5181 if ((team->t.t_size_changed == 0) && 5182 (team->t.t_proc_bind == new_proc_bind)) { 5183 if (new_proc_bind == proc_bind_spread) { 5184 if (do_place_partition) { 5185 // add flag to update only master for spread 5186 __kmp_partition_places(team, 1); 5187 } 5188 } 5189 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 5190 "proc_bind = %d, partition = [%d,%d]\n", 5191 team->t.t_id, new_proc_bind, team->t.t_first_place, 5192 team->t.t_last_place)); 5193 } else { 5194 if (do_place_partition) { 5195 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5196 __kmp_partition_places(team); 5197 } 5198 } 5199 #else 5200 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5201 #endif /* KMP_AFFINITY_SUPPORTED */ 5202 } else if (team->t.t_nproc > new_nproc) { 5203 KA_TRACE(20, 5204 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 5205 new_nproc)); 5206 5207 team->t.t_size_changed = 1; 5208 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5209 // Barrier size already reduced earlier in this function 5210 // Activate team threads via th_used_in_team 5211 __kmp_add_threads_to_team(team, new_nproc); 5212 } 5213 #if KMP_NESTED_HOT_TEAMS 5214 if (__kmp_hot_teams_mode == 0) { 5215 // AC: saved number of threads should correspond to team's value in this 5216 // mode, can be bigger in mode 1, when hot team has threads in reserve 5217 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 5218 hot_teams[level].hot_team_nth = new_nproc; 5219 #endif // KMP_NESTED_HOT_TEAMS 5220 /* release the extra threads we don't need any more */ 5221 for (f = new_nproc; f < team->t.t_nproc; f++) { 5222 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5223 if (__kmp_tasking_mode != tskm_immediate_exec) { 5224 // When decreasing team size, threads no longer in the team should 5225 // unref task team. 5226 team->t.t_threads[f]->th.th_task_team = NULL; 5227 } 5228 __kmp_free_thread(team->t.t_threads[f]); 5229 team->t.t_threads[f] = NULL; 5230 } 5231 #if KMP_NESTED_HOT_TEAMS 5232 } // (__kmp_hot_teams_mode == 0) 5233 else { 5234 // When keeping extra threads in team, switch threads to wait on own 5235 // b_go flag 5236 for (f = new_nproc; f < team->t.t_nproc; ++f) { 5237 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5238 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 5239 for (int b = 0; b < bs_last_barrier; ++b) { 5240 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5241 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5242 } 5243 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5244 } 5245 } 5246 } 5247 #endif // KMP_NESTED_HOT_TEAMS 5248 team->t.t_nproc = new_nproc; 5249 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5250 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 5251 __kmp_reinitialize_team(team, new_icvs, 5252 root->r.r_uber_thread->th.th_ident); 5253 5254 // Update remaining threads 5255 for (f = 0; f < new_nproc; ++f) { 5256 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5257 } 5258 5259 // restore the current task state of the primary thread: should be the 5260 // implicit task 5261 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5262 team->t.t_threads[0], team)); 5263 5264 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5265 5266 #ifdef KMP_DEBUG 5267 for (f = 0; f < team->t.t_nproc; f++) { 5268 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5269 team->t.t_threads[f]->th.th_team_nproc == 5270 team->t.t_nproc); 5271 } 5272 #endif 5273 5274 if (do_place_partition) { 5275 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5276 #if KMP_AFFINITY_SUPPORTED 5277 __kmp_partition_places(team); 5278 #endif 5279 } 5280 } else { // team->t.t_nproc < new_nproc 5281 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5282 kmp_affin_mask_t *old_mask; 5283 if (KMP_AFFINITY_CAPABLE()) { 5284 KMP_CPU_ALLOC(old_mask); 5285 } 5286 #endif 5287 5288 KA_TRACE(20, 5289 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5290 new_nproc)); 5291 int old_nproc = team->t.t_nproc; // save old value and use to update only 5292 team->t.t_size_changed = 1; 5293 5294 #if KMP_NESTED_HOT_TEAMS 5295 int avail_threads = hot_teams[level].hot_team_nth; 5296 if (new_nproc < avail_threads) 5297 avail_threads = new_nproc; 5298 kmp_info_t **other_threads = team->t.t_threads; 5299 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5300 // Adjust barrier data of reserved threads (if any) of the team 5301 // Other data will be set in __kmp_initialize_info() below. 5302 int b; 5303 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5304 for (b = 0; b < bs_last_barrier; ++b) { 5305 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5306 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5307 #if USE_DEBUGGER 5308 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5309 #endif 5310 } 5311 } 5312 if (hot_teams[level].hot_team_nth >= new_nproc) { 5313 // we have all needed threads in reserve, no need to allocate any 5314 // this only possible in mode 1, cannot have reserved threads in mode 0 5315 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5316 team->t.t_nproc = new_nproc; // just get reserved threads involved 5317 } else { 5318 // We may have some threads in reserve, but not enough; 5319 // get reserved threads involved if any. 5320 team->t.t_nproc = hot_teams[level].hot_team_nth; 5321 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5322 #endif // KMP_NESTED_HOT_TEAMS 5323 if (team->t.t_max_nproc < new_nproc) { 5324 /* reallocate larger arrays */ 5325 __kmp_reallocate_team_arrays(team, new_nproc); 5326 __kmp_reinitialize_team(team, new_icvs, NULL); 5327 } 5328 5329 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5330 /* Temporarily set full mask for primary thread before creation of 5331 workers. The reason is that workers inherit the affinity from the 5332 primary thread, so if a lot of workers are created on the single 5333 core quickly, they don't get a chance to set their own affinity for 5334 a long time. */ 5335 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5336 #endif 5337 5338 /* allocate new threads for the hot team */ 5339 for (f = team->t.t_nproc; f < new_nproc; f++) { 5340 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5341 KMP_DEBUG_ASSERT(new_worker); 5342 team->t.t_threads[f] = new_worker; 5343 5344 KA_TRACE(20, 5345 ("__kmp_allocate_team: team %d init T#%d arrived: " 5346 "join=%llu, plain=%llu\n", 5347 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5348 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5349 team->t.t_bar[bs_plain_barrier].b_arrived)); 5350 5351 { // Initialize barrier data for new threads. 5352 int b; 5353 kmp_balign_t *balign = new_worker->th.th_bar; 5354 for (b = 0; b < bs_last_barrier; ++b) { 5355 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5356 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5357 KMP_BARRIER_PARENT_FLAG); 5358 #if USE_DEBUGGER 5359 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5360 #endif 5361 } 5362 } 5363 } 5364 5365 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5366 if (KMP_AFFINITY_CAPABLE()) { 5367 /* Restore initial primary thread's affinity mask */ 5368 __kmp_set_system_affinity(old_mask, TRUE); 5369 KMP_CPU_FREE(old_mask); 5370 } 5371 #endif 5372 #if KMP_NESTED_HOT_TEAMS 5373 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5374 #endif // KMP_NESTED_HOT_TEAMS 5375 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5376 // Barrier size already increased earlier in this function 5377 // Activate team threads via th_used_in_team 5378 __kmp_add_threads_to_team(team, new_nproc); 5379 } 5380 /* make sure everyone is syncronized */ 5381 // new threads below 5382 __kmp_initialize_team(team, new_nproc, new_icvs, 5383 root->r.r_uber_thread->th.th_ident); 5384 5385 /* reinitialize the threads */ 5386 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5387 for (f = 0; f < team->t.t_nproc; ++f) 5388 __kmp_initialize_info(team->t.t_threads[f], team, f, 5389 __kmp_gtid_from_tid(f, team)); 5390 5391 if (level) { // set th_task_state for new threads in nested hot team 5392 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5393 // only need to set the th_task_state for the new threads. th_task_state 5394 // for primary thread will not be accurate until after this in 5395 // __kmp_fork_call(), so we look to the primary thread's memo_stack to 5396 // get the correct value. 5397 for (f = old_nproc; f < team->t.t_nproc; ++f) 5398 team->t.t_threads[f]->th.th_task_state = 5399 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5400 } else { // set th_task_state for new threads in non-nested hot team 5401 // copy primary thread's state 5402 kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state; 5403 for (f = old_nproc; f < team->t.t_nproc; ++f) 5404 team->t.t_threads[f]->th.th_task_state = old_state; 5405 } 5406 5407 #ifdef KMP_DEBUG 5408 for (f = 0; f < team->t.t_nproc; ++f) { 5409 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5410 team->t.t_threads[f]->th.th_team_nproc == 5411 team->t.t_nproc); 5412 } 5413 #endif 5414 5415 if (do_place_partition) { 5416 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5417 #if KMP_AFFINITY_SUPPORTED 5418 __kmp_partition_places(team); 5419 #endif 5420 } 5421 } // Check changes in number of threads 5422 5423 kmp_info_t *master = team->t.t_threads[0]; 5424 if (master->th.th_teams_microtask) { 5425 for (f = 1; f < new_nproc; ++f) { 5426 // propagate teams construct specific info to workers 5427 kmp_info_t *thr = team->t.t_threads[f]; 5428 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5429 thr->th.th_teams_level = master->th.th_teams_level; 5430 thr->th.th_teams_size = master->th.th_teams_size; 5431 } 5432 } 5433 #if KMP_NESTED_HOT_TEAMS 5434 if (level) { 5435 // Sync barrier state for nested hot teams, not needed for outermost hot 5436 // team. 5437 for (f = 1; f < new_nproc; ++f) { 5438 kmp_info_t *thr = team->t.t_threads[f]; 5439 int b; 5440 kmp_balign_t *balign = thr->th.th_bar; 5441 for (b = 0; b < bs_last_barrier; ++b) { 5442 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5443 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5444 #if USE_DEBUGGER 5445 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5446 #endif 5447 } 5448 } 5449 } 5450 #endif // KMP_NESTED_HOT_TEAMS 5451 5452 /* reallocate space for arguments if necessary */ 5453 __kmp_alloc_argv_entries(argc, team, TRUE); 5454 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5455 // The hot team re-uses the previous task team, 5456 // if untouched during the previous release->gather phase. 5457 5458 KF_TRACE(10, (" hot_team = %p\n", team)); 5459 5460 #if KMP_DEBUG 5461 if (__kmp_tasking_mode != tskm_immediate_exec) { 5462 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5463 "task_team[1] = %p after reinit\n", 5464 team->t.t_task_team[0], team->t.t_task_team[1])); 5465 } 5466 #endif 5467 5468 #if OMPT_SUPPORT 5469 __ompt_team_assign_id(team, ompt_parallel_data); 5470 #endif 5471 5472 KMP_MB(); 5473 5474 return team; 5475 } 5476 5477 /* next, let's try to take one from the team pool */ 5478 KMP_MB(); 5479 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5480 /* TODO: consider resizing undersized teams instead of reaping them, now 5481 that we have a resizing mechanism */ 5482 if (team->t.t_max_nproc >= max_nproc) { 5483 /* take this team from the team pool */ 5484 __kmp_team_pool = team->t.t_next_pool; 5485 5486 if (max_nproc > 1 && 5487 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5488 if (!team->t.b) { // Allocate barrier structure 5489 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub); 5490 } 5491 } 5492 5493 /* setup the team for fresh use */ 5494 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5495 5496 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5497 "task_team[1] %p to NULL\n", 5498 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5499 team->t.t_task_team[0] = NULL; 5500 team->t.t_task_team[1] = NULL; 5501 5502 /* reallocate space for arguments if necessary */ 5503 __kmp_alloc_argv_entries(argc, team, TRUE); 5504 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5505 5506 KA_TRACE( 5507 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5508 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5509 { // Initialize barrier data. 5510 int b; 5511 for (b = 0; b < bs_last_barrier; ++b) { 5512 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5513 #if USE_DEBUGGER 5514 team->t.t_bar[b].b_master_arrived = 0; 5515 team->t.t_bar[b].b_team_arrived = 0; 5516 #endif 5517 } 5518 } 5519 5520 team->t.t_proc_bind = new_proc_bind; 5521 5522 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5523 team->t.t_id)); 5524 5525 #if OMPT_SUPPORT 5526 __ompt_team_assign_id(team, ompt_parallel_data); 5527 #endif 5528 5529 KMP_MB(); 5530 5531 return team; 5532 } 5533 5534 /* reap team if it is too small, then loop back and check the next one */ 5535 // not sure if this is wise, but, will be redone during the hot-teams 5536 // rewrite. 5537 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5538 team = __kmp_reap_team(team); 5539 __kmp_team_pool = team; 5540 } 5541 5542 /* nothing available in the pool, no matter, make a new team! */ 5543 KMP_MB(); 5544 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5545 5546 /* and set it up */ 5547 team->t.t_max_nproc = max_nproc; 5548 if (max_nproc > 1 && 5549 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5550 // Allocate barrier structure 5551 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub); 5552 } 5553 5554 /* NOTE well, for some reason allocating one big buffer and dividing it up 5555 seems to really hurt performance a lot on the P4, so, let's not use this */ 5556 __kmp_allocate_team_arrays(team, max_nproc); 5557 5558 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5559 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5560 5561 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5562 "%p to NULL\n", 5563 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5564 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5565 // memory, no need to duplicate 5566 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5567 // memory, no need to duplicate 5568 5569 if (__kmp_storage_map) { 5570 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5571 } 5572 5573 /* allocate space for arguments */ 5574 __kmp_alloc_argv_entries(argc, team, FALSE); 5575 team->t.t_argc = argc; 5576 5577 KA_TRACE(20, 5578 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5579 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5580 { // Initialize barrier data. 5581 int b; 5582 for (b = 0; b < bs_last_barrier; ++b) { 5583 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5584 #if USE_DEBUGGER 5585 team->t.t_bar[b].b_master_arrived = 0; 5586 team->t.t_bar[b].b_team_arrived = 0; 5587 #endif 5588 } 5589 } 5590 5591 team->t.t_proc_bind = new_proc_bind; 5592 5593 #if OMPT_SUPPORT 5594 __ompt_team_assign_id(team, ompt_parallel_data); 5595 team->t.ompt_serialized_team_info = NULL; 5596 #endif 5597 5598 KMP_MB(); 5599 5600 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5601 team->t.t_id)); 5602 5603 return team; 5604 } 5605 5606 /* TODO implement hot-teams at all levels */ 5607 /* TODO implement lazy thread release on demand (disband request) */ 5608 5609 /* free the team. return it to the team pool. release all the threads 5610 * associated with it */ 5611 void __kmp_free_team(kmp_root_t *root, 5612 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5613 int f; 5614 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5615 team->t.t_id)); 5616 5617 /* verify state */ 5618 KMP_DEBUG_ASSERT(root); 5619 KMP_DEBUG_ASSERT(team); 5620 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5621 KMP_DEBUG_ASSERT(team->t.t_threads); 5622 5623 int use_hot_team = team == root->r.r_hot_team; 5624 #if KMP_NESTED_HOT_TEAMS 5625 int level; 5626 if (master) { 5627 level = team->t.t_active_level - 1; 5628 if (master->th.th_teams_microtask) { // in teams construct? 5629 if (master->th.th_teams_size.nteams > 1) { 5630 ++level; // level was not increased in teams construct for 5631 // team_of_masters 5632 } 5633 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5634 master->th.th_teams_level == team->t.t_level) { 5635 ++level; // level was not increased in teams construct for 5636 // team_of_workers before the parallel 5637 } // team->t.t_level will be increased inside parallel 5638 } 5639 #if KMP_DEBUG 5640 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams; 5641 #endif 5642 if (level < __kmp_hot_teams_max_level) { 5643 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5644 use_hot_team = 1; 5645 } 5646 } 5647 #endif // KMP_NESTED_HOT_TEAMS 5648 5649 /* team is done working */ 5650 TCW_SYNC_PTR(team->t.t_pkfn, 5651 NULL); // Important for Debugging Support Library. 5652 #if KMP_OS_WINDOWS 5653 team->t.t_copyin_counter = 0; // init counter for possible reuse 5654 #endif 5655 // Do not reset pointer to parent team to NULL for hot teams. 5656 5657 /* if we are non-hot team, release our threads */ 5658 if (!use_hot_team) { 5659 if (__kmp_tasking_mode != tskm_immediate_exec) { 5660 // Wait for threads to reach reapable state 5661 for (f = 1; f < team->t.t_nproc; ++f) { 5662 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5663 kmp_info_t *th = team->t.t_threads[f]; 5664 volatile kmp_uint32 *state = &th->th.th_reap_state; 5665 while (*state != KMP_SAFE_TO_REAP) { 5666 #if KMP_OS_WINDOWS 5667 // On Windows a thread can be killed at any time, check this 5668 DWORD ecode; 5669 if (!__kmp_is_thread_alive(th, &ecode)) { 5670 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5671 break; 5672 } 5673 #endif 5674 // first check if thread is sleeping 5675 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5676 if (fl.is_sleeping()) 5677 fl.resume(__kmp_gtid_from_thread(th)); 5678 KMP_CPU_PAUSE(); 5679 } 5680 } 5681 5682 // Delete task teams 5683 int tt_idx; 5684 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5685 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5686 if (task_team != NULL) { 5687 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams 5688 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5689 team->t.t_threads[f]->th.th_task_team = NULL; 5690 } 5691 KA_TRACE( 5692 20, 5693 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5694 __kmp_get_gtid(), task_team, team->t.t_id)); 5695 #if KMP_NESTED_HOT_TEAMS 5696 __kmp_free_task_team(master, task_team); 5697 #endif 5698 team->t.t_task_team[tt_idx] = NULL; 5699 } 5700 } 5701 } 5702 5703 // Reset pointer to parent team only for non-hot teams. 5704 team->t.t_parent = NULL; 5705 team->t.t_level = 0; 5706 team->t.t_active_level = 0; 5707 5708 /* free the worker threads */ 5709 for (f = 1; f < team->t.t_nproc; ++f) { 5710 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5711 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5712 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 5713 1, 2); 5714 } 5715 __kmp_free_thread(team->t.t_threads[f]); 5716 } 5717 5718 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5719 if (team->t.b) { 5720 // wake up thread at old location 5721 team->t.b->go_release(); 5722 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5723 for (f = 1; f < team->t.t_nproc; ++f) { 5724 if (team->t.b->sleep[f].sleep) { 5725 __kmp_atomic_resume_64( 5726 team->t.t_threads[f]->th.th_info.ds.ds_gtid, 5727 (kmp_atomic_flag_64<> *)NULL); 5728 } 5729 } 5730 } 5731 // Wait for threads to be removed from team 5732 for (int f = 1; f < team->t.t_nproc; ++f) { 5733 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0) 5734 KMP_CPU_PAUSE(); 5735 } 5736 } 5737 } 5738 5739 for (f = 1; f < team->t.t_nproc; ++f) { 5740 team->t.t_threads[f] = NULL; 5741 } 5742 5743 if (team->t.t_max_nproc > 1 && 5744 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5745 distributedBarrier::deallocate(team->t.b); 5746 team->t.b = NULL; 5747 } 5748 /* put the team back in the team pool */ 5749 /* TODO limit size of team pool, call reap_team if pool too large */ 5750 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5751 __kmp_team_pool = (volatile kmp_team_t *)team; 5752 } else { // Check if team was created for primary threads in teams construct 5753 // See if first worker is a CG root 5754 KMP_DEBUG_ASSERT(team->t.t_threads[1] && 5755 team->t.t_threads[1]->th.th_cg_roots); 5756 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { 5757 // Clean up the CG root nodes on workers so that this team can be re-used 5758 for (f = 1; f < team->t.t_nproc; ++f) { 5759 kmp_info_t *thr = team->t.t_threads[f]; 5760 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && 5761 thr->th.th_cg_roots->cg_root == thr); 5762 // Pop current CG root off list 5763 kmp_cg_root_t *tmp = thr->th.th_cg_roots; 5764 thr->th.th_cg_roots = tmp->up; 5765 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" 5766 " up to node %p. cg_nthreads was %d\n", 5767 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); 5768 int i = tmp->cg_nthreads--; 5769 if (i == 1) { 5770 __kmp_free(tmp); // free CG if we are the last thread in it 5771 } 5772 // Restore current task's thread_limit from CG root 5773 if (thr->th.th_cg_roots) 5774 thr->th.th_current_task->td_icvs.thread_limit = 5775 thr->th.th_cg_roots->cg_thread_limit; 5776 } 5777 } 5778 } 5779 5780 KMP_MB(); 5781 } 5782 5783 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5784 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5785 kmp_team_t *next_pool = team->t.t_next_pool; 5786 5787 KMP_DEBUG_ASSERT(team); 5788 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5789 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5790 KMP_DEBUG_ASSERT(team->t.t_threads); 5791 KMP_DEBUG_ASSERT(team->t.t_argv); 5792 5793 /* TODO clean the threads that are a part of this? */ 5794 5795 /* free stuff */ 5796 __kmp_free_team_arrays(team); 5797 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5798 __kmp_free((void *)team->t.t_argv); 5799 __kmp_free(team); 5800 5801 KMP_MB(); 5802 return next_pool; 5803 } 5804 5805 // Free the thread. Don't reap it, just place it on the pool of available 5806 // threads. 5807 // 5808 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5809 // binding for the affinity mechanism to be useful. 5810 // 5811 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5812 // However, we want to avoid a potential performance problem by always 5813 // scanning through the list to find the correct point at which to insert 5814 // the thread (potential N**2 behavior). To do this we keep track of the 5815 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5816 // With single-level parallelism, threads will always be added to the tail 5817 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5818 // parallelism, all bets are off and we may need to scan through the entire 5819 // free list. 5820 // 5821 // This change also has a potentially large performance benefit, for some 5822 // applications. Previously, as threads were freed from the hot team, they 5823 // would be placed back on the free list in inverse order. If the hot team 5824 // grew back to it's original size, then the freed thread would be placed 5825 // back on the hot team in reverse order. This could cause bad cache 5826 // locality problems on programs where the size of the hot team regularly 5827 // grew and shrunk. 5828 // 5829 // Now, for single-level parallelism, the OMP tid is always == gtid. 5830 void __kmp_free_thread(kmp_info_t *this_th) { 5831 int gtid; 5832 kmp_info_t **scan; 5833 5834 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5835 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5836 5837 KMP_DEBUG_ASSERT(this_th); 5838 5839 // When moving thread to pool, switch thread to wait on own b_go flag, and 5840 // uninitialized (NULL team). 5841 int b; 5842 kmp_balign_t *balign = this_th->th.th_bar; 5843 for (b = 0; b < bs_last_barrier; ++b) { 5844 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5845 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5846 balign[b].bb.team = NULL; 5847 balign[b].bb.leaf_kids = 0; 5848 } 5849 this_th->th.th_task_state = 0; 5850 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5851 5852 /* put thread back on the free pool */ 5853 TCW_PTR(this_th->th.th_team, NULL); 5854 TCW_PTR(this_th->th.th_root, NULL); 5855 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5856 5857 while (this_th->th.th_cg_roots) { 5858 this_th->th.th_cg_roots->cg_nthreads--; 5859 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" 5860 " %p of thread %p to %d\n", 5861 this_th, this_th->th.th_cg_roots, 5862 this_th->th.th_cg_roots->cg_root, 5863 this_th->th.th_cg_roots->cg_nthreads)); 5864 kmp_cg_root_t *tmp = this_th->th.th_cg_roots; 5865 if (tmp->cg_root == this_th) { // Thread is a cg_root 5866 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); 5867 KA_TRACE( 5868 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); 5869 this_th->th.th_cg_roots = tmp->up; 5870 __kmp_free(tmp); 5871 } else { // Worker thread 5872 if (tmp->cg_nthreads == 0) { // last thread leaves contention group 5873 __kmp_free(tmp); 5874 } 5875 this_th->th.th_cg_roots = NULL; 5876 break; 5877 } 5878 } 5879 5880 /* If the implicit task assigned to this thread can be used by other threads 5881 * -> multiple threads can share the data and try to free the task at 5882 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5883 * with higher probability when hot team is disabled but can occurs even when 5884 * the hot team is enabled */ 5885 __kmp_free_implicit_task(this_th); 5886 this_th->th.th_current_task = NULL; 5887 5888 // If the __kmp_thread_pool_insert_pt is already past the new insert 5889 // point, then we need to re-scan the entire list. 5890 gtid = this_th->th.th_info.ds.ds_gtid; 5891 if (__kmp_thread_pool_insert_pt != NULL) { 5892 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5893 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5894 __kmp_thread_pool_insert_pt = NULL; 5895 } 5896 } 5897 5898 // Scan down the list to find the place to insert the thread. 5899 // scan is the address of a link in the list, possibly the address of 5900 // __kmp_thread_pool itself. 5901 // 5902 // In the absence of nested parallelism, the for loop will have 0 iterations. 5903 if (__kmp_thread_pool_insert_pt != NULL) { 5904 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5905 } else { 5906 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5907 } 5908 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5909 scan = &((*scan)->th.th_next_pool)) 5910 ; 5911 5912 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5913 // to its address. 5914 TCW_PTR(this_th->th.th_next_pool, *scan); 5915 __kmp_thread_pool_insert_pt = *scan = this_th; 5916 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5917 (this_th->th.th_info.ds.ds_gtid < 5918 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5919 TCW_4(this_th->th.th_in_pool, TRUE); 5920 __kmp_suspend_initialize_thread(this_th); 5921 __kmp_lock_suspend_mx(this_th); 5922 if (this_th->th.th_active == TRUE) { 5923 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); 5924 this_th->th.th_active_in_pool = TRUE; 5925 } 5926 #if KMP_DEBUG 5927 else { 5928 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); 5929 } 5930 #endif 5931 __kmp_unlock_suspend_mx(this_th); 5932 5933 TCW_4(__kmp_nth, __kmp_nth - 1); 5934 5935 #ifdef KMP_ADJUST_BLOCKTIME 5936 /* Adjust blocktime back to user setting or default if necessary */ 5937 /* Middle initialization might never have occurred */ 5938 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5939 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5940 if (__kmp_nth <= __kmp_avail_proc) { 5941 __kmp_zero_bt = FALSE; 5942 } 5943 } 5944 #endif /* KMP_ADJUST_BLOCKTIME */ 5945 5946 KMP_MB(); 5947 } 5948 5949 /* ------------------------------------------------------------------------ */ 5950 5951 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5952 #if OMP_PROFILING_SUPPORT 5953 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE"); 5954 // TODO: add a configuration option for time granularity 5955 if (ProfileTraceFile) 5956 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget"); 5957 #endif 5958 5959 int gtid = this_thr->th.th_info.ds.ds_gtid; 5960 /* void *stack_data;*/ 5961 kmp_team_t **volatile pteam; 5962 5963 KMP_MB(); 5964 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5965 5966 if (__kmp_env_consistency_check) { 5967 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5968 } 5969 5970 #if OMPD_SUPPORT 5971 if (ompd_state & OMPD_ENABLE_BP) 5972 ompd_bp_thread_begin(); 5973 #endif 5974 5975 #if OMPT_SUPPORT 5976 ompt_data_t *thread_data = nullptr; 5977 if (ompt_enabled.enabled) { 5978 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 5979 *thread_data = ompt_data_none; 5980 5981 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5982 this_thr->th.ompt_thread_info.wait_id = 0; 5983 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 5984 this_thr->th.ompt_thread_info.parallel_flags = 0; 5985 if (ompt_enabled.ompt_callback_thread_begin) { 5986 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 5987 ompt_thread_worker, thread_data); 5988 } 5989 this_thr->th.ompt_thread_info.state = ompt_state_idle; 5990 } 5991 #endif 5992 5993 /* This is the place where threads wait for work */ 5994 while (!TCR_4(__kmp_global.g.g_done)) { 5995 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 5996 KMP_MB(); 5997 5998 /* wait for work to do */ 5999 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 6000 6001 /* No tid yet since not part of a team */ 6002 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 6003 6004 #if OMPT_SUPPORT 6005 if (ompt_enabled.enabled) { 6006 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 6007 } 6008 #endif 6009 6010 pteam = &this_thr->th.th_team; 6011 6012 /* have we been allocated? */ 6013 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 6014 /* we were just woken up, so run our new task */ 6015 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 6016 int rc; 6017 KA_TRACE(20, 6018 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 6019 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 6020 (*pteam)->t.t_pkfn)); 6021 6022 updateHWFPControl(*pteam); 6023 6024 #if OMPT_SUPPORT 6025 if (ompt_enabled.enabled) { 6026 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 6027 } 6028 #endif 6029 6030 rc = (*pteam)->t.t_invoke(gtid); 6031 KMP_ASSERT(rc); 6032 6033 KMP_MB(); 6034 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 6035 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 6036 (*pteam)->t.t_pkfn)); 6037 } 6038 #if OMPT_SUPPORT 6039 if (ompt_enabled.enabled) { 6040 /* no frame set while outside task */ 6041 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 6042 6043 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 6044 } 6045 #endif 6046 /* join barrier after parallel region */ 6047 __kmp_join_barrier(gtid); 6048 } 6049 } 6050 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 6051 6052 #if OMPD_SUPPORT 6053 if (ompd_state & OMPD_ENABLE_BP) 6054 ompd_bp_thread_end(); 6055 #endif 6056 6057 #if OMPT_SUPPORT 6058 if (ompt_enabled.ompt_callback_thread_end) { 6059 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 6060 } 6061 #endif 6062 6063 this_thr->th.th_task_team = NULL; 6064 /* run the destructors for the threadprivate data for this thread */ 6065 __kmp_common_destroy_gtid(gtid); 6066 6067 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 6068 KMP_MB(); 6069 6070 #if OMP_PROFILING_SUPPORT 6071 llvm::timeTraceProfilerFinishThread(); 6072 #endif 6073 return this_thr; 6074 } 6075 6076 /* ------------------------------------------------------------------------ */ 6077 6078 void __kmp_internal_end_dest(void *specific_gtid) { 6079 // Make sure no significant bits are lost 6080 int gtid; 6081 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, >id); 6082 6083 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 6084 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 6085 * this is because 0 is reserved for the nothing-stored case */ 6086 6087 __kmp_internal_end_thread(gtid); 6088 } 6089 6090 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 6091 6092 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 6093 __kmp_internal_end_atexit(); 6094 } 6095 6096 #endif 6097 6098 /* [Windows] josh: when the atexit handler is called, there may still be more 6099 than one thread alive */ 6100 void __kmp_internal_end_atexit(void) { 6101 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 6102 /* [Windows] 6103 josh: ideally, we want to completely shutdown the library in this atexit 6104 handler, but stat code that depends on thread specific data for gtid fails 6105 because that data becomes unavailable at some point during the shutdown, so 6106 we call __kmp_internal_end_thread instead. We should eventually remove the 6107 dependency on __kmp_get_specific_gtid in the stat code and use 6108 __kmp_internal_end_library to cleanly shutdown the library. 6109 6110 // TODO: Can some of this comment about GVS be removed? 6111 I suspect that the offending stat code is executed when the calling thread 6112 tries to clean up a dead root thread's data structures, resulting in GVS 6113 code trying to close the GVS structures for that thread, but since the stat 6114 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 6115 the calling thread is cleaning up itself instead of another thread, it get 6116 confused. This happens because allowing a thread to unregister and cleanup 6117 another thread is a recent modification for addressing an issue. 6118 Based on the current design (20050722), a thread may end up 6119 trying to unregister another thread only if thread death does not trigger 6120 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 6121 thread specific data destructor function to detect thread death. For 6122 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 6123 is nothing. Thus, the workaround is applicable only for Windows static 6124 stat library. */ 6125 __kmp_internal_end_library(-1); 6126 #if KMP_OS_WINDOWS 6127 __kmp_close_console(); 6128 #endif 6129 } 6130 6131 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 6132 // It is assumed __kmp_forkjoin_lock is acquired. 6133 6134 int gtid; 6135 6136 KMP_DEBUG_ASSERT(thread != NULL); 6137 6138 gtid = thread->th.th_info.ds.ds_gtid; 6139 6140 if (!is_root) { 6141 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 6142 /* Assume the threads are at the fork barrier here */ 6143 KA_TRACE( 6144 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 6145 gtid)); 6146 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 6147 while ( 6148 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3)) 6149 KMP_CPU_PAUSE(); 6150 __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL); 6151 } else { 6152 /* Need release fence here to prevent seg faults for tree forkjoin 6153 barrier (GEH) */ 6154 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 6155 thread); 6156 __kmp_release_64(&flag); 6157 } 6158 } 6159 6160 // Terminate OS thread. 6161 __kmp_reap_worker(thread); 6162 6163 // The thread was killed asynchronously. If it was actively 6164 // spinning in the thread pool, decrement the global count. 6165 // 6166 // There is a small timing hole here - if the worker thread was just waking 6167 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 6168 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 6169 // the global counter might not get updated. 6170 // 6171 // Currently, this can only happen as the library is unloaded, 6172 // so there are no harmful side effects. 6173 if (thread->th.th_active_in_pool) { 6174 thread->th.th_active_in_pool = FALSE; 6175 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 6176 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 6177 } 6178 } 6179 6180 __kmp_free_implicit_task(thread); 6181 6182 // Free the fast memory for tasking 6183 #if USE_FAST_MEMORY 6184 __kmp_free_fast_memory(thread); 6185 #endif /* USE_FAST_MEMORY */ 6186 6187 __kmp_suspend_uninitialize_thread(thread); 6188 6189 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 6190 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 6191 6192 --__kmp_all_nth; 6193 // __kmp_nth was decremented when thread is added to the pool. 6194 6195 #ifdef KMP_ADJUST_BLOCKTIME 6196 /* Adjust blocktime back to user setting or default if necessary */ 6197 /* Middle initialization might never have occurred */ 6198 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 6199 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 6200 if (__kmp_nth <= __kmp_avail_proc) { 6201 __kmp_zero_bt = FALSE; 6202 } 6203 } 6204 #endif /* KMP_ADJUST_BLOCKTIME */ 6205 6206 /* free the memory being used */ 6207 if (__kmp_env_consistency_check) { 6208 if (thread->th.th_cons) { 6209 __kmp_free_cons_stack(thread->th.th_cons); 6210 thread->th.th_cons = NULL; 6211 } 6212 } 6213 6214 if (thread->th.th_pri_common != NULL) { 6215 __kmp_free(thread->th.th_pri_common); 6216 thread->th.th_pri_common = NULL; 6217 } 6218 6219 if (thread->th.th_task_state_memo_stack != NULL) { 6220 __kmp_free(thread->th.th_task_state_memo_stack); 6221 thread->th.th_task_state_memo_stack = NULL; 6222 } 6223 6224 #if KMP_USE_BGET 6225 if (thread->th.th_local.bget_data != NULL) { 6226 __kmp_finalize_bget(thread); 6227 } 6228 #endif 6229 6230 #if KMP_AFFINITY_SUPPORTED 6231 if (thread->th.th_affin_mask != NULL) { 6232 KMP_CPU_FREE(thread->th.th_affin_mask); 6233 thread->th.th_affin_mask = NULL; 6234 } 6235 #endif /* KMP_AFFINITY_SUPPORTED */ 6236 6237 #if KMP_USE_HIER_SCHED 6238 if (thread->th.th_hier_bar_data != NULL) { 6239 __kmp_free(thread->th.th_hier_bar_data); 6240 thread->th.th_hier_bar_data = NULL; 6241 } 6242 #endif 6243 6244 __kmp_reap_team(thread->th.th_serial_team); 6245 thread->th.th_serial_team = NULL; 6246 __kmp_free(thread); 6247 6248 KMP_MB(); 6249 6250 } // __kmp_reap_thread 6251 6252 static void __kmp_itthash_clean(kmp_info_t *th) { 6253 #if USE_ITT_NOTIFY 6254 if (__kmp_itt_region_domains.count > 0) { 6255 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) { 6256 kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i]; 6257 while (bucket) { 6258 kmp_itthash_entry_t *next = bucket->next_in_bucket; 6259 __kmp_thread_free(th, bucket); 6260 bucket = next; 6261 } 6262 } 6263 } 6264 if (__kmp_itt_barrier_domains.count > 0) { 6265 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) { 6266 kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i]; 6267 while (bucket) { 6268 kmp_itthash_entry_t *next = bucket->next_in_bucket; 6269 __kmp_thread_free(th, bucket); 6270 bucket = next; 6271 } 6272 } 6273 } 6274 #endif 6275 } 6276 6277 static void __kmp_internal_end(void) { 6278 int i; 6279 6280 /* First, unregister the library */ 6281 __kmp_unregister_library(); 6282 6283 #if KMP_OS_WINDOWS 6284 /* In Win static library, we can't tell when a root actually dies, so we 6285 reclaim the data structures for any root threads that have died but not 6286 unregistered themselves, in order to shut down cleanly. 6287 In Win dynamic library we also can't tell when a thread dies. */ 6288 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 6289 // dead roots 6290 #endif 6291 6292 for (i = 0; i < __kmp_threads_capacity; i++) 6293 if (__kmp_root[i]) 6294 if (__kmp_root[i]->r.r_active) 6295 break; 6296 KMP_MB(); /* Flush all pending memory write invalidates. */ 6297 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6298 6299 if (i < __kmp_threads_capacity) { 6300 #if KMP_USE_MONITOR 6301 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 6302 KMP_MB(); /* Flush all pending memory write invalidates. */ 6303 6304 // Need to check that monitor was initialized before reaping it. If we are 6305 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 6306 // __kmp_monitor will appear to contain valid data, but it is only valid in 6307 // the parent process, not the child. 6308 // New behavior (201008): instead of keying off of the flag 6309 // __kmp_init_parallel, the monitor thread creation is keyed off 6310 // of the new flag __kmp_init_monitor. 6311 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6312 if (TCR_4(__kmp_init_monitor)) { 6313 __kmp_reap_monitor(&__kmp_monitor); 6314 TCW_4(__kmp_init_monitor, 0); 6315 } 6316 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6317 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6318 #endif // KMP_USE_MONITOR 6319 } else { 6320 /* TODO move this to cleanup code */ 6321 #ifdef KMP_DEBUG 6322 /* make sure that everything has properly ended */ 6323 for (i = 0; i < __kmp_threads_capacity; i++) { 6324 if (__kmp_root[i]) { 6325 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 6326 // there can be uber threads alive here 6327 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 6328 } 6329 } 6330 #endif 6331 6332 KMP_MB(); 6333 6334 // Reap the worker threads. 6335 // This is valid for now, but be careful if threads are reaped sooner. 6336 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 6337 // Get the next thread from the pool. 6338 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 6339 __kmp_thread_pool = thread->th.th_next_pool; 6340 // Reap it. 6341 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 6342 thread->th.th_next_pool = NULL; 6343 thread->th.th_in_pool = FALSE; 6344 __kmp_reap_thread(thread, 0); 6345 } 6346 __kmp_thread_pool_insert_pt = NULL; 6347 6348 // Reap teams. 6349 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 6350 // Get the next team from the pool. 6351 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 6352 __kmp_team_pool = team->t.t_next_pool; 6353 // Reap it. 6354 team->t.t_next_pool = NULL; 6355 __kmp_reap_team(team); 6356 } 6357 6358 __kmp_reap_task_teams(); 6359 6360 #if KMP_OS_UNIX 6361 // Threads that are not reaped should not access any resources since they 6362 // are going to be deallocated soon, so the shutdown sequence should wait 6363 // until all threads either exit the final spin-waiting loop or begin 6364 // sleeping after the given blocktime. 6365 for (i = 0; i < __kmp_threads_capacity; i++) { 6366 kmp_info_t *thr = __kmp_threads[i]; 6367 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 6368 KMP_CPU_PAUSE(); 6369 } 6370 #endif 6371 6372 for (i = 0; i < __kmp_threads_capacity; ++i) { 6373 // TBD: Add some checking... 6374 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6375 } 6376 6377 /* Make sure all threadprivate destructors get run by joining with all 6378 worker threads before resetting this flag */ 6379 TCW_SYNC_4(__kmp_init_common, FALSE); 6380 6381 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6382 KMP_MB(); 6383 6384 #if KMP_USE_MONITOR 6385 // See note above: One of the possible fixes for CQ138434 / CQ140126 6386 // 6387 // FIXME: push both code fragments down and CSE them? 6388 // push them into __kmp_cleanup() ? 6389 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6390 if (TCR_4(__kmp_init_monitor)) { 6391 __kmp_reap_monitor(&__kmp_monitor); 6392 TCW_4(__kmp_init_monitor, 0); 6393 } 6394 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6395 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6396 #endif 6397 } /* else !__kmp_global.t_active */ 6398 TCW_4(__kmp_init_gtid, FALSE); 6399 KMP_MB(); /* Flush all pending memory write invalidates. */ 6400 6401 __kmp_cleanup(); 6402 #if OMPT_SUPPORT 6403 ompt_fini(); 6404 #endif 6405 } 6406 6407 void __kmp_internal_end_library(int gtid_req) { 6408 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6409 /* this shouldn't be a race condition because __kmp_internal_end() is the 6410 only place to clear __kmp_serial_init */ 6411 /* we'll check this later too, after we get the lock */ 6412 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6413 // redundant, because the next check will work in any case. 6414 if (__kmp_global.g.g_abort) { 6415 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6416 /* TODO abort? */ 6417 return; 6418 } 6419 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6420 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6421 return; 6422 } 6423 6424 // If hidden helper team has been initialized, we need to deinit it 6425 if (TCR_4(__kmp_init_hidden_helper) && 6426 !TCR_4(__kmp_hidden_helper_team_done)) { 6427 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6428 // First release the main thread to let it continue its work 6429 __kmp_hidden_helper_main_thread_release(); 6430 // Wait until the hidden helper team has been destroyed 6431 __kmp_hidden_helper_threads_deinitz_wait(); 6432 } 6433 6434 KMP_MB(); /* Flush all pending memory write invalidates. */ 6435 /* find out who we are and what we should do */ 6436 { 6437 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6438 KA_TRACE( 6439 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6440 if (gtid == KMP_GTID_SHUTDOWN) { 6441 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6442 "already shutdown\n")); 6443 return; 6444 } else if (gtid == KMP_GTID_MONITOR) { 6445 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6446 "registered, or system shutdown\n")); 6447 return; 6448 } else if (gtid == KMP_GTID_DNE) { 6449 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6450 "shutdown\n")); 6451 /* we don't know who we are, but we may still shutdown the library */ 6452 } else if (KMP_UBER_GTID(gtid)) { 6453 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6454 if (__kmp_root[gtid]->r.r_active) { 6455 __kmp_global.g.g_abort = -1; 6456 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6457 __kmp_unregister_library(); 6458 KA_TRACE(10, 6459 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6460 gtid)); 6461 return; 6462 } else { 6463 __kmp_itthash_clean(__kmp_threads[gtid]); 6464 KA_TRACE( 6465 10, 6466 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6467 __kmp_unregister_root_current_thread(gtid); 6468 } 6469 } else { 6470 /* worker threads may call this function through the atexit handler, if they 6471 * call exit() */ 6472 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6473 TODO: do a thorough shutdown instead */ 6474 #ifdef DUMP_DEBUG_ON_EXIT 6475 if (__kmp_debug_buf) 6476 __kmp_dump_debug_buffer(); 6477 #endif 6478 // added unregister library call here when we switch to shm linux 6479 // if we don't, it will leave lots of files in /dev/shm 6480 // cleanup shared memory file before exiting. 6481 __kmp_unregister_library(); 6482 return; 6483 } 6484 } 6485 /* synchronize the termination process */ 6486 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6487 6488 /* have we already finished */ 6489 if (__kmp_global.g.g_abort) { 6490 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6491 /* TODO abort? */ 6492 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6493 return; 6494 } 6495 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6496 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6497 return; 6498 } 6499 6500 /* We need this lock to enforce mutex between this reading of 6501 __kmp_threads_capacity and the writing by __kmp_register_root. 6502 Alternatively, we can use a counter of roots that is atomically updated by 6503 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6504 __kmp_internal_end_*. */ 6505 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6506 6507 /* now we can safely conduct the actual termination */ 6508 __kmp_internal_end(); 6509 6510 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6511 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6512 6513 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6514 6515 #ifdef DUMP_DEBUG_ON_EXIT 6516 if (__kmp_debug_buf) 6517 __kmp_dump_debug_buffer(); 6518 #endif 6519 6520 #if KMP_OS_WINDOWS 6521 __kmp_close_console(); 6522 #endif 6523 6524 __kmp_fini_allocator(); 6525 6526 } // __kmp_internal_end_library 6527 6528 void __kmp_internal_end_thread(int gtid_req) { 6529 int i; 6530 6531 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6532 /* this shouldn't be a race condition because __kmp_internal_end() is the 6533 * only place to clear __kmp_serial_init */ 6534 /* we'll check this later too, after we get the lock */ 6535 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6536 // redundant, because the next check will work in any case. 6537 if (__kmp_global.g.g_abort) { 6538 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6539 /* TODO abort? */ 6540 return; 6541 } 6542 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6543 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6544 return; 6545 } 6546 6547 // If hidden helper team has been initialized, we need to deinit it 6548 if (TCR_4(__kmp_init_hidden_helper) && 6549 !TCR_4(__kmp_hidden_helper_team_done)) { 6550 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6551 // First release the main thread to let it continue its work 6552 __kmp_hidden_helper_main_thread_release(); 6553 // Wait until the hidden helper team has been destroyed 6554 __kmp_hidden_helper_threads_deinitz_wait(); 6555 } 6556 6557 KMP_MB(); /* Flush all pending memory write invalidates. */ 6558 6559 /* find out who we are and what we should do */ 6560 { 6561 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6562 KA_TRACE(10, 6563 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6564 if (gtid == KMP_GTID_SHUTDOWN) { 6565 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6566 "already shutdown\n")); 6567 return; 6568 } else if (gtid == KMP_GTID_MONITOR) { 6569 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6570 "registered, or system shutdown\n")); 6571 return; 6572 } else if (gtid == KMP_GTID_DNE) { 6573 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6574 "shutdown\n")); 6575 return; 6576 /* we don't know who we are */ 6577 } else if (KMP_UBER_GTID(gtid)) { 6578 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6579 if (__kmp_root[gtid]->r.r_active) { 6580 __kmp_global.g.g_abort = -1; 6581 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6582 KA_TRACE(10, 6583 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6584 gtid)); 6585 return; 6586 } else { 6587 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6588 gtid)); 6589 __kmp_unregister_root_current_thread(gtid); 6590 } 6591 } else { 6592 /* just a worker thread, let's leave */ 6593 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6594 6595 if (gtid >= 0) { 6596 __kmp_threads[gtid]->th.th_task_team = NULL; 6597 } 6598 6599 KA_TRACE(10, 6600 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6601 gtid)); 6602 return; 6603 } 6604 } 6605 #if KMP_DYNAMIC_LIB 6606 if (__kmp_pause_status != kmp_hard_paused) 6607 // AC: lets not shutdown the dynamic library at the exit of uber thread, 6608 // because we will better shutdown later in the library destructor. 6609 { 6610 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6611 return; 6612 } 6613 #endif 6614 /* synchronize the termination process */ 6615 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6616 6617 /* have we already finished */ 6618 if (__kmp_global.g.g_abort) { 6619 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6620 /* TODO abort? */ 6621 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6622 return; 6623 } 6624 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6625 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6626 return; 6627 } 6628 6629 /* We need this lock to enforce mutex between this reading of 6630 __kmp_threads_capacity and the writing by __kmp_register_root. 6631 Alternatively, we can use a counter of roots that is atomically updated by 6632 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6633 __kmp_internal_end_*. */ 6634 6635 /* should we finish the run-time? are all siblings done? */ 6636 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6637 6638 for (i = 0; i < __kmp_threads_capacity; ++i) { 6639 if (KMP_UBER_GTID(i)) { 6640 KA_TRACE( 6641 10, 6642 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6643 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6644 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6645 return; 6646 } 6647 } 6648 6649 /* now we can safely conduct the actual termination */ 6650 6651 __kmp_internal_end(); 6652 6653 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6654 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6655 6656 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6657 6658 #ifdef DUMP_DEBUG_ON_EXIT 6659 if (__kmp_debug_buf) 6660 __kmp_dump_debug_buffer(); 6661 #endif 6662 } // __kmp_internal_end_thread 6663 6664 // ----------------------------------------------------------------------------- 6665 // Library registration stuff. 6666 6667 static long __kmp_registration_flag = 0; 6668 // Random value used to indicate library initialization. 6669 static char *__kmp_registration_str = NULL; 6670 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6671 6672 static inline char *__kmp_reg_status_name() { 6673 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6674 each thread. If registration and unregistration go in different threads 6675 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6676 env var can not be found, because the name will contain different pid. */ 6677 // macOS* complains about name being too long with additional getuid() 6678 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB 6679 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(), 6680 (int)getuid()); 6681 #else 6682 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6683 #endif 6684 } // __kmp_reg_status_get 6685 6686 void __kmp_register_library_startup(void) { 6687 6688 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6689 int done = 0; 6690 union { 6691 double dtime; 6692 long ltime; 6693 } time; 6694 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6695 __kmp_initialize_system_tick(); 6696 #endif 6697 __kmp_read_system_time(&time.dtime); 6698 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6699 __kmp_registration_str = 6700 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6701 __kmp_registration_flag, KMP_LIBRARY_FILE); 6702 6703 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6704 __kmp_registration_str)); 6705 6706 while (!done) { 6707 6708 char *value = NULL; // Actual value of the environment variable. 6709 6710 #if defined(KMP_USE_SHM) 6711 char *shm_name = __kmp_str_format("/%s", name); 6712 int shm_preexist = 0; 6713 char *data1; 6714 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666); 6715 if ((fd1 == -1) && (errno == EEXIST)) { 6716 // file didn't open because it already exists. 6717 // try opening existing file 6718 fd1 = shm_open(shm_name, O_RDWR, 0666); 6719 if (fd1 == -1) { // file didn't open 6720 // error out here 6721 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0), 6722 __kmp_msg_null); 6723 } else { 6724 // able to open existing file 6725 shm_preexist = 1; 6726 } 6727 } else if (fd1 == -1) { // SHM didn't open; it was due to error other than 6728 // already exists. 6729 // error out here. 6730 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno), 6731 __kmp_msg_null); 6732 } 6733 if (shm_preexist == 0) { 6734 // we created SHM now set size 6735 if (ftruncate(fd1, SHM_SIZE) == -1) { 6736 // error occured setting size; 6737 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"), 6738 KMP_ERR(errno), __kmp_msg_null); 6739 } 6740 } 6741 data1 = 6742 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0); 6743 if (data1 == MAP_FAILED) { 6744 // failed to map shared memory 6745 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno), 6746 __kmp_msg_null); 6747 } 6748 if (shm_preexist == 0) { // set data to SHM, set value 6749 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); 6750 } 6751 // Read value from either what we just wrote or existing file. 6752 value = __kmp_str_format("%s", data1); // read value from SHM 6753 munmap(data1, SHM_SIZE); 6754 close(fd1); 6755 #else // Windows and unix with static library 6756 // Set environment variable, but do not overwrite if it is exist. 6757 __kmp_env_set(name, __kmp_registration_str, 0); 6758 // read value to see if it got set 6759 value = __kmp_env_get(name); 6760 #endif 6761 6762 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6763 done = 1; // Ok, environment variable set successfully, exit the loop. 6764 } else { 6765 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6766 // Check whether it alive or dead. 6767 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6768 char *tail = value; 6769 char *flag_addr_str = NULL; 6770 char *flag_val_str = NULL; 6771 char const *file_name = NULL; 6772 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6773 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6774 file_name = tail; 6775 if (tail != NULL) { 6776 unsigned long *flag_addr = 0; 6777 unsigned long flag_val = 0; 6778 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr)); 6779 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6780 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6781 // First, check whether environment-encoded address is mapped into 6782 // addr space. 6783 // If so, dereference it to see if it still has the right value. 6784 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6785 neighbor = 1; 6786 } else { 6787 // If not, then we know the other copy of the library is no longer 6788 // running. 6789 neighbor = 2; 6790 } 6791 } 6792 } 6793 switch (neighbor) { 6794 case 0: // Cannot parse environment variable -- neighbor status unknown. 6795 // Assume it is the incompatible format of future version of the 6796 // library. Assume the other library is alive. 6797 // WARN( ... ); // TODO: Issue a warning. 6798 file_name = "unknown library"; 6799 KMP_FALLTHROUGH(); 6800 // Attention! Falling to the next case. That's intentional. 6801 case 1: { // Neighbor is alive. 6802 // Check it is allowed. 6803 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6804 if (!__kmp_str_match_true(duplicate_ok)) { 6805 // That's not allowed. Issue fatal error. 6806 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6807 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6808 } 6809 KMP_INTERNAL_FREE(duplicate_ok); 6810 __kmp_duplicate_library_ok = 1; 6811 done = 1; // Exit the loop. 6812 } break; 6813 case 2: { // Neighbor is dead. 6814 6815 #if defined(KMP_USE_SHM) 6816 // close shared memory. 6817 shm_unlink(shm_name); // this removes file in /dev/shm 6818 #else 6819 // Clear the variable and try to register library again. 6820 __kmp_env_unset(name); 6821 #endif 6822 } break; 6823 default: { 6824 KMP_DEBUG_ASSERT(0); 6825 } break; 6826 } 6827 } 6828 KMP_INTERNAL_FREE((void *)value); 6829 #if defined(KMP_USE_SHM) 6830 KMP_INTERNAL_FREE((void *)shm_name); 6831 #endif 6832 } // while 6833 KMP_INTERNAL_FREE((void *)name); 6834 6835 } // func __kmp_register_library_startup 6836 6837 void __kmp_unregister_library(void) { 6838 6839 char *name = __kmp_reg_status_name(); 6840 char *value = NULL; 6841 6842 #if defined(KMP_USE_SHM) 6843 char *shm_name = __kmp_str_format("/%s", name); 6844 int fd1 = shm_open(shm_name, O_RDONLY, 0666); 6845 if (fd1 == -1) { 6846 // file did not open. return. 6847 return; 6848 } 6849 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); 6850 if (data1 != MAP_FAILED) { 6851 value = __kmp_str_format("%s", data1); // read value from SHM 6852 munmap(data1, SHM_SIZE); 6853 } 6854 close(fd1); 6855 #else 6856 value = __kmp_env_get(name); 6857 #endif 6858 6859 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6860 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6861 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6862 // Ok, this is our variable. Delete it. 6863 #if defined(KMP_USE_SHM) 6864 shm_unlink(shm_name); // this removes file in /dev/shm 6865 #else 6866 __kmp_env_unset(name); 6867 #endif 6868 } 6869 6870 #if defined(KMP_USE_SHM) 6871 KMP_INTERNAL_FREE(shm_name); 6872 #endif 6873 6874 KMP_INTERNAL_FREE(__kmp_registration_str); 6875 KMP_INTERNAL_FREE(value); 6876 KMP_INTERNAL_FREE(name); 6877 6878 __kmp_registration_flag = 0; 6879 __kmp_registration_str = NULL; 6880 6881 } // __kmp_unregister_library 6882 6883 // End of Library registration stuff. 6884 // ----------------------------------------------------------------------------- 6885 6886 #if KMP_MIC_SUPPORTED 6887 6888 static void __kmp_check_mic_type() { 6889 kmp_cpuid_t cpuid_state = {0}; 6890 kmp_cpuid_t *cs_p = &cpuid_state; 6891 __kmp_x86_cpuid(1, 0, cs_p); 6892 // We don't support mic1 at the moment 6893 if ((cs_p->eax & 0xff0) == 0xB10) { 6894 __kmp_mic_type = mic2; 6895 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6896 __kmp_mic_type = mic3; 6897 } else { 6898 __kmp_mic_type = non_mic; 6899 } 6900 } 6901 6902 #endif /* KMP_MIC_SUPPORTED */ 6903 6904 #if KMP_HAVE_UMWAIT 6905 static void __kmp_user_level_mwait_init() { 6906 struct kmp_cpuid buf; 6907 __kmp_x86_cpuid(7, 0, &buf); 6908 __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1); 6909 __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait; 6910 __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0); 6911 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", 6912 __kmp_umwait_enabled)); 6913 } 6914 #elif KMP_HAVE_MWAIT 6915 #ifndef AT_INTELPHIUSERMWAIT 6916 // Spurious, non-existent value that should always fail to return anything. 6917 // Will be replaced with the correct value when we know that. 6918 #define AT_INTELPHIUSERMWAIT 10000 6919 #endif 6920 // getauxval() function is available in RHEL7 and SLES12. If a system with an 6921 // earlier OS is used to build the RTL, we'll use the following internal 6922 // function when the entry is not found. 6923 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL; 6924 unsigned long getauxval(unsigned long) { return 0; } 6925 6926 static void __kmp_user_level_mwait_init() { 6927 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available 6928 // use them to find if the user-level mwait is enabled. Otherwise, forcibly 6929 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable 6930 // KMP_USER_LEVEL_MWAIT was set to TRUE. 6931 if (__kmp_mic_type == mic3) { 6932 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT); 6933 if ((res & 0x1) || __kmp_user_level_mwait) { 6934 __kmp_mwait_enabled = TRUE; 6935 if (__kmp_user_level_mwait) { 6936 KMP_INFORM(EnvMwaitWarn); 6937 } 6938 } else { 6939 __kmp_mwait_enabled = FALSE; 6940 } 6941 } 6942 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, " 6943 "__kmp_mwait_enabled = %d\n", 6944 __kmp_mic_type, __kmp_mwait_enabled)); 6945 } 6946 #endif /* KMP_HAVE_UMWAIT */ 6947 6948 static void __kmp_do_serial_initialize(void) { 6949 int i, gtid; 6950 size_t size; 6951 6952 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 6953 6954 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 6955 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 6956 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 6957 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 6958 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 6959 6960 #if OMPT_SUPPORT 6961 ompt_pre_init(); 6962 #endif 6963 #if OMPD_SUPPORT 6964 __kmp_env_dump(); 6965 ompd_init(); 6966 #endif 6967 6968 __kmp_validate_locks(); 6969 6970 /* Initialize internal memory allocator */ 6971 __kmp_init_allocator(); 6972 6973 /* Register the library startup via an environment variable or via mapped 6974 shared memory file and check to see whether another copy of the library is 6975 already registered. Since forked child process is often terminated, we 6976 postpone the registration till middle initialization in the child */ 6977 if (__kmp_need_register_serial) 6978 __kmp_register_library_startup(); 6979 6980 /* TODO reinitialization of library */ 6981 if (TCR_4(__kmp_global.g.g_done)) { 6982 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 6983 } 6984 6985 __kmp_global.g.g_abort = 0; 6986 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 6987 6988 /* initialize the locks */ 6989 #if KMP_USE_ADAPTIVE_LOCKS 6990 #if KMP_DEBUG_ADAPTIVE_LOCKS 6991 __kmp_init_speculative_stats(); 6992 #endif 6993 #endif 6994 #if KMP_STATS_ENABLED 6995 __kmp_stats_init(); 6996 #endif 6997 __kmp_init_lock(&__kmp_global_lock); 6998 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 6999 __kmp_init_lock(&__kmp_debug_lock); 7000 __kmp_init_atomic_lock(&__kmp_atomic_lock); 7001 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 7002 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 7003 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 7004 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 7005 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 7006 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 7007 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 7008 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 7009 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 7010 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 7011 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 7012 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 7013 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 7014 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 7015 #if KMP_USE_MONITOR 7016 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 7017 #endif 7018 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 7019 7020 /* conduct initialization and initial setup of configuration */ 7021 7022 __kmp_runtime_initialize(); 7023 7024 #if KMP_MIC_SUPPORTED 7025 __kmp_check_mic_type(); 7026 #endif 7027 7028 // Some global variable initialization moved here from kmp_env_initialize() 7029 #ifdef KMP_DEBUG 7030 kmp_diag = 0; 7031 #endif 7032 __kmp_abort_delay = 0; 7033 7034 // From __kmp_init_dflt_team_nth() 7035 /* assume the entire machine will be used */ 7036 __kmp_dflt_team_nth_ub = __kmp_xproc; 7037 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 7038 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 7039 } 7040 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 7041 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 7042 } 7043 __kmp_max_nth = __kmp_sys_max_nth; 7044 __kmp_cg_max_nth = __kmp_sys_max_nth; 7045 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 7046 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 7047 __kmp_teams_max_nth = __kmp_sys_max_nth; 7048 } 7049 7050 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 7051 // part 7052 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 7053 #if KMP_USE_MONITOR 7054 __kmp_monitor_wakeups = 7055 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 7056 __kmp_bt_intervals = 7057 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 7058 #endif 7059 // From "KMP_LIBRARY" part of __kmp_env_initialize() 7060 __kmp_library = library_throughput; 7061 // From KMP_SCHEDULE initialization 7062 __kmp_static = kmp_sch_static_balanced; 7063 // AC: do not use analytical here, because it is non-monotonous 7064 //__kmp_guided = kmp_sch_guided_iterative_chunked; 7065 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 7066 // need to repeat assignment 7067 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 7068 // bit control and barrier method control parts 7069 #if KMP_FAST_REDUCTION_BARRIER 7070 #define kmp_reduction_barrier_gather_bb ((int)1) 7071 #define kmp_reduction_barrier_release_bb ((int)1) 7072 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt 7073 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt 7074 #endif // KMP_FAST_REDUCTION_BARRIER 7075 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 7076 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 7077 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 7078 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 7079 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 7080 #if KMP_FAST_REDUCTION_BARRIER 7081 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 7082 // lin_64 ): hyper,1 7083 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 7084 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 7085 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 7086 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 7087 } 7088 #endif // KMP_FAST_REDUCTION_BARRIER 7089 } 7090 #if KMP_FAST_REDUCTION_BARRIER 7091 #undef kmp_reduction_barrier_release_pat 7092 #undef kmp_reduction_barrier_gather_pat 7093 #undef kmp_reduction_barrier_release_bb 7094 #undef kmp_reduction_barrier_gather_bb 7095 #endif // KMP_FAST_REDUCTION_BARRIER 7096 #if KMP_MIC_SUPPORTED 7097 if (__kmp_mic_type == mic2) { // KNC 7098 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 7099 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 7100 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 7101 1; // forkjoin release 7102 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 7103 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 7104 } 7105 #if KMP_FAST_REDUCTION_BARRIER 7106 if (__kmp_mic_type == mic2) { // KNC 7107 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 7108 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 7109 } 7110 #endif // KMP_FAST_REDUCTION_BARRIER 7111 #endif // KMP_MIC_SUPPORTED 7112 7113 // From KMP_CHECKS initialization 7114 #ifdef KMP_DEBUG 7115 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 7116 #else 7117 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 7118 #endif 7119 7120 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 7121 __kmp_foreign_tp = TRUE; 7122 7123 __kmp_global.g.g_dynamic = FALSE; 7124 __kmp_global.g.g_dynamic_mode = dynamic_default; 7125 7126 __kmp_init_nesting_mode(); 7127 7128 __kmp_env_initialize(NULL); 7129 7130 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT 7131 __kmp_user_level_mwait_init(); 7132 #endif 7133 // Print all messages in message catalog for testing purposes. 7134 #ifdef KMP_DEBUG 7135 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 7136 if (__kmp_str_match_true(val)) { 7137 kmp_str_buf_t buffer; 7138 __kmp_str_buf_init(&buffer); 7139 __kmp_i18n_dump_catalog(&buffer); 7140 __kmp_printf("%s", buffer.str); 7141 __kmp_str_buf_free(&buffer); 7142 } 7143 __kmp_env_free(&val); 7144 #endif 7145 7146 __kmp_threads_capacity = 7147 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 7148 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 7149 __kmp_tp_capacity = __kmp_default_tp_capacity( 7150 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 7151 7152 // If the library is shut down properly, both pools must be NULL. Just in 7153 // case, set them to NULL -- some memory may leak, but subsequent code will 7154 // work even if pools are not freed. 7155 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 7156 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 7157 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 7158 __kmp_thread_pool = NULL; 7159 __kmp_thread_pool_insert_pt = NULL; 7160 __kmp_team_pool = NULL; 7161 7162 /* Allocate all of the variable sized records */ 7163 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 7164 * expandable */ 7165 /* Since allocation is cache-aligned, just add extra padding at the end */ 7166 size = 7167 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 7168 CACHE_LINE; 7169 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 7170 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 7171 sizeof(kmp_info_t *) * __kmp_threads_capacity); 7172 7173 /* init thread counts */ 7174 KMP_DEBUG_ASSERT(__kmp_all_nth == 7175 0); // Asserts fail if the library is reinitializing and 7176 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 7177 __kmp_all_nth = 0; 7178 __kmp_nth = 0; 7179 7180 /* setup the uber master thread and hierarchy */ 7181 gtid = __kmp_register_root(TRUE); 7182 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 7183 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7184 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 7185 7186 KMP_MB(); /* Flush all pending memory write invalidates. */ 7187 7188 __kmp_common_initialize(); 7189 7190 #if KMP_OS_UNIX 7191 /* invoke the child fork handler */ 7192 __kmp_register_atfork(); 7193 #endif 7194 7195 #if !KMP_DYNAMIC_LIB 7196 { 7197 /* Invoke the exit handler when the program finishes, only for static 7198 library. For dynamic library, we already have _fini and DllMain. */ 7199 int rc = atexit(__kmp_internal_end_atexit); 7200 if (rc != 0) { 7201 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 7202 __kmp_msg_null); 7203 } 7204 } 7205 #endif 7206 7207 #if KMP_HANDLE_SIGNALS 7208 #if KMP_OS_UNIX 7209 /* NOTE: make sure that this is called before the user installs their own 7210 signal handlers so that the user handlers are called first. this way they 7211 can return false, not call our handler, avoid terminating the library, and 7212 continue execution where they left off. */ 7213 __kmp_install_signals(FALSE); 7214 #endif /* KMP_OS_UNIX */ 7215 #if KMP_OS_WINDOWS 7216 __kmp_install_signals(TRUE); 7217 #endif /* KMP_OS_WINDOWS */ 7218 #endif 7219 7220 /* we have finished the serial initialization */ 7221 __kmp_init_counter++; 7222 7223 __kmp_init_serial = TRUE; 7224 7225 if (__kmp_settings) { 7226 __kmp_env_print(); 7227 } 7228 7229 if (__kmp_display_env || __kmp_display_env_verbose) { 7230 __kmp_env_print_2(); 7231 } 7232 7233 #if OMPT_SUPPORT 7234 ompt_post_init(); 7235 #endif 7236 7237 KMP_MB(); 7238 7239 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 7240 } 7241 7242 void __kmp_serial_initialize(void) { 7243 if (__kmp_init_serial) { 7244 return; 7245 } 7246 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7247 if (__kmp_init_serial) { 7248 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7249 return; 7250 } 7251 __kmp_do_serial_initialize(); 7252 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7253 } 7254 7255 static void __kmp_do_middle_initialize(void) { 7256 int i, j; 7257 int prev_dflt_team_nth; 7258 7259 if (!__kmp_init_serial) { 7260 __kmp_do_serial_initialize(); 7261 } 7262 7263 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 7264 7265 if (UNLIKELY(!__kmp_need_register_serial)) { 7266 // We are in a forked child process. The registration was skipped during 7267 // serial initialization in __kmp_atfork_child handler. Do it here. 7268 __kmp_register_library_startup(); 7269 } 7270 7271 // Save the previous value for the __kmp_dflt_team_nth so that 7272 // we can avoid some reinitialization if it hasn't changed. 7273 prev_dflt_team_nth = __kmp_dflt_team_nth; 7274 7275 #if KMP_AFFINITY_SUPPORTED 7276 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 7277 // number of cores on the machine. 7278 __kmp_affinity_initialize(); 7279 7280 #endif /* KMP_AFFINITY_SUPPORTED */ 7281 7282 KMP_ASSERT(__kmp_xproc > 0); 7283 if (__kmp_avail_proc == 0) { 7284 __kmp_avail_proc = __kmp_xproc; 7285 } 7286 7287 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 7288 // correct them now 7289 j = 0; 7290 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 7291 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 7292 __kmp_avail_proc; 7293 j++; 7294 } 7295 7296 if (__kmp_dflt_team_nth == 0) { 7297 #ifdef KMP_DFLT_NTH_CORES 7298 // Default #threads = #cores 7299 __kmp_dflt_team_nth = __kmp_ncores; 7300 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7301 "__kmp_ncores (%d)\n", 7302 __kmp_dflt_team_nth)); 7303 #else 7304 // Default #threads = #available OS procs 7305 __kmp_dflt_team_nth = __kmp_avail_proc; 7306 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7307 "__kmp_avail_proc(%d)\n", 7308 __kmp_dflt_team_nth)); 7309 #endif /* KMP_DFLT_NTH_CORES */ 7310 } 7311 7312 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 7313 __kmp_dflt_team_nth = KMP_MIN_NTH; 7314 } 7315 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 7316 __kmp_dflt_team_nth = __kmp_sys_max_nth; 7317 } 7318 7319 if (__kmp_nesting_mode > 0) 7320 __kmp_set_nesting_mode_threads(); 7321 7322 // There's no harm in continuing if the following check fails, 7323 // but it indicates an error in the previous logic. 7324 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 7325 7326 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 7327 // Run through the __kmp_threads array and set the num threads icv for each 7328 // root thread that is currently registered with the RTL (which has not 7329 // already explicitly set its nthreads-var with a call to 7330 // omp_set_num_threads()). 7331 for (i = 0; i < __kmp_threads_capacity; i++) { 7332 kmp_info_t *thread = __kmp_threads[i]; 7333 if (thread == NULL) 7334 continue; 7335 if (thread->th.th_current_task->td_icvs.nproc != 0) 7336 continue; 7337 7338 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 7339 } 7340 } 7341 KA_TRACE( 7342 20, 7343 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 7344 __kmp_dflt_team_nth)); 7345 7346 #ifdef KMP_ADJUST_BLOCKTIME 7347 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 7348 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 7349 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 7350 if (__kmp_nth > __kmp_avail_proc) { 7351 __kmp_zero_bt = TRUE; 7352 } 7353 } 7354 #endif /* KMP_ADJUST_BLOCKTIME */ 7355 7356 /* we have finished middle initialization */ 7357 TCW_SYNC_4(__kmp_init_middle, TRUE); 7358 7359 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 7360 } 7361 7362 void __kmp_middle_initialize(void) { 7363 if (__kmp_init_middle) { 7364 return; 7365 } 7366 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7367 if (__kmp_init_middle) { 7368 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7369 return; 7370 } 7371 __kmp_do_middle_initialize(); 7372 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7373 } 7374 7375 void __kmp_parallel_initialize(void) { 7376 int gtid = __kmp_entry_gtid(); // this might be a new root 7377 7378 /* synchronize parallel initialization (for sibling) */ 7379 if (TCR_4(__kmp_init_parallel)) 7380 return; 7381 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7382 if (TCR_4(__kmp_init_parallel)) { 7383 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7384 return; 7385 } 7386 7387 /* TODO reinitialization after we have already shut down */ 7388 if (TCR_4(__kmp_global.g.g_done)) { 7389 KA_TRACE( 7390 10, 7391 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 7392 __kmp_infinite_loop(); 7393 } 7394 7395 /* jc: The lock __kmp_initz_lock is already held, so calling 7396 __kmp_serial_initialize would cause a deadlock. So we call 7397 __kmp_do_serial_initialize directly. */ 7398 if (!__kmp_init_middle) { 7399 __kmp_do_middle_initialize(); 7400 } 7401 __kmp_assign_root_init_mask(); 7402 __kmp_resume_if_hard_paused(); 7403 7404 /* begin initialization */ 7405 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 7406 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7407 7408 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 7409 // Save the FP control regs. 7410 // Worker threads will set theirs to these values at thread startup. 7411 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 7412 __kmp_store_mxcsr(&__kmp_init_mxcsr); 7413 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 7414 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 7415 7416 #if KMP_OS_UNIX 7417 #if KMP_HANDLE_SIGNALS 7418 /* must be after __kmp_serial_initialize */ 7419 __kmp_install_signals(TRUE); 7420 #endif 7421 #endif 7422 7423 __kmp_suspend_initialize(); 7424 7425 #if defined(USE_LOAD_BALANCE) 7426 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7427 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 7428 } 7429 #else 7430 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7431 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7432 } 7433 #endif 7434 7435 if (__kmp_version) { 7436 __kmp_print_version_2(); 7437 } 7438 7439 /* we have finished parallel initialization */ 7440 TCW_SYNC_4(__kmp_init_parallel, TRUE); 7441 7442 KMP_MB(); 7443 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 7444 7445 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7446 } 7447 7448 void __kmp_hidden_helper_initialize() { 7449 if (TCR_4(__kmp_init_hidden_helper)) 7450 return; 7451 7452 // __kmp_parallel_initialize is required before we initialize hidden helper 7453 if (!TCR_4(__kmp_init_parallel)) 7454 __kmp_parallel_initialize(); 7455 7456 // Double check. Note that this double check should not be placed before 7457 // __kmp_parallel_initialize as it will cause dead lock. 7458 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7459 if (TCR_4(__kmp_init_hidden_helper)) { 7460 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7461 return; 7462 } 7463 7464 // Set the count of hidden helper tasks to be executed to zero 7465 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0); 7466 7467 // Set the global variable indicating that we're initializing hidden helper 7468 // team/threads 7469 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE); 7470 7471 // Platform independent initialization 7472 __kmp_do_initialize_hidden_helper_threads(); 7473 7474 // Wait here for the finish of initialization of hidden helper teams 7475 __kmp_hidden_helper_threads_initz_wait(); 7476 7477 // We have finished hidden helper initialization 7478 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE); 7479 7480 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7481 } 7482 7483 /* ------------------------------------------------------------------------ */ 7484 7485 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7486 kmp_team_t *team) { 7487 kmp_disp_t *dispatch; 7488 7489 KMP_MB(); 7490 7491 /* none of the threads have encountered any constructs, yet. */ 7492 this_thr->th.th_local.this_construct = 0; 7493 #if KMP_CACHE_MANAGE 7494 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 7495 #endif /* KMP_CACHE_MANAGE */ 7496 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 7497 KMP_DEBUG_ASSERT(dispatch); 7498 KMP_DEBUG_ASSERT(team->t.t_dispatch); 7499 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 7500 // this_thr->th.th_info.ds.ds_tid ] ); 7501 7502 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 7503 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter 7504 if (__kmp_env_consistency_check) 7505 __kmp_push_parallel(gtid, team->t.t_ident); 7506 7507 KMP_MB(); /* Flush all pending memory write invalidates. */ 7508 } 7509 7510 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7511 kmp_team_t *team) { 7512 if (__kmp_env_consistency_check) 7513 __kmp_pop_parallel(gtid, team->t.t_ident); 7514 7515 __kmp_finish_implicit_task(this_thr); 7516 } 7517 7518 int __kmp_invoke_task_func(int gtid) { 7519 int rc; 7520 int tid = __kmp_tid_from_gtid(gtid); 7521 kmp_info_t *this_thr = __kmp_threads[gtid]; 7522 kmp_team_t *team = this_thr->th.th_team; 7523 7524 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 7525 #if USE_ITT_BUILD 7526 if (__itt_stack_caller_create_ptr) { 7527 // inform ittnotify about entering user's code 7528 if (team->t.t_stack_id != NULL) { 7529 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id); 7530 } else { 7531 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7532 __kmp_itt_stack_callee_enter( 7533 (__itt_caller)team->t.t_parent->t.t_stack_id); 7534 } 7535 } 7536 #endif /* USE_ITT_BUILD */ 7537 #if INCLUDE_SSC_MARKS 7538 SSC_MARK_INVOKING(); 7539 #endif 7540 7541 #if OMPT_SUPPORT 7542 void *dummy; 7543 void **exit_frame_p; 7544 ompt_data_t *my_task_data; 7545 ompt_data_t *my_parallel_data; 7546 int ompt_team_size; 7547 7548 if (ompt_enabled.enabled) { 7549 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid] 7550 .ompt_task_info.frame.exit_frame.ptr); 7551 } else { 7552 exit_frame_p = &dummy; 7553 } 7554 7555 my_task_data = 7556 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 7557 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 7558 if (ompt_enabled.ompt_callback_implicit_task) { 7559 ompt_team_size = team->t.t_nproc; 7560 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7561 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 7562 __kmp_tid_from_gtid(gtid), ompt_task_implicit); 7563 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 7564 } 7565 #endif 7566 7567 #if KMP_STATS_ENABLED 7568 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 7569 if (previous_state == stats_state_e::TEAMS_REGION) { 7570 KMP_PUSH_PARTITIONED_TIMER(OMP_teams); 7571 } else { 7572 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); 7573 } 7574 KMP_SET_THREAD_STATE(IMPLICIT_TASK); 7575 #endif 7576 7577 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 7578 tid, (int)team->t.t_argc, (void **)team->t.t_argv 7579 #if OMPT_SUPPORT 7580 , 7581 exit_frame_p 7582 #endif 7583 ); 7584 #if OMPT_SUPPORT 7585 *exit_frame_p = NULL; 7586 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team; 7587 #endif 7588 7589 #if KMP_STATS_ENABLED 7590 if (previous_state == stats_state_e::TEAMS_REGION) { 7591 KMP_SET_THREAD_STATE(previous_state); 7592 } 7593 KMP_POP_PARTITIONED_TIMER(); 7594 #endif 7595 7596 #if USE_ITT_BUILD 7597 if (__itt_stack_caller_create_ptr) { 7598 // inform ittnotify about leaving user's code 7599 if (team->t.t_stack_id != NULL) { 7600 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id); 7601 } else { 7602 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7603 __kmp_itt_stack_callee_leave( 7604 (__itt_caller)team->t.t_parent->t.t_stack_id); 7605 } 7606 } 7607 #endif /* USE_ITT_BUILD */ 7608 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7609 7610 return rc; 7611 } 7612 7613 void __kmp_teams_master(int gtid) { 7614 // This routine is called by all primary threads in teams construct 7615 kmp_info_t *thr = __kmp_threads[gtid]; 7616 kmp_team_t *team = thr->th.th_team; 7617 ident_t *loc = team->t.t_ident; 7618 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7619 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7620 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7621 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7622 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7623 7624 // This thread is a new CG root. Set up the proper variables. 7625 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 7626 tmp->cg_root = thr; // Make thr the CG root 7627 // Init to thread limit stored when league primary threads were forked 7628 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; 7629 tmp->cg_nthreads = 1; // Init counter to one active thread, this one 7630 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" 7631 " cg_nthreads to 1\n", 7632 thr, tmp)); 7633 tmp->up = thr->th.th_cg_roots; 7634 thr->th.th_cg_roots = tmp; 7635 7636 // Launch league of teams now, but not let workers execute 7637 // (they hang on fork barrier until next parallel) 7638 #if INCLUDE_SSC_MARKS 7639 SSC_MARK_FORKING(); 7640 #endif 7641 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7642 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7643 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7644 #if INCLUDE_SSC_MARKS 7645 SSC_MARK_JOINING(); 7646 #endif 7647 // If the team size was reduced from the limit, set it to the new size 7648 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) 7649 thr->th.th_teams_size.nth = thr->th.th_team_nproc; 7650 // AC: last parameter "1" eliminates join barrier which won't work because 7651 // worker threads are in a fork barrier waiting for more parallel regions 7652 __kmp_join_call(loc, gtid 7653 #if OMPT_SUPPORT 7654 , 7655 fork_context_intel 7656 #endif 7657 , 7658 1); 7659 } 7660 7661 int __kmp_invoke_teams_master(int gtid) { 7662 kmp_info_t *this_thr = __kmp_threads[gtid]; 7663 kmp_team_t *team = this_thr->th.th_team; 7664 #if KMP_DEBUG 7665 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7666 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7667 (void *)__kmp_teams_master); 7668 #endif 7669 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7670 #if OMPT_SUPPORT 7671 int tid = __kmp_tid_from_gtid(gtid); 7672 ompt_data_t *task_data = 7673 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data; 7674 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data; 7675 if (ompt_enabled.ompt_callback_implicit_task) { 7676 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7677 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid, 7678 ompt_task_initial); 7679 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid; 7680 } 7681 #endif 7682 __kmp_teams_master(gtid); 7683 #if OMPT_SUPPORT 7684 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league; 7685 #endif 7686 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7687 return 1; 7688 } 7689 7690 /* this sets the requested number of threads for the next parallel region 7691 encountered by this team. since this should be enclosed in the forkjoin 7692 critical section it should avoid race conditions with asymmetrical nested 7693 parallelism */ 7694 7695 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7696 kmp_info_t *thr = __kmp_threads[gtid]; 7697 7698 if (num_threads > 0) 7699 thr->th.th_set_nproc = num_threads; 7700 } 7701 7702 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams, 7703 int num_threads) { 7704 KMP_DEBUG_ASSERT(thr); 7705 // Remember the number of threads for inner parallel regions 7706 if (!TCR_4(__kmp_init_middle)) 7707 __kmp_middle_initialize(); // get internal globals calculated 7708 __kmp_assign_root_init_mask(); 7709 KMP_DEBUG_ASSERT(__kmp_avail_proc); 7710 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth); 7711 7712 if (num_threads == 0) { 7713 if (__kmp_teams_thread_limit > 0) { 7714 num_threads = __kmp_teams_thread_limit; 7715 } else { 7716 num_threads = __kmp_avail_proc / num_teams; 7717 } 7718 // adjust num_threads w/o warning as it is not user setting 7719 // num_threads = min(num_threads, nthreads-var, thread-limit-var) 7720 // no thread_limit clause specified - do not change thread-limit-var ICV 7721 if (num_threads > __kmp_dflt_team_nth) { 7722 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7723 } 7724 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) { 7725 num_threads = thr->th.th_current_task->td_icvs.thread_limit; 7726 } // prevent team size to exceed thread-limit-var 7727 if (num_teams * num_threads > __kmp_teams_max_nth) { 7728 num_threads = __kmp_teams_max_nth / num_teams; 7729 } 7730 if (num_threads == 0) { 7731 num_threads = 1; 7732 } 7733 } else { 7734 if (num_threads < 0) { 7735 __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1), 7736 __kmp_msg_null); 7737 num_threads = 1; 7738 } 7739 // This thread will be the primary thread of the league primary threads 7740 // Store new thread limit; old limit is saved in th_cg_roots list 7741 thr->th.th_current_task->td_icvs.thread_limit = num_threads; 7742 // num_threads = min(num_threads, nthreads-var) 7743 if (num_threads > __kmp_dflt_team_nth) { 7744 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7745 } 7746 if (num_teams * num_threads > __kmp_teams_max_nth) { 7747 int new_threads = __kmp_teams_max_nth / num_teams; 7748 if (new_threads == 0) { 7749 new_threads = 1; 7750 } 7751 if (new_threads != num_threads) { 7752 if (!__kmp_reserve_warn) { // user asked for too many threads 7753 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT 7754 __kmp_msg(kmp_ms_warning, 7755 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7756 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7757 } 7758 } 7759 num_threads = new_threads; 7760 } 7761 } 7762 thr->th.th_teams_size.nth = num_threads; 7763 } 7764 7765 /* this sets the requested number of teams for the teams region and/or 7766 the number of threads for the next parallel region encountered */ 7767 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7768 int num_threads) { 7769 kmp_info_t *thr = __kmp_threads[gtid]; 7770 if (num_teams < 0) { 7771 // OpenMP specification requires requested values to be positive, 7772 // but people can send us any value, so we'd better check 7773 __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1), 7774 __kmp_msg_null); 7775 num_teams = 1; 7776 } 7777 if (num_teams == 0) { 7778 if (__kmp_nteams > 0) { 7779 num_teams = __kmp_nteams; 7780 } else { 7781 num_teams = 1; // default number of teams is 1. 7782 } 7783 } 7784 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7785 if (!__kmp_reserve_warn) { 7786 __kmp_reserve_warn = 1; 7787 __kmp_msg(kmp_ms_warning, 7788 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7789 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7790 } 7791 num_teams = __kmp_teams_max_nth; 7792 } 7793 // Set number of teams (number of threads in the outer "parallel" of the 7794 // teams) 7795 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7796 7797 __kmp_push_thread_limit(thr, num_teams, num_threads); 7798 } 7799 7800 /* This sets the requested number of teams for the teams region and/or 7801 the number of threads for the next parallel region encountered */ 7802 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb, 7803 int num_teams_ub, int num_threads) { 7804 kmp_info_t *thr = __kmp_threads[gtid]; 7805 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0); 7806 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb); 7807 KMP_DEBUG_ASSERT(num_threads >= 0); 7808 7809 if (num_teams_lb > num_teams_ub) { 7810 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub), 7811 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null); 7812 } 7813 7814 int num_teams = 1; // defalt number of teams is 1. 7815 7816 if (num_teams_lb == 0 && num_teams_ub > 0) 7817 num_teams_lb = num_teams_ub; 7818 7819 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause 7820 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams; 7821 if (num_teams > __kmp_teams_max_nth) { 7822 if (!__kmp_reserve_warn) { 7823 __kmp_reserve_warn = 1; 7824 __kmp_msg(kmp_ms_warning, 7825 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7826 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7827 } 7828 num_teams = __kmp_teams_max_nth; 7829 } 7830 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams 7831 num_teams = num_teams_ub; 7832 } else { // num_teams_lb <= num_teams <= num_teams_ub 7833 if (num_threads <= 0) { 7834 if (num_teams_ub > __kmp_teams_max_nth) { 7835 num_teams = num_teams_lb; 7836 } else { 7837 num_teams = num_teams_ub; 7838 } 7839 } else { 7840 num_teams = (num_threads > __kmp_teams_max_nth) 7841 ? num_teams 7842 : __kmp_teams_max_nth / num_threads; 7843 if (num_teams < num_teams_lb) { 7844 num_teams = num_teams_lb; 7845 } else if (num_teams > num_teams_ub) { 7846 num_teams = num_teams_ub; 7847 } 7848 } 7849 } 7850 // Set number of teams (number of threads in the outer "parallel" of the 7851 // teams) 7852 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7853 7854 __kmp_push_thread_limit(thr, num_teams, num_threads); 7855 } 7856 7857 // Set the proc_bind var to use in the following parallel region. 7858 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7859 kmp_info_t *thr = __kmp_threads[gtid]; 7860 thr->th.th_set_proc_bind = proc_bind; 7861 } 7862 7863 /* Launch the worker threads into the microtask. */ 7864 7865 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7866 kmp_info_t *this_thr = __kmp_threads[gtid]; 7867 7868 #ifdef KMP_DEBUG 7869 int f; 7870 #endif /* KMP_DEBUG */ 7871 7872 KMP_DEBUG_ASSERT(team); 7873 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7874 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7875 KMP_MB(); /* Flush all pending memory write invalidates. */ 7876 7877 team->t.t_construct = 0; /* no single directives seen yet */ 7878 team->t.t_ordered.dt.t_value = 7879 0; /* thread 0 enters the ordered section first */ 7880 7881 /* Reset the identifiers on the dispatch buffer */ 7882 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7883 if (team->t.t_max_nproc > 1) { 7884 int i; 7885 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7886 team->t.t_disp_buffer[i].buffer_index = i; 7887 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7888 } 7889 } else { 7890 team->t.t_disp_buffer[0].buffer_index = 0; 7891 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7892 } 7893 7894 KMP_MB(); /* Flush all pending memory write invalidates. */ 7895 KMP_ASSERT(this_thr->th.th_team == team); 7896 7897 #ifdef KMP_DEBUG 7898 for (f = 0; f < team->t.t_nproc; f++) { 7899 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7900 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7901 } 7902 #endif /* KMP_DEBUG */ 7903 7904 /* release the worker threads so they may begin working */ 7905 __kmp_fork_barrier(gtid, 0); 7906 } 7907 7908 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7909 kmp_info_t *this_thr = __kmp_threads[gtid]; 7910 7911 KMP_DEBUG_ASSERT(team); 7912 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7913 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7914 KMP_MB(); /* Flush all pending memory write invalidates. */ 7915 7916 /* Join barrier after fork */ 7917 7918 #ifdef KMP_DEBUG 7919 if (__kmp_threads[gtid] && 7920 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7921 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7922 __kmp_threads[gtid]); 7923 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 7924 "team->t.t_nproc=%d\n", 7925 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 7926 team->t.t_nproc); 7927 __kmp_print_structure(); 7928 } 7929 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 7930 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 7931 #endif /* KMP_DEBUG */ 7932 7933 __kmp_join_barrier(gtid); /* wait for everyone */ 7934 #if OMPT_SUPPORT 7935 if (ompt_enabled.enabled && 7936 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 7937 int ds_tid = this_thr->th.th_info.ds.ds_tid; 7938 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 7939 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 7940 #if OMPT_OPTIONAL 7941 void *codeptr = NULL; 7942 if (KMP_MASTER_TID(ds_tid) && 7943 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 7944 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 7945 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 7946 7947 if (ompt_enabled.ompt_callback_sync_region_wait) { 7948 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 7949 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7950 codeptr); 7951 } 7952 if (ompt_enabled.ompt_callback_sync_region) { 7953 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 7954 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7955 codeptr); 7956 } 7957 #endif 7958 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 7959 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7960 ompt_scope_end, NULL, task_data, 0, ds_tid, 7961 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 7962 } 7963 } 7964 #endif 7965 7966 KMP_MB(); /* Flush all pending memory write invalidates. */ 7967 KMP_ASSERT(this_thr->th.th_team == team); 7968 } 7969 7970 /* ------------------------------------------------------------------------ */ 7971 7972 #ifdef USE_LOAD_BALANCE 7973 7974 // Return the worker threads actively spinning in the hot team, if we 7975 // are at the outermost level of parallelism. Otherwise, return 0. 7976 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 7977 int i; 7978 int retval; 7979 kmp_team_t *hot_team; 7980 7981 if (root->r.r_active) { 7982 return 0; 7983 } 7984 hot_team = root->r.r_hot_team; 7985 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 7986 return hot_team->t.t_nproc - 1; // Don't count primary thread 7987 } 7988 7989 // Skip the primary thread - it is accounted for elsewhere. 7990 retval = 0; 7991 for (i = 1; i < hot_team->t.t_nproc; i++) { 7992 if (hot_team->t.t_threads[i]->th.th_active) { 7993 retval++; 7994 } 7995 } 7996 return retval; 7997 } 7998 7999 // Perform an automatic adjustment to the number of 8000 // threads used by the next parallel region. 8001 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 8002 int retval; 8003 int pool_active; 8004 int hot_team_active; 8005 int team_curr_active; 8006 int system_active; 8007 8008 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 8009 set_nproc)); 8010 KMP_DEBUG_ASSERT(root); 8011 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 8012 ->th.th_current_task->td_icvs.dynamic == TRUE); 8013 KMP_DEBUG_ASSERT(set_nproc > 1); 8014 8015 if (set_nproc == 1) { 8016 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 8017 return 1; 8018 } 8019 8020 // Threads that are active in the thread pool, active in the hot team for this 8021 // particular root (if we are at the outer par level), and the currently 8022 // executing thread (to become the primary thread) are available to add to the 8023 // new team, but are currently contributing to the system load, and must be 8024 // accounted for. 8025 pool_active = __kmp_thread_pool_active_nth; 8026 hot_team_active = __kmp_active_hot_team_nproc(root); 8027 team_curr_active = pool_active + hot_team_active + 1; 8028 8029 // Check the system load. 8030 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 8031 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 8032 "hot team active = %d\n", 8033 system_active, pool_active, hot_team_active)); 8034 8035 if (system_active < 0) { 8036 // There was an error reading the necessary info from /proc, so use the 8037 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 8038 // = dynamic_thread_limit, we shouldn't wind up getting back here. 8039 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 8040 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 8041 8042 // Make this call behave like the thread limit algorithm. 8043 retval = __kmp_avail_proc - __kmp_nth + 8044 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 8045 if (retval > set_nproc) { 8046 retval = set_nproc; 8047 } 8048 if (retval < KMP_MIN_NTH) { 8049 retval = KMP_MIN_NTH; 8050 } 8051 8052 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 8053 retval)); 8054 return retval; 8055 } 8056 8057 // There is a slight delay in the load balance algorithm in detecting new 8058 // running procs. The real system load at this instant should be at least as 8059 // large as the #active omp thread that are available to add to the team. 8060 if (system_active < team_curr_active) { 8061 system_active = team_curr_active; 8062 } 8063 retval = __kmp_avail_proc - system_active + team_curr_active; 8064 if (retval > set_nproc) { 8065 retval = set_nproc; 8066 } 8067 if (retval < KMP_MIN_NTH) { 8068 retval = KMP_MIN_NTH; 8069 } 8070 8071 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 8072 return retval; 8073 } // __kmp_load_balance_nproc() 8074 8075 #endif /* USE_LOAD_BALANCE */ 8076 8077 /* ------------------------------------------------------------------------ */ 8078 8079 /* NOTE: this is called with the __kmp_init_lock held */ 8080 void __kmp_cleanup(void) { 8081 int f; 8082 8083 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 8084 8085 if (TCR_4(__kmp_init_parallel)) { 8086 #if KMP_HANDLE_SIGNALS 8087 __kmp_remove_signals(); 8088 #endif 8089 TCW_4(__kmp_init_parallel, FALSE); 8090 } 8091 8092 if (TCR_4(__kmp_init_middle)) { 8093 #if KMP_AFFINITY_SUPPORTED 8094 __kmp_affinity_uninitialize(); 8095 #endif /* KMP_AFFINITY_SUPPORTED */ 8096 __kmp_cleanup_hierarchy(); 8097 TCW_4(__kmp_init_middle, FALSE); 8098 } 8099 8100 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 8101 8102 if (__kmp_init_serial) { 8103 __kmp_runtime_destroy(); 8104 __kmp_init_serial = FALSE; 8105 } 8106 8107 __kmp_cleanup_threadprivate_caches(); 8108 8109 for (f = 0; f < __kmp_threads_capacity; f++) { 8110 if (__kmp_root[f] != NULL) { 8111 __kmp_free(__kmp_root[f]); 8112 __kmp_root[f] = NULL; 8113 } 8114 } 8115 __kmp_free(__kmp_threads); 8116 // __kmp_threads and __kmp_root were allocated at once, as single block, so 8117 // there is no need in freeing __kmp_root. 8118 __kmp_threads = NULL; 8119 __kmp_root = NULL; 8120 __kmp_threads_capacity = 0; 8121 8122 // Free old __kmp_threads arrays if they exist. 8123 kmp_old_threads_list_t *ptr = __kmp_old_threads_list; 8124 while (ptr) { 8125 kmp_old_threads_list_t *next = ptr->next; 8126 __kmp_free(ptr->threads); 8127 __kmp_free(ptr); 8128 ptr = next; 8129 } 8130 8131 #if KMP_USE_DYNAMIC_LOCK 8132 __kmp_cleanup_indirect_user_locks(); 8133 #else 8134 __kmp_cleanup_user_locks(); 8135 #endif 8136 #if OMPD_SUPPORT 8137 if (ompd_state) { 8138 __kmp_free(ompd_env_block); 8139 ompd_env_block = NULL; 8140 ompd_env_block_size = 0; 8141 } 8142 #endif 8143 8144 #if KMP_AFFINITY_SUPPORTED 8145 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 8146 __kmp_cpuinfo_file = NULL; 8147 #endif /* KMP_AFFINITY_SUPPORTED */ 8148 8149 #if KMP_USE_ADAPTIVE_LOCKS 8150 #if KMP_DEBUG_ADAPTIVE_LOCKS 8151 __kmp_print_speculative_stats(); 8152 #endif 8153 #endif 8154 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 8155 __kmp_nested_nth.nth = NULL; 8156 __kmp_nested_nth.size = 0; 8157 __kmp_nested_nth.used = 0; 8158 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 8159 __kmp_nested_proc_bind.bind_types = NULL; 8160 __kmp_nested_proc_bind.size = 0; 8161 __kmp_nested_proc_bind.used = 0; 8162 if (__kmp_affinity_format) { 8163 KMP_INTERNAL_FREE(__kmp_affinity_format); 8164 __kmp_affinity_format = NULL; 8165 } 8166 8167 __kmp_i18n_catclose(); 8168 8169 #if KMP_USE_HIER_SCHED 8170 __kmp_hier_scheds.deallocate(); 8171 #endif 8172 8173 #if KMP_STATS_ENABLED 8174 __kmp_stats_fini(); 8175 #endif 8176 8177 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 8178 } 8179 8180 /* ------------------------------------------------------------------------ */ 8181 8182 int __kmp_ignore_mppbeg(void) { 8183 char *env; 8184 8185 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 8186 if (__kmp_str_match_false(env)) 8187 return FALSE; 8188 } 8189 // By default __kmpc_begin() is no-op. 8190 return TRUE; 8191 } 8192 8193 int __kmp_ignore_mppend(void) { 8194 char *env; 8195 8196 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 8197 if (__kmp_str_match_false(env)) 8198 return FALSE; 8199 } 8200 // By default __kmpc_end() is no-op. 8201 return TRUE; 8202 } 8203 8204 void __kmp_internal_begin(void) { 8205 int gtid; 8206 kmp_root_t *root; 8207 8208 /* this is a very important step as it will register new sibling threads 8209 and assign these new uber threads a new gtid */ 8210 gtid = __kmp_entry_gtid(); 8211 root = __kmp_threads[gtid]->th.th_root; 8212 KMP_ASSERT(KMP_UBER_GTID(gtid)); 8213 8214 if (root->r.r_begin) 8215 return; 8216 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 8217 if (root->r.r_begin) { 8218 __kmp_release_lock(&root->r.r_begin_lock, gtid); 8219 return; 8220 } 8221 8222 root->r.r_begin = TRUE; 8223 8224 __kmp_release_lock(&root->r.r_begin_lock, gtid); 8225 } 8226 8227 /* ------------------------------------------------------------------------ */ 8228 8229 void __kmp_user_set_library(enum library_type arg) { 8230 int gtid; 8231 kmp_root_t *root; 8232 kmp_info_t *thread; 8233 8234 /* first, make sure we are initialized so we can get our gtid */ 8235 8236 gtid = __kmp_entry_gtid(); 8237 thread = __kmp_threads[gtid]; 8238 8239 root = thread->th.th_root; 8240 8241 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 8242 library_serial)); 8243 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 8244 thread */ 8245 KMP_WARNING(SetLibraryIncorrectCall); 8246 return; 8247 } 8248 8249 switch (arg) { 8250 case library_serial: 8251 thread->th.th_set_nproc = 0; 8252 set__nproc(thread, 1); 8253 break; 8254 case library_turnaround: 8255 thread->th.th_set_nproc = 0; 8256 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8257 : __kmp_dflt_team_nth_ub); 8258 break; 8259 case library_throughput: 8260 thread->th.th_set_nproc = 0; 8261 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8262 : __kmp_dflt_team_nth_ub); 8263 break; 8264 default: 8265 KMP_FATAL(UnknownLibraryType, arg); 8266 } 8267 8268 __kmp_aux_set_library(arg); 8269 } 8270 8271 void __kmp_aux_set_stacksize(size_t arg) { 8272 if (!__kmp_init_serial) 8273 __kmp_serial_initialize(); 8274 8275 #if KMP_OS_DARWIN 8276 if (arg & (0x1000 - 1)) { 8277 arg &= ~(0x1000 - 1); 8278 if (arg + 0x1000) /* check for overflow if we round up */ 8279 arg += 0x1000; 8280 } 8281 #endif 8282 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8283 8284 /* only change the default stacksize before the first parallel region */ 8285 if (!TCR_4(__kmp_init_parallel)) { 8286 size_t value = arg; /* argument is in bytes */ 8287 8288 if (value < __kmp_sys_min_stksize) 8289 value = __kmp_sys_min_stksize; 8290 else if (value > KMP_MAX_STKSIZE) 8291 value = KMP_MAX_STKSIZE; 8292 8293 __kmp_stksize = value; 8294 8295 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 8296 } 8297 8298 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8299 } 8300 8301 /* set the behaviour of the runtime library */ 8302 /* TODO this can cause some odd behaviour with sibling parallelism... */ 8303 void __kmp_aux_set_library(enum library_type arg) { 8304 __kmp_library = arg; 8305 8306 switch (__kmp_library) { 8307 case library_serial: { 8308 KMP_INFORM(LibraryIsSerial); 8309 } break; 8310 case library_turnaround: 8311 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) 8312 __kmp_use_yield = 2; // only yield when oversubscribed 8313 break; 8314 case library_throughput: 8315 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) 8316 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 8317 break; 8318 default: 8319 KMP_FATAL(UnknownLibraryType, arg); 8320 } 8321 } 8322 8323 /* Getting team information common for all team API */ 8324 // Returns NULL if not in teams construct 8325 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 8326 kmp_info_t *thr = __kmp_entry_thread(); 8327 teams_serialized = 0; 8328 if (thr->th.th_teams_microtask) { 8329 kmp_team_t *team = thr->th.th_team; 8330 int tlevel = thr->th.th_teams_level; // the level of the teams construct 8331 int ii = team->t.t_level; 8332 teams_serialized = team->t.t_serialized; 8333 int level = tlevel + 1; 8334 KMP_DEBUG_ASSERT(ii >= tlevel); 8335 while (ii > level) { 8336 for (teams_serialized = team->t.t_serialized; 8337 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 8338 } 8339 if (team->t.t_serialized && (!teams_serialized)) { 8340 team = team->t.t_parent; 8341 continue; 8342 } 8343 if (ii > level) { 8344 team = team->t.t_parent; 8345 ii--; 8346 } 8347 } 8348 return team; 8349 } 8350 return NULL; 8351 } 8352 8353 int __kmp_aux_get_team_num() { 8354 int serialized; 8355 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8356 if (team) { 8357 if (serialized > 1) { 8358 return 0; // teams region is serialized ( 1 team of 1 thread ). 8359 } else { 8360 return team->t.t_master_tid; 8361 } 8362 } 8363 return 0; 8364 } 8365 8366 int __kmp_aux_get_num_teams() { 8367 int serialized; 8368 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8369 if (team) { 8370 if (serialized > 1) { 8371 return 1; 8372 } else { 8373 return team->t.t_parent->t.t_nproc; 8374 } 8375 } 8376 return 1; 8377 } 8378 8379 /* ------------------------------------------------------------------------ */ 8380 8381 /* 8382 * Affinity Format Parser 8383 * 8384 * Field is in form of: %[[[0].]size]type 8385 * % and type are required (%% means print a literal '%') 8386 * type is either single char or long name surrounded by {}, 8387 * e.g., N or {num_threads} 8388 * 0 => leading zeros 8389 * . => right justified when size is specified 8390 * by default output is left justified 8391 * size is the *minimum* field length 8392 * All other characters are printed as is 8393 * 8394 * Available field types: 8395 * L {thread_level} - omp_get_level() 8396 * n {thread_num} - omp_get_thread_num() 8397 * h {host} - name of host machine 8398 * P {process_id} - process id (integer) 8399 * T {thread_identifier} - native thread identifier (integer) 8400 * N {num_threads} - omp_get_num_threads() 8401 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 8402 * a {thread_affinity} - comma separated list of integers or integer ranges 8403 * (values of affinity mask) 8404 * 8405 * Implementation-specific field types can be added 8406 * If a type is unknown, print "undefined" 8407 */ 8408 8409 // Structure holding the short name, long name, and corresponding data type 8410 // for snprintf. A table of these will represent the entire valid keyword 8411 // field types. 8412 typedef struct kmp_affinity_format_field_t { 8413 char short_name; // from spec e.g., L -> thread level 8414 const char *long_name; // from spec thread_level -> thread level 8415 char field_format; // data type for snprintf (typically 'd' or 's' 8416 // for integer or string) 8417 } kmp_affinity_format_field_t; 8418 8419 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 8420 #if KMP_AFFINITY_SUPPORTED 8421 {'A', "thread_affinity", 's'}, 8422 #endif 8423 {'t', "team_num", 'd'}, 8424 {'T', "num_teams", 'd'}, 8425 {'L', "nesting_level", 'd'}, 8426 {'n', "thread_num", 'd'}, 8427 {'N', "num_threads", 'd'}, 8428 {'a', "ancestor_tnum", 'd'}, 8429 {'H', "host", 's'}, 8430 {'P', "process_id", 'd'}, 8431 {'i', "native_thread_id", 'd'}}; 8432 8433 // Return the number of characters it takes to hold field 8434 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 8435 const char **ptr, 8436 kmp_str_buf_t *field_buffer) { 8437 int rc, format_index, field_value; 8438 const char *width_left, *width_right; 8439 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 8440 static const int FORMAT_SIZE = 20; 8441 char format[FORMAT_SIZE] = {0}; 8442 char absolute_short_name = 0; 8443 8444 KMP_DEBUG_ASSERT(gtid >= 0); 8445 KMP_DEBUG_ASSERT(th); 8446 KMP_DEBUG_ASSERT(**ptr == '%'); 8447 KMP_DEBUG_ASSERT(field_buffer); 8448 8449 __kmp_str_buf_clear(field_buffer); 8450 8451 // Skip the initial % 8452 (*ptr)++; 8453 8454 // Check for %% first 8455 if (**ptr == '%') { 8456 __kmp_str_buf_cat(field_buffer, "%", 1); 8457 (*ptr)++; // skip over the second % 8458 return 1; 8459 } 8460 8461 // Parse field modifiers if they are present 8462 pad_zeros = false; 8463 if (**ptr == '0') { 8464 pad_zeros = true; 8465 (*ptr)++; // skip over 0 8466 } 8467 right_justify = false; 8468 if (**ptr == '.') { 8469 right_justify = true; 8470 (*ptr)++; // skip over . 8471 } 8472 // Parse width of field: [width_left, width_right) 8473 width_left = width_right = NULL; 8474 if (**ptr >= '0' && **ptr <= '9') { 8475 width_left = *ptr; 8476 SKIP_DIGITS(*ptr); 8477 width_right = *ptr; 8478 } 8479 8480 // Create the format for KMP_SNPRINTF based on flags parsed above 8481 format_index = 0; 8482 format[format_index++] = '%'; 8483 if (!right_justify) 8484 format[format_index++] = '-'; 8485 if (pad_zeros) 8486 format[format_index++] = '0'; 8487 if (width_left && width_right) { 8488 int i = 0; 8489 // Only allow 8 digit number widths. 8490 // This also prevents overflowing format variable 8491 while (i < 8 && width_left < width_right) { 8492 format[format_index++] = *width_left; 8493 width_left++; 8494 i++; 8495 } 8496 } 8497 8498 // Parse a name (long or short) 8499 // Canonicalize the name into absolute_short_name 8500 found_valid_name = false; 8501 parse_long_name = (**ptr == '{'); 8502 if (parse_long_name) 8503 (*ptr)++; // skip initial left brace 8504 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 8505 sizeof(__kmp_affinity_format_table[0]); 8506 ++i) { 8507 char short_name = __kmp_affinity_format_table[i].short_name; 8508 const char *long_name = __kmp_affinity_format_table[i].long_name; 8509 char field_format = __kmp_affinity_format_table[i].field_format; 8510 if (parse_long_name) { 8511 size_t length = KMP_STRLEN(long_name); 8512 if (strncmp(*ptr, long_name, length) == 0) { 8513 found_valid_name = true; 8514 (*ptr) += length; // skip the long name 8515 } 8516 } else if (**ptr == short_name) { 8517 found_valid_name = true; 8518 (*ptr)++; // skip the short name 8519 } 8520 if (found_valid_name) { 8521 format[format_index++] = field_format; 8522 format[format_index++] = '\0'; 8523 absolute_short_name = short_name; 8524 break; 8525 } 8526 } 8527 if (parse_long_name) { 8528 if (**ptr != '}') { 8529 absolute_short_name = 0; 8530 } else { 8531 (*ptr)++; // skip over the right brace 8532 } 8533 } 8534 8535 // Attempt to fill the buffer with the requested 8536 // value using snprintf within __kmp_str_buf_print() 8537 switch (absolute_short_name) { 8538 case 't': 8539 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 8540 break; 8541 case 'T': 8542 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 8543 break; 8544 case 'L': 8545 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 8546 break; 8547 case 'n': 8548 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 8549 break; 8550 case 'H': { 8551 static const int BUFFER_SIZE = 256; 8552 char buf[BUFFER_SIZE]; 8553 __kmp_expand_host_name(buf, BUFFER_SIZE); 8554 rc = __kmp_str_buf_print(field_buffer, format, buf); 8555 } break; 8556 case 'P': 8557 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 8558 break; 8559 case 'i': 8560 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 8561 break; 8562 case 'N': 8563 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 8564 break; 8565 case 'a': 8566 field_value = 8567 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 8568 rc = __kmp_str_buf_print(field_buffer, format, field_value); 8569 break; 8570 #if KMP_AFFINITY_SUPPORTED 8571 case 'A': { 8572 kmp_str_buf_t buf; 8573 __kmp_str_buf_init(&buf); 8574 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 8575 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 8576 __kmp_str_buf_free(&buf); 8577 } break; 8578 #endif 8579 default: 8580 // According to spec, If an implementation does not have info for field 8581 // type, then "undefined" is printed 8582 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 8583 // Skip the field 8584 if (parse_long_name) { 8585 SKIP_TOKEN(*ptr); 8586 if (**ptr == '}') 8587 (*ptr)++; 8588 } else { 8589 (*ptr)++; 8590 } 8591 } 8592 8593 KMP_ASSERT(format_index <= FORMAT_SIZE); 8594 return rc; 8595 } 8596 8597 /* 8598 * Return number of characters needed to hold the affinity string 8599 * (not including null byte character) 8600 * The resultant string is printed to buffer, which the caller can then 8601 * handle afterwards 8602 */ 8603 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 8604 kmp_str_buf_t *buffer) { 8605 const char *parse_ptr; 8606 size_t retval; 8607 const kmp_info_t *th; 8608 kmp_str_buf_t field; 8609 8610 KMP_DEBUG_ASSERT(buffer); 8611 KMP_DEBUG_ASSERT(gtid >= 0); 8612 8613 __kmp_str_buf_init(&field); 8614 __kmp_str_buf_clear(buffer); 8615 8616 th = __kmp_threads[gtid]; 8617 retval = 0; 8618 8619 // If format is NULL or zero-length string, then we use 8620 // affinity-format-var ICV 8621 parse_ptr = format; 8622 if (parse_ptr == NULL || *parse_ptr == '\0') { 8623 parse_ptr = __kmp_affinity_format; 8624 } 8625 KMP_DEBUG_ASSERT(parse_ptr); 8626 8627 while (*parse_ptr != '\0') { 8628 // Parse a field 8629 if (*parse_ptr == '%') { 8630 // Put field in the buffer 8631 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 8632 __kmp_str_buf_catbuf(buffer, &field); 8633 retval += rc; 8634 } else { 8635 // Put literal character in buffer 8636 __kmp_str_buf_cat(buffer, parse_ptr, 1); 8637 retval++; 8638 parse_ptr++; 8639 } 8640 } 8641 __kmp_str_buf_free(&field); 8642 return retval; 8643 } 8644 8645 // Displays the affinity string to stdout 8646 void __kmp_aux_display_affinity(int gtid, const char *format) { 8647 kmp_str_buf_t buf; 8648 __kmp_str_buf_init(&buf); 8649 __kmp_aux_capture_affinity(gtid, format, &buf); 8650 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 8651 __kmp_str_buf_free(&buf); 8652 } 8653 8654 /* ------------------------------------------------------------------------ */ 8655 8656 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 8657 int blocktime = arg; /* argument is in milliseconds */ 8658 #if KMP_USE_MONITOR 8659 int bt_intervals; 8660 #endif 8661 kmp_int8 bt_set; 8662 8663 __kmp_save_internal_controls(thread); 8664 8665 /* Normalize and set blocktime for the teams */ 8666 if (blocktime < KMP_MIN_BLOCKTIME) 8667 blocktime = KMP_MIN_BLOCKTIME; 8668 else if (blocktime > KMP_MAX_BLOCKTIME) 8669 blocktime = KMP_MAX_BLOCKTIME; 8670 8671 set__blocktime_team(thread->th.th_team, tid, blocktime); 8672 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 8673 8674 #if KMP_USE_MONITOR 8675 /* Calculate and set blocktime intervals for the teams */ 8676 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 8677 8678 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 8679 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 8680 #endif 8681 8682 /* Set whether blocktime has been set to "TRUE" */ 8683 bt_set = TRUE; 8684 8685 set__bt_set_team(thread->th.th_team, tid, bt_set); 8686 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 8687 #if KMP_USE_MONITOR 8688 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 8689 "bt_intervals=%d, monitor_updates=%d\n", 8690 __kmp_gtid_from_tid(tid, thread->th.th_team), 8691 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 8692 __kmp_monitor_wakeups)); 8693 #else 8694 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 8695 __kmp_gtid_from_tid(tid, thread->th.th_team), 8696 thread->th.th_team->t.t_id, tid, blocktime)); 8697 #endif 8698 } 8699 8700 void __kmp_aux_set_defaults(char const *str, size_t len) { 8701 if (!__kmp_init_serial) { 8702 __kmp_serial_initialize(); 8703 } 8704 __kmp_env_initialize(str); 8705 8706 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { 8707 __kmp_env_print(); 8708 } 8709 } // __kmp_aux_set_defaults 8710 8711 /* ------------------------------------------------------------------------ */ 8712 /* internal fast reduction routines */ 8713 8714 PACKED_REDUCTION_METHOD_T 8715 __kmp_determine_reduction_method( 8716 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 8717 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 8718 kmp_critical_name *lck) { 8719 8720 // Default reduction method: critical construct ( lck != NULL, like in current 8721 // PAROPT ) 8722 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 8723 // can be selected by RTL 8724 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 8725 // can be selected by RTL 8726 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8727 // among generated by PAROPT. 8728 8729 PACKED_REDUCTION_METHOD_T retval; 8730 8731 int team_size; 8732 8733 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 8734 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8735 8736 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8737 (loc && \ 8738 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))) 8739 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8740 8741 retval = critical_reduce_block; 8742 8743 // another choice of getting a team size (with 1 dynamic deference) is slower 8744 team_size = __kmp_get_team_num_threads(global_tid); 8745 if (team_size == 1) { 8746 8747 retval = empty_reduce_block; 8748 8749 } else { 8750 8751 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8752 8753 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ 8754 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 8755 8756 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8757 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8758 8759 int teamsize_cutoff = 4; 8760 8761 #if KMP_MIC_SUPPORTED 8762 if (__kmp_mic_type != non_mic) { 8763 teamsize_cutoff = 8; 8764 } 8765 #endif 8766 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8767 if (tree_available) { 8768 if (team_size <= teamsize_cutoff) { 8769 if (atomic_available) { 8770 retval = atomic_reduce_block; 8771 } 8772 } else { 8773 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8774 } 8775 } else if (atomic_available) { 8776 retval = atomic_reduce_block; 8777 } 8778 #else 8779 #error "Unknown or unsupported OS" 8780 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8781 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8782 8783 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 8784 8785 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD 8786 8787 // basic tuning 8788 8789 if (atomic_available) { 8790 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 8791 retval = atomic_reduce_block; 8792 } 8793 } // otherwise: use critical section 8794 8795 #elif KMP_OS_DARWIN 8796 8797 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8798 if (atomic_available && (num_vars <= 3)) { 8799 retval = atomic_reduce_block; 8800 } else if (tree_available) { 8801 if ((reduce_size > (9 * sizeof(kmp_real64))) && 8802 (reduce_size < (2000 * sizeof(kmp_real64)))) { 8803 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 8804 } 8805 } // otherwise: use critical section 8806 8807 #else 8808 #error "Unknown or unsupported OS" 8809 #endif 8810 8811 #else 8812 #error "Unknown or unsupported architecture" 8813 #endif 8814 } 8815 8816 // KMP_FORCE_REDUCTION 8817 8818 // If the team is serialized (team_size == 1), ignore the forced reduction 8819 // method and stay with the unsynchronized method (empty_reduce_block) 8820 if (__kmp_force_reduction_method != reduction_method_not_defined && 8821 team_size != 1) { 8822 8823 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 8824 8825 int atomic_available, tree_available; 8826 8827 switch ((forced_retval = __kmp_force_reduction_method)) { 8828 case critical_reduce_block: 8829 KMP_ASSERT(lck); // lck should be != 0 8830 break; 8831 8832 case atomic_reduce_block: 8833 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8834 if (!atomic_available) { 8835 KMP_WARNING(RedMethodNotSupported, "atomic"); 8836 forced_retval = critical_reduce_block; 8837 } 8838 break; 8839 8840 case tree_reduce_block: 8841 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8842 if (!tree_available) { 8843 KMP_WARNING(RedMethodNotSupported, "tree"); 8844 forced_retval = critical_reduce_block; 8845 } else { 8846 #if KMP_FAST_REDUCTION_BARRIER 8847 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8848 #endif 8849 } 8850 break; 8851 8852 default: 8853 KMP_ASSERT(0); // "unsupported method specified" 8854 } 8855 8856 retval = forced_retval; 8857 } 8858 8859 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 8860 8861 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 8862 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 8863 8864 return (retval); 8865 } 8866 // this function is for testing set/get/determine reduce method 8867 kmp_int32 __kmp_get_reduce_method(void) { 8868 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 8869 } 8870 8871 // Soft pause sets up threads to ignore blocktime and just go to sleep. 8872 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 8873 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 8874 8875 // Hard pause shuts down the runtime completely. Resume happens naturally when 8876 // OpenMP is used subsequently. 8877 void __kmp_hard_pause() { 8878 __kmp_pause_status = kmp_hard_paused; 8879 __kmp_internal_end_thread(-1); 8880 } 8881 8882 // Soft resume sets __kmp_pause_status, and wakes up all threads. 8883 void __kmp_resume_if_soft_paused() { 8884 if (__kmp_pause_status == kmp_soft_paused) { 8885 __kmp_pause_status = kmp_not_paused; 8886 8887 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 8888 kmp_info_t *thread = __kmp_threads[gtid]; 8889 if (thread) { // Wake it if sleeping 8890 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 8891 thread); 8892 if (fl.is_sleeping()) 8893 fl.resume(gtid); 8894 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 8895 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 8896 } else { // thread holds the lock and may sleep soon 8897 do { // until either the thread sleeps, or we can get the lock 8898 if (fl.is_sleeping()) { 8899 fl.resume(gtid); 8900 break; 8901 } else if (__kmp_try_suspend_mx(thread)) { 8902 __kmp_unlock_suspend_mx(thread); 8903 break; 8904 } 8905 } while (1); 8906 } 8907 } 8908 } 8909 } 8910 } 8911 8912 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 8913 // TODO: add warning messages 8914 int __kmp_pause_resource(kmp_pause_status_t level) { 8915 if (level == kmp_not_paused) { // requesting resume 8916 if (__kmp_pause_status == kmp_not_paused) { 8917 // error message about runtime not being paused, so can't resume 8918 return 1; 8919 } else { 8920 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 8921 __kmp_pause_status == kmp_hard_paused); 8922 __kmp_pause_status = kmp_not_paused; 8923 return 0; 8924 } 8925 } else if (level == kmp_soft_paused) { // requesting soft pause 8926 if (__kmp_pause_status != kmp_not_paused) { 8927 // error message about already being paused 8928 return 1; 8929 } else { 8930 __kmp_soft_pause(); 8931 return 0; 8932 } 8933 } else if (level == kmp_hard_paused) { // requesting hard pause 8934 if (__kmp_pause_status != kmp_not_paused) { 8935 // error message about already being paused 8936 return 1; 8937 } else { 8938 __kmp_hard_pause(); 8939 return 0; 8940 } 8941 } else { 8942 // error message about invalid level 8943 return 1; 8944 } 8945 } 8946 8947 void __kmp_omp_display_env(int verbose) { 8948 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8949 if (__kmp_init_serial == 0) 8950 __kmp_do_serial_initialize(); 8951 __kmp_display_env_impl(!verbose, verbose); 8952 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8953 } 8954 8955 // The team size is changing, so distributed barrier must be modified 8956 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads, 8957 int new_nthreads) { 8958 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] == 8959 bp_dist_bar); 8960 kmp_info_t **other_threads = team->t.t_threads; 8961 8962 // We want all the workers to stop waiting on the barrier while we adjust the 8963 // size of the team. 8964 for (int f = 1; f < old_nthreads; ++f) { 8965 KMP_DEBUG_ASSERT(other_threads[f] != NULL); 8966 // Ignore threads that are already inactive or not present in the team 8967 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) { 8968 // teams construct causes thread_limit to get passed in, and some of 8969 // those could be inactive; just ignore them 8970 continue; 8971 } 8972 // If thread is transitioning still to in_use state, wait for it 8973 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) { 8974 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3) 8975 KMP_CPU_PAUSE(); 8976 } 8977 // The thread should be in_use now 8978 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1); 8979 // Transition to unused state 8980 team->t.t_threads[f]->th.th_used_in_team.store(2); 8981 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2); 8982 } 8983 // Release all the workers 8984 team->t.b->go_release(); 8985 8986 KMP_MFENCE(); 8987 8988 // Workers should see transition status 2 and move to 0; but may need to be 8989 // woken up first 8990 int count = old_nthreads - 1; 8991 while (count > 0) { 8992 count = old_nthreads - 1; 8993 for (int f = 1; f < old_nthreads; ++f) { 8994 if (other_threads[f]->th.th_used_in_team.load() != 0) { 8995 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers 8996 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST( 8997 void *, other_threads[f]->th.th_sleep_loc); 8998 __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag); 8999 } 9000 } else { 9001 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0); 9002 count--; 9003 } 9004 } 9005 } 9006 // Now update the barrier size 9007 team->t.b->update_num_threads(new_nthreads); 9008 team->t.b->go_reset(); 9009 } 9010 9011 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) { 9012 // Add the threads back to the team 9013 KMP_DEBUG_ASSERT(team); 9014 // Threads were paused and pointed at th_used_in_team temporarily during a 9015 // resize of the team. We're going to set th_used_in_team to 3 to indicate to 9016 // the thread that it should transition itself back into the team. Then, if 9017 // blocktime isn't infinite, the thread could be sleeping, so we send a resume 9018 // to wake it up. 9019 for (int f = 1; f < new_nthreads; ++f) { 9020 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 9021 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0, 9022 3); 9023 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads 9024 __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid, 9025 (kmp_flag_32<false, false> *)NULL); 9026 } 9027 } 9028 // The threads should be transitioning to the team; when they are done, they 9029 // should have set th_used_in_team to 1. This loop forces master to wait until 9030 // all threads have moved into the team and are waiting in the barrier. 9031 int count = new_nthreads - 1; 9032 while (count > 0) { 9033 count = new_nthreads - 1; 9034 for (int f = 1; f < new_nthreads; ++f) { 9035 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) { 9036 count--; 9037 } 9038 } 9039 } 9040 } 9041 9042 // Globals and functions for hidden helper task 9043 kmp_info_t **__kmp_hidden_helper_threads; 9044 kmp_info_t *__kmp_hidden_helper_main_thread; 9045 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks; 9046 #if KMP_OS_LINUX 9047 kmp_int32 __kmp_hidden_helper_threads_num = 8; 9048 kmp_int32 __kmp_enable_hidden_helper = TRUE; 9049 #else 9050 kmp_int32 __kmp_hidden_helper_threads_num = 0; 9051 kmp_int32 __kmp_enable_hidden_helper = FALSE; 9052 #endif 9053 9054 namespace { 9055 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num; 9056 9057 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) { 9058 // This is an explicit synchronization on all hidden helper threads in case 9059 // that when a regular thread pushes a hidden helper task to one hidden 9060 // helper thread, the thread has not been awaken once since they're released 9061 // by the main thread after creating the team. 9062 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num); 9063 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) != 9064 __kmp_hidden_helper_threads_num) 9065 ; 9066 9067 // If main thread, then wait for signal 9068 if (__kmpc_master(nullptr, *gtid)) { 9069 // First, unset the initial state and release the initial thread 9070 TCW_4(__kmp_init_hidden_helper_threads, FALSE); 9071 __kmp_hidden_helper_initz_release(); 9072 __kmp_hidden_helper_main_thread_wait(); 9073 // Now wake up all worker threads 9074 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) { 9075 __kmp_hidden_helper_worker_thread_signal(); 9076 } 9077 } 9078 } 9079 } // namespace 9080 9081 void __kmp_hidden_helper_threads_initz_routine() { 9082 // Create a new root for hidden helper team/threads 9083 const int gtid = __kmp_register_root(TRUE); 9084 __kmp_hidden_helper_main_thread = __kmp_threads[gtid]; 9085 __kmp_hidden_helper_threads = &__kmp_threads[gtid]; 9086 __kmp_hidden_helper_main_thread->th.th_set_nproc = 9087 __kmp_hidden_helper_threads_num; 9088 9089 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0); 9090 9091 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn); 9092 9093 // Set the initialization flag to FALSE 9094 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE); 9095 9096 __kmp_hidden_helper_threads_deinitz_release(); 9097 } 9098 9099 /* Nesting Mode: 9100 Set via KMP_NESTING_MODE, which takes an integer. 9101 Note: we skip duplicate topology levels, and skip levels with only 9102 one entity. 9103 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode. 9104 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels 9105 in the topology, and initializes the number of threads at each of those 9106 levels to the number of entities at each level, respectively, below the 9107 entity at the parent level. 9108 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels, 9109 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires 9110 the user to turn nesting on explicitly. This is an even more experimental 9111 option to this experimental feature, and may change or go away in the 9112 future. 9113 */ 9114 9115 // Allocate space to store nesting levels 9116 void __kmp_init_nesting_mode() { 9117 int levels = KMP_HW_LAST; 9118 __kmp_nesting_mode_nlevels = levels; 9119 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int)); 9120 for (int i = 0; i < levels; ++i) 9121 __kmp_nesting_nth_level[i] = 0; 9122 if (__kmp_nested_nth.size < levels) { 9123 __kmp_nested_nth.nth = 9124 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int)); 9125 __kmp_nested_nth.size = levels; 9126 } 9127 } 9128 9129 // Set # threads for top levels of nesting; must be called after topology set 9130 void __kmp_set_nesting_mode_threads() { 9131 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()]; 9132 9133 if (__kmp_nesting_mode == 1) 9134 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 9135 else if (__kmp_nesting_mode > 1) 9136 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 9137 9138 if (__kmp_topology) { // use topology info 9139 int loc, hw_level; 9140 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() && 9141 loc < __kmp_nesting_mode_nlevels; 9142 loc++, hw_level++) { 9143 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level); 9144 if (__kmp_nesting_nth_level[loc] == 1) 9145 loc--; 9146 } 9147 // Make sure all cores are used 9148 if (__kmp_nesting_mode > 1 && loc > 1) { 9149 int core_level = __kmp_topology->get_level(KMP_HW_CORE); 9150 int num_cores = __kmp_topology->get_count(core_level); 9151 int upper_levels = 1; 9152 for (int level = 0; level < loc - 1; ++level) 9153 upper_levels *= __kmp_nesting_nth_level[level]; 9154 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores) 9155 __kmp_nesting_nth_level[loc - 1] = 9156 num_cores / __kmp_nesting_nth_level[loc - 2]; 9157 } 9158 __kmp_nesting_mode_nlevels = loc; 9159 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 9160 } else { // no topology info available; provide a reasonable guesstimation 9161 if (__kmp_avail_proc >= 4) { 9162 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2; 9163 __kmp_nesting_nth_level[1] = 2; 9164 __kmp_nesting_mode_nlevels = 2; 9165 } else { 9166 __kmp_nesting_nth_level[0] = __kmp_avail_proc; 9167 __kmp_nesting_mode_nlevels = 1; 9168 } 9169 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 9170 } 9171 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) { 9172 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i]; 9173 } 9174 set__nproc(thread, __kmp_nesting_nth_level[0]); 9175 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode) 9176 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 9177 if (get__max_active_levels(thread) > 1) { 9178 // if max levels was set, set nesting mode levels to same 9179 __kmp_nesting_mode_nlevels = get__max_active_levels(thread); 9180 } 9181 if (__kmp_nesting_mode == 1) // turn on nesting for this case only 9182 set__max_active_levels(thread, __kmp_nesting_mode_nlevels); 9183 } 9184