1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #if KMP_USE_HIER_SCHED 28 #include "kmp_dispatch_hier.h" 29 #endif 30 31 #if OMPT_SUPPORT 32 #include "ompt-specific.h" 33 #endif 34 #if OMPD_SUPPORT 35 #include "ompd-specific.h" 36 #endif 37 38 #if OMP_PROFILING_SUPPORT 39 #include "llvm/Support/TimeProfiler.h" 40 static char *ProfileTraceFile = nullptr; 41 #endif 42 43 /* these are temporary issues to be dealt with */ 44 #define KMP_USE_PRCTL 0 45 46 #if KMP_OS_WINDOWS 47 #include <process.h> 48 #endif 49 50 #if KMP_OS_WINDOWS 51 // windows does not need include files as it doesn't use shared memory 52 #else 53 #include <sys/mman.h> 54 #include <sys/stat.h> 55 #include <fcntl.h> 56 #define SHM_SIZE 1024 57 #endif 58 59 #if defined(KMP_GOMP_COMPAT) 60 char const __kmp_version_alt_comp[] = 61 KMP_VERSION_PREFIX "alternative compiler support: yes"; 62 #endif /* defined(KMP_GOMP_COMPAT) */ 63 64 char const __kmp_version_omp_api[] = 65 KMP_VERSION_PREFIX "API version: 5.0 (201611)"; 66 67 #ifdef KMP_DEBUG 68 char const __kmp_version_lock[] = 69 KMP_VERSION_PREFIX "lock type: run time selectable"; 70 #endif /* KMP_DEBUG */ 71 72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 73 74 /* ------------------------------------------------------------------------ */ 75 76 #if KMP_USE_MONITOR 77 kmp_info_t __kmp_monitor; 78 #endif 79 80 /* Forward declarations */ 81 82 void __kmp_cleanup(void); 83 84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 85 int gtid); 86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 87 kmp_internal_control_t *new_icvs, 88 ident_t *loc); 89 #if KMP_AFFINITY_SUPPORTED 90 static void __kmp_partition_places(kmp_team_t *team, 91 int update_master_only = 0); 92 #endif 93 static void __kmp_do_serial_initialize(void); 94 void __kmp_fork_barrier(int gtid, int tid); 95 void __kmp_join_barrier(int gtid); 96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 97 kmp_internal_control_t *new_icvs, ident_t *loc); 98 99 #ifdef USE_LOAD_BALANCE 100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 101 #endif 102 103 static int __kmp_expand_threads(int nNeed); 104 #if KMP_OS_WINDOWS 105 static int __kmp_unregister_root_other_thread(int gtid); 106 #endif 107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 109 110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads, 111 int new_nthreads); 112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads); 113 114 /* Calculate the identifier of the current thread */ 115 /* fast (and somewhat portable) way to get unique identifier of executing 116 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 117 int __kmp_get_global_thread_id() { 118 int i; 119 kmp_info_t **other_threads; 120 size_t stack_data; 121 char *stack_addr; 122 size_t stack_size; 123 char *stack_base; 124 125 KA_TRACE( 126 1000, 127 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 128 __kmp_nth, __kmp_all_nth)); 129 130 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 131 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 132 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 133 __kmp_init_gtid for this to work. */ 134 135 if (!TCR_4(__kmp_init_gtid)) 136 return KMP_GTID_DNE; 137 138 #ifdef KMP_TDATA_GTID 139 if (TCR_4(__kmp_gtid_mode) >= 3) { 140 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 141 return __kmp_gtid; 142 } 143 #endif 144 if (TCR_4(__kmp_gtid_mode) >= 2) { 145 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 146 return __kmp_gtid_get_specific(); 147 } 148 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 149 150 stack_addr = (char *)&stack_data; 151 other_threads = __kmp_threads; 152 153 /* ATT: The code below is a source of potential bugs due to unsynchronized 154 access to __kmp_threads array. For example: 155 1. Current thread loads other_threads[i] to thr and checks it, it is 156 non-NULL. 157 2. Current thread is suspended by OS. 158 3. Another thread unregisters and finishes (debug versions of free() 159 may fill memory with something like 0xEF). 160 4. Current thread is resumed. 161 5. Current thread reads junk from *thr. 162 TODO: Fix it. --ln */ 163 164 for (i = 0; i < __kmp_threads_capacity; i++) { 165 166 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 167 if (!thr) 168 continue; 169 170 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 171 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 172 173 /* stack grows down -- search through all of the active threads */ 174 175 if (stack_addr <= stack_base) { 176 size_t stack_diff = stack_base - stack_addr; 177 178 if (stack_diff <= stack_size) { 179 /* The only way we can be closer than the allocated */ 180 /* stack size is if we are running on this thread. */ 181 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 182 return i; 183 } 184 } 185 } 186 187 /* get specific to try and determine our gtid */ 188 KA_TRACE(1000, 189 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 190 "thread, using TLS\n")); 191 i = __kmp_gtid_get_specific(); 192 193 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 194 195 /* if we havn't been assigned a gtid, then return code */ 196 if (i < 0) 197 return i; 198 199 /* dynamically updated stack window for uber threads to avoid get_specific 200 call */ 201 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 202 KMP_FATAL(StackOverflow, i); 203 } 204 205 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 206 if (stack_addr > stack_base) { 207 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 208 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 209 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 210 stack_base); 211 } else { 212 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 213 stack_base - stack_addr); 214 } 215 216 /* Reprint stack bounds for ubermaster since they have been refined */ 217 if (__kmp_storage_map) { 218 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 219 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 220 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 221 other_threads[i]->th.th_info.ds.ds_stacksize, 222 "th_%d stack (refinement)", i); 223 } 224 return i; 225 } 226 227 int __kmp_get_global_thread_id_reg() { 228 int gtid; 229 230 if (!__kmp_init_serial) { 231 gtid = KMP_GTID_DNE; 232 } else 233 #ifdef KMP_TDATA_GTID 234 if (TCR_4(__kmp_gtid_mode) >= 3) { 235 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 236 gtid = __kmp_gtid; 237 } else 238 #endif 239 if (TCR_4(__kmp_gtid_mode) >= 2) { 240 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 241 gtid = __kmp_gtid_get_specific(); 242 } else { 243 KA_TRACE(1000, 244 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 245 gtid = __kmp_get_global_thread_id(); 246 } 247 248 /* we must be a new uber master sibling thread */ 249 if (gtid == KMP_GTID_DNE) { 250 KA_TRACE(10, 251 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 252 "Registering a new gtid.\n")); 253 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 254 if (!__kmp_init_serial) { 255 __kmp_do_serial_initialize(); 256 gtid = __kmp_gtid_get_specific(); 257 } else { 258 gtid = __kmp_register_root(FALSE); 259 } 260 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 261 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 262 } 263 264 KMP_DEBUG_ASSERT(gtid >= 0); 265 266 return gtid; 267 } 268 269 /* caller must hold forkjoin_lock */ 270 void __kmp_check_stack_overlap(kmp_info_t *th) { 271 int f; 272 char *stack_beg = NULL; 273 char *stack_end = NULL; 274 int gtid; 275 276 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 277 if (__kmp_storage_map) { 278 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 279 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 280 281 gtid = __kmp_gtid_from_thread(th); 282 283 if (gtid == KMP_GTID_MONITOR) { 284 __kmp_print_storage_map_gtid( 285 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 286 "th_%s stack (%s)", "mon", 287 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 288 } else { 289 __kmp_print_storage_map_gtid( 290 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 291 "th_%d stack (%s)", gtid, 292 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 293 } 294 } 295 296 /* No point in checking ubermaster threads since they use refinement and 297 * cannot overlap */ 298 gtid = __kmp_gtid_from_thread(th); 299 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 300 KA_TRACE(10, 301 ("__kmp_check_stack_overlap: performing extensive checking\n")); 302 if (stack_beg == NULL) { 303 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 304 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 305 } 306 307 for (f = 0; f < __kmp_threads_capacity; f++) { 308 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 309 310 if (f_th && f_th != th) { 311 char *other_stack_end = 312 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 313 char *other_stack_beg = 314 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 315 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 316 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 317 318 /* Print the other stack values before the abort */ 319 if (__kmp_storage_map) 320 __kmp_print_storage_map_gtid( 321 -1, other_stack_beg, other_stack_end, 322 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 323 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 324 325 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 326 __kmp_msg_null); 327 } 328 } 329 } 330 } 331 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 332 } 333 334 /* ------------------------------------------------------------------------ */ 335 336 void __kmp_infinite_loop(void) { 337 static int done = FALSE; 338 339 while (!done) { 340 KMP_YIELD(TRUE); 341 } 342 } 343 344 #define MAX_MESSAGE 512 345 346 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 347 char const *format, ...) { 348 char buffer[MAX_MESSAGE]; 349 va_list ap; 350 351 va_start(ap, format); 352 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 353 p2, (unsigned long)size, format); 354 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 355 __kmp_vprintf(kmp_err, buffer, ap); 356 #if KMP_PRINT_DATA_PLACEMENT 357 int node; 358 if (gtid >= 0) { 359 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 360 if (__kmp_storage_map_verbose) { 361 node = __kmp_get_host_node(p1); 362 if (node < 0) /* doesn't work, so don't try this next time */ 363 __kmp_storage_map_verbose = FALSE; 364 else { 365 char *last; 366 int lastNode; 367 int localProc = __kmp_get_cpu_from_gtid(gtid); 368 369 const int page_size = KMP_GET_PAGE_SIZE(); 370 371 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 372 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 373 if (localProc >= 0) 374 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 375 localProc >> 1); 376 else 377 __kmp_printf_no_lock(" GTID %d\n", gtid); 378 #if KMP_USE_PRCTL 379 /* The more elaborate format is disabled for now because of the prctl 380 * hanging bug. */ 381 do { 382 last = p1; 383 lastNode = node; 384 /* This loop collates adjacent pages with the same host node. */ 385 do { 386 (char *)p1 += page_size; 387 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 388 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 389 lastNode); 390 } while (p1 <= p2); 391 #else 392 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 393 (char *)p1 + (page_size - 1), 394 __kmp_get_host_node(p1)); 395 if (p1 < p2) { 396 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 397 (char *)p2 + (page_size - 1), 398 __kmp_get_host_node(p2)); 399 } 400 #endif 401 } 402 } 403 } else 404 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 405 } 406 #endif /* KMP_PRINT_DATA_PLACEMENT */ 407 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 408 409 va_end(ap); 410 } 411 412 void __kmp_warn(char const *format, ...) { 413 char buffer[MAX_MESSAGE]; 414 va_list ap; 415 416 if (__kmp_generate_warnings == kmp_warnings_off) { 417 return; 418 } 419 420 va_start(ap, format); 421 422 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 423 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 424 __kmp_vprintf(kmp_err, buffer, ap); 425 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 426 427 va_end(ap); 428 } 429 430 void __kmp_abort_process() { 431 // Later threads may stall here, but that's ok because abort() will kill them. 432 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 433 434 if (__kmp_debug_buf) { 435 __kmp_dump_debug_buffer(); 436 } 437 438 if (KMP_OS_WINDOWS) { 439 // Let other threads know of abnormal termination and prevent deadlock 440 // if abort happened during library initialization or shutdown 441 __kmp_global.g.g_abort = SIGABRT; 442 443 /* On Windows* OS by default abort() causes pop-up error box, which stalls 444 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 445 boxes. _set_abort_behavior() works well, but this function is not 446 available in VS7 (this is not problem for DLL, but it is a problem for 447 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 448 help, at least in some versions of MS C RTL. 449 450 It seems following sequence is the only way to simulate abort() and 451 avoid pop-up error box. */ 452 raise(SIGABRT); 453 _exit(3); // Just in case, if signal ignored, exit anyway. 454 } else { 455 __kmp_unregister_library(); 456 abort(); 457 } 458 459 __kmp_infinite_loop(); 460 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 461 462 } // __kmp_abort_process 463 464 void __kmp_abort_thread(void) { 465 // TODO: Eliminate g_abort global variable and this function. 466 // In case of abort just call abort(), it will kill all the threads. 467 __kmp_infinite_loop(); 468 } // __kmp_abort_thread 469 470 /* Print out the storage map for the major kmp_info_t thread data structures 471 that are allocated together. */ 472 473 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 474 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 475 gtid); 476 477 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 478 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 479 480 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 481 sizeof(kmp_local_t), "th_%d.th_local", gtid); 482 483 __kmp_print_storage_map_gtid( 484 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 485 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 486 487 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 488 &thr->th.th_bar[bs_plain_barrier + 1], 489 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 490 gtid); 491 492 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 493 &thr->th.th_bar[bs_forkjoin_barrier + 1], 494 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 495 gtid); 496 497 #if KMP_FAST_REDUCTION_BARRIER 498 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 499 &thr->th.th_bar[bs_reduction_barrier + 1], 500 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 501 gtid); 502 #endif // KMP_FAST_REDUCTION_BARRIER 503 } 504 505 /* Print out the storage map for the major kmp_team_t team data structures 506 that are allocated together. */ 507 508 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 509 int team_id, int num_thr) { 510 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 511 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 512 header, team_id); 513 514 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 515 &team->t.t_bar[bs_last_barrier], 516 sizeof(kmp_balign_team_t) * bs_last_barrier, 517 "%s_%d.t_bar", header, team_id); 518 519 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 520 &team->t.t_bar[bs_plain_barrier + 1], 521 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 522 header, team_id); 523 524 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 525 &team->t.t_bar[bs_forkjoin_barrier + 1], 526 sizeof(kmp_balign_team_t), 527 "%s_%d.t_bar[forkjoin]", header, team_id); 528 529 #if KMP_FAST_REDUCTION_BARRIER 530 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 531 &team->t.t_bar[bs_reduction_barrier + 1], 532 sizeof(kmp_balign_team_t), 533 "%s_%d.t_bar[reduction]", header, team_id); 534 #endif // KMP_FAST_REDUCTION_BARRIER 535 536 __kmp_print_storage_map_gtid( 537 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 538 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 539 540 __kmp_print_storage_map_gtid( 541 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 542 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 543 544 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 545 &team->t.t_disp_buffer[num_disp_buff], 546 sizeof(dispatch_shared_info_t) * num_disp_buff, 547 "%s_%d.t_disp_buffer", header, team_id); 548 } 549 550 static void __kmp_init_allocator() { 551 __kmp_init_memkind(); 552 __kmp_init_target_mem(); 553 } 554 static void __kmp_fini_allocator() { __kmp_fini_memkind(); } 555 556 /* ------------------------------------------------------------------------ */ 557 558 #if ENABLE_LIBOMPTARGET 559 static void __kmp_init_omptarget() { 560 __kmp_init_target_task(); 561 } 562 #endif 563 564 /* ------------------------------------------------------------------------ */ 565 566 #if KMP_DYNAMIC_LIB 567 #if KMP_OS_WINDOWS 568 569 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 570 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 571 572 switch (fdwReason) { 573 574 case DLL_PROCESS_ATTACH: 575 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 576 577 return TRUE; 578 579 case DLL_PROCESS_DETACH: 580 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 581 582 // According to Windows* documentation for DllMain entry point: 583 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference: 584 // lpReserved == NULL when FreeLibrary() is called, 585 // lpReserved != NULL when the process is terminated. 586 // When FreeLibrary() is called, worker threads remain alive. So the 587 // runtime's state is consistent and executing proper shutdown is OK. 588 // When the process is terminated, worker threads have exited or been 589 // forcefully terminated by the OS and only the shutdown thread remains. 590 // This can leave the runtime in an inconsistent state. 591 // Hence, only attempt proper cleanup when FreeLibrary() is called. 592 // Otherwise, rely on OS to reclaim resources. 593 if (lpReserved == NULL) 594 __kmp_internal_end_library(__kmp_gtid_get_specific()); 595 596 return TRUE; 597 598 case DLL_THREAD_ATTACH: 599 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 600 601 /* if we want to register new siblings all the time here call 602 * __kmp_get_gtid(); */ 603 return TRUE; 604 605 case DLL_THREAD_DETACH: 606 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 607 608 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 609 return TRUE; 610 } 611 612 return TRUE; 613 } 614 615 #endif /* KMP_OS_WINDOWS */ 616 #endif /* KMP_DYNAMIC_LIB */ 617 618 /* __kmp_parallel_deo -- Wait until it's our turn. */ 619 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 620 int gtid = *gtid_ref; 621 #ifdef BUILD_PARALLEL_ORDERED 622 kmp_team_t *team = __kmp_team_from_gtid(gtid); 623 #endif /* BUILD_PARALLEL_ORDERED */ 624 625 if (__kmp_env_consistency_check) { 626 if (__kmp_threads[gtid]->th.th_root->r.r_active) 627 #if KMP_USE_DYNAMIC_LOCK 628 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 629 #else 630 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 631 #endif 632 } 633 #ifdef BUILD_PARALLEL_ORDERED 634 if (!team->t.t_serialized) { 635 KMP_MB(); 636 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, 637 NULL); 638 KMP_MB(); 639 } 640 #endif /* BUILD_PARALLEL_ORDERED */ 641 } 642 643 /* __kmp_parallel_dxo -- Signal the next task. */ 644 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 645 int gtid = *gtid_ref; 646 #ifdef BUILD_PARALLEL_ORDERED 647 int tid = __kmp_tid_from_gtid(gtid); 648 kmp_team_t *team = __kmp_team_from_gtid(gtid); 649 #endif /* BUILD_PARALLEL_ORDERED */ 650 651 if (__kmp_env_consistency_check) { 652 if (__kmp_threads[gtid]->th.th_root->r.r_active) 653 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 654 } 655 #ifdef BUILD_PARALLEL_ORDERED 656 if (!team->t.t_serialized) { 657 KMP_MB(); /* Flush all pending memory write invalidates. */ 658 659 /* use the tid of the next thread in this team */ 660 /* TODO replace with general release procedure */ 661 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 662 663 KMP_MB(); /* Flush all pending memory write invalidates. */ 664 } 665 #endif /* BUILD_PARALLEL_ORDERED */ 666 } 667 668 /* ------------------------------------------------------------------------ */ 669 /* The BARRIER for a SINGLE process section is always explicit */ 670 671 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 672 int status; 673 kmp_info_t *th; 674 kmp_team_t *team; 675 676 if (!TCR_4(__kmp_init_parallel)) 677 __kmp_parallel_initialize(); 678 __kmp_resume_if_soft_paused(); 679 680 th = __kmp_threads[gtid]; 681 team = th->th.th_team; 682 status = 0; 683 684 th->th.th_ident = id_ref; 685 686 if (team->t.t_serialized) { 687 status = 1; 688 } else { 689 kmp_int32 old_this = th->th.th_local.this_construct; 690 691 ++th->th.th_local.this_construct; 692 /* try to set team count to thread count--success means thread got the 693 single block */ 694 /* TODO: Should this be acquire or release? */ 695 if (team->t.t_construct == old_this) { 696 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 697 th->th.th_local.this_construct); 698 } 699 #if USE_ITT_BUILD 700 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 701 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 702 team->t.t_active_level == 1) { 703 // Only report metadata by primary thread of active team at level 1 704 __kmp_itt_metadata_single(id_ref); 705 } 706 #endif /* USE_ITT_BUILD */ 707 } 708 709 if (__kmp_env_consistency_check) { 710 if (status && push_ws) { 711 __kmp_push_workshare(gtid, ct_psingle, id_ref); 712 } else { 713 __kmp_check_workshare(gtid, ct_psingle, id_ref); 714 } 715 } 716 #if USE_ITT_BUILD 717 if (status) { 718 __kmp_itt_single_start(gtid); 719 } 720 #endif /* USE_ITT_BUILD */ 721 return status; 722 } 723 724 void __kmp_exit_single(int gtid) { 725 #if USE_ITT_BUILD 726 __kmp_itt_single_end(gtid); 727 #endif /* USE_ITT_BUILD */ 728 if (__kmp_env_consistency_check) 729 __kmp_pop_workshare(gtid, ct_psingle, NULL); 730 } 731 732 /* determine if we can go parallel or must use a serialized parallel region and 733 * how many threads we can use 734 * set_nproc is the number of threads requested for the team 735 * returns 0 if we should serialize or only use one thread, 736 * otherwise the number of threads to use 737 * The forkjoin lock is held by the caller. */ 738 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 739 int master_tid, int set_nthreads, 740 int enter_teams) { 741 int capacity; 742 int new_nthreads; 743 KMP_DEBUG_ASSERT(__kmp_init_serial); 744 KMP_DEBUG_ASSERT(root && parent_team); 745 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; 746 747 // If dyn-var is set, dynamically adjust the number of desired threads, 748 // according to the method specified by dynamic_mode. 749 new_nthreads = set_nthreads; 750 if (!get__dynamic_2(parent_team, master_tid)) { 751 ; 752 } 753 #ifdef USE_LOAD_BALANCE 754 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 755 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 756 if (new_nthreads == 1) { 757 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 758 "reservation to 1 thread\n", 759 master_tid)); 760 return 1; 761 } 762 if (new_nthreads < set_nthreads) { 763 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 764 "reservation to %d threads\n", 765 master_tid, new_nthreads)); 766 } 767 } 768 #endif /* USE_LOAD_BALANCE */ 769 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 770 new_nthreads = __kmp_avail_proc - __kmp_nth + 771 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 772 if (new_nthreads <= 1) { 773 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 774 "reservation to 1 thread\n", 775 master_tid)); 776 return 1; 777 } 778 if (new_nthreads < set_nthreads) { 779 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 780 "reservation to %d threads\n", 781 master_tid, new_nthreads)); 782 } else { 783 new_nthreads = set_nthreads; 784 } 785 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 786 if (set_nthreads > 2) { 787 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 788 new_nthreads = (new_nthreads % set_nthreads) + 1; 789 if (new_nthreads == 1) { 790 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 791 "reservation to 1 thread\n", 792 master_tid)); 793 return 1; 794 } 795 if (new_nthreads < set_nthreads) { 796 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 797 "reservation to %d threads\n", 798 master_tid, new_nthreads)); 799 } 800 } 801 } else { 802 KMP_ASSERT(0); 803 } 804 805 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 806 if (__kmp_nth + new_nthreads - 807 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 808 __kmp_max_nth) { 809 int tl_nthreads = __kmp_max_nth - __kmp_nth + 810 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 811 if (tl_nthreads <= 0) { 812 tl_nthreads = 1; 813 } 814 815 // If dyn-var is false, emit a 1-time warning. 816 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 817 __kmp_reserve_warn = 1; 818 __kmp_msg(kmp_ms_warning, 819 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 820 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 821 } 822 if (tl_nthreads == 1) { 823 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 824 "reduced reservation to 1 thread\n", 825 master_tid)); 826 return 1; 827 } 828 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 829 "reservation to %d threads\n", 830 master_tid, tl_nthreads)); 831 new_nthreads = tl_nthreads; 832 } 833 834 // Respect OMP_THREAD_LIMIT 835 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; 836 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; 837 if (cg_nthreads + new_nthreads - 838 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 839 max_cg_threads) { 840 int tl_nthreads = max_cg_threads - cg_nthreads + 841 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 842 if (tl_nthreads <= 0) { 843 tl_nthreads = 1; 844 } 845 846 // If dyn-var is false, emit a 1-time warning. 847 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 848 __kmp_reserve_warn = 1; 849 __kmp_msg(kmp_ms_warning, 850 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 851 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 852 } 853 if (tl_nthreads == 1) { 854 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 855 "reduced reservation to 1 thread\n", 856 master_tid)); 857 return 1; 858 } 859 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 860 "reservation to %d threads\n", 861 master_tid, tl_nthreads)); 862 new_nthreads = tl_nthreads; 863 } 864 865 // Check if the threads array is large enough, or needs expanding. 866 // See comment in __kmp_register_root() about the adjustment if 867 // __kmp_threads[0] == NULL. 868 capacity = __kmp_threads_capacity; 869 if (TCR_PTR(__kmp_threads[0]) == NULL) { 870 --capacity; 871 } 872 // If it is not for initializing the hidden helper team, we need to take 873 // __kmp_hidden_helper_threads_num out of the capacity because it is included 874 // in __kmp_threads_capacity. 875 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 876 capacity -= __kmp_hidden_helper_threads_num; 877 } 878 if (__kmp_nth + new_nthreads - 879 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 880 capacity) { 881 // Expand the threads array. 882 int slotsRequired = __kmp_nth + new_nthreads - 883 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 884 capacity; 885 int slotsAdded = __kmp_expand_threads(slotsRequired); 886 if (slotsAdded < slotsRequired) { 887 // The threads array was not expanded enough. 888 new_nthreads -= (slotsRequired - slotsAdded); 889 KMP_ASSERT(new_nthreads >= 1); 890 891 // If dyn-var is false, emit a 1-time warning. 892 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 893 __kmp_reserve_warn = 1; 894 if (__kmp_tp_cached) { 895 __kmp_msg(kmp_ms_warning, 896 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 897 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 898 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 899 } else { 900 __kmp_msg(kmp_ms_warning, 901 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 902 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 903 } 904 } 905 } 906 } 907 908 #ifdef KMP_DEBUG 909 if (new_nthreads == 1) { 910 KC_TRACE(10, 911 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 912 "dead roots and rechecking; requested %d threads\n", 913 __kmp_get_gtid(), set_nthreads)); 914 } else { 915 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 916 " %d threads\n", 917 __kmp_get_gtid(), new_nthreads, set_nthreads)); 918 } 919 #endif // KMP_DEBUG 920 return new_nthreads; 921 } 922 923 /* Allocate threads from the thread pool and assign them to the new team. We are 924 assured that there are enough threads available, because we checked on that 925 earlier within critical section forkjoin */ 926 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 927 kmp_info_t *master_th, int master_gtid, 928 int fork_teams_workers) { 929 int i; 930 int use_hot_team; 931 932 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 933 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 934 KMP_MB(); 935 936 /* first, let's setup the primary thread */ 937 master_th->th.th_info.ds.ds_tid = 0; 938 master_th->th.th_team = team; 939 master_th->th.th_team_nproc = team->t.t_nproc; 940 master_th->th.th_team_master = master_th; 941 master_th->th.th_team_serialized = FALSE; 942 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 943 944 /* make sure we are not the optimized hot team */ 945 #if KMP_NESTED_HOT_TEAMS 946 use_hot_team = 0; 947 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 948 if (hot_teams) { // hot teams array is not allocated if 949 // KMP_HOT_TEAMS_MAX_LEVEL=0 950 int level = team->t.t_active_level - 1; // index in array of hot teams 951 if (master_th->th.th_teams_microtask) { // are we inside the teams? 952 if (master_th->th.th_teams_size.nteams > 1) { 953 ++level; // level was not increased in teams construct for 954 // team_of_masters 955 } 956 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 957 master_th->th.th_teams_level == team->t.t_level) { 958 ++level; // level was not increased in teams construct for 959 // team_of_workers before the parallel 960 } // team->t.t_level will be increased inside parallel 961 } 962 if (level < __kmp_hot_teams_max_level) { 963 if (hot_teams[level].hot_team) { 964 // hot team has already been allocated for given level 965 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 966 use_hot_team = 1; // the team is ready to use 967 } else { 968 use_hot_team = 0; // AC: threads are not allocated yet 969 hot_teams[level].hot_team = team; // remember new hot team 970 hot_teams[level].hot_team_nth = team->t.t_nproc; 971 } 972 } else { 973 use_hot_team = 0; 974 } 975 } 976 #else 977 use_hot_team = team == root->r.r_hot_team; 978 #endif 979 if (!use_hot_team) { 980 981 /* install the primary thread */ 982 team->t.t_threads[0] = master_th; 983 __kmp_initialize_info(master_th, team, 0, master_gtid); 984 985 /* now, install the worker threads */ 986 for (i = 1; i < team->t.t_nproc; i++) { 987 988 /* fork or reallocate a new thread and install it in team */ 989 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 990 team->t.t_threads[i] = thr; 991 KMP_DEBUG_ASSERT(thr); 992 KMP_DEBUG_ASSERT(thr->th.th_team == team); 993 /* align team and thread arrived states */ 994 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 995 "T#%d(%d:%d) join =%llu, plain=%llu\n", 996 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 997 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 998 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 999 team->t.t_bar[bs_plain_barrier].b_arrived)); 1000 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 1001 thr->th.th_teams_level = master_th->th.th_teams_level; 1002 thr->th.th_teams_size = master_th->th.th_teams_size; 1003 { // Initialize threads' barrier data. 1004 int b; 1005 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 1006 for (b = 0; b < bs_last_barrier; ++b) { 1007 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 1008 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 1009 #if USE_DEBUGGER 1010 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 1011 #endif 1012 } 1013 } 1014 } 1015 1016 #if KMP_AFFINITY_SUPPORTED 1017 // Do not partition the places list for teams construct workers who 1018 // haven't actually been forked to do real work yet. This partitioning 1019 // will take place in the parallel region nested within the teams construct. 1020 if (!fork_teams_workers) { 1021 __kmp_partition_places(team); 1022 } 1023 #endif 1024 1025 if (team->t.t_nproc > 1 && 1026 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 1027 team->t.b->update_num_threads(team->t.t_nproc); 1028 __kmp_add_threads_to_team(team, team->t.t_nproc); 1029 } 1030 } 1031 1032 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 1033 for (i = 0; i < team->t.t_nproc; i++) { 1034 kmp_info_t *thr = team->t.t_threads[i]; 1035 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1036 thr->th.th_prev_level != team->t.t_level) { 1037 team->t.t_display_affinity = 1; 1038 break; 1039 } 1040 } 1041 } 1042 1043 KMP_MB(); 1044 } 1045 1046 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1047 // Propagate any changes to the floating point control registers out to the team 1048 // We try to avoid unnecessary writes to the relevant cache line in the team 1049 // structure, so we don't make changes unless they are needed. 1050 inline static void propagateFPControl(kmp_team_t *team) { 1051 if (__kmp_inherit_fp_control) { 1052 kmp_int16 x87_fpu_control_word; 1053 kmp_uint32 mxcsr; 1054 1055 // Get primary thread's values of FPU control flags (both X87 and vector) 1056 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1057 __kmp_store_mxcsr(&mxcsr); 1058 mxcsr &= KMP_X86_MXCSR_MASK; 1059 1060 // There is no point looking at t_fp_control_saved here. 1061 // If it is TRUE, we still have to update the values if they are different 1062 // from those we now have. If it is FALSE we didn't save anything yet, but 1063 // our objective is the same. We have to ensure that the values in the team 1064 // are the same as those we have. 1065 // So, this code achieves what we need whether or not t_fp_control_saved is 1066 // true. By checking whether the value needs updating we avoid unnecessary 1067 // writes that would put the cache-line into a written state, causing all 1068 // threads in the team to have to read it again. 1069 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1070 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1071 // Although we don't use this value, other code in the runtime wants to know 1072 // whether it should restore them. So we must ensure it is correct. 1073 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1074 } else { 1075 // Similarly here. Don't write to this cache-line in the team structure 1076 // unless we have to. 1077 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1078 } 1079 } 1080 1081 // Do the opposite, setting the hardware registers to the updated values from 1082 // the team. 1083 inline static void updateHWFPControl(kmp_team_t *team) { 1084 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1085 // Only reset the fp control regs if they have been changed in the team. 1086 // the parallel region that we are exiting. 1087 kmp_int16 x87_fpu_control_word; 1088 kmp_uint32 mxcsr; 1089 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1090 __kmp_store_mxcsr(&mxcsr); 1091 mxcsr &= KMP_X86_MXCSR_MASK; 1092 1093 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1094 __kmp_clear_x87_fpu_status_word(); 1095 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1096 } 1097 1098 if (team->t.t_mxcsr != mxcsr) { 1099 __kmp_load_mxcsr(&team->t.t_mxcsr); 1100 } 1101 } 1102 } 1103 #else 1104 #define propagateFPControl(x) ((void)0) 1105 #define updateHWFPControl(x) ((void)0) 1106 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1107 1108 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1109 int realloc); // forward declaration 1110 1111 /* Run a parallel region that has been serialized, so runs only in a team of the 1112 single primary thread. */ 1113 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1114 kmp_info_t *this_thr; 1115 kmp_team_t *serial_team; 1116 1117 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1118 1119 /* Skip all this code for autopar serialized loops since it results in 1120 unacceptable overhead */ 1121 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1122 return; 1123 1124 if (!TCR_4(__kmp_init_parallel)) 1125 __kmp_parallel_initialize(); 1126 __kmp_resume_if_soft_paused(); 1127 1128 this_thr = __kmp_threads[global_tid]; 1129 serial_team = this_thr->th.th_serial_team; 1130 1131 /* utilize the serialized team held by this thread */ 1132 KMP_DEBUG_ASSERT(serial_team); 1133 KMP_MB(); 1134 1135 if (__kmp_tasking_mode != tskm_immediate_exec) { 1136 KMP_DEBUG_ASSERT( 1137 this_thr->th.th_task_team == 1138 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1139 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1140 NULL); 1141 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1142 "team %p, new task_team = NULL\n", 1143 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1144 this_thr->th.th_task_team = NULL; 1145 } 1146 1147 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1148 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1149 proc_bind = proc_bind_false; 1150 } else if (proc_bind == proc_bind_default) { 1151 // No proc_bind clause was specified, so use the current value 1152 // of proc-bind-var for this parallel region. 1153 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1154 } 1155 // Reset for next parallel region 1156 this_thr->th.th_set_proc_bind = proc_bind_default; 1157 1158 // Reset num_threads for next parallel region 1159 this_thr->th.th_set_nproc = 0; 1160 1161 #if OMPT_SUPPORT 1162 ompt_data_t ompt_parallel_data = ompt_data_none; 1163 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1164 if (ompt_enabled.enabled && 1165 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1166 1167 ompt_task_info_t *parent_task_info; 1168 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1169 1170 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1171 if (ompt_enabled.ompt_callback_parallel_begin) { 1172 int team_size = 1; 1173 1174 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1175 &(parent_task_info->task_data), &(parent_task_info->frame), 1176 &ompt_parallel_data, team_size, 1177 ompt_parallel_invoker_program | ompt_parallel_team, codeptr); 1178 } 1179 } 1180 #endif // OMPT_SUPPORT 1181 1182 if (this_thr->th.th_team != serial_team) { 1183 // Nested level will be an index in the nested nthreads array 1184 int level = this_thr->th.th_team->t.t_level; 1185 1186 if (serial_team->t.t_serialized) { 1187 /* this serial team was already used 1188 TODO increase performance by making this locks more specific */ 1189 kmp_team_t *new_team; 1190 1191 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1192 1193 new_team = 1194 __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1195 #if OMPT_SUPPORT 1196 ompt_parallel_data, 1197 #endif 1198 proc_bind, &this_thr->th.th_current_task->td_icvs, 1199 0 USE_NESTED_HOT_ARG(NULL)); 1200 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1201 KMP_ASSERT(new_team); 1202 1203 /* setup new serialized team and install it */ 1204 new_team->t.t_threads[0] = this_thr; 1205 new_team->t.t_parent = this_thr->th.th_team; 1206 serial_team = new_team; 1207 this_thr->th.th_serial_team = serial_team; 1208 1209 KF_TRACE( 1210 10, 1211 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1212 global_tid, serial_team)); 1213 1214 /* TODO the above breaks the requirement that if we run out of resources, 1215 then we can still guarantee that serialized teams are ok, since we may 1216 need to allocate a new one */ 1217 } else { 1218 KF_TRACE( 1219 10, 1220 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1221 global_tid, serial_team)); 1222 } 1223 1224 /* we have to initialize this serial team */ 1225 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1226 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1227 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1228 serial_team->t.t_ident = loc; 1229 serial_team->t.t_serialized = 1; 1230 serial_team->t.t_nproc = 1; 1231 serial_team->t.t_parent = this_thr->th.th_team; 1232 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1233 this_thr->th.th_team = serial_team; 1234 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1235 1236 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid, 1237 this_thr->th.th_current_task)); 1238 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1239 this_thr->th.th_current_task->td_flags.executing = 0; 1240 1241 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1242 1243 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1244 implicit task for each serialized task represented by 1245 team->t.t_serialized? */ 1246 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1247 &this_thr->th.th_current_task->td_parent->td_icvs); 1248 1249 // Thread value exists in the nested nthreads array for the next nested 1250 // level 1251 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1252 this_thr->th.th_current_task->td_icvs.nproc = 1253 __kmp_nested_nth.nth[level + 1]; 1254 } 1255 1256 if (__kmp_nested_proc_bind.used && 1257 (level + 1 < __kmp_nested_proc_bind.used)) { 1258 this_thr->th.th_current_task->td_icvs.proc_bind = 1259 __kmp_nested_proc_bind.bind_types[level + 1]; 1260 } 1261 1262 #if USE_DEBUGGER 1263 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1264 #endif 1265 this_thr->th.th_info.ds.ds_tid = 0; 1266 1267 /* set thread cache values */ 1268 this_thr->th.th_team_nproc = 1; 1269 this_thr->th.th_team_master = this_thr; 1270 this_thr->th.th_team_serialized = 1; 1271 1272 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1273 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1274 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1275 1276 propagateFPControl(serial_team); 1277 1278 /* check if we need to allocate dispatch buffers stack */ 1279 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1280 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1281 serial_team->t.t_dispatch->th_disp_buffer = 1282 (dispatch_private_info_t *)__kmp_allocate( 1283 sizeof(dispatch_private_info_t)); 1284 } 1285 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1286 1287 KMP_MB(); 1288 1289 } else { 1290 /* this serialized team is already being used, 1291 * that's fine, just add another nested level */ 1292 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1293 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1294 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1295 ++serial_team->t.t_serialized; 1296 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1297 1298 // Nested level will be an index in the nested nthreads array 1299 int level = this_thr->th.th_team->t.t_level; 1300 // Thread value exists in the nested nthreads array for the next nested 1301 // level 1302 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1303 this_thr->th.th_current_task->td_icvs.nproc = 1304 __kmp_nested_nth.nth[level + 1]; 1305 } 1306 serial_team->t.t_level++; 1307 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1308 "of serial team %p to %d\n", 1309 global_tid, serial_team, serial_team->t.t_level)); 1310 1311 /* allocate/push dispatch buffers stack */ 1312 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1313 { 1314 dispatch_private_info_t *disp_buffer = 1315 (dispatch_private_info_t *)__kmp_allocate( 1316 sizeof(dispatch_private_info_t)); 1317 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1318 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1319 } 1320 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1321 1322 KMP_MB(); 1323 } 1324 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1325 1326 // Perform the display affinity functionality for 1327 // serialized parallel regions 1328 if (__kmp_display_affinity) { 1329 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1330 this_thr->th.th_prev_num_threads != 1) { 1331 // NULL means use the affinity-format-var ICV 1332 __kmp_aux_display_affinity(global_tid, NULL); 1333 this_thr->th.th_prev_level = serial_team->t.t_level; 1334 this_thr->th.th_prev_num_threads = 1; 1335 } 1336 } 1337 1338 if (__kmp_env_consistency_check) 1339 __kmp_push_parallel(global_tid, NULL); 1340 #if OMPT_SUPPORT 1341 serial_team->t.ompt_team_info.master_return_address = codeptr; 1342 if (ompt_enabled.enabled && 1343 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1344 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1345 OMPT_GET_FRAME_ADDRESS(0); 1346 1347 ompt_lw_taskteam_t lw_taskteam; 1348 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1349 &ompt_parallel_data, codeptr); 1350 1351 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1352 // don't use lw_taskteam after linking. content was swaped 1353 1354 /* OMPT implicit task begin */ 1355 if (ompt_enabled.ompt_callback_implicit_task) { 1356 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1357 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1358 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), 1359 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1360 OMPT_CUR_TASK_INFO(this_thr)->thread_num = 1361 __kmp_tid_from_gtid(global_tid); 1362 } 1363 1364 /* OMPT state */ 1365 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1366 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1367 OMPT_GET_FRAME_ADDRESS(0); 1368 } 1369 #endif 1370 } 1371 1372 // Test if this fork is for a team closely nested in a teams construct 1373 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th, 1374 microtask_t microtask, int level, 1375 int teams_level, kmp_va_list ap) { 1376 return (master_th->th.th_teams_microtask && ap && 1377 microtask != (microtask_t)__kmp_teams_master && level == teams_level); 1378 } 1379 1380 // Test if this fork is for the teams construct, i.e. to form the outer league 1381 // of teams 1382 static inline bool __kmp_is_entering_teams(int active_level, int level, 1383 int teams_level, kmp_va_list ap) { 1384 return ((ap == NULL && active_level == 0) || 1385 (ap && teams_level > 0 && teams_level == level)); 1386 } 1387 1388 // AC: This is start of parallel that is nested inside teams construct. 1389 // The team is actual (hot), all workers are ready at the fork barrier. 1390 // No lock needed to initialize the team a bit, then free workers. 1391 static inline int 1392 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team, 1393 kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root, 1394 enum fork_context_e call_context, microtask_t microtask, 1395 launch_t invoker, int master_set_numthreads, int level, 1396 #if OMPT_SUPPORT 1397 ompt_data_t ompt_parallel_data, void *return_address, 1398 #endif 1399 kmp_va_list ap) { 1400 void **argv; 1401 int i; 1402 1403 parent_team->t.t_ident = loc; 1404 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1405 parent_team->t.t_argc = argc; 1406 argv = (void **)parent_team->t.t_argv; 1407 for (i = argc - 1; i >= 0; --i) { 1408 *argv++ = va_arg(kmp_va_deref(ap), void *); 1409 } 1410 // Increment our nested depth levels, but not increase the serialization 1411 if (parent_team == master_th->th.th_serial_team) { 1412 // AC: we are in serialized parallel 1413 __kmpc_serialized_parallel(loc, gtid); 1414 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1415 1416 if (call_context == fork_context_gnu) { 1417 // AC: need to decrement t_serialized for enquiry functions to work 1418 // correctly, will restore at join time 1419 parent_team->t.t_serialized--; 1420 return TRUE; 1421 } 1422 1423 #if OMPD_SUPPORT 1424 parent_team->t.t_pkfn = microtask; 1425 #endif 1426 1427 #if OMPT_SUPPORT 1428 void *dummy; 1429 void **exit_frame_p; 1430 ompt_data_t *implicit_task_data; 1431 ompt_lw_taskteam_t lw_taskteam; 1432 1433 if (ompt_enabled.enabled) { 1434 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1435 &ompt_parallel_data, return_address); 1436 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1437 1438 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1439 // Don't use lw_taskteam after linking. Content was swapped. 1440 1441 /* OMPT implicit task begin */ 1442 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1443 if (ompt_enabled.ompt_callback_implicit_task) { 1444 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid); 1445 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1446 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data, 1447 1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1448 } 1449 1450 /* OMPT state */ 1451 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1452 } else { 1453 exit_frame_p = &dummy; 1454 } 1455 #endif 1456 1457 // AC: need to decrement t_serialized for enquiry functions to work 1458 // correctly, will restore at join time 1459 parent_team->t.t_serialized--; 1460 1461 { 1462 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1463 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1464 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1465 #if OMPT_SUPPORT 1466 , 1467 exit_frame_p 1468 #endif 1469 ); 1470 } 1471 1472 #if OMPT_SUPPORT 1473 if (ompt_enabled.enabled) { 1474 *exit_frame_p = NULL; 1475 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1476 if (ompt_enabled.ompt_callback_implicit_task) { 1477 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1478 ompt_scope_end, NULL, implicit_task_data, 1, 1479 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1480 } 1481 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1482 __ompt_lw_taskteam_unlink(master_th); 1483 if (ompt_enabled.ompt_callback_parallel_end) { 1484 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1485 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th), 1486 OMPT_INVOKER(call_context) | ompt_parallel_team, return_address); 1487 } 1488 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1489 } 1490 #endif 1491 return TRUE; 1492 } 1493 1494 parent_team->t.t_pkfn = microtask; 1495 parent_team->t.t_invoke = invoker; 1496 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1497 parent_team->t.t_active_level++; 1498 parent_team->t.t_level++; 1499 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1500 1501 // If the threads allocated to the team are less than the thread limit, update 1502 // the thread limit here. th_teams_size.nth is specific to this team nested 1503 // in a teams construct, the team is fully created, and we're about to do 1504 // the actual fork. Best to do this here so that the subsequent uses below 1505 // and in the join have the correct value. 1506 master_th->th.th_teams_size.nth = parent_team->t.t_nproc; 1507 1508 #if OMPT_SUPPORT 1509 if (ompt_enabled.enabled) { 1510 ompt_lw_taskteam_t lw_taskteam; 1511 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data, 1512 return_address); 1513 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true); 1514 } 1515 #endif 1516 1517 /* Change number of threads in the team if requested */ 1518 if (master_set_numthreads) { // The parallel has num_threads clause 1519 if (master_set_numthreads <= master_th->th.th_teams_size.nth) { 1520 // AC: only can reduce number of threads dynamically, can't increase 1521 kmp_info_t **other_threads = parent_team->t.t_threads; 1522 // NOTE: if using distributed barrier, we need to run this code block 1523 // even when the team size appears not to have changed from the max. 1524 int old_proc = master_th->th.th_teams_size.nth; 1525 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 1526 __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads); 1527 __kmp_add_threads_to_team(parent_team, master_set_numthreads); 1528 } 1529 parent_team->t.t_nproc = master_set_numthreads; 1530 for (i = 0; i < master_set_numthreads; ++i) { 1531 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1532 } 1533 } 1534 // Keep extra threads hot in the team for possible next parallels 1535 master_th->th.th_set_nproc = 0; 1536 } 1537 1538 #if USE_DEBUGGER 1539 if (__kmp_debugging) { // Let debugger override number of threads. 1540 int nth = __kmp_omp_num_threads(loc); 1541 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1542 master_set_numthreads = nth; 1543 } 1544 } 1545 #endif 1546 1547 // Figure out the proc_bind policy for the nested parallel within teams 1548 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1549 // proc_bind_default means don't update 1550 kmp_proc_bind_t proc_bind_icv = proc_bind_default; 1551 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1552 proc_bind = proc_bind_false; 1553 } else { 1554 // No proc_bind clause specified; use current proc-bind-var 1555 if (proc_bind == proc_bind_default) { 1556 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1557 } 1558 /* else: The proc_bind policy was specified explicitly on parallel clause. 1559 This overrides proc-bind-var for this parallel region, but does not 1560 change proc-bind-var. */ 1561 // Figure the value of proc-bind-var for the child threads. 1562 if ((level + 1 < __kmp_nested_proc_bind.used) && 1563 (__kmp_nested_proc_bind.bind_types[level + 1] != 1564 master_th->th.th_current_task->td_icvs.proc_bind)) { 1565 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1566 } 1567 } 1568 KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind); 1569 // Need to change the bind-var ICV to correct value for each implicit task 1570 if (proc_bind_icv != proc_bind_default && 1571 master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) { 1572 kmp_info_t **other_threads = parent_team->t.t_threads; 1573 for (i = 0; i < master_th->th.th_team_nproc; ++i) { 1574 other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv; 1575 } 1576 } 1577 // Reset for next parallel region 1578 master_th->th.th_set_proc_bind = proc_bind_default; 1579 1580 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1581 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || 1582 KMP_ITT_DEBUG) && 1583 __kmp_forkjoin_frames_mode == 3 && 1584 parent_team->t.t_active_level == 1 // only report frames at level 1 1585 && master_th->th.th_teams_size.nteams == 1) { 1586 kmp_uint64 tmp_time = __itt_get_timestamp(); 1587 master_th->th.th_frame_time = tmp_time; 1588 parent_team->t.t_region_time = tmp_time; 1589 } 1590 if (__itt_stack_caller_create_ptr) { 1591 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 1592 // create new stack stitching id before entering fork barrier 1593 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 1594 } 1595 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1596 #if KMP_AFFINITY_SUPPORTED 1597 __kmp_partition_places(parent_team); 1598 #endif 1599 1600 KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, " 1601 "master_th=%p, gtid=%d\n", 1602 root, parent_team, master_th, gtid)); 1603 __kmp_internal_fork(loc, gtid, parent_team); 1604 KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, " 1605 "master_th=%p, gtid=%d\n", 1606 root, parent_team, master_th, gtid)); 1607 1608 if (call_context == fork_context_gnu) 1609 return TRUE; 1610 1611 /* Invoke microtask for PRIMARY thread */ 1612 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid, 1613 parent_team->t.t_id, parent_team->t.t_pkfn)); 1614 1615 if (!parent_team->t.t_invoke(gtid)) { 1616 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 1617 } 1618 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid, 1619 parent_team->t.t_id, parent_team->t.t_pkfn)); 1620 KMP_MB(); /* Flush all pending memory write invalidates. */ 1621 1622 KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid)); 1623 1624 return TRUE; 1625 } 1626 1627 // Create a serialized parallel region 1628 static inline int 1629 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context, 1630 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1631 kmp_info_t *master_th, kmp_team_t *parent_team, 1632 #if OMPT_SUPPORT 1633 ompt_data_t *ompt_parallel_data, void **return_address, 1634 ompt_data_t **parent_task_data, 1635 #endif 1636 kmp_va_list ap) { 1637 kmp_team_t *team; 1638 int i; 1639 void **argv; 1640 1641 /* josh todo: hypothetical question: what do we do for OS X*? */ 1642 #if KMP_OS_LINUX && \ 1643 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1644 void *args[argc]; 1645 #else 1646 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1647 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1648 KMP_ARCH_AARCH64) */ 1649 1650 KA_TRACE( 1651 20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid)); 1652 1653 __kmpc_serialized_parallel(loc, gtid); 1654 1655 #if OMPD_SUPPORT 1656 master_th->th.th_serial_team->t.t_pkfn = microtask; 1657 #endif 1658 1659 if (call_context == fork_context_intel) { 1660 /* TODO this sucks, use the compiler itself to pass args! :) */ 1661 master_th->th.th_serial_team->t.t_ident = loc; 1662 if (!ap) { 1663 // revert change made in __kmpc_serialized_parallel() 1664 master_th->th.th_serial_team->t.t_level--; 1665 // Get args from parent team for teams construct 1666 1667 #if OMPT_SUPPORT 1668 void *dummy; 1669 void **exit_frame_p; 1670 ompt_task_info_t *task_info; 1671 ompt_lw_taskteam_t lw_taskteam; 1672 1673 if (ompt_enabled.enabled) { 1674 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1675 ompt_parallel_data, *return_address); 1676 1677 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1678 // don't use lw_taskteam after linking. content was swaped 1679 task_info = OMPT_CUR_TASK_INFO(master_th); 1680 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1681 if (ompt_enabled.ompt_callback_implicit_task) { 1682 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid); 1683 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1684 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1685 &(task_info->task_data), 1, 1686 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1687 } 1688 1689 /* OMPT state */ 1690 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1691 } else { 1692 exit_frame_p = &dummy; 1693 } 1694 #endif 1695 1696 { 1697 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1698 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1699 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1700 #if OMPT_SUPPORT 1701 , 1702 exit_frame_p 1703 #endif 1704 ); 1705 } 1706 1707 #if OMPT_SUPPORT 1708 if (ompt_enabled.enabled) { 1709 *exit_frame_p = NULL; 1710 if (ompt_enabled.ompt_callback_implicit_task) { 1711 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1712 ompt_scope_end, NULL, &(task_info->task_data), 1, 1713 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1714 } 1715 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1716 __ompt_lw_taskteam_unlink(master_th); 1717 if (ompt_enabled.ompt_callback_parallel_end) { 1718 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1719 ompt_parallel_data, *parent_task_data, 1720 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address); 1721 } 1722 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1723 } 1724 #endif 1725 } else if (microtask == (microtask_t)__kmp_teams_master) { 1726 KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team); 1727 team = master_th->th.th_team; 1728 // team->t.t_pkfn = microtask; 1729 team->t.t_invoke = invoker; 1730 __kmp_alloc_argv_entries(argc, team, TRUE); 1731 team->t.t_argc = argc; 1732 argv = (void **)team->t.t_argv; 1733 if (ap) { 1734 for (i = argc - 1; i >= 0; --i) 1735 *argv++ = va_arg(kmp_va_deref(ap), void *); 1736 } else { 1737 for (i = 0; i < argc; ++i) 1738 // Get args from parent team for teams construct 1739 argv[i] = parent_team->t.t_argv[i]; 1740 } 1741 // AC: revert change made in __kmpc_serialized_parallel() 1742 // because initial code in teams should have level=0 1743 team->t.t_level--; 1744 // AC: call special invoker for outer "parallel" of teams construct 1745 invoker(gtid); 1746 #if OMPT_SUPPORT 1747 if (ompt_enabled.enabled) { 1748 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th); 1749 if (ompt_enabled.ompt_callback_implicit_task) { 1750 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1751 ompt_scope_end, NULL, &(task_info->task_data), 0, 1752 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial); 1753 } 1754 if (ompt_enabled.ompt_callback_parallel_end) { 1755 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1756 ompt_parallel_data, *parent_task_data, 1757 OMPT_INVOKER(call_context) | ompt_parallel_league, 1758 *return_address); 1759 } 1760 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1761 } 1762 #endif 1763 } else { 1764 argv = args; 1765 for (i = argc - 1; i >= 0; --i) 1766 *argv++ = va_arg(kmp_va_deref(ap), void *); 1767 KMP_MB(); 1768 1769 #if OMPT_SUPPORT 1770 void *dummy; 1771 void **exit_frame_p; 1772 ompt_task_info_t *task_info; 1773 ompt_lw_taskteam_t lw_taskteam; 1774 ompt_data_t *implicit_task_data; 1775 1776 if (ompt_enabled.enabled) { 1777 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1778 ompt_parallel_data, *return_address); 1779 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1780 // don't use lw_taskteam after linking. content was swaped 1781 task_info = OMPT_CUR_TASK_INFO(master_th); 1782 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1783 1784 /* OMPT implicit task begin */ 1785 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1786 if (ompt_enabled.ompt_callback_implicit_task) { 1787 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1788 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1789 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), 1790 ompt_task_implicit); 1791 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid); 1792 } 1793 1794 /* OMPT state */ 1795 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1796 } else { 1797 exit_frame_p = &dummy; 1798 } 1799 #endif 1800 1801 { 1802 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1803 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1804 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1805 #if OMPT_SUPPORT 1806 , 1807 exit_frame_p 1808 #endif 1809 ); 1810 } 1811 1812 #if OMPT_SUPPORT 1813 if (ompt_enabled.enabled) { 1814 *exit_frame_p = NULL; 1815 if (ompt_enabled.ompt_callback_implicit_task) { 1816 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1817 ompt_scope_end, NULL, &(task_info->task_data), 1, 1818 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1819 } 1820 1821 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1822 __ompt_lw_taskteam_unlink(master_th); 1823 if (ompt_enabled.ompt_callback_parallel_end) { 1824 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1825 ompt_parallel_data, *parent_task_data, 1826 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address); 1827 } 1828 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1829 } 1830 #endif 1831 } 1832 } else if (call_context == fork_context_gnu) { 1833 #if OMPT_SUPPORT 1834 if (ompt_enabled.enabled) { 1835 ompt_lw_taskteam_t lwt; 1836 __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data, 1837 *return_address); 1838 1839 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1840 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1841 } 1842 // don't use lw_taskteam after linking. content was swaped 1843 #endif 1844 1845 // we were called from GNU native code 1846 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid)); 1847 return FALSE; 1848 } else { 1849 KMP_ASSERT2(call_context < fork_context_last, 1850 "__kmp_serial_fork_call: unknown fork_context parameter"); 1851 } 1852 1853 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid)); 1854 KMP_MB(); 1855 return FALSE; 1856 } 1857 1858 /* most of the work for a fork */ 1859 /* return true if we really went parallel, false if serialized */ 1860 int __kmp_fork_call(ident_t *loc, int gtid, 1861 enum fork_context_e call_context, // Intel, GNU, ... 1862 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1863 kmp_va_list ap) { 1864 void **argv; 1865 int i; 1866 int master_tid; 1867 int master_this_cons; 1868 kmp_team_t *team; 1869 kmp_team_t *parent_team; 1870 kmp_info_t *master_th; 1871 kmp_root_t *root; 1872 int nthreads; 1873 int master_active; 1874 int master_set_numthreads; 1875 int level; 1876 int active_level; 1877 int teams_level; 1878 #if KMP_NESTED_HOT_TEAMS 1879 kmp_hot_team_ptr_t **p_hot_teams; 1880 #endif 1881 { // KMP_TIME_BLOCK 1882 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1883 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1884 1885 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1886 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1887 /* Some systems prefer the stack for the root thread(s) to start with */ 1888 /* some gap from the parent stack to prevent false sharing. */ 1889 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1890 /* These 2 lines below are so this does not get optimized out */ 1891 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1892 __kmp_stkpadding += (short)((kmp_int64)dummy); 1893 } 1894 1895 /* initialize if needed */ 1896 KMP_DEBUG_ASSERT( 1897 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1898 if (!TCR_4(__kmp_init_parallel)) 1899 __kmp_parallel_initialize(); 1900 __kmp_resume_if_soft_paused(); 1901 1902 /* setup current data */ 1903 // AC: potentially unsafe, not in sync with library shutdown, 1904 // __kmp_threads can be freed 1905 master_th = __kmp_threads[gtid]; 1906 1907 parent_team = master_th->th.th_team; 1908 master_tid = master_th->th.th_info.ds.ds_tid; 1909 master_this_cons = master_th->th.th_local.this_construct; 1910 root = master_th->th.th_root; 1911 master_active = root->r.r_active; 1912 master_set_numthreads = master_th->th.th_set_nproc; 1913 1914 #if OMPT_SUPPORT 1915 ompt_data_t ompt_parallel_data = ompt_data_none; 1916 ompt_data_t *parent_task_data; 1917 ompt_frame_t *ompt_frame; 1918 void *return_address = NULL; 1919 1920 if (ompt_enabled.enabled) { 1921 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1922 NULL, NULL); 1923 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1924 } 1925 #endif 1926 1927 // Assign affinity to root thread if it hasn't happened yet 1928 __kmp_assign_root_init_mask(); 1929 1930 // Nested level will be an index in the nested nthreads array 1931 level = parent_team->t.t_level; 1932 // used to launch non-serial teams even if nested is not allowed 1933 active_level = parent_team->t.t_active_level; 1934 // needed to check nesting inside the teams 1935 teams_level = master_th->th.th_teams_level; 1936 #if KMP_NESTED_HOT_TEAMS 1937 p_hot_teams = &master_th->th.th_hot_teams; 1938 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1939 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1940 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1941 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1942 // it is either actual or not needed (when active_level > 0) 1943 (*p_hot_teams)[0].hot_team_nth = 1; 1944 } 1945 #endif 1946 1947 #if OMPT_SUPPORT 1948 if (ompt_enabled.enabled) { 1949 if (ompt_enabled.ompt_callback_parallel_begin) { 1950 int team_size = master_set_numthreads 1951 ? master_set_numthreads 1952 : get__nproc_2(parent_team, master_tid); 1953 int flags = OMPT_INVOKER(call_context) | 1954 ((microtask == (microtask_t)__kmp_teams_master) 1955 ? ompt_parallel_league 1956 : ompt_parallel_team); 1957 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1958 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags, 1959 return_address); 1960 } 1961 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1962 } 1963 #endif 1964 1965 master_th->th.th_ident = loc; 1966 1967 // Parallel closely nested in teams construct: 1968 if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) { 1969 return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root, 1970 call_context, microtask, invoker, 1971 master_set_numthreads, level, 1972 #if OMPT_SUPPORT 1973 ompt_parallel_data, return_address, 1974 #endif 1975 ap); 1976 } // End parallel closely nested in teams construct 1977 1978 #if KMP_DEBUG 1979 if (__kmp_tasking_mode != tskm_immediate_exec) { 1980 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1981 parent_team->t.t_task_team[master_th->th.th_task_state]); 1982 } 1983 #endif 1984 1985 // Need this to happen before we determine the number of threads, not while 1986 // we are allocating the team 1987 //__kmp_push_current_task_to_thread(master_th, parent_team, 0); 1988 1989 // Determine the number of threads 1990 int enter_teams = 1991 __kmp_is_entering_teams(active_level, level, teams_level, ap); 1992 if ((!enter_teams && 1993 (parent_team->t.t_active_level >= 1994 master_th->th.th_current_task->td_icvs.max_active_levels)) || 1995 (__kmp_library == library_serial)) { 1996 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid)); 1997 nthreads = 1; 1998 } else { 1999 nthreads = master_set_numthreads 2000 ? master_set_numthreads 2001 // TODO: get nproc directly from current task 2002 : get__nproc_2(parent_team, master_tid); 2003 // Check if we need to take forkjoin lock? (no need for serialized 2004 // parallel out of teams construct). 2005 if (nthreads > 1) { 2006 /* determine how many new threads we can use */ 2007 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2008 /* AC: If we execute teams from parallel region (on host), then teams 2009 should be created but each can only have 1 thread if nesting is 2010 disabled. If teams called from serial region, then teams and their 2011 threads should be created regardless of the nesting setting. */ 2012 nthreads = __kmp_reserve_threads(root, parent_team, master_tid, 2013 nthreads, enter_teams); 2014 if (nthreads == 1) { 2015 // Free lock for single thread execution here; for multi-thread 2016 // execution it will be freed later after team of threads created 2017 // and initialized 2018 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2019 } 2020 } 2021 } 2022 KMP_DEBUG_ASSERT(nthreads > 0); 2023 2024 // If we temporarily changed the set number of threads then restore it now 2025 master_th->th.th_set_nproc = 0; 2026 2027 if (nthreads == 1) { 2028 return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask, 2029 invoker, master_th, parent_team, 2030 #if OMPT_SUPPORT 2031 &ompt_parallel_data, &return_address, 2032 &parent_task_data, 2033 #endif 2034 ap); 2035 } // if (nthreads == 1) 2036 2037 // GEH: only modify the executing flag in the case when not serialized 2038 // serialized case is handled in kmpc_serialized_parallel 2039 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 2040 "curtask=%p, curtask_max_aclevel=%d\n", 2041 parent_team->t.t_active_level, master_th, 2042 master_th->th.th_current_task, 2043 master_th->th.th_current_task->td_icvs.max_active_levels)); 2044 // TODO: GEH - cannot do this assertion because root thread not set up as 2045 // executing 2046 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 2047 master_th->th.th_current_task->td_flags.executing = 0; 2048 2049 if (!master_th->th.th_teams_microtask || level > teams_level) { 2050 /* Increment our nested depth level */ 2051 KMP_ATOMIC_INC(&root->r.r_in_parallel); 2052 } 2053 2054 // See if we need to make a copy of the ICVs. 2055 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 2056 if ((level + 1 < __kmp_nested_nth.used) && 2057 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 2058 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 2059 } else { 2060 nthreads_icv = 0; // don't update 2061 } 2062 2063 // Figure out the proc_bind_policy for the new team. 2064 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 2065 // proc_bind_default means don't update 2066 kmp_proc_bind_t proc_bind_icv = proc_bind_default; 2067 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 2068 proc_bind = proc_bind_false; 2069 } else { 2070 // No proc_bind clause specified; use current proc-bind-var for this 2071 // parallel region 2072 if (proc_bind == proc_bind_default) { 2073 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 2074 } 2075 // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND 2076 if (master_th->th.th_teams_microtask && 2077 microtask == (microtask_t)__kmp_teams_master) { 2078 proc_bind = __kmp_teams_proc_bind; 2079 } 2080 /* else: The proc_bind policy was specified explicitly on parallel clause. 2081 This overrides proc-bind-var for this parallel region, but does not 2082 change proc-bind-var. */ 2083 // Figure the value of proc-bind-var for the child threads. 2084 if ((level + 1 < __kmp_nested_proc_bind.used) && 2085 (__kmp_nested_proc_bind.bind_types[level + 1] != 2086 master_th->th.th_current_task->td_icvs.proc_bind)) { 2087 // Do not modify the proc bind icv for the two teams construct forks 2088 // They just let the proc bind icv pass through 2089 if (!master_th->th.th_teams_microtask || 2090 !(microtask == (microtask_t)__kmp_teams_master || ap == NULL)) 2091 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 2092 } 2093 } 2094 2095 // Reset for next parallel region 2096 master_th->th.th_set_proc_bind = proc_bind_default; 2097 2098 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { 2099 kmp_internal_control_t new_icvs; 2100 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 2101 new_icvs.next = NULL; 2102 if (nthreads_icv > 0) { 2103 new_icvs.nproc = nthreads_icv; 2104 } 2105 if (proc_bind_icv != proc_bind_default) { 2106 new_icvs.proc_bind = proc_bind_icv; 2107 } 2108 2109 /* allocate a new parallel team */ 2110 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2111 team = __kmp_allocate_team(root, nthreads, nthreads, 2112 #if OMPT_SUPPORT 2113 ompt_parallel_data, 2114 #endif 2115 proc_bind, &new_icvs, 2116 argc USE_NESTED_HOT_ARG(master_th)); 2117 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) 2118 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs); 2119 } else { 2120 /* allocate a new parallel team */ 2121 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2122 team = __kmp_allocate_team(root, nthreads, nthreads, 2123 #if OMPT_SUPPORT 2124 ompt_parallel_data, 2125 #endif 2126 proc_bind, 2127 &master_th->th.th_current_task->td_icvs, 2128 argc USE_NESTED_HOT_ARG(master_th)); 2129 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) 2130 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, 2131 &master_th->th.th_current_task->td_icvs); 2132 } 2133 KF_TRACE( 2134 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 2135 2136 /* setup the new team */ 2137 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 2138 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 2139 KMP_CHECK_UPDATE(team->t.t_ident, loc); 2140 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 2141 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 2142 #if OMPT_SUPPORT 2143 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 2144 return_address); 2145 #endif 2146 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 2147 // TODO: parent_team->t.t_level == INT_MAX ??? 2148 if (!master_th->th.th_teams_microtask || level > teams_level) { 2149 int new_level = parent_team->t.t_level + 1; 2150 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2151 new_level = parent_team->t.t_active_level + 1; 2152 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2153 } else { 2154 // AC: Do not increase parallel level at start of the teams construct 2155 int new_level = parent_team->t.t_level; 2156 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2157 new_level = parent_team->t.t_active_level; 2158 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2159 } 2160 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2161 // set primary thread's schedule as new run-time schedule 2162 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2163 2164 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2165 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2166 2167 // Update the floating point rounding in the team if required. 2168 propagateFPControl(team); 2169 #if OMPD_SUPPORT 2170 if (ompd_state & OMPD_ENABLE_BP) 2171 ompd_bp_parallel_begin(); 2172 #endif 2173 2174 if (__kmp_tasking_mode != tskm_immediate_exec) { 2175 // Set primary thread's task team to team's task team. Unless this is hot 2176 // team, it should be NULL. 2177 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2178 parent_team->t.t_task_team[master_th->th.th_task_state]); 2179 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team " 2180 "%p, new task_team %p / team %p\n", 2181 __kmp_gtid_from_thread(master_th), 2182 master_th->th.th_task_team, parent_team, 2183 team->t.t_task_team[master_th->th.th_task_state], team)); 2184 2185 if (active_level || master_th->th.th_task_team) { 2186 // Take a memo of primary thread's task_state 2187 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2188 if (master_th->th.th_task_state_top >= 2189 master_th->th.th_task_state_stack_sz) { // increase size 2190 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2191 kmp_uint8 *old_stack, *new_stack; 2192 kmp_uint32 i; 2193 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2194 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2195 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2196 } 2197 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2198 ++i) { // zero-init rest of stack 2199 new_stack[i] = 0; 2200 } 2201 old_stack = master_th->th.th_task_state_memo_stack; 2202 master_th->th.th_task_state_memo_stack = new_stack; 2203 master_th->th.th_task_state_stack_sz = new_size; 2204 __kmp_free(old_stack); 2205 } 2206 // Store primary thread's task_state on stack 2207 master_th->th 2208 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2209 master_th->th.th_task_state; 2210 master_th->th.th_task_state_top++; 2211 #if KMP_NESTED_HOT_TEAMS 2212 if (master_th->th.th_hot_teams && 2213 active_level < __kmp_hot_teams_max_level && 2214 team == master_th->th.th_hot_teams[active_level].hot_team) { 2215 // Restore primary thread's nested state if nested hot team 2216 master_th->th.th_task_state = 2217 master_th->th 2218 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2219 } else { 2220 #endif 2221 master_th->th.th_task_state = 0; 2222 #if KMP_NESTED_HOT_TEAMS 2223 } 2224 #endif 2225 } 2226 #if !KMP_NESTED_HOT_TEAMS 2227 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2228 (team == root->r.r_hot_team)); 2229 #endif 2230 } 2231 2232 KA_TRACE( 2233 20, 2234 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2235 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2236 team->t.t_nproc)); 2237 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2238 (team->t.t_master_tid == 0 && 2239 (team->t.t_parent == root->r.r_root_team || 2240 team->t.t_parent->t.t_serialized))); 2241 KMP_MB(); 2242 2243 /* now, setup the arguments */ 2244 argv = (void **)team->t.t_argv; 2245 if (ap) { 2246 for (i = argc - 1; i >= 0; --i) { 2247 void *new_argv = va_arg(kmp_va_deref(ap), void *); 2248 KMP_CHECK_UPDATE(*argv, new_argv); 2249 argv++; 2250 } 2251 } else { 2252 for (i = 0; i < argc; ++i) { 2253 // Get args from parent team for teams construct 2254 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2255 } 2256 } 2257 2258 /* now actually fork the threads */ 2259 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2260 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2261 root->r.r_active = TRUE; 2262 2263 __kmp_fork_team_threads(root, team, master_th, gtid, !ap); 2264 __kmp_setup_icv_copy(team, nthreads, 2265 &master_th->th.th_current_task->td_icvs, loc); 2266 2267 #if OMPT_SUPPORT 2268 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2269 #endif 2270 2271 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2272 2273 #if USE_ITT_BUILD 2274 if (team->t.t_active_level == 1 // only report frames at level 1 2275 && !master_th->th.th_teams_microtask) { // not in teams construct 2276 #if USE_ITT_NOTIFY 2277 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2278 (__kmp_forkjoin_frames_mode == 3 || 2279 __kmp_forkjoin_frames_mode == 1)) { 2280 kmp_uint64 tmp_time = 0; 2281 if (__itt_get_timestamp_ptr) 2282 tmp_time = __itt_get_timestamp(); 2283 // Internal fork - report frame begin 2284 master_th->th.th_frame_time = tmp_time; 2285 if (__kmp_forkjoin_frames_mode == 3) 2286 team->t.t_region_time = tmp_time; 2287 } else 2288 // only one notification scheme (either "submit" or "forking/joined", not both) 2289 #endif /* USE_ITT_NOTIFY */ 2290 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2291 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2292 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2293 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2294 } 2295 } 2296 #endif /* USE_ITT_BUILD */ 2297 2298 /* now go on and do the work */ 2299 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2300 KMP_MB(); 2301 KF_TRACE(10, 2302 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2303 root, team, master_th, gtid)); 2304 2305 #if USE_ITT_BUILD 2306 if (__itt_stack_caller_create_ptr) { 2307 // create new stack stitching id before entering fork barrier 2308 if (!enter_teams) { 2309 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL); 2310 team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2311 } else if (parent_team->t.t_serialized) { 2312 // keep stack stitching id in the serialized parent_team; 2313 // current team will be used for parallel inside the teams; 2314 // if parent_team is active, then it already keeps stack stitching id 2315 // for the league of teams 2316 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 2317 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2318 } 2319 } 2320 #endif /* USE_ITT_BUILD */ 2321 2322 // AC: skip __kmp_internal_fork at teams construct, let only primary 2323 // threads execute 2324 if (ap) { 2325 __kmp_internal_fork(loc, gtid, team); 2326 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2327 "master_th=%p, gtid=%d\n", 2328 root, team, master_th, gtid)); 2329 } 2330 2331 if (call_context == fork_context_gnu) { 2332 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2333 return TRUE; 2334 } 2335 2336 /* Invoke microtask for PRIMARY thread */ 2337 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2338 team->t.t_id, team->t.t_pkfn)); 2339 } // END of timer KMP_fork_call block 2340 2341 #if KMP_STATS_ENABLED 2342 // If beginning a teams construct, then change thread state 2343 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 2344 if (!ap) { 2345 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); 2346 } 2347 #endif 2348 2349 if (!team->t.t_invoke(gtid)) { 2350 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 2351 } 2352 2353 #if KMP_STATS_ENABLED 2354 // If was beginning of a teams construct, then reset thread state 2355 if (!ap) { 2356 KMP_SET_THREAD_STATE(previous_state); 2357 } 2358 #endif 2359 2360 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2361 team->t.t_id, team->t.t_pkfn)); 2362 KMP_MB(); /* Flush all pending memory write invalidates. */ 2363 2364 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2365 #if OMPT_SUPPORT 2366 if (ompt_enabled.enabled) { 2367 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2368 } 2369 #endif 2370 2371 return TRUE; 2372 } 2373 2374 #if OMPT_SUPPORT 2375 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2376 kmp_team_t *team) { 2377 // restore state outside the region 2378 thread->th.ompt_thread_info.state = 2379 ((team->t.t_serialized) ? ompt_state_work_serial 2380 : ompt_state_work_parallel); 2381 } 2382 2383 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2384 kmp_team_t *team, ompt_data_t *parallel_data, 2385 int flags, void *codeptr) { 2386 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2387 if (ompt_enabled.ompt_callback_parallel_end) { 2388 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2389 parallel_data, &(task_info->task_data), flags, codeptr); 2390 } 2391 2392 task_info->frame.enter_frame = ompt_data_none; 2393 __kmp_join_restore_state(thread, team); 2394 } 2395 #endif 2396 2397 void __kmp_join_call(ident_t *loc, int gtid 2398 #if OMPT_SUPPORT 2399 , 2400 enum fork_context_e fork_context 2401 #endif 2402 , 2403 int exit_teams) { 2404 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2405 kmp_team_t *team; 2406 kmp_team_t *parent_team; 2407 kmp_info_t *master_th; 2408 kmp_root_t *root; 2409 int master_active; 2410 2411 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2412 2413 /* setup current data */ 2414 master_th = __kmp_threads[gtid]; 2415 root = master_th->th.th_root; 2416 team = master_th->th.th_team; 2417 parent_team = team->t.t_parent; 2418 2419 master_th->th.th_ident = loc; 2420 2421 #if OMPT_SUPPORT 2422 void *team_microtask = (void *)team->t.t_pkfn; 2423 // For GOMP interface with serialized parallel, need the 2424 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task 2425 // and end-parallel events. 2426 if (ompt_enabled.enabled && 2427 !(team->t.t_serialized && fork_context == fork_context_gnu)) { 2428 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2429 } 2430 #endif 2431 2432 #if KMP_DEBUG 2433 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2434 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2435 "th_task_team = %p\n", 2436 __kmp_gtid_from_thread(master_th), team, 2437 team->t.t_task_team[master_th->th.th_task_state], 2438 master_th->th.th_task_team)); 2439 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2440 team->t.t_task_team[master_th->th.th_task_state]); 2441 } 2442 #endif 2443 2444 if (team->t.t_serialized) { 2445 if (master_th->th.th_teams_microtask) { 2446 // We are in teams construct 2447 int level = team->t.t_level; 2448 int tlevel = master_th->th.th_teams_level; 2449 if (level == tlevel) { 2450 // AC: we haven't incremented it earlier at start of teams construct, 2451 // so do it here - at the end of teams construct 2452 team->t.t_level++; 2453 } else if (level == tlevel + 1) { 2454 // AC: we are exiting parallel inside teams, need to increment 2455 // serialization in order to restore it in the next call to 2456 // __kmpc_end_serialized_parallel 2457 team->t.t_serialized++; 2458 } 2459 } 2460 __kmpc_end_serialized_parallel(loc, gtid); 2461 2462 #if OMPT_SUPPORT 2463 if (ompt_enabled.enabled) { 2464 if (fork_context == fork_context_gnu) { 2465 __ompt_lw_taskteam_unlink(master_th); 2466 } 2467 __kmp_join_restore_state(master_th, parent_team); 2468 } 2469 #endif 2470 2471 return; 2472 } 2473 2474 master_active = team->t.t_master_active; 2475 2476 if (!exit_teams) { 2477 // AC: No barrier for internal teams at exit from teams construct. 2478 // But there is barrier for external team (league). 2479 __kmp_internal_join(loc, gtid, team); 2480 #if USE_ITT_BUILD 2481 if (__itt_stack_caller_create_ptr) { 2482 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL); 2483 // destroy the stack stitching id after join barrier 2484 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id); 2485 team->t.t_stack_id = NULL; 2486 } 2487 #endif 2488 } else { 2489 master_th->th.th_task_state = 2490 0; // AC: no tasking in teams (out of any parallel) 2491 #if USE_ITT_BUILD 2492 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) { 2493 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL); 2494 // destroy the stack stitching id on exit from the teams construct 2495 // if parent_team is active, then the id will be destroyed later on 2496 // by master of the league of teams 2497 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id); 2498 parent_team->t.t_stack_id = NULL; 2499 } 2500 #endif 2501 } 2502 2503 KMP_MB(); 2504 2505 #if OMPT_SUPPORT 2506 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2507 void *codeptr = team->t.ompt_team_info.master_return_address; 2508 #endif 2509 2510 #if USE_ITT_BUILD 2511 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2512 if (team->t.t_active_level == 1 && 2513 (!master_th->th.th_teams_microtask || /* not in teams construct */ 2514 master_th->th.th_teams_size.nteams == 1)) { 2515 master_th->th.th_ident = loc; 2516 // only one notification scheme (either "submit" or "forking/joined", not 2517 // both) 2518 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2519 __kmp_forkjoin_frames_mode == 3) 2520 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2521 master_th->th.th_frame_time, 0, loc, 2522 master_th->th.th_team_nproc, 1); 2523 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2524 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2525 __kmp_itt_region_joined(gtid); 2526 } // active_level == 1 2527 #endif /* USE_ITT_BUILD */ 2528 2529 #if KMP_AFFINITY_SUPPORTED 2530 if (!exit_teams) { 2531 // Restore master thread's partition. 2532 master_th->th.th_first_place = team->t.t_first_place; 2533 master_th->th.th_last_place = team->t.t_last_place; 2534 } 2535 #endif // KMP_AFFINITY_SUPPORTED 2536 2537 if (master_th->th.th_teams_microtask && !exit_teams && 2538 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2539 team->t.t_level == master_th->th.th_teams_level + 1) { 2540 // AC: We need to leave the team structure intact at the end of parallel 2541 // inside the teams construct, so that at the next parallel same (hot) team 2542 // works, only adjust nesting levels 2543 #if OMPT_SUPPORT 2544 ompt_data_t ompt_parallel_data = ompt_data_none; 2545 if (ompt_enabled.enabled) { 2546 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2547 if (ompt_enabled.ompt_callback_implicit_task) { 2548 int ompt_team_size = team->t.t_nproc; 2549 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2550 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2551 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 2552 } 2553 task_info->frame.exit_frame = ompt_data_none; 2554 task_info->task_data = ompt_data_none; 2555 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 2556 __ompt_lw_taskteam_unlink(master_th); 2557 } 2558 #endif 2559 /* Decrement our nested depth level */ 2560 team->t.t_level--; 2561 team->t.t_active_level--; 2562 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2563 2564 // Restore number of threads in the team if needed. This code relies on 2565 // the proper adjustment of th_teams_size.nth after the fork in 2566 // __kmp_teams_master on each teams primary thread in the case that 2567 // __kmp_reserve_threads reduced it. 2568 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2569 int old_num = master_th->th.th_team_nproc; 2570 int new_num = master_th->th.th_teams_size.nth; 2571 kmp_info_t **other_threads = team->t.t_threads; 2572 team->t.t_nproc = new_num; 2573 for (int i = 0; i < old_num; ++i) { 2574 other_threads[i]->th.th_team_nproc = new_num; 2575 } 2576 // Adjust states of non-used threads of the team 2577 for (int i = old_num; i < new_num; ++i) { 2578 // Re-initialize thread's barrier data. 2579 KMP_DEBUG_ASSERT(other_threads[i]); 2580 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2581 for (int b = 0; b < bs_last_barrier; ++b) { 2582 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2583 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2584 #if USE_DEBUGGER 2585 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2586 #endif 2587 } 2588 if (__kmp_tasking_mode != tskm_immediate_exec) { 2589 // Synchronize thread's task state 2590 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2591 } 2592 } 2593 } 2594 2595 #if OMPT_SUPPORT 2596 if (ompt_enabled.enabled) { 2597 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data, 2598 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr); 2599 } 2600 #endif 2601 2602 return; 2603 } 2604 2605 /* do cleanup and restore the parent team */ 2606 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2607 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2608 2609 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2610 2611 /* jc: The following lock has instructions with REL and ACQ semantics, 2612 separating the parallel user code called in this parallel region 2613 from the serial user code called after this function returns. */ 2614 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2615 2616 if (!master_th->th.th_teams_microtask || 2617 team->t.t_level > master_th->th.th_teams_level) { 2618 /* Decrement our nested depth level */ 2619 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2620 } 2621 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2622 2623 #if OMPT_SUPPORT 2624 if (ompt_enabled.enabled) { 2625 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2626 if (ompt_enabled.ompt_callback_implicit_task) { 2627 int flags = (team_microtask == (void *)__kmp_teams_master) 2628 ? ompt_task_initial 2629 : ompt_task_implicit; 2630 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc; 2631 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2632 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2633 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags); 2634 } 2635 task_info->frame.exit_frame = ompt_data_none; 2636 task_info->task_data = ompt_data_none; 2637 } 2638 #endif 2639 2640 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2641 master_th, team)); 2642 __kmp_pop_current_task_from_thread(master_th); 2643 2644 master_th->th.th_def_allocator = team->t.t_def_allocator; 2645 2646 #if OMPD_SUPPORT 2647 if (ompd_state & OMPD_ENABLE_BP) 2648 ompd_bp_parallel_end(); 2649 #endif 2650 updateHWFPControl(team); 2651 2652 if (root->r.r_active != master_active) 2653 root->r.r_active = master_active; 2654 2655 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2656 master_th)); // this will free worker threads 2657 2658 /* this race was fun to find. make sure the following is in the critical 2659 region otherwise assertions may fail occasionally since the old team may be 2660 reallocated and the hierarchy appears inconsistent. it is actually safe to 2661 run and won't cause any bugs, but will cause those assertion failures. it's 2662 only one deref&assign so might as well put this in the critical region */ 2663 master_th->th.th_team = parent_team; 2664 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2665 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2666 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2667 2668 /* restore serialized team, if need be */ 2669 if (parent_team->t.t_serialized && 2670 parent_team != master_th->th.th_serial_team && 2671 parent_team != root->r.r_root_team) { 2672 __kmp_free_team(root, 2673 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2674 master_th->th.th_serial_team = parent_team; 2675 } 2676 2677 if (__kmp_tasking_mode != tskm_immediate_exec) { 2678 if (master_th->th.th_task_state_top > 2679 0) { // Restore task state from memo stack 2680 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2681 // Remember primary thread's state if we re-use this nested hot team 2682 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2683 master_th->th.th_task_state; 2684 --master_th->th.th_task_state_top; // pop 2685 // Now restore state at this level 2686 master_th->th.th_task_state = 2687 master_th->th 2688 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2689 } else if (team != root->r.r_hot_team) { 2690 // Reset the task state of primary thread if we are not hot team because 2691 // in this case all the worker threads will be free, and their task state 2692 // will be reset. If not reset the primary's, the task state will be 2693 // inconsistent. 2694 master_th->th.th_task_state = 0; 2695 } 2696 // Copy the task team from the parent team to the primary thread 2697 master_th->th.th_task_team = 2698 parent_team->t.t_task_team[master_th->th.th_task_state]; 2699 KA_TRACE(20, 2700 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n", 2701 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2702 parent_team)); 2703 } 2704 2705 // TODO: GEH - cannot do this assertion because root thread not set up as 2706 // executing 2707 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2708 master_th->th.th_current_task->td_flags.executing = 1; 2709 2710 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2711 2712 #if KMP_AFFINITY_SUPPORTED 2713 if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) { 2714 __kmp_reset_root_init_mask(gtid); 2715 } 2716 #endif 2717 #if OMPT_SUPPORT 2718 int flags = 2719 OMPT_INVOKER(fork_context) | 2720 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league 2721 : ompt_parallel_team); 2722 if (ompt_enabled.enabled) { 2723 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags, 2724 codeptr); 2725 } 2726 #endif 2727 2728 KMP_MB(); 2729 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2730 } 2731 2732 /* Check whether we should push an internal control record onto the 2733 serial team stack. If so, do it. */ 2734 void __kmp_save_internal_controls(kmp_info_t *thread) { 2735 2736 if (thread->th.th_team != thread->th.th_serial_team) { 2737 return; 2738 } 2739 if (thread->th.th_team->t.t_serialized > 1) { 2740 int push = 0; 2741 2742 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2743 push = 1; 2744 } else { 2745 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2746 thread->th.th_team->t.t_serialized) { 2747 push = 1; 2748 } 2749 } 2750 if (push) { /* push a record on the serial team's stack */ 2751 kmp_internal_control_t *control = 2752 (kmp_internal_control_t *)__kmp_allocate( 2753 sizeof(kmp_internal_control_t)); 2754 2755 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2756 2757 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2758 2759 control->next = thread->th.th_team->t.t_control_stack_top; 2760 thread->th.th_team->t.t_control_stack_top = control; 2761 } 2762 } 2763 } 2764 2765 /* Changes set_nproc */ 2766 void __kmp_set_num_threads(int new_nth, int gtid) { 2767 kmp_info_t *thread; 2768 kmp_root_t *root; 2769 2770 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2771 KMP_DEBUG_ASSERT(__kmp_init_serial); 2772 2773 if (new_nth < 1) 2774 new_nth = 1; 2775 else if (new_nth > __kmp_max_nth) 2776 new_nth = __kmp_max_nth; 2777 2778 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2779 thread = __kmp_threads[gtid]; 2780 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2781 return; // nothing to do 2782 2783 __kmp_save_internal_controls(thread); 2784 2785 set__nproc(thread, new_nth); 2786 2787 // If this omp_set_num_threads() call will cause the hot team size to be 2788 // reduced (in the absence of a num_threads clause), then reduce it now, 2789 // rather than waiting for the next parallel region. 2790 root = thread->th.th_root; 2791 if (__kmp_init_parallel && (!root->r.r_active) && 2792 (root->r.r_hot_team->t.t_nproc > new_nth) 2793 #if KMP_NESTED_HOT_TEAMS 2794 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2795 #endif 2796 ) { 2797 kmp_team_t *hot_team = root->r.r_hot_team; 2798 int f; 2799 2800 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2801 2802 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 2803 __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth); 2804 } 2805 // Release the extra threads we don't need any more. 2806 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2807 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2808 if (__kmp_tasking_mode != tskm_immediate_exec) { 2809 // When decreasing team size, threads no longer in the team should unref 2810 // task team. 2811 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2812 } 2813 __kmp_free_thread(hot_team->t.t_threads[f]); 2814 hot_team->t.t_threads[f] = NULL; 2815 } 2816 hot_team->t.t_nproc = new_nth; 2817 #if KMP_NESTED_HOT_TEAMS 2818 if (thread->th.th_hot_teams) { 2819 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2820 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2821 } 2822 #endif 2823 2824 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 2825 hot_team->t.b->update_num_threads(new_nth); 2826 __kmp_add_threads_to_team(hot_team, new_nth); 2827 } 2828 2829 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2830 2831 // Update the t_nproc field in the threads that are still active. 2832 for (f = 0; f < new_nth; f++) { 2833 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2834 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2835 } 2836 // Special flag in case omp_set_num_threads() call 2837 hot_team->t.t_size_changed = -1; 2838 } 2839 } 2840 2841 /* Changes max_active_levels */ 2842 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2843 kmp_info_t *thread; 2844 2845 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2846 "%d = (%d)\n", 2847 gtid, max_active_levels)); 2848 KMP_DEBUG_ASSERT(__kmp_init_serial); 2849 2850 // validate max_active_levels 2851 if (max_active_levels < 0) { 2852 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2853 // We ignore this call if the user has specified a negative value. 2854 // The current setting won't be changed. The last valid setting will be 2855 // used. A warning will be issued (if warnings are allowed as controlled by 2856 // the KMP_WARNINGS env var). 2857 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2858 "max_active_levels for thread %d = (%d)\n", 2859 gtid, max_active_levels)); 2860 return; 2861 } 2862 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2863 // it's OK, the max_active_levels is within the valid range: [ 0; 2864 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2865 // We allow a zero value. (implementation defined behavior) 2866 } else { 2867 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2868 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2869 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2870 // Current upper limit is MAX_INT. (implementation defined behavior) 2871 // If the input exceeds the upper limit, we correct the input to be the 2872 // upper limit. (implementation defined behavior) 2873 // Actually, the flow should never get here until we use MAX_INT limit. 2874 } 2875 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2876 "max_active_levels for thread %d = (%d)\n", 2877 gtid, max_active_levels)); 2878 2879 thread = __kmp_threads[gtid]; 2880 2881 __kmp_save_internal_controls(thread); 2882 2883 set__max_active_levels(thread, max_active_levels); 2884 } 2885 2886 /* Gets max_active_levels */ 2887 int __kmp_get_max_active_levels(int gtid) { 2888 kmp_info_t *thread; 2889 2890 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2891 KMP_DEBUG_ASSERT(__kmp_init_serial); 2892 2893 thread = __kmp_threads[gtid]; 2894 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2895 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2896 "curtask_maxaclevel=%d\n", 2897 gtid, thread->th.th_current_task, 2898 thread->th.th_current_task->td_icvs.max_active_levels)); 2899 return thread->th.th_current_task->td_icvs.max_active_levels; 2900 } 2901 2902 // nteams-var per-device ICV 2903 void __kmp_set_num_teams(int num_teams) { 2904 if (num_teams > 0) 2905 __kmp_nteams = num_teams; 2906 } 2907 int __kmp_get_max_teams(void) { return __kmp_nteams; } 2908 // teams-thread-limit-var per-device ICV 2909 void __kmp_set_teams_thread_limit(int limit) { 2910 if (limit > 0) 2911 __kmp_teams_thread_limit = limit; 2912 } 2913 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; } 2914 2915 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); 2916 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); 2917 2918 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2919 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2920 kmp_info_t *thread; 2921 kmp_sched_t orig_kind; 2922 // kmp_team_t *team; 2923 2924 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2925 gtid, (int)kind, chunk)); 2926 KMP_DEBUG_ASSERT(__kmp_init_serial); 2927 2928 // Check if the kind parameter is valid, correct if needed. 2929 // Valid parameters should fit in one of two intervals - standard or extended: 2930 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2931 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2932 orig_kind = kind; 2933 kind = __kmp_sched_without_mods(kind); 2934 2935 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2936 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2937 // TODO: Hint needs attention in case we change the default schedule. 2938 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2939 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2940 __kmp_msg_null); 2941 kind = kmp_sched_default; 2942 chunk = 0; // ignore chunk value in case of bad kind 2943 } 2944 2945 thread = __kmp_threads[gtid]; 2946 2947 __kmp_save_internal_controls(thread); 2948 2949 if (kind < kmp_sched_upper_std) { 2950 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2951 // differ static chunked vs. unchunked: chunk should be invalid to 2952 // indicate unchunked schedule (which is the default) 2953 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2954 } else { 2955 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2956 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2957 } 2958 } else { 2959 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2960 // kmp_sched_lower - 2 ]; 2961 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2962 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2963 kmp_sched_lower - 2]; 2964 } 2965 __kmp_sched_apply_mods_intkind( 2966 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); 2967 if (kind == kmp_sched_auto || chunk < 1) { 2968 // ignore parameter chunk for schedule auto 2969 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2970 } else { 2971 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2972 } 2973 } 2974 2975 /* Gets def_sched_var ICV values */ 2976 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2977 kmp_info_t *thread; 2978 enum sched_type th_type; 2979 2980 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2981 KMP_DEBUG_ASSERT(__kmp_init_serial); 2982 2983 thread = __kmp_threads[gtid]; 2984 2985 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2986 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { 2987 case kmp_sch_static: 2988 case kmp_sch_static_greedy: 2989 case kmp_sch_static_balanced: 2990 *kind = kmp_sched_static; 2991 __kmp_sched_apply_mods_stdkind(kind, th_type); 2992 *chunk = 0; // chunk was not set, try to show this fact via zero value 2993 return; 2994 case kmp_sch_static_chunked: 2995 *kind = kmp_sched_static; 2996 break; 2997 case kmp_sch_dynamic_chunked: 2998 *kind = kmp_sched_dynamic; 2999 break; 3000 case kmp_sch_guided_chunked: 3001 case kmp_sch_guided_iterative_chunked: 3002 case kmp_sch_guided_analytical_chunked: 3003 *kind = kmp_sched_guided; 3004 break; 3005 case kmp_sch_auto: 3006 *kind = kmp_sched_auto; 3007 break; 3008 case kmp_sch_trapezoidal: 3009 *kind = kmp_sched_trapezoidal; 3010 break; 3011 #if KMP_STATIC_STEAL_ENABLED 3012 case kmp_sch_static_steal: 3013 *kind = kmp_sched_static_steal; 3014 break; 3015 #endif 3016 default: 3017 KMP_FATAL(UnknownSchedulingType, th_type); 3018 } 3019 3020 __kmp_sched_apply_mods_stdkind(kind, th_type); 3021 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 3022 } 3023 3024 int __kmp_get_ancestor_thread_num(int gtid, int level) { 3025 3026 int ii, dd; 3027 kmp_team_t *team; 3028 kmp_info_t *thr; 3029 3030 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 3031 KMP_DEBUG_ASSERT(__kmp_init_serial); 3032 3033 // validate level 3034 if (level == 0) 3035 return 0; 3036 if (level < 0) 3037 return -1; 3038 thr = __kmp_threads[gtid]; 3039 team = thr->th.th_team; 3040 ii = team->t.t_level; 3041 if (level > ii) 3042 return -1; 3043 3044 if (thr->th.th_teams_microtask) { 3045 // AC: we are in teams region where multiple nested teams have same level 3046 int tlevel = thr->th.th_teams_level; // the level of the teams construct 3047 if (level <= 3048 tlevel) { // otherwise usual algorithm works (will not touch the teams) 3049 KMP_DEBUG_ASSERT(ii >= tlevel); 3050 // AC: As we need to pass by the teams league, we need to artificially 3051 // increase ii 3052 if (ii == tlevel) { 3053 ii += 2; // three teams have same level 3054 } else { 3055 ii++; // two teams have same level 3056 } 3057 } 3058 } 3059 3060 if (ii == level) 3061 return __kmp_tid_from_gtid(gtid); 3062 3063 dd = team->t.t_serialized; 3064 level++; 3065 while (ii > level) { 3066 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 3067 } 3068 if ((team->t.t_serialized) && (!dd)) { 3069 team = team->t.t_parent; 3070 continue; 3071 } 3072 if (ii > level) { 3073 team = team->t.t_parent; 3074 dd = team->t.t_serialized; 3075 ii--; 3076 } 3077 } 3078 3079 return (dd > 1) ? (0) : (team->t.t_master_tid); 3080 } 3081 3082 int __kmp_get_team_size(int gtid, int level) { 3083 3084 int ii, dd; 3085 kmp_team_t *team; 3086 kmp_info_t *thr; 3087 3088 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 3089 KMP_DEBUG_ASSERT(__kmp_init_serial); 3090 3091 // validate level 3092 if (level == 0) 3093 return 1; 3094 if (level < 0) 3095 return -1; 3096 thr = __kmp_threads[gtid]; 3097 team = thr->th.th_team; 3098 ii = team->t.t_level; 3099 if (level > ii) 3100 return -1; 3101 3102 if (thr->th.th_teams_microtask) { 3103 // AC: we are in teams region where multiple nested teams have same level 3104 int tlevel = thr->th.th_teams_level; // the level of the teams construct 3105 if (level <= 3106 tlevel) { // otherwise usual algorithm works (will not touch the teams) 3107 KMP_DEBUG_ASSERT(ii >= tlevel); 3108 // AC: As we need to pass by the teams league, we need to artificially 3109 // increase ii 3110 if (ii == tlevel) { 3111 ii += 2; // three teams have same level 3112 } else { 3113 ii++; // two teams have same level 3114 } 3115 } 3116 } 3117 3118 while (ii > level) { 3119 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 3120 } 3121 if (team->t.t_serialized && (!dd)) { 3122 team = team->t.t_parent; 3123 continue; 3124 } 3125 if (ii > level) { 3126 team = team->t.t_parent; 3127 ii--; 3128 } 3129 } 3130 3131 return team->t.t_nproc; 3132 } 3133 3134 kmp_r_sched_t __kmp_get_schedule_global() { 3135 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 3136 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 3137 // independently. So one can get the updated schedule here. 3138 3139 kmp_r_sched_t r_sched; 3140 3141 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 3142 // __kmp_guided. __kmp_sched should keep original value, so that user can set 3143 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 3144 // different roots (even in OMP 2.5) 3145 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); 3146 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); 3147 if (s == kmp_sch_static) { 3148 // replace STATIC with more detailed schedule (balanced or greedy) 3149 r_sched.r_sched_type = __kmp_static; 3150 } else if (s == kmp_sch_guided_chunked) { 3151 // replace GUIDED with more detailed schedule (iterative or analytical) 3152 r_sched.r_sched_type = __kmp_guided; 3153 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 3154 r_sched.r_sched_type = __kmp_sched; 3155 } 3156 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); 3157 3158 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 3159 // __kmp_chunk may be wrong here (if it was not ever set) 3160 r_sched.chunk = KMP_DEFAULT_CHUNK; 3161 } else { 3162 r_sched.chunk = __kmp_chunk; 3163 } 3164 3165 return r_sched; 3166 } 3167 3168 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 3169 at least argc number of *t_argv entries for the requested team. */ 3170 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 3171 3172 KMP_DEBUG_ASSERT(team); 3173 if (!realloc || argc > team->t.t_max_argc) { 3174 3175 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 3176 "current entries=%d\n", 3177 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 3178 /* if previously allocated heap space for args, free them */ 3179 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 3180 __kmp_free((void *)team->t.t_argv); 3181 3182 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 3183 /* use unused space in the cache line for arguments */ 3184 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 3185 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 3186 "argv entries\n", 3187 team->t.t_id, team->t.t_max_argc)); 3188 team->t.t_argv = &team->t.t_inline_argv[0]; 3189 if (__kmp_storage_map) { 3190 __kmp_print_storage_map_gtid( 3191 -1, &team->t.t_inline_argv[0], 3192 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3193 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3194 team->t.t_id); 3195 } 3196 } else { 3197 /* allocate space for arguments in the heap */ 3198 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3199 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3200 : 2 * argc; 3201 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3202 "argv entries\n", 3203 team->t.t_id, team->t.t_max_argc)); 3204 team->t.t_argv = 3205 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3206 if (__kmp_storage_map) { 3207 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3208 &team->t.t_argv[team->t.t_max_argc], 3209 sizeof(void *) * team->t.t_max_argc, 3210 "team_%d.t_argv", team->t.t_id); 3211 } 3212 } 3213 } 3214 } 3215 3216 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3217 int i; 3218 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3219 team->t.t_threads = 3220 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3221 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3222 sizeof(dispatch_shared_info_t) * num_disp_buff); 3223 team->t.t_dispatch = 3224 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3225 team->t.t_implicit_task_taskdata = 3226 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3227 team->t.t_max_nproc = max_nth; 3228 3229 /* setup dispatch buffers */ 3230 for (i = 0; i < num_disp_buff; ++i) { 3231 team->t.t_disp_buffer[i].buffer_index = i; 3232 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3233 } 3234 } 3235 3236 static void __kmp_free_team_arrays(kmp_team_t *team) { 3237 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3238 int i; 3239 for (i = 0; i < team->t.t_max_nproc; ++i) { 3240 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3241 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3242 team->t.t_dispatch[i].th_disp_buffer = NULL; 3243 } 3244 } 3245 #if KMP_USE_HIER_SCHED 3246 __kmp_dispatch_free_hierarchies(team); 3247 #endif 3248 __kmp_free(team->t.t_threads); 3249 __kmp_free(team->t.t_disp_buffer); 3250 __kmp_free(team->t.t_dispatch); 3251 __kmp_free(team->t.t_implicit_task_taskdata); 3252 team->t.t_threads = NULL; 3253 team->t.t_disp_buffer = NULL; 3254 team->t.t_dispatch = NULL; 3255 team->t.t_implicit_task_taskdata = 0; 3256 } 3257 3258 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3259 kmp_info_t **oldThreads = team->t.t_threads; 3260 3261 __kmp_free(team->t.t_disp_buffer); 3262 __kmp_free(team->t.t_dispatch); 3263 __kmp_free(team->t.t_implicit_task_taskdata); 3264 __kmp_allocate_team_arrays(team, max_nth); 3265 3266 KMP_MEMCPY(team->t.t_threads, oldThreads, 3267 team->t.t_nproc * sizeof(kmp_info_t *)); 3268 3269 __kmp_free(oldThreads); 3270 } 3271 3272 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3273 3274 kmp_r_sched_t r_sched = 3275 __kmp_get_schedule_global(); // get current state of scheduling globals 3276 3277 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3278 3279 kmp_internal_control_t g_icvs = { 3280 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3281 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3282 // adjustment of threads (per thread) 3283 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3284 // whether blocktime is explicitly set 3285 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3286 #if KMP_USE_MONITOR 3287 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3288 // intervals 3289 #endif 3290 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3291 // next parallel region (per thread) 3292 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3293 __kmp_cg_max_nth, // int thread_limit; 3294 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3295 // for max_active_levels 3296 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3297 // {sched,chunk} pair 3298 __kmp_nested_proc_bind.bind_types[0], 3299 __kmp_default_device, 3300 NULL // struct kmp_internal_control *next; 3301 }; 3302 3303 return g_icvs; 3304 } 3305 3306 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3307 3308 kmp_internal_control_t gx_icvs; 3309 gx_icvs.serial_nesting_level = 3310 0; // probably =team->t.t_serial like in save_inter_controls 3311 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3312 gx_icvs.next = NULL; 3313 3314 return gx_icvs; 3315 } 3316 3317 static void __kmp_initialize_root(kmp_root_t *root) { 3318 int f; 3319 kmp_team_t *root_team; 3320 kmp_team_t *hot_team; 3321 int hot_team_max_nth; 3322 kmp_r_sched_t r_sched = 3323 __kmp_get_schedule_global(); // get current state of scheduling globals 3324 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3325 KMP_DEBUG_ASSERT(root); 3326 KMP_ASSERT(!root->r.r_begin); 3327 3328 /* setup the root state structure */ 3329 __kmp_init_lock(&root->r.r_begin_lock); 3330 root->r.r_begin = FALSE; 3331 root->r.r_active = FALSE; 3332 root->r.r_in_parallel = 0; 3333 root->r.r_blocktime = __kmp_dflt_blocktime; 3334 #if KMP_AFFINITY_SUPPORTED 3335 root->r.r_affinity_assigned = FALSE; 3336 #endif 3337 3338 /* setup the root team for this task */ 3339 /* allocate the root team structure */ 3340 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3341 3342 root_team = 3343 __kmp_allocate_team(root, 3344 1, // new_nproc 3345 1, // max_nproc 3346 #if OMPT_SUPPORT 3347 ompt_data_none, // root parallel id 3348 #endif 3349 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3350 0 // argc 3351 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3352 ); 3353 #if USE_DEBUGGER 3354 // Non-NULL value should be assigned to make the debugger display the root 3355 // team. 3356 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3357 #endif 3358 3359 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3360 3361 root->r.r_root_team = root_team; 3362 root_team->t.t_control_stack_top = NULL; 3363 3364 /* initialize root team */ 3365 root_team->t.t_threads[0] = NULL; 3366 root_team->t.t_nproc = 1; 3367 root_team->t.t_serialized = 1; 3368 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3369 root_team->t.t_sched.sched = r_sched.sched; 3370 KA_TRACE( 3371 20, 3372 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3373 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3374 3375 /* setup the hot team for this task */ 3376 /* allocate the hot team structure */ 3377 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3378 3379 hot_team = 3380 __kmp_allocate_team(root, 3381 1, // new_nproc 3382 __kmp_dflt_team_nth_ub * 2, // max_nproc 3383 #if OMPT_SUPPORT 3384 ompt_data_none, // root parallel id 3385 #endif 3386 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3387 0 // argc 3388 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3389 ); 3390 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3391 3392 root->r.r_hot_team = hot_team; 3393 root_team->t.t_control_stack_top = NULL; 3394 3395 /* first-time initialization */ 3396 hot_team->t.t_parent = root_team; 3397 3398 /* initialize hot team */ 3399 hot_team_max_nth = hot_team->t.t_max_nproc; 3400 for (f = 0; f < hot_team_max_nth; ++f) { 3401 hot_team->t.t_threads[f] = NULL; 3402 } 3403 hot_team->t.t_nproc = 1; 3404 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3405 hot_team->t.t_sched.sched = r_sched.sched; 3406 hot_team->t.t_size_changed = 0; 3407 } 3408 3409 #ifdef KMP_DEBUG 3410 3411 typedef struct kmp_team_list_item { 3412 kmp_team_p const *entry; 3413 struct kmp_team_list_item *next; 3414 } kmp_team_list_item_t; 3415 typedef kmp_team_list_item_t *kmp_team_list_t; 3416 3417 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3418 kmp_team_list_t list, // List of teams. 3419 kmp_team_p const *team // Team to add. 3420 ) { 3421 3422 // List must terminate with item where both entry and next are NULL. 3423 // Team is added to the list only once. 3424 // List is sorted in ascending order by team id. 3425 // Team id is *not* a key. 3426 3427 kmp_team_list_t l; 3428 3429 KMP_DEBUG_ASSERT(list != NULL); 3430 if (team == NULL) { 3431 return; 3432 } 3433 3434 __kmp_print_structure_team_accum(list, team->t.t_parent); 3435 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3436 3437 // Search list for the team. 3438 l = list; 3439 while (l->next != NULL && l->entry != team) { 3440 l = l->next; 3441 } 3442 if (l->next != NULL) { 3443 return; // Team has been added before, exit. 3444 } 3445 3446 // Team is not found. Search list again for insertion point. 3447 l = list; 3448 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3449 l = l->next; 3450 } 3451 3452 // Insert team. 3453 { 3454 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3455 sizeof(kmp_team_list_item_t)); 3456 *item = *l; 3457 l->entry = team; 3458 l->next = item; 3459 } 3460 } 3461 3462 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3463 3464 ) { 3465 __kmp_printf("%s", title); 3466 if (team != NULL) { 3467 __kmp_printf("%2x %p\n", team->t.t_id, team); 3468 } else { 3469 __kmp_printf(" - (nil)\n"); 3470 } 3471 } 3472 3473 static void __kmp_print_structure_thread(char const *title, 3474 kmp_info_p const *thread) { 3475 __kmp_printf("%s", title); 3476 if (thread != NULL) { 3477 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3478 } else { 3479 __kmp_printf(" - (nil)\n"); 3480 } 3481 } 3482 3483 void __kmp_print_structure(void) { 3484 3485 kmp_team_list_t list; 3486 3487 // Initialize list of teams. 3488 list = 3489 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3490 list->entry = NULL; 3491 list->next = NULL; 3492 3493 __kmp_printf("\n------------------------------\nGlobal Thread " 3494 "Table\n------------------------------\n"); 3495 { 3496 int gtid; 3497 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3498 __kmp_printf("%2d", gtid); 3499 if (__kmp_threads != NULL) { 3500 __kmp_printf(" %p", __kmp_threads[gtid]); 3501 } 3502 if (__kmp_root != NULL) { 3503 __kmp_printf(" %p", __kmp_root[gtid]); 3504 } 3505 __kmp_printf("\n"); 3506 } 3507 } 3508 3509 // Print out __kmp_threads array. 3510 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3511 "----------\n"); 3512 if (__kmp_threads != NULL) { 3513 int gtid; 3514 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3515 kmp_info_t const *thread = __kmp_threads[gtid]; 3516 if (thread != NULL) { 3517 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3518 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3519 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3520 __kmp_print_structure_team(" Serial Team: ", 3521 thread->th.th_serial_team); 3522 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3523 __kmp_print_structure_thread(" Primary: ", 3524 thread->th.th_team_master); 3525 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3526 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3527 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3528 __kmp_print_structure_thread(" Next in pool: ", 3529 thread->th.th_next_pool); 3530 __kmp_printf("\n"); 3531 __kmp_print_structure_team_accum(list, thread->th.th_team); 3532 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3533 } 3534 } 3535 } else { 3536 __kmp_printf("Threads array is not allocated.\n"); 3537 } 3538 3539 // Print out __kmp_root array. 3540 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3541 "--------\n"); 3542 if (__kmp_root != NULL) { 3543 int gtid; 3544 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3545 kmp_root_t const *root = __kmp_root[gtid]; 3546 if (root != NULL) { 3547 __kmp_printf("GTID %2d %p:\n", gtid, root); 3548 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3549 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3550 __kmp_print_structure_thread(" Uber Thread: ", 3551 root->r.r_uber_thread); 3552 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3553 __kmp_printf(" In Parallel: %2d\n", 3554 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3555 __kmp_printf("\n"); 3556 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3557 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3558 } 3559 } 3560 } else { 3561 __kmp_printf("Ubers array is not allocated.\n"); 3562 } 3563 3564 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3565 "--------\n"); 3566 while (list->next != NULL) { 3567 kmp_team_p const *team = list->entry; 3568 int i; 3569 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3570 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3571 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid); 3572 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3573 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3574 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3575 for (i = 0; i < team->t.t_nproc; ++i) { 3576 __kmp_printf(" Thread %2d: ", i); 3577 __kmp_print_structure_thread("", team->t.t_threads[i]); 3578 } 3579 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3580 __kmp_printf("\n"); 3581 list = list->next; 3582 } 3583 3584 // Print out __kmp_thread_pool and __kmp_team_pool. 3585 __kmp_printf("\n------------------------------\nPools\n----------------------" 3586 "--------\n"); 3587 __kmp_print_structure_thread("Thread pool: ", 3588 CCAST(kmp_info_t *, __kmp_thread_pool)); 3589 __kmp_print_structure_team("Team pool: ", 3590 CCAST(kmp_team_t *, __kmp_team_pool)); 3591 __kmp_printf("\n"); 3592 3593 // Free team list. 3594 while (list != NULL) { 3595 kmp_team_list_item_t *item = list; 3596 list = list->next; 3597 KMP_INTERNAL_FREE(item); 3598 } 3599 } 3600 3601 #endif 3602 3603 //--------------------------------------------------------------------------- 3604 // Stuff for per-thread fast random number generator 3605 // Table of primes 3606 static const unsigned __kmp_primes[] = { 3607 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3608 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3609 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3610 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3611 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3612 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3613 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3614 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3615 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3616 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3617 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3618 3619 //--------------------------------------------------------------------------- 3620 // __kmp_get_random: Get a random number using a linear congruential method. 3621 unsigned short __kmp_get_random(kmp_info_t *thread) { 3622 unsigned x = thread->th.th_x; 3623 unsigned short r = (unsigned short)(x >> 16); 3624 3625 thread->th.th_x = x * thread->th.th_a + 1; 3626 3627 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3628 thread->th.th_info.ds.ds_tid, r)); 3629 3630 return r; 3631 } 3632 //-------------------------------------------------------- 3633 // __kmp_init_random: Initialize a random number generator 3634 void __kmp_init_random(kmp_info_t *thread) { 3635 unsigned seed = thread->th.th_info.ds.ds_tid; 3636 3637 thread->th.th_a = 3638 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3639 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3640 KA_TRACE(30, 3641 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3642 } 3643 3644 #if KMP_OS_WINDOWS 3645 /* reclaim array entries for root threads that are already dead, returns number 3646 * reclaimed */ 3647 static int __kmp_reclaim_dead_roots(void) { 3648 int i, r = 0; 3649 3650 for (i = 0; i < __kmp_threads_capacity; ++i) { 3651 if (KMP_UBER_GTID(i) && 3652 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3653 !__kmp_root[i] 3654 ->r.r_active) { // AC: reclaim only roots died in non-active state 3655 r += __kmp_unregister_root_other_thread(i); 3656 } 3657 } 3658 return r; 3659 } 3660 #endif 3661 3662 /* This function attempts to create free entries in __kmp_threads and 3663 __kmp_root, and returns the number of free entries generated. 3664 3665 For Windows* OS static library, the first mechanism used is to reclaim array 3666 entries for root threads that are already dead. 3667 3668 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3669 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3670 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3671 threadprivate cache array has been created. Synchronization with 3672 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3673 3674 After any dead root reclamation, if the clipping value allows array expansion 3675 to result in the generation of a total of nNeed free slots, the function does 3676 that expansion. If not, nothing is done beyond the possible initial root 3677 thread reclamation. 3678 3679 If any argument is negative, the behavior is undefined. */ 3680 static int __kmp_expand_threads(int nNeed) { 3681 int added = 0; 3682 int minimumRequiredCapacity; 3683 int newCapacity; 3684 kmp_info_t **newThreads; 3685 kmp_root_t **newRoot; 3686 3687 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3688 // resizing __kmp_threads does not need additional protection if foreign 3689 // threads are present 3690 3691 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3692 /* only for Windows static library */ 3693 /* reclaim array entries for root threads that are already dead */ 3694 added = __kmp_reclaim_dead_roots(); 3695 3696 if (nNeed) { 3697 nNeed -= added; 3698 if (nNeed < 0) 3699 nNeed = 0; 3700 } 3701 #endif 3702 if (nNeed <= 0) 3703 return added; 3704 3705 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3706 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3707 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3708 // > __kmp_max_nth in one of two ways: 3709 // 3710 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3711 // may not be reused by another thread, so we may need to increase 3712 // __kmp_threads_capacity to __kmp_max_nth + 1. 3713 // 3714 // 2) New foreign root(s) are encountered. We always register new foreign 3715 // roots. This may cause a smaller # of threads to be allocated at 3716 // subsequent parallel regions, but the worker threads hang around (and 3717 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3718 // 3719 // Anyway, that is the reason for moving the check to see if 3720 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3721 // instead of having it performed here. -BB 3722 3723 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3724 3725 /* compute expansion headroom to check if we can expand */ 3726 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3727 /* possible expansion too small -- give up */ 3728 return added; 3729 } 3730 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3731 3732 newCapacity = __kmp_threads_capacity; 3733 do { 3734 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3735 : __kmp_sys_max_nth; 3736 } while (newCapacity < minimumRequiredCapacity); 3737 newThreads = (kmp_info_t **)__kmp_allocate( 3738 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3739 newRoot = 3740 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3741 KMP_MEMCPY(newThreads, __kmp_threads, 3742 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3743 KMP_MEMCPY(newRoot, __kmp_root, 3744 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3745 // Put old __kmp_threads array on a list. Any ongoing references to the old 3746 // list will be valid. This list is cleaned up at library shutdown. 3747 kmp_old_threads_list_t *node = 3748 (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t)); 3749 node->threads = __kmp_threads; 3750 node->next = __kmp_old_threads_list; 3751 __kmp_old_threads_list = node; 3752 3753 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3754 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3755 added += newCapacity - __kmp_threads_capacity; 3756 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3757 3758 if (newCapacity > __kmp_tp_capacity) { 3759 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3760 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3761 __kmp_threadprivate_resize_cache(newCapacity); 3762 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3763 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3764 } 3765 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3766 } 3767 3768 return added; 3769 } 3770 3771 /* Register the current thread as a root thread and obtain our gtid. We must 3772 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3773 thread that calls from __kmp_do_serial_initialize() */ 3774 int __kmp_register_root(int initial_thread) { 3775 kmp_info_t *root_thread; 3776 kmp_root_t *root; 3777 int gtid; 3778 int capacity; 3779 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3780 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3781 KMP_MB(); 3782 3783 /* 2007-03-02: 3784 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3785 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3786 work as expected -- it may return false (that means there is at least one 3787 empty slot in __kmp_threads array), but it is possible the only free slot 3788 is #0, which is reserved for initial thread and so cannot be used for this 3789 one. Following code workarounds this bug. 3790 3791 However, right solution seems to be not reserving slot #0 for initial 3792 thread because: 3793 (1) there is no magic in slot #0, 3794 (2) we cannot detect initial thread reliably (the first thread which does 3795 serial initialization may be not a real initial thread). 3796 */ 3797 capacity = __kmp_threads_capacity; 3798 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3799 --capacity; 3800 } 3801 3802 // If it is not for initializing the hidden helper team, we need to take 3803 // __kmp_hidden_helper_threads_num out of the capacity because it is included 3804 // in __kmp_threads_capacity. 3805 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 3806 capacity -= __kmp_hidden_helper_threads_num; 3807 } 3808 3809 /* see if there are too many threads */ 3810 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3811 if (__kmp_tp_cached) { 3812 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3813 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3814 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3815 } else { 3816 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3817 __kmp_msg_null); 3818 } 3819 } 3820 3821 // When hidden helper task is enabled, __kmp_threads is organized as follows: 3822 // 0: initial thread, also a regular OpenMP thread. 3823 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads. 3824 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for 3825 // regular OpenMP threads. 3826 if (TCR_4(__kmp_init_hidden_helper_threads)) { 3827 // Find an available thread slot for hidden helper thread. Slots for hidden 3828 // helper threads start from 1 to __kmp_hidden_helper_threads_num. 3829 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL && 3830 gtid <= __kmp_hidden_helper_threads_num; 3831 gtid++) 3832 ; 3833 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num); 3834 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for " 3835 "hidden helper thread: T#%d\n", 3836 gtid)); 3837 } else { 3838 /* find an available thread slot */ 3839 // Don't reassign the zero slot since we need that to only be used by 3840 // initial thread. Slots for hidden helper threads should also be skipped. 3841 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3842 gtid = 0; 3843 } else { 3844 for (gtid = __kmp_hidden_helper_threads_num + 1; 3845 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++) 3846 ; 3847 } 3848 KA_TRACE( 3849 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3850 KMP_ASSERT(gtid < __kmp_threads_capacity); 3851 } 3852 3853 /* update global accounting */ 3854 __kmp_all_nth++; 3855 TCW_4(__kmp_nth, __kmp_nth + 1); 3856 3857 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3858 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3859 if (__kmp_adjust_gtid_mode) { 3860 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3861 if (TCR_4(__kmp_gtid_mode) != 2) { 3862 TCW_4(__kmp_gtid_mode, 2); 3863 } 3864 } else { 3865 if (TCR_4(__kmp_gtid_mode) != 1) { 3866 TCW_4(__kmp_gtid_mode, 1); 3867 } 3868 } 3869 } 3870 3871 #ifdef KMP_ADJUST_BLOCKTIME 3872 /* Adjust blocktime to zero if necessary */ 3873 /* Middle initialization might not have occurred yet */ 3874 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3875 if (__kmp_nth > __kmp_avail_proc) { 3876 __kmp_zero_bt = TRUE; 3877 } 3878 } 3879 #endif /* KMP_ADJUST_BLOCKTIME */ 3880 3881 /* setup this new hierarchy */ 3882 if (!(root = __kmp_root[gtid])) { 3883 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3884 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3885 } 3886 3887 #if KMP_STATS_ENABLED 3888 // Initialize stats as soon as possible (right after gtid assignment). 3889 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3890 __kmp_stats_thread_ptr->startLife(); 3891 KMP_SET_THREAD_STATE(SERIAL_REGION); 3892 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3893 #endif 3894 __kmp_initialize_root(root); 3895 3896 /* setup new root thread structure */ 3897 if (root->r.r_uber_thread) { 3898 root_thread = root->r.r_uber_thread; 3899 } else { 3900 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3901 if (__kmp_storage_map) { 3902 __kmp_print_thread_storage_map(root_thread, gtid); 3903 } 3904 root_thread->th.th_info.ds.ds_gtid = gtid; 3905 #if OMPT_SUPPORT 3906 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3907 #endif 3908 root_thread->th.th_root = root; 3909 if (__kmp_env_consistency_check) { 3910 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3911 } 3912 #if USE_FAST_MEMORY 3913 __kmp_initialize_fast_memory(root_thread); 3914 #endif /* USE_FAST_MEMORY */ 3915 3916 #if KMP_USE_BGET 3917 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3918 __kmp_initialize_bget(root_thread); 3919 #endif 3920 __kmp_init_random(root_thread); // Initialize random number generator 3921 } 3922 3923 /* setup the serial team held in reserve by the root thread */ 3924 if (!root_thread->th.th_serial_team) { 3925 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3926 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3927 root_thread->th.th_serial_team = __kmp_allocate_team( 3928 root, 1, 1, 3929 #if OMPT_SUPPORT 3930 ompt_data_none, // root parallel id 3931 #endif 3932 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3933 } 3934 KMP_ASSERT(root_thread->th.th_serial_team); 3935 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3936 root_thread->th.th_serial_team)); 3937 3938 /* drop root_thread into place */ 3939 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3940 3941 root->r.r_root_team->t.t_threads[0] = root_thread; 3942 root->r.r_hot_team->t.t_threads[0] = root_thread; 3943 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3944 // AC: the team created in reserve, not for execution (it is unused for now). 3945 root_thread->th.th_serial_team->t.t_serialized = 0; 3946 root->r.r_uber_thread = root_thread; 3947 3948 /* initialize the thread, get it ready to go */ 3949 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3950 TCW_4(__kmp_init_gtid, TRUE); 3951 3952 /* prepare the primary thread for get_gtid() */ 3953 __kmp_gtid_set_specific(gtid); 3954 3955 #if USE_ITT_BUILD 3956 __kmp_itt_thread_name(gtid); 3957 #endif /* USE_ITT_BUILD */ 3958 3959 #ifdef KMP_TDATA_GTID 3960 __kmp_gtid = gtid; 3961 #endif 3962 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3963 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3964 3965 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3966 "plain=%u\n", 3967 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3968 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3969 KMP_INIT_BARRIER_STATE)); 3970 { // Initialize barrier data. 3971 int b; 3972 for (b = 0; b < bs_last_barrier; ++b) { 3973 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3974 #if USE_DEBUGGER 3975 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3976 #endif 3977 } 3978 } 3979 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3980 KMP_INIT_BARRIER_STATE); 3981 3982 #if KMP_AFFINITY_SUPPORTED 3983 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3984 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3985 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3986 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3987 #endif /* KMP_AFFINITY_SUPPORTED */ 3988 root_thread->th.th_def_allocator = __kmp_def_allocator; 3989 root_thread->th.th_prev_level = 0; 3990 root_thread->th.th_prev_num_threads = 1; 3991 3992 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 3993 tmp->cg_root = root_thread; 3994 tmp->cg_thread_limit = __kmp_cg_max_nth; 3995 tmp->cg_nthreads = 1; 3996 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" 3997 " cg_nthreads init to 1\n", 3998 root_thread, tmp)); 3999 tmp->up = NULL; 4000 root_thread->th.th_cg_roots = tmp; 4001 4002 __kmp_root_counter++; 4003 4004 #if OMPT_SUPPORT 4005 if (!initial_thread && ompt_enabled.enabled) { 4006 4007 kmp_info_t *root_thread = ompt_get_thread(); 4008 4009 ompt_set_thread_state(root_thread, ompt_state_overhead); 4010 4011 if (ompt_enabled.ompt_callback_thread_begin) { 4012 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 4013 ompt_thread_initial, __ompt_get_thread_data_internal()); 4014 } 4015 ompt_data_t *task_data; 4016 ompt_data_t *parallel_data; 4017 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 4018 NULL); 4019 if (ompt_enabled.ompt_callback_implicit_task) { 4020 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 4021 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); 4022 } 4023 4024 ompt_set_thread_state(root_thread, ompt_state_work_serial); 4025 } 4026 #endif 4027 #if OMPD_SUPPORT 4028 if (ompd_state & OMPD_ENABLE_BP) 4029 ompd_bp_thread_begin(); 4030 #endif 4031 4032 KMP_MB(); 4033 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4034 4035 return gtid; 4036 } 4037 4038 #if KMP_NESTED_HOT_TEAMS 4039 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 4040 const int max_level) { 4041 int i, n, nth; 4042 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 4043 if (!hot_teams || !hot_teams[level].hot_team) { 4044 return 0; 4045 } 4046 KMP_DEBUG_ASSERT(level < max_level); 4047 kmp_team_t *team = hot_teams[level].hot_team; 4048 nth = hot_teams[level].hot_team_nth; 4049 n = nth - 1; // primary thread is not freed 4050 if (level < max_level - 1) { 4051 for (i = 0; i < nth; ++i) { 4052 kmp_info_t *th = team->t.t_threads[i]; 4053 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 4054 if (i > 0 && th->th.th_hot_teams) { 4055 __kmp_free(th->th.th_hot_teams); 4056 th->th.th_hot_teams = NULL; 4057 } 4058 } 4059 } 4060 __kmp_free_team(root, team, NULL); 4061 return n; 4062 } 4063 #endif 4064 4065 // Resets a root thread and clear its root and hot teams. 4066 // Returns the number of __kmp_threads entries directly and indirectly freed. 4067 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 4068 kmp_team_t *root_team = root->r.r_root_team; 4069 kmp_team_t *hot_team = root->r.r_hot_team; 4070 int n = hot_team->t.t_nproc; 4071 int i; 4072 4073 KMP_DEBUG_ASSERT(!root->r.r_active); 4074 4075 root->r.r_root_team = NULL; 4076 root->r.r_hot_team = NULL; 4077 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 4078 // before call to __kmp_free_team(). 4079 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 4080 #if KMP_NESTED_HOT_TEAMS 4081 if (__kmp_hot_teams_max_level > 4082 0) { // need to free nested hot teams and their threads if any 4083 for (i = 0; i < hot_team->t.t_nproc; ++i) { 4084 kmp_info_t *th = hot_team->t.t_threads[i]; 4085 if (__kmp_hot_teams_max_level > 1) { 4086 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 4087 } 4088 if (th->th.th_hot_teams) { 4089 __kmp_free(th->th.th_hot_teams); 4090 th->th.th_hot_teams = NULL; 4091 } 4092 } 4093 } 4094 #endif 4095 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 4096 4097 // Before we can reap the thread, we need to make certain that all other 4098 // threads in the teams that had this root as ancestor have stopped trying to 4099 // steal tasks. 4100 if (__kmp_tasking_mode != tskm_immediate_exec) { 4101 __kmp_wait_to_unref_task_teams(); 4102 } 4103 4104 #if KMP_OS_WINDOWS 4105 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 4106 KA_TRACE( 4107 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 4108 "\n", 4109 (LPVOID) & (root->r.r_uber_thread->th), 4110 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 4111 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 4112 #endif /* KMP_OS_WINDOWS */ 4113 4114 #if OMPD_SUPPORT 4115 if (ompd_state & OMPD_ENABLE_BP) 4116 ompd_bp_thread_end(); 4117 #endif 4118 4119 #if OMPT_SUPPORT 4120 ompt_data_t *task_data; 4121 ompt_data_t *parallel_data; 4122 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 4123 NULL); 4124 if (ompt_enabled.ompt_callback_implicit_task) { 4125 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 4126 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); 4127 } 4128 if (ompt_enabled.ompt_callback_thread_end) { 4129 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 4130 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 4131 } 4132 #endif 4133 4134 TCW_4(__kmp_nth, 4135 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 4136 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; 4137 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" 4138 " to %d\n", 4139 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, 4140 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); 4141 if (i == 1) { 4142 // need to free contention group structure 4143 KMP_DEBUG_ASSERT(root->r.r_uber_thread == 4144 root->r.r_uber_thread->th.th_cg_roots->cg_root); 4145 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); 4146 __kmp_free(root->r.r_uber_thread->th.th_cg_roots); 4147 root->r.r_uber_thread->th.th_cg_roots = NULL; 4148 } 4149 __kmp_reap_thread(root->r.r_uber_thread, 1); 4150 4151 // We canot put root thread to __kmp_thread_pool, so we have to reap it 4152 // instead of freeing. 4153 root->r.r_uber_thread = NULL; 4154 /* mark root as no longer in use */ 4155 root->r.r_begin = FALSE; 4156 4157 return n; 4158 } 4159 4160 void __kmp_unregister_root_current_thread(int gtid) { 4161 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 4162 /* this lock should be ok, since unregister_root_current_thread is never 4163 called during an abort, only during a normal close. furthermore, if you 4164 have the forkjoin lock, you should never try to get the initz lock */ 4165 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 4166 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 4167 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 4168 "exiting T#%d\n", 4169 gtid)); 4170 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4171 return; 4172 } 4173 kmp_root_t *root = __kmp_root[gtid]; 4174 4175 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4176 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4177 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4178 KMP_ASSERT(root->r.r_active == FALSE); 4179 4180 KMP_MB(); 4181 4182 kmp_info_t *thread = __kmp_threads[gtid]; 4183 kmp_team_t *team = thread->th.th_team; 4184 kmp_task_team_t *task_team = thread->th.th_task_team; 4185 4186 // we need to wait for the proxy tasks before finishing the thread 4187 if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks || 4188 task_team->tt.tt_hidden_helper_task_encountered)) { 4189 #if OMPT_SUPPORT 4190 // the runtime is shutting down so we won't report any events 4191 thread->th.ompt_thread_info.state = ompt_state_undefined; 4192 #endif 4193 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 4194 } 4195 4196 __kmp_reset_root(gtid, root); 4197 4198 KMP_MB(); 4199 KC_TRACE(10, 4200 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 4201 4202 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4203 } 4204 4205 #if KMP_OS_WINDOWS 4206 /* __kmp_forkjoin_lock must be already held 4207 Unregisters a root thread that is not the current thread. Returns the number 4208 of __kmp_threads entries freed as a result. */ 4209 static int __kmp_unregister_root_other_thread(int gtid) { 4210 kmp_root_t *root = __kmp_root[gtid]; 4211 int r; 4212 4213 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 4214 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4215 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4216 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4217 KMP_ASSERT(root->r.r_active == FALSE); 4218 4219 r = __kmp_reset_root(gtid, root); 4220 KC_TRACE(10, 4221 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 4222 return r; 4223 } 4224 #endif 4225 4226 #if KMP_DEBUG 4227 void __kmp_task_info() { 4228 4229 kmp_int32 gtid = __kmp_entry_gtid(); 4230 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4231 kmp_info_t *this_thr = __kmp_threads[gtid]; 4232 kmp_team_t *steam = this_thr->th.th_serial_team; 4233 kmp_team_t *team = this_thr->th.th_team; 4234 4235 __kmp_printf( 4236 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4237 "ptask=%p\n", 4238 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4239 team->t.t_implicit_task_taskdata[tid].td_parent); 4240 } 4241 #endif // KMP_DEBUG 4242 4243 /* TODO optimize with one big memclr, take out what isn't needed, split 4244 responsibility to workers as much as possible, and delay initialization of 4245 features as much as possible */ 4246 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4247 int tid, int gtid) { 4248 /* this_thr->th.th_info.ds.ds_gtid is setup in 4249 kmp_allocate_thread/create_worker. 4250 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4251 KMP_DEBUG_ASSERT(this_thr != NULL); 4252 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4253 KMP_DEBUG_ASSERT(team); 4254 KMP_DEBUG_ASSERT(team->t.t_threads); 4255 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4256 kmp_info_t *master = team->t.t_threads[0]; 4257 KMP_DEBUG_ASSERT(master); 4258 KMP_DEBUG_ASSERT(master->th.th_root); 4259 4260 KMP_MB(); 4261 4262 TCW_SYNC_PTR(this_thr->th.th_team, team); 4263 4264 this_thr->th.th_info.ds.ds_tid = tid; 4265 this_thr->th.th_set_nproc = 0; 4266 if (__kmp_tasking_mode != tskm_immediate_exec) 4267 // When tasking is possible, threads are not safe to reap until they are 4268 // done tasking; this will be set when tasking code is exited in wait 4269 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4270 else // no tasking --> always safe to reap 4271 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4272 this_thr->th.th_set_proc_bind = proc_bind_default; 4273 #if KMP_AFFINITY_SUPPORTED 4274 this_thr->th.th_new_place = this_thr->th.th_current_place; 4275 #endif 4276 this_thr->th.th_root = master->th.th_root; 4277 4278 /* setup the thread's cache of the team structure */ 4279 this_thr->th.th_team_nproc = team->t.t_nproc; 4280 this_thr->th.th_team_master = master; 4281 this_thr->th.th_team_serialized = team->t.t_serialized; 4282 4283 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4284 4285 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4286 tid, gtid, this_thr, this_thr->th.th_current_task)); 4287 4288 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4289 team, tid, TRUE); 4290 4291 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4292 tid, gtid, this_thr, this_thr->th.th_current_task)); 4293 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4294 // __kmp_initialize_team()? 4295 4296 /* TODO no worksharing in speculative threads */ 4297 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4298 4299 this_thr->th.th_local.this_construct = 0; 4300 4301 if (!this_thr->th.th_pri_common) { 4302 this_thr->th.th_pri_common = 4303 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4304 if (__kmp_storage_map) { 4305 __kmp_print_storage_map_gtid( 4306 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4307 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4308 } 4309 this_thr->th.th_pri_head = NULL; 4310 } 4311 4312 if (this_thr != master && // Primary thread's CG root is initialized elsewhere 4313 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set 4314 // Make new thread's CG root same as primary thread's 4315 KMP_DEBUG_ASSERT(master->th.th_cg_roots); 4316 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 4317 if (tmp) { 4318 // worker changes CG, need to check if old CG should be freed 4319 int i = tmp->cg_nthreads--; 4320 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" 4321 " on node %p of thread %p to %d\n", 4322 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); 4323 if (i == 1) { 4324 __kmp_free(tmp); // last thread left CG --> free it 4325 } 4326 } 4327 this_thr->th.th_cg_roots = master->th.th_cg_roots; 4328 // Increment new thread's CG root's counter to add the new thread 4329 this_thr->th.th_cg_roots->cg_nthreads++; 4330 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" 4331 " node %p of thread %p to %d\n", 4332 this_thr, this_thr->th.th_cg_roots, 4333 this_thr->th.th_cg_roots->cg_root, 4334 this_thr->th.th_cg_roots->cg_nthreads)); 4335 this_thr->th.th_current_task->td_icvs.thread_limit = 4336 this_thr->th.th_cg_roots->cg_thread_limit; 4337 } 4338 4339 /* Initialize dynamic dispatch */ 4340 { 4341 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4342 // Use team max_nproc since this will never change for the team. 4343 size_t disp_size = 4344 sizeof(dispatch_private_info_t) * 4345 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4346 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4347 team->t.t_max_nproc)); 4348 KMP_ASSERT(dispatch); 4349 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4350 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4351 4352 dispatch->th_disp_index = 0; 4353 dispatch->th_doacross_buf_idx = 0; 4354 if (!dispatch->th_disp_buffer) { 4355 dispatch->th_disp_buffer = 4356 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4357 4358 if (__kmp_storage_map) { 4359 __kmp_print_storage_map_gtid( 4360 gtid, &dispatch->th_disp_buffer[0], 4361 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4362 ? 1 4363 : __kmp_dispatch_num_buffers], 4364 disp_size, 4365 "th_%d.th_dispatch.th_disp_buffer " 4366 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4367 gtid, team->t.t_id, gtid); 4368 } 4369 } else { 4370 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4371 } 4372 4373 dispatch->th_dispatch_pr_current = 0; 4374 dispatch->th_dispatch_sh_current = 0; 4375 4376 dispatch->th_deo_fcn = 0; /* ORDERED */ 4377 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4378 } 4379 4380 this_thr->th.th_next_pool = NULL; 4381 4382 if (!this_thr->th.th_task_state_memo_stack) { 4383 size_t i; 4384 this_thr->th.th_task_state_memo_stack = 4385 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4386 this_thr->th.th_task_state_top = 0; 4387 this_thr->th.th_task_state_stack_sz = 4; 4388 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4389 ++i) // zero init the stack 4390 this_thr->th.th_task_state_memo_stack[i] = 0; 4391 } 4392 4393 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4394 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4395 4396 KMP_MB(); 4397 } 4398 4399 /* allocate a new thread for the requesting team. this is only called from 4400 within a forkjoin critical section. we will first try to get an available 4401 thread from the thread pool. if none is available, we will fork a new one 4402 assuming we are able to create a new one. this should be assured, as the 4403 caller should check on this first. */ 4404 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4405 int new_tid) { 4406 kmp_team_t *serial_team; 4407 kmp_info_t *new_thr; 4408 int new_gtid; 4409 4410 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4411 KMP_DEBUG_ASSERT(root && team); 4412 #if !KMP_NESTED_HOT_TEAMS 4413 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4414 #endif 4415 KMP_MB(); 4416 4417 /* first, try to get one from the thread pool */ 4418 if (__kmp_thread_pool) { 4419 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4420 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4421 if (new_thr == __kmp_thread_pool_insert_pt) { 4422 __kmp_thread_pool_insert_pt = NULL; 4423 } 4424 TCW_4(new_thr->th.th_in_pool, FALSE); 4425 __kmp_suspend_initialize_thread(new_thr); 4426 __kmp_lock_suspend_mx(new_thr); 4427 if (new_thr->th.th_active_in_pool == TRUE) { 4428 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); 4429 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 4430 new_thr->th.th_active_in_pool = FALSE; 4431 } 4432 __kmp_unlock_suspend_mx(new_thr); 4433 4434 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4435 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4436 KMP_ASSERT(!new_thr->th.th_team); 4437 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4438 4439 /* setup the thread structure */ 4440 __kmp_initialize_info(new_thr, team, new_tid, 4441 new_thr->th.th_info.ds.ds_gtid); 4442 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4443 4444 TCW_4(__kmp_nth, __kmp_nth + 1); 4445 4446 new_thr->th.th_task_state = 0; 4447 new_thr->th.th_task_state_top = 0; 4448 new_thr->th.th_task_state_stack_sz = 4; 4449 4450 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 4451 // Make sure pool thread has transitioned to waiting on own thread struct 4452 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0); 4453 // Thread activated in __kmp_allocate_team when increasing team size 4454 } 4455 4456 #ifdef KMP_ADJUST_BLOCKTIME 4457 /* Adjust blocktime back to zero if necessary */ 4458 /* Middle initialization might not have occurred yet */ 4459 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4460 if (__kmp_nth > __kmp_avail_proc) { 4461 __kmp_zero_bt = TRUE; 4462 } 4463 } 4464 #endif /* KMP_ADJUST_BLOCKTIME */ 4465 4466 #if KMP_DEBUG 4467 // If thread entered pool via __kmp_free_thread, wait_flag should != 4468 // KMP_BARRIER_PARENT_FLAG. 4469 int b; 4470 kmp_balign_t *balign = new_thr->th.th_bar; 4471 for (b = 0; b < bs_last_barrier; ++b) 4472 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4473 #endif 4474 4475 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4476 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4477 4478 KMP_MB(); 4479 return new_thr; 4480 } 4481 4482 /* no, well fork a new one */ 4483 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4484 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4485 4486 #if KMP_USE_MONITOR 4487 // If this is the first worker thread the RTL is creating, then also 4488 // launch the monitor thread. We try to do this as early as possible. 4489 if (!TCR_4(__kmp_init_monitor)) { 4490 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4491 if (!TCR_4(__kmp_init_monitor)) { 4492 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4493 TCW_4(__kmp_init_monitor, 1); 4494 __kmp_create_monitor(&__kmp_monitor); 4495 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4496 #if KMP_OS_WINDOWS 4497 // AC: wait until monitor has started. This is a fix for CQ232808. 4498 // The reason is that if the library is loaded/unloaded in a loop with 4499 // small (parallel) work in between, then there is high probability that 4500 // monitor thread started after the library shutdown. At shutdown it is 4501 // too late to cope with the problem, because when the primary thread is 4502 // in DllMain (process detach) the monitor has no chances to start (it is 4503 // blocked), and primary thread has no means to inform the monitor that 4504 // the library has gone, because all the memory which the monitor can 4505 // access is going to be released/reset. 4506 while (TCR_4(__kmp_init_monitor) < 2) { 4507 KMP_YIELD(TRUE); 4508 } 4509 KF_TRACE(10, ("after monitor thread has started\n")); 4510 #endif 4511 } 4512 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4513 } 4514 #endif 4515 4516 KMP_MB(); 4517 4518 { 4519 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads) 4520 ? 1 4521 : __kmp_hidden_helper_threads_num + 1; 4522 4523 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL; 4524 ++new_gtid) { 4525 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4526 } 4527 4528 if (TCR_4(__kmp_init_hidden_helper_threads)) { 4529 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num); 4530 } 4531 } 4532 4533 /* allocate space for it. */ 4534 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4535 4536 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4537 4538 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 4539 // suppress race conditions detection on synchronization flags in debug mode 4540 // this helps to analyze library internals eliminating false positives 4541 __itt_suppress_mark_range( 4542 __itt_suppress_range, __itt_suppress_threading_errors, 4543 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc)); 4544 __itt_suppress_mark_range( 4545 __itt_suppress_range, __itt_suppress_threading_errors, 4546 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state)); 4547 #if KMP_OS_WINDOWS 4548 __itt_suppress_mark_range( 4549 __itt_suppress_range, __itt_suppress_threading_errors, 4550 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init)); 4551 #else 4552 __itt_suppress_mark_range(__itt_suppress_range, 4553 __itt_suppress_threading_errors, 4554 &new_thr->th.th_suspend_init_count, 4555 sizeof(new_thr->th.th_suspend_init_count)); 4556 #endif 4557 // TODO: check if we need to also suppress b_arrived flags 4558 __itt_suppress_mark_range(__itt_suppress_range, 4559 __itt_suppress_threading_errors, 4560 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go), 4561 sizeof(new_thr->th.th_bar[0].bb.b_go)); 4562 __itt_suppress_mark_range(__itt_suppress_range, 4563 __itt_suppress_threading_errors, 4564 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go), 4565 sizeof(new_thr->th.th_bar[1].bb.b_go)); 4566 __itt_suppress_mark_range(__itt_suppress_range, 4567 __itt_suppress_threading_errors, 4568 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go), 4569 sizeof(new_thr->th.th_bar[2].bb.b_go)); 4570 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 4571 if (__kmp_storage_map) { 4572 __kmp_print_thread_storage_map(new_thr, new_gtid); 4573 } 4574 4575 // add the reserve serialized team, initialized from the team's primary thread 4576 { 4577 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4578 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4579 new_thr->th.th_serial_team = serial_team = 4580 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4581 #if OMPT_SUPPORT 4582 ompt_data_none, // root parallel id 4583 #endif 4584 proc_bind_default, &r_icvs, 4585 0 USE_NESTED_HOT_ARG(NULL)); 4586 } 4587 KMP_ASSERT(serial_team); 4588 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4589 // execution (it is unused for now). 4590 serial_team->t.t_threads[0] = new_thr; 4591 KF_TRACE(10, 4592 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4593 new_thr)); 4594 4595 /* setup the thread structures */ 4596 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4597 4598 #if USE_FAST_MEMORY 4599 __kmp_initialize_fast_memory(new_thr); 4600 #endif /* USE_FAST_MEMORY */ 4601 4602 #if KMP_USE_BGET 4603 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4604 __kmp_initialize_bget(new_thr); 4605 #endif 4606 4607 __kmp_init_random(new_thr); // Initialize random number generator 4608 4609 /* Initialize these only once when thread is grabbed for a team allocation */ 4610 KA_TRACE(20, 4611 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4612 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4613 4614 int b; 4615 kmp_balign_t *balign = new_thr->th.th_bar; 4616 for (b = 0; b < bs_last_barrier; ++b) { 4617 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4618 balign[b].bb.team = NULL; 4619 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4620 balign[b].bb.use_oncore_barrier = 0; 4621 } 4622 4623 TCW_PTR(new_thr->th.th_sleep_loc, NULL); 4624 new_thr->th.th_sleep_loc_type = flag_unset; 4625 4626 new_thr->th.th_spin_here = FALSE; 4627 new_thr->th.th_next_waiting = 0; 4628 #if KMP_OS_UNIX 4629 new_thr->th.th_blocking = false; 4630 #endif 4631 4632 #if KMP_AFFINITY_SUPPORTED 4633 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4634 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4635 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4636 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4637 #endif 4638 new_thr->th.th_def_allocator = __kmp_def_allocator; 4639 new_thr->th.th_prev_level = 0; 4640 new_thr->th.th_prev_num_threads = 1; 4641 4642 TCW_4(new_thr->th.th_in_pool, FALSE); 4643 new_thr->th.th_active_in_pool = FALSE; 4644 TCW_4(new_thr->th.th_active, TRUE); 4645 4646 /* adjust the global counters */ 4647 __kmp_all_nth++; 4648 __kmp_nth++; 4649 4650 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4651 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4652 if (__kmp_adjust_gtid_mode) { 4653 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4654 if (TCR_4(__kmp_gtid_mode) != 2) { 4655 TCW_4(__kmp_gtid_mode, 2); 4656 } 4657 } else { 4658 if (TCR_4(__kmp_gtid_mode) != 1) { 4659 TCW_4(__kmp_gtid_mode, 1); 4660 } 4661 } 4662 } 4663 4664 #ifdef KMP_ADJUST_BLOCKTIME 4665 /* Adjust blocktime back to zero if necessary */ 4666 /* Middle initialization might not have occurred yet */ 4667 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4668 if (__kmp_nth > __kmp_avail_proc) { 4669 __kmp_zero_bt = TRUE; 4670 } 4671 } 4672 #endif /* KMP_ADJUST_BLOCKTIME */ 4673 4674 /* actually fork it and create the new worker thread */ 4675 KF_TRACE( 4676 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4677 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4678 KF_TRACE(10, 4679 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4680 4681 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4682 new_gtid)); 4683 KMP_MB(); 4684 return new_thr; 4685 } 4686 4687 /* Reinitialize team for reuse. 4688 The hot team code calls this case at every fork barrier, so EPCC barrier 4689 test are extremely sensitive to changes in it, esp. writes to the team 4690 struct, which cause a cache invalidation in all threads. 4691 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4692 static void __kmp_reinitialize_team(kmp_team_t *team, 4693 kmp_internal_control_t *new_icvs, 4694 ident_t *loc) { 4695 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4696 team->t.t_threads[0], team)); 4697 KMP_DEBUG_ASSERT(team && new_icvs); 4698 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4699 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4700 4701 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4702 // Copy ICVs to the primary thread's implicit taskdata 4703 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4704 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4705 4706 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4707 team->t.t_threads[0], team)); 4708 } 4709 4710 /* Initialize the team data structure. 4711 This assumes the t_threads and t_max_nproc are already set. 4712 Also, we don't touch the arguments */ 4713 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4714 kmp_internal_control_t *new_icvs, 4715 ident_t *loc) { 4716 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4717 4718 /* verify */ 4719 KMP_DEBUG_ASSERT(team); 4720 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4721 KMP_DEBUG_ASSERT(team->t.t_threads); 4722 KMP_MB(); 4723 4724 team->t.t_master_tid = 0; /* not needed */ 4725 /* team->t.t_master_bar; not needed */ 4726 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4727 team->t.t_nproc = new_nproc; 4728 4729 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4730 team->t.t_next_pool = NULL; 4731 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4732 * up hot team */ 4733 4734 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4735 team->t.t_invoke = NULL; /* not needed */ 4736 4737 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4738 team->t.t_sched.sched = new_icvs->sched.sched; 4739 4740 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4741 team->t.t_fp_control_saved = FALSE; /* not needed */ 4742 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4743 team->t.t_mxcsr = 0; /* not needed */ 4744 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4745 4746 team->t.t_construct = 0; 4747 4748 team->t.t_ordered.dt.t_value = 0; 4749 team->t.t_master_active = FALSE; 4750 4751 #ifdef KMP_DEBUG 4752 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4753 #endif 4754 #if KMP_OS_WINDOWS 4755 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4756 #endif 4757 4758 team->t.t_control_stack_top = NULL; 4759 4760 __kmp_reinitialize_team(team, new_icvs, loc); 4761 4762 KMP_MB(); 4763 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4764 } 4765 4766 #if KMP_AFFINITY_SUPPORTED 4767 4768 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4769 // It calculates the worker + primary thread's partition based upon the parent 4770 // thread's partition, and binds each worker to a thread in their partition. 4771 // The primary thread's partition should already include its current binding. 4772 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4773 // Do not partition places for the hidden helper team 4774 if (KMP_HIDDEN_HELPER_TEAM(team)) 4775 return; 4776 // Copy the primary thread's place partition to the team struct 4777 kmp_info_t *master_th = team->t.t_threads[0]; 4778 KMP_DEBUG_ASSERT(master_th != NULL); 4779 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4780 int first_place = master_th->th.th_first_place; 4781 int last_place = master_th->th.th_last_place; 4782 int masters_place = master_th->th.th_current_place; 4783 int num_masks = __kmp_affinity.num_masks; 4784 team->t.t_first_place = first_place; 4785 team->t.t_last_place = last_place; 4786 4787 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4788 "bound to place %d partition = [%d,%d]\n", 4789 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4790 team->t.t_id, masters_place, first_place, last_place)); 4791 4792 switch (proc_bind) { 4793 4794 case proc_bind_default: 4795 // Serial teams might have the proc_bind policy set to proc_bind_default. 4796 // Not an issue -- we don't rebind primary thread for any proc_bind policy. 4797 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4798 break; 4799 4800 case proc_bind_primary: { 4801 int f; 4802 int n_th = team->t.t_nproc; 4803 for (f = 1; f < n_th; f++) { 4804 kmp_info_t *th = team->t.t_threads[f]; 4805 KMP_DEBUG_ASSERT(th != NULL); 4806 th->th.th_first_place = first_place; 4807 th->th.th_last_place = last_place; 4808 th->th.th_new_place = masters_place; 4809 if (__kmp_display_affinity && masters_place != th->th.th_current_place && 4810 team->t.t_display_affinity != 1) { 4811 team->t.t_display_affinity = 1; 4812 } 4813 4814 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d " 4815 "partition = [%d,%d]\n", 4816 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4817 f, masters_place, first_place, last_place)); 4818 } 4819 } break; 4820 4821 case proc_bind_close: { 4822 int f; 4823 int n_th = team->t.t_nproc; 4824 int n_places; 4825 if (first_place <= last_place) { 4826 n_places = last_place - first_place + 1; 4827 } else { 4828 n_places = num_masks - first_place + last_place + 1; 4829 } 4830 if (n_th <= n_places) { 4831 int place = masters_place; 4832 for (f = 1; f < n_th; f++) { 4833 kmp_info_t *th = team->t.t_threads[f]; 4834 KMP_DEBUG_ASSERT(th != NULL); 4835 4836 if (place == last_place) { 4837 place = first_place; 4838 } else if (place == (num_masks - 1)) { 4839 place = 0; 4840 } else { 4841 place++; 4842 } 4843 th->th.th_first_place = first_place; 4844 th->th.th_last_place = last_place; 4845 th->th.th_new_place = place; 4846 if (__kmp_display_affinity && place != th->th.th_current_place && 4847 team->t.t_display_affinity != 1) { 4848 team->t.t_display_affinity = 1; 4849 } 4850 4851 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4852 "partition = [%d,%d]\n", 4853 __kmp_gtid_from_thread(team->t.t_threads[f]), 4854 team->t.t_id, f, place, first_place, last_place)); 4855 } 4856 } else { 4857 int S, rem, gap, s_count; 4858 S = n_th / n_places; 4859 s_count = 0; 4860 rem = n_th - (S * n_places); 4861 gap = rem > 0 ? n_places / rem : n_places; 4862 int place = masters_place; 4863 int gap_ct = gap; 4864 for (f = 0; f < n_th; f++) { 4865 kmp_info_t *th = team->t.t_threads[f]; 4866 KMP_DEBUG_ASSERT(th != NULL); 4867 4868 th->th.th_first_place = first_place; 4869 th->th.th_last_place = last_place; 4870 th->th.th_new_place = place; 4871 if (__kmp_display_affinity && place != th->th.th_current_place && 4872 team->t.t_display_affinity != 1) { 4873 team->t.t_display_affinity = 1; 4874 } 4875 s_count++; 4876 4877 if ((s_count == S) && rem && (gap_ct == gap)) { 4878 // do nothing, add an extra thread to place on next iteration 4879 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4880 // we added an extra thread to this place; move to next place 4881 if (place == last_place) { 4882 place = first_place; 4883 } else if (place == (num_masks - 1)) { 4884 place = 0; 4885 } else { 4886 place++; 4887 } 4888 s_count = 0; 4889 gap_ct = 1; 4890 rem--; 4891 } else if (s_count == S) { // place full; don't add extra 4892 if (place == last_place) { 4893 place = first_place; 4894 } else if (place == (num_masks - 1)) { 4895 place = 0; 4896 } else { 4897 place++; 4898 } 4899 gap_ct++; 4900 s_count = 0; 4901 } 4902 4903 KA_TRACE(100, 4904 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4905 "partition = [%d,%d]\n", 4906 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4907 th->th.th_new_place, first_place, last_place)); 4908 } 4909 KMP_DEBUG_ASSERT(place == masters_place); 4910 } 4911 } break; 4912 4913 case proc_bind_spread: { 4914 int f; 4915 int n_th = team->t.t_nproc; 4916 int n_places; 4917 int thidx; 4918 if (first_place <= last_place) { 4919 n_places = last_place - first_place + 1; 4920 } else { 4921 n_places = num_masks - first_place + last_place + 1; 4922 } 4923 if (n_th <= n_places) { 4924 int place = -1; 4925 4926 if (n_places != num_masks) { 4927 int S = n_places / n_th; 4928 int s_count, rem, gap, gap_ct; 4929 4930 place = masters_place; 4931 rem = n_places - n_th * S; 4932 gap = rem ? n_th / rem : 1; 4933 gap_ct = gap; 4934 thidx = n_th; 4935 if (update_master_only == 1) 4936 thidx = 1; 4937 for (f = 0; f < thidx; f++) { 4938 kmp_info_t *th = team->t.t_threads[f]; 4939 KMP_DEBUG_ASSERT(th != NULL); 4940 4941 th->th.th_first_place = place; 4942 th->th.th_new_place = place; 4943 if (__kmp_display_affinity && place != th->th.th_current_place && 4944 team->t.t_display_affinity != 1) { 4945 team->t.t_display_affinity = 1; 4946 } 4947 s_count = 1; 4948 while (s_count < S) { 4949 if (place == last_place) { 4950 place = first_place; 4951 } else if (place == (num_masks - 1)) { 4952 place = 0; 4953 } else { 4954 place++; 4955 } 4956 s_count++; 4957 } 4958 if (rem && (gap_ct == gap)) { 4959 if (place == last_place) { 4960 place = first_place; 4961 } else if (place == (num_masks - 1)) { 4962 place = 0; 4963 } else { 4964 place++; 4965 } 4966 rem--; 4967 gap_ct = 0; 4968 } 4969 th->th.th_last_place = place; 4970 gap_ct++; 4971 4972 if (place == last_place) { 4973 place = first_place; 4974 } else if (place == (num_masks - 1)) { 4975 place = 0; 4976 } else { 4977 place++; 4978 } 4979 4980 KA_TRACE(100, 4981 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4982 "partition = [%d,%d], num_masks: %u\n", 4983 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4984 f, th->th.th_new_place, th->th.th_first_place, 4985 th->th.th_last_place, num_masks)); 4986 } 4987 } else { 4988 /* Having uniform space of available computation places I can create 4989 T partitions of round(P/T) size and put threads into the first 4990 place of each partition. */ 4991 double current = static_cast<double>(masters_place); 4992 double spacing = 4993 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4994 int first, last; 4995 kmp_info_t *th; 4996 4997 thidx = n_th + 1; 4998 if (update_master_only == 1) 4999 thidx = 1; 5000 for (f = 0; f < thidx; f++) { 5001 first = static_cast<int>(current); 5002 last = static_cast<int>(current + spacing) - 1; 5003 KMP_DEBUG_ASSERT(last >= first); 5004 if (first >= n_places) { 5005 if (masters_place) { 5006 first -= n_places; 5007 last -= n_places; 5008 if (first == (masters_place + 1)) { 5009 KMP_DEBUG_ASSERT(f == n_th); 5010 first--; 5011 } 5012 if (last == masters_place) { 5013 KMP_DEBUG_ASSERT(f == (n_th - 1)); 5014 last--; 5015 } 5016 } else { 5017 KMP_DEBUG_ASSERT(f == n_th); 5018 first = 0; 5019 last = 0; 5020 } 5021 } 5022 if (last >= n_places) { 5023 last = (n_places - 1); 5024 } 5025 place = first; 5026 current += spacing; 5027 if (f < n_th) { 5028 KMP_DEBUG_ASSERT(0 <= first); 5029 KMP_DEBUG_ASSERT(n_places > first); 5030 KMP_DEBUG_ASSERT(0 <= last); 5031 KMP_DEBUG_ASSERT(n_places > last); 5032 KMP_DEBUG_ASSERT(last_place >= first_place); 5033 th = team->t.t_threads[f]; 5034 KMP_DEBUG_ASSERT(th); 5035 th->th.th_first_place = first; 5036 th->th.th_new_place = place; 5037 th->th.th_last_place = last; 5038 if (__kmp_display_affinity && place != th->th.th_current_place && 5039 team->t.t_display_affinity != 1) { 5040 team->t.t_display_affinity = 1; 5041 } 5042 KA_TRACE(100, 5043 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 5044 "partition = [%d,%d], spacing = %.4f\n", 5045 __kmp_gtid_from_thread(team->t.t_threads[f]), 5046 team->t.t_id, f, th->th.th_new_place, 5047 th->th.th_first_place, th->th.th_last_place, spacing)); 5048 } 5049 } 5050 } 5051 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 5052 } else { 5053 int S, rem, gap, s_count; 5054 S = n_th / n_places; 5055 s_count = 0; 5056 rem = n_th - (S * n_places); 5057 gap = rem > 0 ? n_places / rem : n_places; 5058 int place = masters_place; 5059 int gap_ct = gap; 5060 thidx = n_th; 5061 if (update_master_only == 1) 5062 thidx = 1; 5063 for (f = 0; f < thidx; f++) { 5064 kmp_info_t *th = team->t.t_threads[f]; 5065 KMP_DEBUG_ASSERT(th != NULL); 5066 5067 th->th.th_first_place = place; 5068 th->th.th_last_place = place; 5069 th->th.th_new_place = place; 5070 if (__kmp_display_affinity && place != th->th.th_current_place && 5071 team->t.t_display_affinity != 1) { 5072 team->t.t_display_affinity = 1; 5073 } 5074 s_count++; 5075 5076 if ((s_count == S) && rem && (gap_ct == gap)) { 5077 // do nothing, add an extra thread to place on next iteration 5078 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 5079 // we added an extra thread to this place; move on to next place 5080 if (place == last_place) { 5081 place = first_place; 5082 } else if (place == (num_masks - 1)) { 5083 place = 0; 5084 } else { 5085 place++; 5086 } 5087 s_count = 0; 5088 gap_ct = 1; 5089 rem--; 5090 } else if (s_count == S) { // place is full; don't add extra thread 5091 if (place == last_place) { 5092 place = first_place; 5093 } else if (place == (num_masks - 1)) { 5094 place = 0; 5095 } else { 5096 place++; 5097 } 5098 gap_ct++; 5099 s_count = 0; 5100 } 5101 5102 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 5103 "partition = [%d,%d]\n", 5104 __kmp_gtid_from_thread(team->t.t_threads[f]), 5105 team->t.t_id, f, th->th.th_new_place, 5106 th->th.th_first_place, th->th.th_last_place)); 5107 } 5108 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 5109 } 5110 } break; 5111 5112 default: 5113 break; 5114 } 5115 5116 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 5117 } 5118 5119 #endif // KMP_AFFINITY_SUPPORTED 5120 5121 /* allocate a new team data structure to use. take one off of the free pool if 5122 available */ 5123 kmp_team_t * 5124 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 5125 #if OMPT_SUPPORT 5126 ompt_data_t ompt_parallel_data, 5127 #endif 5128 kmp_proc_bind_t new_proc_bind, 5129 kmp_internal_control_t *new_icvs, 5130 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5131 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 5132 int f; 5133 kmp_team_t *team; 5134 int use_hot_team = !root->r.r_active; 5135 int level = 0; 5136 int do_place_partition = 1; 5137 5138 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 5139 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 5140 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 5141 KMP_MB(); 5142 5143 #if KMP_NESTED_HOT_TEAMS 5144 kmp_hot_team_ptr_t *hot_teams; 5145 if (master) { 5146 team = master->th.th_team; 5147 level = team->t.t_active_level; 5148 if (master->th.th_teams_microtask) { // in teams construct? 5149 if (master->th.th_teams_size.nteams > 1 && 5150 ( // #teams > 1 5151 team->t.t_pkfn == 5152 (microtask_t)__kmp_teams_master || // inner fork of the teams 5153 master->th.th_teams_level < 5154 team->t.t_level)) { // or nested parallel inside the teams 5155 ++level; // not increment if #teams==1, or for outer fork of the teams; 5156 // increment otherwise 5157 } 5158 // Do not perform the place partition if inner fork of the teams 5159 // Wait until nested parallel region encountered inside teams construct 5160 if ((master->th.th_teams_size.nteams == 1 && 5161 master->th.th_teams_level >= team->t.t_level) || 5162 (team->t.t_pkfn == (microtask_t)__kmp_teams_master)) 5163 do_place_partition = 0; 5164 } 5165 hot_teams = master->th.th_hot_teams; 5166 if (level < __kmp_hot_teams_max_level && hot_teams && 5167 hot_teams[level].hot_team) { 5168 // hot team has already been allocated for given level 5169 use_hot_team = 1; 5170 } else { 5171 use_hot_team = 0; 5172 } 5173 } else { 5174 // check we won't access uninitialized hot_teams, just in case 5175 KMP_DEBUG_ASSERT(new_nproc == 1); 5176 } 5177 #endif 5178 // Optimization to use a "hot" team 5179 if (use_hot_team && new_nproc > 1) { 5180 KMP_DEBUG_ASSERT(new_nproc <= max_nproc); 5181 #if KMP_NESTED_HOT_TEAMS 5182 team = hot_teams[level].hot_team; 5183 #else 5184 team = root->r.r_hot_team; 5185 #endif 5186 #if KMP_DEBUG 5187 if (__kmp_tasking_mode != tskm_immediate_exec) { 5188 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5189 "task_team[1] = %p before reinit\n", 5190 team->t.t_task_team[0], team->t.t_task_team[1])); 5191 } 5192 #endif 5193 5194 if (team->t.t_nproc != new_nproc && 5195 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5196 // Distributed barrier may need a resize 5197 int old_nthr = team->t.t_nproc; 5198 __kmp_resize_dist_barrier(team, old_nthr, new_nproc); 5199 } 5200 5201 // If not doing the place partition, then reset the team's proc bind 5202 // to indicate that partitioning of all threads still needs to take place 5203 if (do_place_partition == 0) 5204 team->t.t_proc_bind = proc_bind_default; 5205 // Has the number of threads changed? 5206 /* Let's assume the most common case is that the number of threads is 5207 unchanged, and put that case first. */ 5208 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 5209 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 5210 // This case can mean that omp_set_num_threads() was called and the hot 5211 // team size was already reduced, so we check the special flag 5212 if (team->t.t_size_changed == -1) { 5213 team->t.t_size_changed = 1; 5214 } else { 5215 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 5216 } 5217 5218 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5219 kmp_r_sched_t new_sched = new_icvs->sched; 5220 // set primary thread's schedule as new run-time schedule 5221 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 5222 5223 __kmp_reinitialize_team(team, new_icvs, 5224 root->r.r_uber_thread->th.th_ident); 5225 5226 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 5227 team->t.t_threads[0], team)); 5228 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5229 5230 #if KMP_AFFINITY_SUPPORTED 5231 if ((team->t.t_size_changed == 0) && 5232 (team->t.t_proc_bind == new_proc_bind)) { 5233 if (new_proc_bind == proc_bind_spread) { 5234 if (do_place_partition) { 5235 // add flag to update only master for spread 5236 __kmp_partition_places(team, 1); 5237 } 5238 } 5239 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 5240 "proc_bind = %d, partition = [%d,%d]\n", 5241 team->t.t_id, new_proc_bind, team->t.t_first_place, 5242 team->t.t_last_place)); 5243 } else { 5244 if (do_place_partition) { 5245 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5246 __kmp_partition_places(team); 5247 } 5248 } 5249 #else 5250 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5251 #endif /* KMP_AFFINITY_SUPPORTED */ 5252 } else if (team->t.t_nproc > new_nproc) { 5253 KA_TRACE(20, 5254 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 5255 new_nproc)); 5256 5257 team->t.t_size_changed = 1; 5258 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5259 // Barrier size already reduced earlier in this function 5260 // Activate team threads via th_used_in_team 5261 __kmp_add_threads_to_team(team, new_nproc); 5262 } 5263 #if KMP_NESTED_HOT_TEAMS 5264 if (__kmp_hot_teams_mode == 0) { 5265 // AC: saved number of threads should correspond to team's value in this 5266 // mode, can be bigger in mode 1, when hot team has threads in reserve 5267 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 5268 hot_teams[level].hot_team_nth = new_nproc; 5269 #endif // KMP_NESTED_HOT_TEAMS 5270 /* release the extra threads we don't need any more */ 5271 for (f = new_nproc; f < team->t.t_nproc; f++) { 5272 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5273 if (__kmp_tasking_mode != tskm_immediate_exec) { 5274 // When decreasing team size, threads no longer in the team should 5275 // unref task team. 5276 team->t.t_threads[f]->th.th_task_team = NULL; 5277 } 5278 __kmp_free_thread(team->t.t_threads[f]); 5279 team->t.t_threads[f] = NULL; 5280 } 5281 #if KMP_NESTED_HOT_TEAMS 5282 } // (__kmp_hot_teams_mode == 0) 5283 else { 5284 // When keeping extra threads in team, switch threads to wait on own 5285 // b_go flag 5286 for (f = new_nproc; f < team->t.t_nproc; ++f) { 5287 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5288 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 5289 for (int b = 0; b < bs_last_barrier; ++b) { 5290 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5291 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5292 } 5293 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5294 } 5295 } 5296 } 5297 #endif // KMP_NESTED_HOT_TEAMS 5298 team->t.t_nproc = new_nproc; 5299 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5300 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 5301 __kmp_reinitialize_team(team, new_icvs, 5302 root->r.r_uber_thread->th.th_ident); 5303 5304 // Update remaining threads 5305 for (f = 0; f < new_nproc; ++f) { 5306 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5307 } 5308 5309 // restore the current task state of the primary thread: should be the 5310 // implicit task 5311 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5312 team->t.t_threads[0], team)); 5313 5314 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5315 5316 #ifdef KMP_DEBUG 5317 for (f = 0; f < team->t.t_nproc; f++) { 5318 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5319 team->t.t_threads[f]->th.th_team_nproc == 5320 team->t.t_nproc); 5321 } 5322 #endif 5323 5324 if (do_place_partition) { 5325 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5326 #if KMP_AFFINITY_SUPPORTED 5327 __kmp_partition_places(team); 5328 #endif 5329 } 5330 } else { // team->t.t_nproc < new_nproc 5331 5332 KA_TRACE(20, 5333 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5334 new_nproc)); 5335 int old_nproc = team->t.t_nproc; // save old value and use to update only 5336 team->t.t_size_changed = 1; 5337 5338 #if KMP_NESTED_HOT_TEAMS 5339 int avail_threads = hot_teams[level].hot_team_nth; 5340 if (new_nproc < avail_threads) 5341 avail_threads = new_nproc; 5342 kmp_info_t **other_threads = team->t.t_threads; 5343 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5344 // Adjust barrier data of reserved threads (if any) of the team 5345 // Other data will be set in __kmp_initialize_info() below. 5346 int b; 5347 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5348 for (b = 0; b < bs_last_barrier; ++b) { 5349 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5350 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5351 #if USE_DEBUGGER 5352 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5353 #endif 5354 } 5355 } 5356 if (hot_teams[level].hot_team_nth >= new_nproc) { 5357 // we have all needed threads in reserve, no need to allocate any 5358 // this only possible in mode 1, cannot have reserved threads in mode 0 5359 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5360 team->t.t_nproc = new_nproc; // just get reserved threads involved 5361 } else { 5362 // We may have some threads in reserve, but not enough; 5363 // get reserved threads involved if any. 5364 team->t.t_nproc = hot_teams[level].hot_team_nth; 5365 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5366 #endif // KMP_NESTED_HOT_TEAMS 5367 if (team->t.t_max_nproc < new_nproc) { 5368 /* reallocate larger arrays */ 5369 __kmp_reallocate_team_arrays(team, new_nproc); 5370 __kmp_reinitialize_team(team, new_icvs, NULL); 5371 } 5372 5373 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5374 /* Temporarily set full mask for primary thread before creation of 5375 workers. The reason is that workers inherit the affinity from the 5376 primary thread, so if a lot of workers are created on the single 5377 core quickly, they don't get a chance to set their own affinity for 5378 a long time. */ 5379 kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask}; 5380 #endif 5381 5382 /* allocate new threads for the hot team */ 5383 for (f = team->t.t_nproc; f < new_nproc; f++) { 5384 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5385 KMP_DEBUG_ASSERT(new_worker); 5386 team->t.t_threads[f] = new_worker; 5387 5388 KA_TRACE(20, 5389 ("__kmp_allocate_team: team %d init T#%d arrived: " 5390 "join=%llu, plain=%llu\n", 5391 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5392 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5393 team->t.t_bar[bs_plain_barrier].b_arrived)); 5394 5395 { // Initialize barrier data for new threads. 5396 int b; 5397 kmp_balign_t *balign = new_worker->th.th_bar; 5398 for (b = 0; b < bs_last_barrier; ++b) { 5399 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5400 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5401 KMP_BARRIER_PARENT_FLAG); 5402 #if USE_DEBUGGER 5403 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5404 #endif 5405 } 5406 } 5407 } 5408 5409 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5410 /* Restore initial primary thread's affinity mask */ 5411 new_temp_affinity.restore(); 5412 #endif 5413 #if KMP_NESTED_HOT_TEAMS 5414 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5415 #endif // KMP_NESTED_HOT_TEAMS 5416 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5417 // Barrier size already increased earlier in this function 5418 // Activate team threads via th_used_in_team 5419 __kmp_add_threads_to_team(team, new_nproc); 5420 } 5421 /* make sure everyone is syncronized */ 5422 // new threads below 5423 __kmp_initialize_team(team, new_nproc, new_icvs, 5424 root->r.r_uber_thread->th.th_ident); 5425 5426 /* reinitialize the threads */ 5427 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5428 for (f = 0; f < team->t.t_nproc; ++f) 5429 __kmp_initialize_info(team->t.t_threads[f], team, f, 5430 __kmp_gtid_from_tid(f, team)); 5431 5432 // set th_task_state for new threads in hot team with older thread's state 5433 kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state; 5434 for (f = old_nproc; f < team->t.t_nproc; ++f) 5435 team->t.t_threads[f]->th.th_task_state = old_state; 5436 5437 #ifdef KMP_DEBUG 5438 for (f = 0; f < team->t.t_nproc; ++f) { 5439 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5440 team->t.t_threads[f]->th.th_team_nproc == 5441 team->t.t_nproc); 5442 } 5443 #endif 5444 5445 if (do_place_partition) { 5446 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5447 #if KMP_AFFINITY_SUPPORTED 5448 __kmp_partition_places(team); 5449 #endif 5450 } 5451 } // Check changes in number of threads 5452 5453 kmp_info_t *master = team->t.t_threads[0]; 5454 if (master->th.th_teams_microtask) { 5455 for (f = 1; f < new_nproc; ++f) { 5456 // propagate teams construct specific info to workers 5457 kmp_info_t *thr = team->t.t_threads[f]; 5458 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5459 thr->th.th_teams_level = master->th.th_teams_level; 5460 thr->th.th_teams_size = master->th.th_teams_size; 5461 } 5462 } 5463 #if KMP_NESTED_HOT_TEAMS 5464 if (level) { 5465 // Sync barrier state for nested hot teams, not needed for outermost hot 5466 // team. 5467 for (f = 1; f < new_nproc; ++f) { 5468 kmp_info_t *thr = team->t.t_threads[f]; 5469 int b; 5470 kmp_balign_t *balign = thr->th.th_bar; 5471 for (b = 0; b < bs_last_barrier; ++b) { 5472 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5473 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5474 #if USE_DEBUGGER 5475 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5476 #endif 5477 } 5478 } 5479 } 5480 #endif // KMP_NESTED_HOT_TEAMS 5481 5482 /* reallocate space for arguments if necessary */ 5483 __kmp_alloc_argv_entries(argc, team, TRUE); 5484 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5485 // The hot team re-uses the previous task team, 5486 // if untouched during the previous release->gather phase. 5487 5488 KF_TRACE(10, (" hot_team = %p\n", team)); 5489 5490 #if KMP_DEBUG 5491 if (__kmp_tasking_mode != tskm_immediate_exec) { 5492 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5493 "task_team[1] = %p after reinit\n", 5494 team->t.t_task_team[0], team->t.t_task_team[1])); 5495 } 5496 #endif 5497 5498 #if OMPT_SUPPORT 5499 __ompt_team_assign_id(team, ompt_parallel_data); 5500 #endif 5501 5502 KMP_MB(); 5503 5504 return team; 5505 } 5506 5507 /* next, let's try to take one from the team pool */ 5508 KMP_MB(); 5509 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5510 /* TODO: consider resizing undersized teams instead of reaping them, now 5511 that we have a resizing mechanism */ 5512 if (team->t.t_max_nproc >= max_nproc) { 5513 /* take this team from the team pool */ 5514 __kmp_team_pool = team->t.t_next_pool; 5515 5516 if (max_nproc > 1 && 5517 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5518 if (!team->t.b) { // Allocate barrier structure 5519 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub); 5520 } 5521 } 5522 5523 /* setup the team for fresh use */ 5524 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5525 5526 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5527 "task_team[1] %p to NULL\n", 5528 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5529 team->t.t_task_team[0] = NULL; 5530 team->t.t_task_team[1] = NULL; 5531 5532 /* reallocate space for arguments if necessary */ 5533 __kmp_alloc_argv_entries(argc, team, TRUE); 5534 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5535 5536 KA_TRACE( 5537 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5538 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5539 { // Initialize barrier data. 5540 int b; 5541 for (b = 0; b < bs_last_barrier; ++b) { 5542 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5543 #if USE_DEBUGGER 5544 team->t.t_bar[b].b_master_arrived = 0; 5545 team->t.t_bar[b].b_team_arrived = 0; 5546 #endif 5547 } 5548 } 5549 5550 team->t.t_proc_bind = new_proc_bind; 5551 5552 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5553 team->t.t_id)); 5554 5555 #if OMPT_SUPPORT 5556 __ompt_team_assign_id(team, ompt_parallel_data); 5557 #endif 5558 5559 KMP_MB(); 5560 5561 return team; 5562 } 5563 5564 /* reap team if it is too small, then loop back and check the next one */ 5565 // not sure if this is wise, but, will be redone during the hot-teams 5566 // rewrite. 5567 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5568 team = __kmp_reap_team(team); 5569 __kmp_team_pool = team; 5570 } 5571 5572 /* nothing available in the pool, no matter, make a new team! */ 5573 KMP_MB(); 5574 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5575 5576 /* and set it up */ 5577 team->t.t_max_nproc = max_nproc; 5578 if (max_nproc > 1 && 5579 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5580 // Allocate barrier structure 5581 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub); 5582 } 5583 5584 /* NOTE well, for some reason allocating one big buffer and dividing it up 5585 seems to really hurt performance a lot on the P4, so, let's not use this */ 5586 __kmp_allocate_team_arrays(team, max_nproc); 5587 5588 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5589 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5590 5591 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5592 "%p to NULL\n", 5593 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5594 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5595 // memory, no need to duplicate 5596 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5597 // memory, no need to duplicate 5598 5599 if (__kmp_storage_map) { 5600 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5601 } 5602 5603 /* allocate space for arguments */ 5604 __kmp_alloc_argv_entries(argc, team, FALSE); 5605 team->t.t_argc = argc; 5606 5607 KA_TRACE(20, 5608 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5609 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5610 { // Initialize barrier data. 5611 int b; 5612 for (b = 0; b < bs_last_barrier; ++b) { 5613 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5614 #if USE_DEBUGGER 5615 team->t.t_bar[b].b_master_arrived = 0; 5616 team->t.t_bar[b].b_team_arrived = 0; 5617 #endif 5618 } 5619 } 5620 5621 team->t.t_proc_bind = new_proc_bind; 5622 5623 #if OMPT_SUPPORT 5624 __ompt_team_assign_id(team, ompt_parallel_data); 5625 team->t.ompt_serialized_team_info = NULL; 5626 #endif 5627 5628 KMP_MB(); 5629 5630 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5631 team->t.t_id)); 5632 5633 return team; 5634 } 5635 5636 /* TODO implement hot-teams at all levels */ 5637 /* TODO implement lazy thread release on demand (disband request) */ 5638 5639 /* free the team. return it to the team pool. release all the threads 5640 * associated with it */ 5641 void __kmp_free_team(kmp_root_t *root, 5642 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5643 int f; 5644 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5645 team->t.t_id)); 5646 5647 /* verify state */ 5648 KMP_DEBUG_ASSERT(root); 5649 KMP_DEBUG_ASSERT(team); 5650 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5651 KMP_DEBUG_ASSERT(team->t.t_threads); 5652 5653 int use_hot_team = team == root->r.r_hot_team; 5654 #if KMP_NESTED_HOT_TEAMS 5655 int level; 5656 if (master) { 5657 level = team->t.t_active_level - 1; 5658 if (master->th.th_teams_microtask) { // in teams construct? 5659 if (master->th.th_teams_size.nteams > 1) { 5660 ++level; // level was not increased in teams construct for 5661 // team_of_masters 5662 } 5663 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5664 master->th.th_teams_level == team->t.t_level) { 5665 ++level; // level was not increased in teams construct for 5666 // team_of_workers before the parallel 5667 } // team->t.t_level will be increased inside parallel 5668 } 5669 #if KMP_DEBUG 5670 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams; 5671 #endif 5672 if (level < __kmp_hot_teams_max_level) { 5673 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5674 use_hot_team = 1; 5675 } 5676 } 5677 #endif // KMP_NESTED_HOT_TEAMS 5678 5679 /* team is done working */ 5680 TCW_SYNC_PTR(team->t.t_pkfn, 5681 NULL); // Important for Debugging Support Library. 5682 #if KMP_OS_WINDOWS 5683 team->t.t_copyin_counter = 0; // init counter for possible reuse 5684 #endif 5685 // Do not reset pointer to parent team to NULL for hot teams. 5686 5687 /* if we are non-hot team, release our threads */ 5688 if (!use_hot_team) { 5689 if (__kmp_tasking_mode != tskm_immediate_exec) { 5690 // Wait for threads to reach reapable state 5691 for (f = 1; f < team->t.t_nproc; ++f) { 5692 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5693 kmp_info_t *th = team->t.t_threads[f]; 5694 volatile kmp_uint32 *state = &th->th.th_reap_state; 5695 while (*state != KMP_SAFE_TO_REAP) { 5696 #if KMP_OS_WINDOWS 5697 // On Windows a thread can be killed at any time, check this 5698 DWORD ecode; 5699 if (!__kmp_is_thread_alive(th, &ecode)) { 5700 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5701 break; 5702 } 5703 #endif 5704 // first check if thread is sleeping 5705 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5706 if (fl.is_sleeping()) 5707 fl.resume(__kmp_gtid_from_thread(th)); 5708 KMP_CPU_PAUSE(); 5709 } 5710 } 5711 5712 // Delete task teams 5713 int tt_idx; 5714 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5715 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5716 if (task_team != NULL) { 5717 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams 5718 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5719 team->t.t_threads[f]->th.th_task_team = NULL; 5720 } 5721 KA_TRACE( 5722 20, 5723 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5724 __kmp_get_gtid(), task_team, team->t.t_id)); 5725 #if KMP_NESTED_HOT_TEAMS 5726 __kmp_free_task_team(master, task_team); 5727 #endif 5728 team->t.t_task_team[tt_idx] = NULL; 5729 } 5730 } 5731 } 5732 5733 // Reset pointer to parent team only for non-hot teams. 5734 team->t.t_parent = NULL; 5735 team->t.t_level = 0; 5736 team->t.t_active_level = 0; 5737 5738 /* free the worker threads */ 5739 for (f = 1; f < team->t.t_nproc; ++f) { 5740 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5741 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5742 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 5743 1, 2); 5744 } 5745 __kmp_free_thread(team->t.t_threads[f]); 5746 } 5747 5748 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5749 if (team->t.b) { 5750 // wake up thread at old location 5751 team->t.b->go_release(); 5752 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5753 for (f = 1; f < team->t.t_nproc; ++f) { 5754 if (team->t.b->sleep[f].sleep) { 5755 __kmp_atomic_resume_64( 5756 team->t.t_threads[f]->th.th_info.ds.ds_gtid, 5757 (kmp_atomic_flag_64<> *)NULL); 5758 } 5759 } 5760 } 5761 // Wait for threads to be removed from team 5762 for (int f = 1; f < team->t.t_nproc; ++f) { 5763 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0) 5764 KMP_CPU_PAUSE(); 5765 } 5766 } 5767 } 5768 5769 for (f = 1; f < team->t.t_nproc; ++f) { 5770 team->t.t_threads[f] = NULL; 5771 } 5772 5773 if (team->t.t_max_nproc > 1 && 5774 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5775 distributedBarrier::deallocate(team->t.b); 5776 team->t.b = NULL; 5777 } 5778 /* put the team back in the team pool */ 5779 /* TODO limit size of team pool, call reap_team if pool too large */ 5780 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5781 __kmp_team_pool = (volatile kmp_team_t *)team; 5782 } else { // Check if team was created for primary threads in teams construct 5783 // See if first worker is a CG root 5784 KMP_DEBUG_ASSERT(team->t.t_threads[1] && 5785 team->t.t_threads[1]->th.th_cg_roots); 5786 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { 5787 // Clean up the CG root nodes on workers so that this team can be re-used 5788 for (f = 1; f < team->t.t_nproc; ++f) { 5789 kmp_info_t *thr = team->t.t_threads[f]; 5790 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && 5791 thr->th.th_cg_roots->cg_root == thr); 5792 // Pop current CG root off list 5793 kmp_cg_root_t *tmp = thr->th.th_cg_roots; 5794 thr->th.th_cg_roots = tmp->up; 5795 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" 5796 " up to node %p. cg_nthreads was %d\n", 5797 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); 5798 int i = tmp->cg_nthreads--; 5799 if (i == 1) { 5800 __kmp_free(tmp); // free CG if we are the last thread in it 5801 } 5802 // Restore current task's thread_limit from CG root 5803 if (thr->th.th_cg_roots) 5804 thr->th.th_current_task->td_icvs.thread_limit = 5805 thr->th.th_cg_roots->cg_thread_limit; 5806 } 5807 } 5808 } 5809 5810 KMP_MB(); 5811 } 5812 5813 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5814 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5815 kmp_team_t *next_pool = team->t.t_next_pool; 5816 5817 KMP_DEBUG_ASSERT(team); 5818 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5819 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5820 KMP_DEBUG_ASSERT(team->t.t_threads); 5821 KMP_DEBUG_ASSERT(team->t.t_argv); 5822 5823 /* TODO clean the threads that are a part of this? */ 5824 5825 /* free stuff */ 5826 __kmp_free_team_arrays(team); 5827 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5828 __kmp_free((void *)team->t.t_argv); 5829 __kmp_free(team); 5830 5831 KMP_MB(); 5832 return next_pool; 5833 } 5834 5835 // Free the thread. Don't reap it, just place it on the pool of available 5836 // threads. 5837 // 5838 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5839 // binding for the affinity mechanism to be useful. 5840 // 5841 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5842 // However, we want to avoid a potential performance problem by always 5843 // scanning through the list to find the correct point at which to insert 5844 // the thread (potential N**2 behavior). To do this we keep track of the 5845 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5846 // With single-level parallelism, threads will always be added to the tail 5847 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5848 // parallelism, all bets are off and we may need to scan through the entire 5849 // free list. 5850 // 5851 // This change also has a potentially large performance benefit, for some 5852 // applications. Previously, as threads were freed from the hot team, they 5853 // would be placed back on the free list in inverse order. If the hot team 5854 // grew back to it's original size, then the freed thread would be placed 5855 // back on the hot team in reverse order. This could cause bad cache 5856 // locality problems on programs where the size of the hot team regularly 5857 // grew and shrunk. 5858 // 5859 // Now, for single-level parallelism, the OMP tid is always == gtid. 5860 void __kmp_free_thread(kmp_info_t *this_th) { 5861 int gtid; 5862 kmp_info_t **scan; 5863 5864 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5865 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5866 5867 KMP_DEBUG_ASSERT(this_th); 5868 5869 // When moving thread to pool, switch thread to wait on own b_go flag, and 5870 // uninitialized (NULL team). 5871 int b; 5872 kmp_balign_t *balign = this_th->th.th_bar; 5873 for (b = 0; b < bs_last_barrier; ++b) { 5874 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5875 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5876 balign[b].bb.team = NULL; 5877 balign[b].bb.leaf_kids = 0; 5878 } 5879 this_th->th.th_task_state = 0; 5880 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5881 5882 /* put thread back on the free pool */ 5883 TCW_PTR(this_th->th.th_team, NULL); 5884 TCW_PTR(this_th->th.th_root, NULL); 5885 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5886 5887 while (this_th->th.th_cg_roots) { 5888 this_th->th.th_cg_roots->cg_nthreads--; 5889 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" 5890 " %p of thread %p to %d\n", 5891 this_th, this_th->th.th_cg_roots, 5892 this_th->th.th_cg_roots->cg_root, 5893 this_th->th.th_cg_roots->cg_nthreads)); 5894 kmp_cg_root_t *tmp = this_th->th.th_cg_roots; 5895 if (tmp->cg_root == this_th) { // Thread is a cg_root 5896 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); 5897 KA_TRACE( 5898 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); 5899 this_th->th.th_cg_roots = tmp->up; 5900 __kmp_free(tmp); 5901 } else { // Worker thread 5902 if (tmp->cg_nthreads == 0) { // last thread leaves contention group 5903 __kmp_free(tmp); 5904 } 5905 this_th->th.th_cg_roots = NULL; 5906 break; 5907 } 5908 } 5909 5910 /* If the implicit task assigned to this thread can be used by other threads 5911 * -> multiple threads can share the data and try to free the task at 5912 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5913 * with higher probability when hot team is disabled but can occurs even when 5914 * the hot team is enabled */ 5915 __kmp_free_implicit_task(this_th); 5916 this_th->th.th_current_task = NULL; 5917 5918 // If the __kmp_thread_pool_insert_pt is already past the new insert 5919 // point, then we need to re-scan the entire list. 5920 gtid = this_th->th.th_info.ds.ds_gtid; 5921 if (__kmp_thread_pool_insert_pt != NULL) { 5922 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5923 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5924 __kmp_thread_pool_insert_pt = NULL; 5925 } 5926 } 5927 5928 // Scan down the list to find the place to insert the thread. 5929 // scan is the address of a link in the list, possibly the address of 5930 // __kmp_thread_pool itself. 5931 // 5932 // In the absence of nested parallelism, the for loop will have 0 iterations. 5933 if (__kmp_thread_pool_insert_pt != NULL) { 5934 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5935 } else { 5936 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5937 } 5938 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5939 scan = &((*scan)->th.th_next_pool)) 5940 ; 5941 5942 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5943 // to its address. 5944 TCW_PTR(this_th->th.th_next_pool, *scan); 5945 __kmp_thread_pool_insert_pt = *scan = this_th; 5946 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5947 (this_th->th.th_info.ds.ds_gtid < 5948 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5949 TCW_4(this_th->th.th_in_pool, TRUE); 5950 __kmp_suspend_initialize_thread(this_th); 5951 __kmp_lock_suspend_mx(this_th); 5952 if (this_th->th.th_active == TRUE) { 5953 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); 5954 this_th->th.th_active_in_pool = TRUE; 5955 } 5956 #if KMP_DEBUG 5957 else { 5958 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); 5959 } 5960 #endif 5961 __kmp_unlock_suspend_mx(this_th); 5962 5963 TCW_4(__kmp_nth, __kmp_nth - 1); 5964 5965 #ifdef KMP_ADJUST_BLOCKTIME 5966 /* Adjust blocktime back to user setting or default if necessary */ 5967 /* Middle initialization might never have occurred */ 5968 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5969 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5970 if (__kmp_nth <= __kmp_avail_proc) { 5971 __kmp_zero_bt = FALSE; 5972 } 5973 } 5974 #endif /* KMP_ADJUST_BLOCKTIME */ 5975 5976 KMP_MB(); 5977 } 5978 5979 /* ------------------------------------------------------------------------ */ 5980 5981 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5982 #if OMP_PROFILING_SUPPORT 5983 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE"); 5984 // TODO: add a configuration option for time granularity 5985 if (ProfileTraceFile) 5986 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget"); 5987 #endif 5988 5989 int gtid = this_thr->th.th_info.ds.ds_gtid; 5990 /* void *stack_data;*/ 5991 kmp_team_t **volatile pteam; 5992 5993 KMP_MB(); 5994 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5995 5996 if (__kmp_env_consistency_check) { 5997 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5998 } 5999 6000 #if OMPD_SUPPORT 6001 if (ompd_state & OMPD_ENABLE_BP) 6002 ompd_bp_thread_begin(); 6003 #endif 6004 6005 #if OMPT_SUPPORT 6006 ompt_data_t *thread_data = nullptr; 6007 if (ompt_enabled.enabled) { 6008 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 6009 *thread_data = ompt_data_none; 6010 6011 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 6012 this_thr->th.ompt_thread_info.wait_id = 0; 6013 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 6014 this_thr->th.ompt_thread_info.parallel_flags = 0; 6015 if (ompt_enabled.ompt_callback_thread_begin) { 6016 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 6017 ompt_thread_worker, thread_data); 6018 } 6019 this_thr->th.ompt_thread_info.state = ompt_state_idle; 6020 } 6021 #endif 6022 6023 /* This is the place where threads wait for work */ 6024 while (!TCR_4(__kmp_global.g.g_done)) { 6025 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 6026 KMP_MB(); 6027 6028 /* wait for work to do */ 6029 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 6030 6031 /* No tid yet since not part of a team */ 6032 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 6033 6034 #if OMPT_SUPPORT 6035 if (ompt_enabled.enabled) { 6036 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 6037 } 6038 #endif 6039 6040 pteam = &this_thr->th.th_team; 6041 6042 /* have we been allocated? */ 6043 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 6044 /* we were just woken up, so run our new task */ 6045 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 6046 int rc; 6047 KA_TRACE(20, 6048 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 6049 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 6050 (*pteam)->t.t_pkfn)); 6051 6052 updateHWFPControl(*pteam); 6053 6054 #if OMPT_SUPPORT 6055 if (ompt_enabled.enabled) { 6056 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 6057 } 6058 #endif 6059 6060 rc = (*pteam)->t.t_invoke(gtid); 6061 KMP_ASSERT(rc); 6062 6063 KMP_MB(); 6064 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 6065 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 6066 (*pteam)->t.t_pkfn)); 6067 } 6068 #if OMPT_SUPPORT 6069 if (ompt_enabled.enabled) { 6070 /* no frame set while outside task */ 6071 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 6072 6073 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 6074 } 6075 #endif 6076 /* join barrier after parallel region */ 6077 __kmp_join_barrier(gtid); 6078 } 6079 } 6080 6081 #if OMPD_SUPPORT 6082 if (ompd_state & OMPD_ENABLE_BP) 6083 ompd_bp_thread_end(); 6084 #endif 6085 6086 #if OMPT_SUPPORT 6087 if (ompt_enabled.ompt_callback_thread_end) { 6088 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 6089 } 6090 #endif 6091 6092 this_thr->th.th_task_team = NULL; 6093 /* run the destructors for the threadprivate data for this thread */ 6094 __kmp_common_destroy_gtid(gtid); 6095 6096 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 6097 KMP_MB(); 6098 6099 #if OMP_PROFILING_SUPPORT 6100 llvm::timeTraceProfilerFinishThread(); 6101 #endif 6102 return this_thr; 6103 } 6104 6105 /* ------------------------------------------------------------------------ */ 6106 6107 void __kmp_internal_end_dest(void *specific_gtid) { 6108 // Make sure no significant bits are lost 6109 int gtid; 6110 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, >id); 6111 6112 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 6113 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 6114 * this is because 0 is reserved for the nothing-stored case */ 6115 6116 __kmp_internal_end_thread(gtid); 6117 } 6118 6119 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 6120 6121 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 6122 __kmp_internal_end_atexit(); 6123 } 6124 6125 #endif 6126 6127 /* [Windows] josh: when the atexit handler is called, there may still be more 6128 than one thread alive */ 6129 void __kmp_internal_end_atexit(void) { 6130 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 6131 /* [Windows] 6132 josh: ideally, we want to completely shutdown the library in this atexit 6133 handler, but stat code that depends on thread specific data for gtid fails 6134 because that data becomes unavailable at some point during the shutdown, so 6135 we call __kmp_internal_end_thread instead. We should eventually remove the 6136 dependency on __kmp_get_specific_gtid in the stat code and use 6137 __kmp_internal_end_library to cleanly shutdown the library. 6138 6139 // TODO: Can some of this comment about GVS be removed? 6140 I suspect that the offending stat code is executed when the calling thread 6141 tries to clean up a dead root thread's data structures, resulting in GVS 6142 code trying to close the GVS structures for that thread, but since the stat 6143 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 6144 the calling thread is cleaning up itself instead of another thread, it get 6145 confused. This happens because allowing a thread to unregister and cleanup 6146 another thread is a recent modification for addressing an issue. 6147 Based on the current design (20050722), a thread may end up 6148 trying to unregister another thread only if thread death does not trigger 6149 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 6150 thread specific data destructor function to detect thread death. For 6151 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 6152 is nothing. Thus, the workaround is applicable only for Windows static 6153 stat library. */ 6154 __kmp_internal_end_library(-1); 6155 #if KMP_OS_WINDOWS 6156 __kmp_close_console(); 6157 #endif 6158 } 6159 6160 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 6161 // It is assumed __kmp_forkjoin_lock is acquired. 6162 6163 int gtid; 6164 6165 KMP_DEBUG_ASSERT(thread != NULL); 6166 6167 gtid = thread->th.th_info.ds.ds_gtid; 6168 6169 if (!is_root) { 6170 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 6171 /* Assume the threads are at the fork barrier here */ 6172 KA_TRACE( 6173 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 6174 gtid)); 6175 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 6176 while ( 6177 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3)) 6178 KMP_CPU_PAUSE(); 6179 __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL); 6180 } else { 6181 /* Need release fence here to prevent seg faults for tree forkjoin 6182 barrier (GEH) */ 6183 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 6184 thread); 6185 __kmp_release_64(&flag); 6186 } 6187 } 6188 6189 // Terminate OS thread. 6190 __kmp_reap_worker(thread); 6191 6192 // The thread was killed asynchronously. If it was actively 6193 // spinning in the thread pool, decrement the global count. 6194 // 6195 // There is a small timing hole here - if the worker thread was just waking 6196 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 6197 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 6198 // the global counter might not get updated. 6199 // 6200 // Currently, this can only happen as the library is unloaded, 6201 // so there are no harmful side effects. 6202 if (thread->th.th_active_in_pool) { 6203 thread->th.th_active_in_pool = FALSE; 6204 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 6205 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 6206 } 6207 } 6208 6209 __kmp_free_implicit_task(thread); 6210 6211 // Free the fast memory for tasking 6212 #if USE_FAST_MEMORY 6213 __kmp_free_fast_memory(thread); 6214 #endif /* USE_FAST_MEMORY */ 6215 6216 __kmp_suspend_uninitialize_thread(thread); 6217 6218 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 6219 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 6220 6221 --__kmp_all_nth; 6222 // __kmp_nth was decremented when thread is added to the pool. 6223 6224 #ifdef KMP_ADJUST_BLOCKTIME 6225 /* Adjust blocktime back to user setting or default if necessary */ 6226 /* Middle initialization might never have occurred */ 6227 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 6228 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 6229 if (__kmp_nth <= __kmp_avail_proc) { 6230 __kmp_zero_bt = FALSE; 6231 } 6232 } 6233 #endif /* KMP_ADJUST_BLOCKTIME */ 6234 6235 /* free the memory being used */ 6236 if (__kmp_env_consistency_check) { 6237 if (thread->th.th_cons) { 6238 __kmp_free_cons_stack(thread->th.th_cons); 6239 thread->th.th_cons = NULL; 6240 } 6241 } 6242 6243 if (thread->th.th_pri_common != NULL) { 6244 __kmp_free(thread->th.th_pri_common); 6245 thread->th.th_pri_common = NULL; 6246 } 6247 6248 if (thread->th.th_task_state_memo_stack != NULL) { 6249 __kmp_free(thread->th.th_task_state_memo_stack); 6250 thread->th.th_task_state_memo_stack = NULL; 6251 } 6252 6253 #if KMP_USE_BGET 6254 if (thread->th.th_local.bget_data != NULL) { 6255 __kmp_finalize_bget(thread); 6256 } 6257 #endif 6258 6259 #if KMP_AFFINITY_SUPPORTED 6260 if (thread->th.th_affin_mask != NULL) { 6261 KMP_CPU_FREE(thread->th.th_affin_mask); 6262 thread->th.th_affin_mask = NULL; 6263 } 6264 #endif /* KMP_AFFINITY_SUPPORTED */ 6265 6266 #if KMP_USE_HIER_SCHED 6267 if (thread->th.th_hier_bar_data != NULL) { 6268 __kmp_free(thread->th.th_hier_bar_data); 6269 thread->th.th_hier_bar_data = NULL; 6270 } 6271 #endif 6272 6273 __kmp_reap_team(thread->th.th_serial_team); 6274 thread->th.th_serial_team = NULL; 6275 __kmp_free(thread); 6276 6277 KMP_MB(); 6278 6279 } // __kmp_reap_thread 6280 6281 static void __kmp_itthash_clean(kmp_info_t *th) { 6282 #if USE_ITT_NOTIFY 6283 if (__kmp_itt_region_domains.count > 0) { 6284 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) { 6285 kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i]; 6286 while (bucket) { 6287 kmp_itthash_entry_t *next = bucket->next_in_bucket; 6288 __kmp_thread_free(th, bucket); 6289 bucket = next; 6290 } 6291 } 6292 } 6293 if (__kmp_itt_barrier_domains.count > 0) { 6294 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) { 6295 kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i]; 6296 while (bucket) { 6297 kmp_itthash_entry_t *next = bucket->next_in_bucket; 6298 __kmp_thread_free(th, bucket); 6299 bucket = next; 6300 } 6301 } 6302 } 6303 #endif 6304 } 6305 6306 static void __kmp_internal_end(void) { 6307 int i; 6308 6309 /* First, unregister the library */ 6310 __kmp_unregister_library(); 6311 6312 #if KMP_OS_WINDOWS 6313 /* In Win static library, we can't tell when a root actually dies, so we 6314 reclaim the data structures for any root threads that have died but not 6315 unregistered themselves, in order to shut down cleanly. 6316 In Win dynamic library we also can't tell when a thread dies. */ 6317 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 6318 // dead roots 6319 #endif 6320 6321 for (i = 0; i < __kmp_threads_capacity; i++) 6322 if (__kmp_root[i]) 6323 if (__kmp_root[i]->r.r_active) 6324 break; 6325 KMP_MB(); /* Flush all pending memory write invalidates. */ 6326 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6327 6328 if (i < __kmp_threads_capacity) { 6329 #if KMP_USE_MONITOR 6330 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 6331 KMP_MB(); /* Flush all pending memory write invalidates. */ 6332 6333 // Need to check that monitor was initialized before reaping it. If we are 6334 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 6335 // __kmp_monitor will appear to contain valid data, but it is only valid in 6336 // the parent process, not the child. 6337 // New behavior (201008): instead of keying off of the flag 6338 // __kmp_init_parallel, the monitor thread creation is keyed off 6339 // of the new flag __kmp_init_monitor. 6340 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6341 if (TCR_4(__kmp_init_monitor)) { 6342 __kmp_reap_monitor(&__kmp_monitor); 6343 TCW_4(__kmp_init_monitor, 0); 6344 } 6345 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6346 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6347 #endif // KMP_USE_MONITOR 6348 } else { 6349 /* TODO move this to cleanup code */ 6350 #ifdef KMP_DEBUG 6351 /* make sure that everything has properly ended */ 6352 for (i = 0; i < __kmp_threads_capacity; i++) { 6353 if (__kmp_root[i]) { 6354 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 6355 // there can be uber threads alive here 6356 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 6357 } 6358 } 6359 #endif 6360 6361 KMP_MB(); 6362 6363 // Reap the worker threads. 6364 // This is valid for now, but be careful if threads are reaped sooner. 6365 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 6366 // Get the next thread from the pool. 6367 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 6368 __kmp_thread_pool = thread->th.th_next_pool; 6369 // Reap it. 6370 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 6371 thread->th.th_next_pool = NULL; 6372 thread->th.th_in_pool = FALSE; 6373 __kmp_reap_thread(thread, 0); 6374 } 6375 __kmp_thread_pool_insert_pt = NULL; 6376 6377 // Reap teams. 6378 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 6379 // Get the next team from the pool. 6380 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 6381 __kmp_team_pool = team->t.t_next_pool; 6382 // Reap it. 6383 team->t.t_next_pool = NULL; 6384 __kmp_reap_team(team); 6385 } 6386 6387 __kmp_reap_task_teams(); 6388 6389 #if KMP_OS_UNIX 6390 // Threads that are not reaped should not access any resources since they 6391 // are going to be deallocated soon, so the shutdown sequence should wait 6392 // until all threads either exit the final spin-waiting loop or begin 6393 // sleeping after the given blocktime. 6394 for (i = 0; i < __kmp_threads_capacity; i++) { 6395 kmp_info_t *thr = __kmp_threads[i]; 6396 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 6397 KMP_CPU_PAUSE(); 6398 } 6399 #endif 6400 6401 for (i = 0; i < __kmp_threads_capacity; ++i) { 6402 // TBD: Add some checking... 6403 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6404 } 6405 6406 /* Make sure all threadprivate destructors get run by joining with all 6407 worker threads before resetting this flag */ 6408 TCW_SYNC_4(__kmp_init_common, FALSE); 6409 6410 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6411 KMP_MB(); 6412 6413 #if KMP_USE_MONITOR 6414 // See note above: One of the possible fixes for CQ138434 / CQ140126 6415 // 6416 // FIXME: push both code fragments down and CSE them? 6417 // push them into __kmp_cleanup() ? 6418 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6419 if (TCR_4(__kmp_init_monitor)) { 6420 __kmp_reap_monitor(&__kmp_monitor); 6421 TCW_4(__kmp_init_monitor, 0); 6422 } 6423 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6424 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6425 #endif 6426 } /* else !__kmp_global.t_active */ 6427 TCW_4(__kmp_init_gtid, FALSE); 6428 KMP_MB(); /* Flush all pending memory write invalidates. */ 6429 6430 __kmp_cleanup(); 6431 #if OMPT_SUPPORT 6432 ompt_fini(); 6433 #endif 6434 } 6435 6436 void __kmp_internal_end_library(int gtid_req) { 6437 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6438 /* this shouldn't be a race condition because __kmp_internal_end() is the 6439 only place to clear __kmp_serial_init */ 6440 /* we'll check this later too, after we get the lock */ 6441 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6442 // redundant, because the next check will work in any case. 6443 if (__kmp_global.g.g_abort) { 6444 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6445 /* TODO abort? */ 6446 return; 6447 } 6448 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6449 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6450 return; 6451 } 6452 6453 // If hidden helper team has been initialized, we need to deinit it 6454 if (TCR_4(__kmp_init_hidden_helper) && 6455 !TCR_4(__kmp_hidden_helper_team_done)) { 6456 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6457 // First release the main thread to let it continue its work 6458 __kmp_hidden_helper_main_thread_release(); 6459 // Wait until the hidden helper team has been destroyed 6460 __kmp_hidden_helper_threads_deinitz_wait(); 6461 } 6462 6463 KMP_MB(); /* Flush all pending memory write invalidates. */ 6464 /* find out who we are and what we should do */ 6465 { 6466 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6467 KA_TRACE( 6468 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6469 if (gtid == KMP_GTID_SHUTDOWN) { 6470 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6471 "already shutdown\n")); 6472 return; 6473 } else if (gtid == KMP_GTID_MONITOR) { 6474 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6475 "registered, or system shutdown\n")); 6476 return; 6477 } else if (gtid == KMP_GTID_DNE) { 6478 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6479 "shutdown\n")); 6480 /* we don't know who we are, but we may still shutdown the library */ 6481 } else if (KMP_UBER_GTID(gtid)) { 6482 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6483 if (__kmp_root[gtid]->r.r_active) { 6484 __kmp_global.g.g_abort = -1; 6485 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6486 __kmp_unregister_library(); 6487 KA_TRACE(10, 6488 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6489 gtid)); 6490 return; 6491 } else { 6492 __kmp_itthash_clean(__kmp_threads[gtid]); 6493 KA_TRACE( 6494 10, 6495 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6496 __kmp_unregister_root_current_thread(gtid); 6497 } 6498 } else { 6499 /* worker threads may call this function through the atexit handler, if they 6500 * call exit() */ 6501 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6502 TODO: do a thorough shutdown instead */ 6503 #ifdef DUMP_DEBUG_ON_EXIT 6504 if (__kmp_debug_buf) 6505 __kmp_dump_debug_buffer(); 6506 #endif 6507 // added unregister library call here when we switch to shm linux 6508 // if we don't, it will leave lots of files in /dev/shm 6509 // cleanup shared memory file before exiting. 6510 __kmp_unregister_library(); 6511 return; 6512 } 6513 } 6514 /* synchronize the termination process */ 6515 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6516 6517 /* have we already finished */ 6518 if (__kmp_global.g.g_abort) { 6519 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6520 /* TODO abort? */ 6521 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6522 return; 6523 } 6524 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6525 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6526 return; 6527 } 6528 6529 /* We need this lock to enforce mutex between this reading of 6530 __kmp_threads_capacity and the writing by __kmp_register_root. 6531 Alternatively, we can use a counter of roots that is atomically updated by 6532 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6533 __kmp_internal_end_*. */ 6534 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6535 6536 /* now we can safely conduct the actual termination */ 6537 __kmp_internal_end(); 6538 6539 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6540 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6541 6542 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6543 6544 #ifdef DUMP_DEBUG_ON_EXIT 6545 if (__kmp_debug_buf) 6546 __kmp_dump_debug_buffer(); 6547 #endif 6548 6549 #if KMP_OS_WINDOWS 6550 __kmp_close_console(); 6551 #endif 6552 6553 __kmp_fini_allocator(); 6554 6555 } // __kmp_internal_end_library 6556 6557 void __kmp_internal_end_thread(int gtid_req) { 6558 int i; 6559 6560 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6561 /* this shouldn't be a race condition because __kmp_internal_end() is the 6562 * only place to clear __kmp_serial_init */ 6563 /* we'll check this later too, after we get the lock */ 6564 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6565 // redundant, because the next check will work in any case. 6566 if (__kmp_global.g.g_abort) { 6567 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6568 /* TODO abort? */ 6569 return; 6570 } 6571 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6572 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6573 return; 6574 } 6575 6576 // If hidden helper team has been initialized, we need to deinit it 6577 if (TCR_4(__kmp_init_hidden_helper) && 6578 !TCR_4(__kmp_hidden_helper_team_done)) { 6579 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6580 // First release the main thread to let it continue its work 6581 __kmp_hidden_helper_main_thread_release(); 6582 // Wait until the hidden helper team has been destroyed 6583 __kmp_hidden_helper_threads_deinitz_wait(); 6584 } 6585 6586 KMP_MB(); /* Flush all pending memory write invalidates. */ 6587 6588 /* find out who we are and what we should do */ 6589 { 6590 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6591 KA_TRACE(10, 6592 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6593 if (gtid == KMP_GTID_SHUTDOWN) { 6594 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6595 "already shutdown\n")); 6596 return; 6597 } else if (gtid == KMP_GTID_MONITOR) { 6598 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6599 "registered, or system shutdown\n")); 6600 return; 6601 } else if (gtid == KMP_GTID_DNE) { 6602 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6603 "shutdown\n")); 6604 return; 6605 /* we don't know who we are */ 6606 } else if (KMP_UBER_GTID(gtid)) { 6607 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6608 if (__kmp_root[gtid]->r.r_active) { 6609 __kmp_global.g.g_abort = -1; 6610 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6611 KA_TRACE(10, 6612 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6613 gtid)); 6614 return; 6615 } else { 6616 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6617 gtid)); 6618 __kmp_unregister_root_current_thread(gtid); 6619 } 6620 } else { 6621 /* just a worker thread, let's leave */ 6622 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6623 6624 if (gtid >= 0) { 6625 __kmp_threads[gtid]->th.th_task_team = NULL; 6626 } 6627 6628 KA_TRACE(10, 6629 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6630 gtid)); 6631 return; 6632 } 6633 } 6634 #if KMP_DYNAMIC_LIB 6635 if (__kmp_pause_status != kmp_hard_paused) 6636 // AC: lets not shutdown the dynamic library at the exit of uber thread, 6637 // because we will better shutdown later in the library destructor. 6638 { 6639 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6640 return; 6641 } 6642 #endif 6643 /* synchronize the termination process */ 6644 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6645 6646 /* have we already finished */ 6647 if (__kmp_global.g.g_abort) { 6648 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6649 /* TODO abort? */ 6650 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6651 return; 6652 } 6653 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6654 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6655 return; 6656 } 6657 6658 /* We need this lock to enforce mutex between this reading of 6659 __kmp_threads_capacity and the writing by __kmp_register_root. 6660 Alternatively, we can use a counter of roots that is atomically updated by 6661 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6662 __kmp_internal_end_*. */ 6663 6664 /* should we finish the run-time? are all siblings done? */ 6665 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6666 6667 for (i = 0; i < __kmp_threads_capacity; ++i) { 6668 if (KMP_UBER_GTID(i)) { 6669 KA_TRACE( 6670 10, 6671 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6672 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6673 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6674 return; 6675 } 6676 } 6677 6678 /* now we can safely conduct the actual termination */ 6679 6680 __kmp_internal_end(); 6681 6682 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6683 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6684 6685 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6686 6687 #ifdef DUMP_DEBUG_ON_EXIT 6688 if (__kmp_debug_buf) 6689 __kmp_dump_debug_buffer(); 6690 #endif 6691 } // __kmp_internal_end_thread 6692 6693 // ----------------------------------------------------------------------------- 6694 // Library registration stuff. 6695 6696 static long __kmp_registration_flag = 0; 6697 // Random value used to indicate library initialization. 6698 static char *__kmp_registration_str = NULL; 6699 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6700 6701 static inline char *__kmp_reg_status_name() { 6702 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6703 each thread. If registration and unregistration go in different threads 6704 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6705 env var can not be found, because the name will contain different pid. */ 6706 // macOS* complains about name being too long with additional getuid() 6707 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB 6708 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(), 6709 (int)getuid()); 6710 #else 6711 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6712 #endif 6713 } // __kmp_reg_status_get 6714 6715 #if defined(KMP_USE_SHM) 6716 // If /dev/shm is not accessible, we will create a temporary file under /tmp. 6717 char *temp_reg_status_file_name = nullptr; 6718 #endif 6719 6720 void __kmp_register_library_startup(void) { 6721 6722 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6723 int done = 0; 6724 union { 6725 double dtime; 6726 long ltime; 6727 } time; 6728 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6729 __kmp_initialize_system_tick(); 6730 #endif 6731 __kmp_read_system_time(&time.dtime); 6732 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6733 __kmp_registration_str = 6734 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6735 __kmp_registration_flag, KMP_LIBRARY_FILE); 6736 6737 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6738 __kmp_registration_str)); 6739 6740 while (!done) { 6741 6742 char *value = NULL; // Actual value of the environment variable. 6743 6744 #if defined(KMP_USE_SHM) 6745 char *shm_name = __kmp_str_format("/%s", name); 6746 int shm_preexist = 0; 6747 char *data1; 6748 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666); 6749 if ((fd1 == -1) && (errno == EEXIST)) { 6750 // file didn't open because it already exists. 6751 // try opening existing file 6752 fd1 = shm_open(shm_name, O_RDWR, 0666); 6753 if (fd1 == -1) { // file didn't open 6754 // error out here 6755 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0), 6756 __kmp_msg_null); 6757 } else { 6758 // able to open existing file 6759 shm_preexist = 1; 6760 } 6761 } else if (fd1 == -1) { 6762 // SHM didn't open; it was due to error other than already exists. Try to 6763 // create a temp file under /tmp. 6764 // TODO: /tmp might not always be the temporary directory. For now we will 6765 // not consider TMPDIR. If /tmp is not accessible, we simply error out. 6766 char *temp_file_name = __kmp_str_format("/tmp/%sXXXXXX", name); 6767 fd1 = mkstemp(temp_file_name); 6768 if (fd1 == -1) { 6769 // error out here. 6770 __kmp_fatal(KMP_MSG(FunctionError, "Can't open TEMP"), KMP_ERR(errno), 6771 __kmp_msg_null); 6772 } 6773 temp_reg_status_file_name = temp_file_name; 6774 } 6775 if (shm_preexist == 0) { 6776 // we created SHM now set size 6777 if (ftruncate(fd1, SHM_SIZE) == -1) { 6778 // error occured setting size; 6779 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"), 6780 KMP_ERR(errno), __kmp_msg_null); 6781 } 6782 } 6783 data1 = 6784 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0); 6785 if (data1 == MAP_FAILED) { 6786 // failed to map shared memory 6787 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno), 6788 __kmp_msg_null); 6789 } 6790 if (shm_preexist == 0) { // set data to SHM, set value 6791 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); 6792 } 6793 // Read value from either what we just wrote or existing file. 6794 value = __kmp_str_format("%s", data1); // read value from SHM 6795 munmap(data1, SHM_SIZE); 6796 close(fd1); 6797 #else // Windows and unix with static library 6798 // Set environment variable, but do not overwrite if it is exist. 6799 __kmp_env_set(name, __kmp_registration_str, 0); 6800 // read value to see if it got set 6801 value = __kmp_env_get(name); 6802 #endif 6803 6804 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6805 done = 1; // Ok, environment variable set successfully, exit the loop. 6806 } else { 6807 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6808 // Check whether it alive or dead. 6809 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6810 char *tail = value; 6811 char *flag_addr_str = NULL; 6812 char *flag_val_str = NULL; 6813 char const *file_name = NULL; 6814 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6815 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6816 file_name = tail; 6817 if (tail != NULL) { 6818 unsigned long *flag_addr = 0; 6819 unsigned long flag_val = 0; 6820 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr)); 6821 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6822 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6823 // First, check whether environment-encoded address is mapped into 6824 // addr space. 6825 // If so, dereference it to see if it still has the right value. 6826 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6827 neighbor = 1; 6828 } else { 6829 // If not, then we know the other copy of the library is no longer 6830 // running. 6831 neighbor = 2; 6832 } 6833 } 6834 } 6835 switch (neighbor) { 6836 case 0: // Cannot parse environment variable -- neighbor status unknown. 6837 // Assume it is the incompatible format of future version of the 6838 // library. Assume the other library is alive. 6839 // WARN( ... ); // TODO: Issue a warning. 6840 file_name = "unknown library"; 6841 KMP_FALLTHROUGH(); 6842 // Attention! Falling to the next case. That's intentional. 6843 case 1: { // Neighbor is alive. 6844 // Check it is allowed. 6845 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6846 if (!__kmp_str_match_true(duplicate_ok)) { 6847 // That's not allowed. Issue fatal error. 6848 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6849 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6850 } 6851 KMP_INTERNAL_FREE(duplicate_ok); 6852 __kmp_duplicate_library_ok = 1; 6853 done = 1; // Exit the loop. 6854 } break; 6855 case 2: { // Neighbor is dead. 6856 6857 #if defined(KMP_USE_SHM) 6858 // close shared memory. 6859 shm_unlink(shm_name); // this removes file in /dev/shm 6860 #else 6861 // Clear the variable and try to register library again. 6862 __kmp_env_unset(name); 6863 #endif 6864 } break; 6865 default: { 6866 KMP_DEBUG_ASSERT(0); 6867 } break; 6868 } 6869 } 6870 KMP_INTERNAL_FREE((void *)value); 6871 #if defined(KMP_USE_SHM) 6872 KMP_INTERNAL_FREE((void *)shm_name); 6873 #endif 6874 } // while 6875 KMP_INTERNAL_FREE((void *)name); 6876 6877 } // func __kmp_register_library_startup 6878 6879 void __kmp_unregister_library(void) { 6880 6881 char *name = __kmp_reg_status_name(); 6882 char *value = NULL; 6883 6884 #if defined(KMP_USE_SHM) 6885 bool use_shm = true; 6886 char *shm_name = __kmp_str_format("/%s", name); 6887 int fd1 = shm_open(shm_name, O_RDONLY, 0666); 6888 if (fd1 == -1) { 6889 // File did not open. Try the temporary file. 6890 use_shm = false; 6891 KMP_DEBUG_ASSERT(temp_reg_status_file_name); 6892 fd1 = open(temp_reg_status_file_name, O_RDONLY); 6893 if (fd1 == -1) { 6894 // give it up now. 6895 return; 6896 } 6897 } 6898 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); 6899 if (data1 != MAP_FAILED) { 6900 value = __kmp_str_format("%s", data1); // read value from SHM 6901 munmap(data1, SHM_SIZE); 6902 } 6903 close(fd1); 6904 #else 6905 value = __kmp_env_get(name); 6906 #endif 6907 6908 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6909 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6910 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6911 // Ok, this is our variable. Delete it. 6912 #if defined(KMP_USE_SHM) 6913 if (use_shm) { 6914 shm_unlink(shm_name); // this removes file in /dev/shm 6915 } else { 6916 KMP_DEBUG_ASSERT(temp_reg_status_file_name); 6917 unlink(temp_reg_status_file_name); // this removes the temp file 6918 } 6919 #else 6920 __kmp_env_unset(name); 6921 #endif 6922 } 6923 6924 #if defined(KMP_USE_SHM) 6925 KMP_INTERNAL_FREE(shm_name); 6926 if (!use_shm) { 6927 KMP_DEBUG_ASSERT(temp_reg_status_file_name); 6928 KMP_INTERNAL_FREE(temp_reg_status_file_name); 6929 } 6930 #endif 6931 6932 KMP_INTERNAL_FREE(__kmp_registration_str); 6933 KMP_INTERNAL_FREE(value); 6934 KMP_INTERNAL_FREE(name); 6935 6936 __kmp_registration_flag = 0; 6937 __kmp_registration_str = NULL; 6938 6939 } // __kmp_unregister_library 6940 6941 // End of Library registration stuff. 6942 // ----------------------------------------------------------------------------- 6943 6944 #if KMP_MIC_SUPPORTED 6945 6946 static void __kmp_check_mic_type() { 6947 kmp_cpuid_t cpuid_state = {0}; 6948 kmp_cpuid_t *cs_p = &cpuid_state; 6949 __kmp_x86_cpuid(1, 0, cs_p); 6950 // We don't support mic1 at the moment 6951 if ((cs_p->eax & 0xff0) == 0xB10) { 6952 __kmp_mic_type = mic2; 6953 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6954 __kmp_mic_type = mic3; 6955 } else { 6956 __kmp_mic_type = non_mic; 6957 } 6958 } 6959 6960 #endif /* KMP_MIC_SUPPORTED */ 6961 6962 #if KMP_HAVE_UMWAIT 6963 static void __kmp_user_level_mwait_init() { 6964 struct kmp_cpuid buf; 6965 __kmp_x86_cpuid(7, 0, &buf); 6966 __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1); 6967 __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait; 6968 __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0); 6969 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", 6970 __kmp_umwait_enabled)); 6971 } 6972 #elif KMP_HAVE_MWAIT 6973 #ifndef AT_INTELPHIUSERMWAIT 6974 // Spurious, non-existent value that should always fail to return anything. 6975 // Will be replaced with the correct value when we know that. 6976 #define AT_INTELPHIUSERMWAIT 10000 6977 #endif 6978 // getauxval() function is available in RHEL7 and SLES12. If a system with an 6979 // earlier OS is used to build the RTL, we'll use the following internal 6980 // function when the entry is not found. 6981 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL; 6982 unsigned long getauxval(unsigned long) { return 0; } 6983 6984 static void __kmp_user_level_mwait_init() { 6985 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available 6986 // use them to find if the user-level mwait is enabled. Otherwise, forcibly 6987 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable 6988 // KMP_USER_LEVEL_MWAIT was set to TRUE. 6989 if (__kmp_mic_type == mic3) { 6990 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT); 6991 if ((res & 0x1) || __kmp_user_level_mwait) { 6992 __kmp_mwait_enabled = TRUE; 6993 if (__kmp_user_level_mwait) { 6994 KMP_INFORM(EnvMwaitWarn); 6995 } 6996 } else { 6997 __kmp_mwait_enabled = FALSE; 6998 } 6999 } 7000 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, " 7001 "__kmp_mwait_enabled = %d\n", 7002 __kmp_mic_type, __kmp_mwait_enabled)); 7003 } 7004 #endif /* KMP_HAVE_UMWAIT */ 7005 7006 static void __kmp_do_serial_initialize(void) { 7007 int i, gtid; 7008 size_t size; 7009 7010 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 7011 7012 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 7013 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 7014 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 7015 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 7016 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 7017 7018 #if OMPT_SUPPORT 7019 ompt_pre_init(); 7020 #endif 7021 #if OMPD_SUPPORT 7022 __kmp_env_dump(); 7023 ompd_init(); 7024 #endif 7025 7026 __kmp_validate_locks(); 7027 7028 #if ENABLE_LIBOMPTARGET 7029 /* Initialize functions from libomptarget */ 7030 __kmp_init_omptarget(); 7031 #endif 7032 7033 /* Initialize internal memory allocator */ 7034 __kmp_init_allocator(); 7035 7036 /* Register the library startup via an environment variable or via mapped 7037 shared memory file and check to see whether another copy of the library is 7038 already registered. Since forked child process is often terminated, we 7039 postpone the registration till middle initialization in the child */ 7040 if (__kmp_need_register_serial) 7041 __kmp_register_library_startup(); 7042 7043 /* TODO reinitialization of library */ 7044 if (TCR_4(__kmp_global.g.g_done)) { 7045 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 7046 } 7047 7048 __kmp_global.g.g_abort = 0; 7049 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 7050 7051 /* initialize the locks */ 7052 #if KMP_USE_ADAPTIVE_LOCKS 7053 #if KMP_DEBUG_ADAPTIVE_LOCKS 7054 __kmp_init_speculative_stats(); 7055 #endif 7056 #endif 7057 #if KMP_STATS_ENABLED 7058 __kmp_stats_init(); 7059 #endif 7060 __kmp_init_lock(&__kmp_global_lock); 7061 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 7062 __kmp_init_lock(&__kmp_debug_lock); 7063 __kmp_init_atomic_lock(&__kmp_atomic_lock); 7064 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 7065 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 7066 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 7067 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 7068 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 7069 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 7070 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 7071 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 7072 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 7073 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 7074 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 7075 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 7076 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 7077 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 7078 #if KMP_USE_MONITOR 7079 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 7080 #endif 7081 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 7082 7083 /* conduct initialization and initial setup of configuration */ 7084 7085 __kmp_runtime_initialize(); 7086 7087 #if KMP_MIC_SUPPORTED 7088 __kmp_check_mic_type(); 7089 #endif 7090 7091 // Some global variable initialization moved here from kmp_env_initialize() 7092 #ifdef KMP_DEBUG 7093 kmp_diag = 0; 7094 #endif 7095 __kmp_abort_delay = 0; 7096 7097 // From __kmp_init_dflt_team_nth() 7098 /* assume the entire machine will be used */ 7099 __kmp_dflt_team_nth_ub = __kmp_xproc; 7100 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 7101 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 7102 } 7103 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 7104 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 7105 } 7106 __kmp_max_nth = __kmp_sys_max_nth; 7107 __kmp_cg_max_nth = __kmp_sys_max_nth; 7108 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 7109 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 7110 __kmp_teams_max_nth = __kmp_sys_max_nth; 7111 } 7112 7113 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 7114 // part 7115 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 7116 #if KMP_USE_MONITOR 7117 __kmp_monitor_wakeups = 7118 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 7119 __kmp_bt_intervals = 7120 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 7121 #endif 7122 // From "KMP_LIBRARY" part of __kmp_env_initialize() 7123 __kmp_library = library_throughput; 7124 // From KMP_SCHEDULE initialization 7125 __kmp_static = kmp_sch_static_balanced; 7126 // AC: do not use analytical here, because it is non-monotonous 7127 //__kmp_guided = kmp_sch_guided_iterative_chunked; 7128 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 7129 // need to repeat assignment 7130 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 7131 // bit control and barrier method control parts 7132 #if KMP_FAST_REDUCTION_BARRIER 7133 #define kmp_reduction_barrier_gather_bb ((int)1) 7134 #define kmp_reduction_barrier_release_bb ((int)1) 7135 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt 7136 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt 7137 #endif // KMP_FAST_REDUCTION_BARRIER 7138 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 7139 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 7140 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 7141 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 7142 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 7143 #if KMP_FAST_REDUCTION_BARRIER 7144 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 7145 // lin_64 ): hyper,1 7146 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 7147 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 7148 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 7149 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 7150 } 7151 #endif // KMP_FAST_REDUCTION_BARRIER 7152 } 7153 #if KMP_FAST_REDUCTION_BARRIER 7154 #undef kmp_reduction_barrier_release_pat 7155 #undef kmp_reduction_barrier_gather_pat 7156 #undef kmp_reduction_barrier_release_bb 7157 #undef kmp_reduction_barrier_gather_bb 7158 #endif // KMP_FAST_REDUCTION_BARRIER 7159 #if KMP_MIC_SUPPORTED 7160 if (__kmp_mic_type == mic2) { // KNC 7161 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 7162 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 7163 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 7164 1; // forkjoin release 7165 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 7166 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 7167 } 7168 #if KMP_FAST_REDUCTION_BARRIER 7169 if (__kmp_mic_type == mic2) { // KNC 7170 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 7171 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 7172 } 7173 #endif // KMP_FAST_REDUCTION_BARRIER 7174 #endif // KMP_MIC_SUPPORTED 7175 7176 // From KMP_CHECKS initialization 7177 #ifdef KMP_DEBUG 7178 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 7179 #else 7180 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 7181 #endif 7182 7183 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 7184 __kmp_foreign_tp = TRUE; 7185 7186 __kmp_global.g.g_dynamic = FALSE; 7187 __kmp_global.g.g_dynamic_mode = dynamic_default; 7188 7189 __kmp_init_nesting_mode(); 7190 7191 __kmp_env_initialize(NULL); 7192 7193 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT 7194 __kmp_user_level_mwait_init(); 7195 #endif 7196 // Print all messages in message catalog for testing purposes. 7197 #ifdef KMP_DEBUG 7198 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 7199 if (__kmp_str_match_true(val)) { 7200 kmp_str_buf_t buffer; 7201 __kmp_str_buf_init(&buffer); 7202 __kmp_i18n_dump_catalog(&buffer); 7203 __kmp_printf("%s", buffer.str); 7204 __kmp_str_buf_free(&buffer); 7205 } 7206 __kmp_env_free(&val); 7207 #endif 7208 7209 __kmp_threads_capacity = 7210 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 7211 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 7212 __kmp_tp_capacity = __kmp_default_tp_capacity( 7213 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 7214 7215 // If the library is shut down properly, both pools must be NULL. Just in 7216 // case, set them to NULL -- some memory may leak, but subsequent code will 7217 // work even if pools are not freed. 7218 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 7219 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 7220 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 7221 __kmp_thread_pool = NULL; 7222 __kmp_thread_pool_insert_pt = NULL; 7223 __kmp_team_pool = NULL; 7224 7225 /* Allocate all of the variable sized records */ 7226 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 7227 * expandable */ 7228 /* Since allocation is cache-aligned, just add extra padding at the end */ 7229 size = 7230 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 7231 CACHE_LINE; 7232 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 7233 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 7234 sizeof(kmp_info_t *) * __kmp_threads_capacity); 7235 7236 /* init thread counts */ 7237 KMP_DEBUG_ASSERT(__kmp_all_nth == 7238 0); // Asserts fail if the library is reinitializing and 7239 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 7240 __kmp_all_nth = 0; 7241 __kmp_nth = 0; 7242 7243 /* setup the uber master thread and hierarchy */ 7244 gtid = __kmp_register_root(TRUE); 7245 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 7246 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7247 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 7248 7249 KMP_MB(); /* Flush all pending memory write invalidates. */ 7250 7251 __kmp_common_initialize(); 7252 7253 #if KMP_OS_UNIX 7254 /* invoke the child fork handler */ 7255 __kmp_register_atfork(); 7256 #endif 7257 7258 #if !KMP_DYNAMIC_LIB || \ 7259 ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN) 7260 { 7261 /* Invoke the exit handler when the program finishes, only for static 7262 library and macOS* dynamic. For other dynamic libraries, we already 7263 have _fini and DllMain. */ 7264 int rc = atexit(__kmp_internal_end_atexit); 7265 if (rc != 0) { 7266 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 7267 __kmp_msg_null); 7268 } 7269 } 7270 #endif 7271 7272 #if KMP_HANDLE_SIGNALS 7273 #if KMP_OS_UNIX 7274 /* NOTE: make sure that this is called before the user installs their own 7275 signal handlers so that the user handlers are called first. this way they 7276 can return false, not call our handler, avoid terminating the library, and 7277 continue execution where they left off. */ 7278 __kmp_install_signals(FALSE); 7279 #endif /* KMP_OS_UNIX */ 7280 #if KMP_OS_WINDOWS 7281 __kmp_install_signals(TRUE); 7282 #endif /* KMP_OS_WINDOWS */ 7283 #endif 7284 7285 /* we have finished the serial initialization */ 7286 __kmp_init_counter++; 7287 7288 __kmp_init_serial = TRUE; 7289 7290 if (__kmp_version) { 7291 __kmp_print_version_1(); 7292 } 7293 7294 if (__kmp_settings) { 7295 __kmp_env_print(); 7296 } 7297 7298 if (__kmp_display_env || __kmp_display_env_verbose) { 7299 __kmp_env_print_2(); 7300 } 7301 7302 #if OMPT_SUPPORT 7303 ompt_post_init(); 7304 #endif 7305 7306 KMP_MB(); 7307 7308 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 7309 } 7310 7311 void __kmp_serial_initialize(void) { 7312 if (__kmp_init_serial) { 7313 return; 7314 } 7315 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7316 if (__kmp_init_serial) { 7317 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7318 return; 7319 } 7320 __kmp_do_serial_initialize(); 7321 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7322 } 7323 7324 static void __kmp_do_middle_initialize(void) { 7325 int i, j; 7326 int prev_dflt_team_nth; 7327 7328 if (!__kmp_init_serial) { 7329 __kmp_do_serial_initialize(); 7330 } 7331 7332 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 7333 7334 if (UNLIKELY(!__kmp_need_register_serial)) { 7335 // We are in a forked child process. The registration was skipped during 7336 // serial initialization in __kmp_atfork_child handler. Do it here. 7337 __kmp_register_library_startup(); 7338 } 7339 7340 // Save the previous value for the __kmp_dflt_team_nth so that 7341 // we can avoid some reinitialization if it hasn't changed. 7342 prev_dflt_team_nth = __kmp_dflt_team_nth; 7343 7344 #if KMP_AFFINITY_SUPPORTED 7345 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 7346 // number of cores on the machine. 7347 __kmp_affinity_initialize(__kmp_affinity); 7348 7349 #endif /* KMP_AFFINITY_SUPPORTED */ 7350 7351 KMP_ASSERT(__kmp_xproc > 0); 7352 if (__kmp_avail_proc == 0) { 7353 __kmp_avail_proc = __kmp_xproc; 7354 } 7355 7356 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 7357 // correct them now 7358 j = 0; 7359 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 7360 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 7361 __kmp_avail_proc; 7362 j++; 7363 } 7364 7365 if (__kmp_dflt_team_nth == 0) { 7366 #ifdef KMP_DFLT_NTH_CORES 7367 // Default #threads = #cores 7368 __kmp_dflt_team_nth = __kmp_ncores; 7369 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7370 "__kmp_ncores (%d)\n", 7371 __kmp_dflt_team_nth)); 7372 #else 7373 // Default #threads = #available OS procs 7374 __kmp_dflt_team_nth = __kmp_avail_proc; 7375 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7376 "__kmp_avail_proc(%d)\n", 7377 __kmp_dflt_team_nth)); 7378 #endif /* KMP_DFLT_NTH_CORES */ 7379 } 7380 7381 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 7382 __kmp_dflt_team_nth = KMP_MIN_NTH; 7383 } 7384 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 7385 __kmp_dflt_team_nth = __kmp_sys_max_nth; 7386 } 7387 7388 if (__kmp_nesting_mode > 0) 7389 __kmp_set_nesting_mode_threads(); 7390 7391 // There's no harm in continuing if the following check fails, 7392 // but it indicates an error in the previous logic. 7393 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 7394 7395 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 7396 // Run through the __kmp_threads array and set the num threads icv for each 7397 // root thread that is currently registered with the RTL (which has not 7398 // already explicitly set its nthreads-var with a call to 7399 // omp_set_num_threads()). 7400 for (i = 0; i < __kmp_threads_capacity; i++) { 7401 kmp_info_t *thread = __kmp_threads[i]; 7402 if (thread == NULL) 7403 continue; 7404 if (thread->th.th_current_task->td_icvs.nproc != 0) 7405 continue; 7406 7407 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 7408 } 7409 } 7410 KA_TRACE( 7411 20, 7412 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 7413 __kmp_dflt_team_nth)); 7414 7415 #ifdef KMP_ADJUST_BLOCKTIME 7416 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 7417 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 7418 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 7419 if (__kmp_nth > __kmp_avail_proc) { 7420 __kmp_zero_bt = TRUE; 7421 } 7422 } 7423 #endif /* KMP_ADJUST_BLOCKTIME */ 7424 7425 /* we have finished middle initialization */ 7426 TCW_SYNC_4(__kmp_init_middle, TRUE); 7427 7428 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 7429 } 7430 7431 void __kmp_middle_initialize(void) { 7432 if (__kmp_init_middle) { 7433 return; 7434 } 7435 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7436 if (__kmp_init_middle) { 7437 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7438 return; 7439 } 7440 __kmp_do_middle_initialize(); 7441 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7442 } 7443 7444 void __kmp_parallel_initialize(void) { 7445 int gtid = __kmp_entry_gtid(); // this might be a new root 7446 7447 /* synchronize parallel initialization (for sibling) */ 7448 if (TCR_4(__kmp_init_parallel)) 7449 return; 7450 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7451 if (TCR_4(__kmp_init_parallel)) { 7452 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7453 return; 7454 } 7455 7456 /* TODO reinitialization after we have already shut down */ 7457 if (TCR_4(__kmp_global.g.g_done)) { 7458 KA_TRACE( 7459 10, 7460 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 7461 __kmp_infinite_loop(); 7462 } 7463 7464 /* jc: The lock __kmp_initz_lock is already held, so calling 7465 __kmp_serial_initialize would cause a deadlock. So we call 7466 __kmp_do_serial_initialize directly. */ 7467 if (!__kmp_init_middle) { 7468 __kmp_do_middle_initialize(); 7469 } 7470 __kmp_assign_root_init_mask(); 7471 __kmp_resume_if_hard_paused(); 7472 7473 /* begin initialization */ 7474 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 7475 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7476 7477 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 7478 // Save the FP control regs. 7479 // Worker threads will set theirs to these values at thread startup. 7480 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 7481 __kmp_store_mxcsr(&__kmp_init_mxcsr); 7482 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 7483 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 7484 7485 #if KMP_OS_UNIX 7486 #if KMP_HANDLE_SIGNALS 7487 /* must be after __kmp_serial_initialize */ 7488 __kmp_install_signals(TRUE); 7489 #endif 7490 #endif 7491 7492 __kmp_suspend_initialize(); 7493 7494 #if defined(USE_LOAD_BALANCE) 7495 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7496 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 7497 } 7498 #else 7499 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7500 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7501 } 7502 #endif 7503 7504 if (__kmp_version) { 7505 __kmp_print_version_2(); 7506 } 7507 7508 /* we have finished parallel initialization */ 7509 TCW_SYNC_4(__kmp_init_parallel, TRUE); 7510 7511 KMP_MB(); 7512 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 7513 7514 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7515 } 7516 7517 void __kmp_hidden_helper_initialize() { 7518 if (TCR_4(__kmp_init_hidden_helper)) 7519 return; 7520 7521 // __kmp_parallel_initialize is required before we initialize hidden helper 7522 if (!TCR_4(__kmp_init_parallel)) 7523 __kmp_parallel_initialize(); 7524 7525 // Double check. Note that this double check should not be placed before 7526 // __kmp_parallel_initialize as it will cause dead lock. 7527 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7528 if (TCR_4(__kmp_init_hidden_helper)) { 7529 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7530 return; 7531 } 7532 7533 #if KMP_AFFINITY_SUPPORTED 7534 // Initialize hidden helper affinity settings. 7535 // The above __kmp_parallel_initialize() will initialize 7536 // regular affinity (and topology) if not already done. 7537 if (!__kmp_hh_affinity.flags.initialized) 7538 __kmp_affinity_initialize(__kmp_hh_affinity); 7539 #endif 7540 7541 // Set the count of hidden helper tasks to be executed to zero 7542 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0); 7543 7544 // Set the global variable indicating that we're initializing hidden helper 7545 // team/threads 7546 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE); 7547 7548 // Platform independent initialization 7549 __kmp_do_initialize_hidden_helper_threads(); 7550 7551 // Wait here for the finish of initialization of hidden helper teams 7552 __kmp_hidden_helper_threads_initz_wait(); 7553 7554 // We have finished hidden helper initialization 7555 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE); 7556 7557 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7558 } 7559 7560 /* ------------------------------------------------------------------------ */ 7561 7562 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7563 kmp_team_t *team) { 7564 kmp_disp_t *dispatch; 7565 7566 KMP_MB(); 7567 7568 /* none of the threads have encountered any constructs, yet. */ 7569 this_thr->th.th_local.this_construct = 0; 7570 #if KMP_CACHE_MANAGE 7571 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 7572 #endif /* KMP_CACHE_MANAGE */ 7573 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 7574 KMP_DEBUG_ASSERT(dispatch); 7575 KMP_DEBUG_ASSERT(team->t.t_dispatch); 7576 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 7577 // this_thr->th.th_info.ds.ds_tid ] ); 7578 7579 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 7580 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter 7581 if (__kmp_env_consistency_check) 7582 __kmp_push_parallel(gtid, team->t.t_ident); 7583 7584 KMP_MB(); /* Flush all pending memory write invalidates. */ 7585 } 7586 7587 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7588 kmp_team_t *team) { 7589 if (__kmp_env_consistency_check) 7590 __kmp_pop_parallel(gtid, team->t.t_ident); 7591 7592 __kmp_finish_implicit_task(this_thr); 7593 } 7594 7595 int __kmp_invoke_task_func(int gtid) { 7596 int rc; 7597 int tid = __kmp_tid_from_gtid(gtid); 7598 kmp_info_t *this_thr = __kmp_threads[gtid]; 7599 kmp_team_t *team = this_thr->th.th_team; 7600 7601 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 7602 #if USE_ITT_BUILD 7603 if (__itt_stack_caller_create_ptr) { 7604 // inform ittnotify about entering user's code 7605 if (team->t.t_stack_id != NULL) { 7606 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id); 7607 } else { 7608 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7609 __kmp_itt_stack_callee_enter( 7610 (__itt_caller)team->t.t_parent->t.t_stack_id); 7611 } 7612 } 7613 #endif /* USE_ITT_BUILD */ 7614 #if INCLUDE_SSC_MARKS 7615 SSC_MARK_INVOKING(); 7616 #endif 7617 7618 #if OMPT_SUPPORT 7619 void *dummy; 7620 void **exit_frame_p; 7621 ompt_data_t *my_task_data; 7622 ompt_data_t *my_parallel_data; 7623 int ompt_team_size; 7624 7625 if (ompt_enabled.enabled) { 7626 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid] 7627 .ompt_task_info.frame.exit_frame.ptr); 7628 } else { 7629 exit_frame_p = &dummy; 7630 } 7631 7632 my_task_data = 7633 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 7634 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 7635 if (ompt_enabled.ompt_callback_implicit_task) { 7636 ompt_team_size = team->t.t_nproc; 7637 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7638 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 7639 __kmp_tid_from_gtid(gtid), ompt_task_implicit); 7640 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 7641 } 7642 #endif 7643 7644 #if KMP_STATS_ENABLED 7645 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 7646 if (previous_state == stats_state_e::TEAMS_REGION) { 7647 KMP_PUSH_PARTITIONED_TIMER(OMP_teams); 7648 } else { 7649 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); 7650 } 7651 KMP_SET_THREAD_STATE(IMPLICIT_TASK); 7652 #endif 7653 7654 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 7655 tid, (int)team->t.t_argc, (void **)team->t.t_argv 7656 #if OMPT_SUPPORT 7657 , 7658 exit_frame_p 7659 #endif 7660 ); 7661 #if OMPT_SUPPORT 7662 *exit_frame_p = NULL; 7663 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team; 7664 #endif 7665 7666 #if KMP_STATS_ENABLED 7667 if (previous_state == stats_state_e::TEAMS_REGION) { 7668 KMP_SET_THREAD_STATE(previous_state); 7669 } 7670 KMP_POP_PARTITIONED_TIMER(); 7671 #endif 7672 7673 #if USE_ITT_BUILD 7674 if (__itt_stack_caller_create_ptr) { 7675 // inform ittnotify about leaving user's code 7676 if (team->t.t_stack_id != NULL) { 7677 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id); 7678 } else { 7679 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7680 __kmp_itt_stack_callee_leave( 7681 (__itt_caller)team->t.t_parent->t.t_stack_id); 7682 } 7683 } 7684 #endif /* USE_ITT_BUILD */ 7685 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7686 7687 return rc; 7688 } 7689 7690 void __kmp_teams_master(int gtid) { 7691 // This routine is called by all primary threads in teams construct 7692 kmp_info_t *thr = __kmp_threads[gtid]; 7693 kmp_team_t *team = thr->th.th_team; 7694 ident_t *loc = team->t.t_ident; 7695 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7696 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7697 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7698 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7699 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7700 7701 // This thread is a new CG root. Set up the proper variables. 7702 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 7703 tmp->cg_root = thr; // Make thr the CG root 7704 // Init to thread limit stored when league primary threads were forked 7705 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; 7706 tmp->cg_nthreads = 1; // Init counter to one active thread, this one 7707 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" 7708 " cg_nthreads to 1\n", 7709 thr, tmp)); 7710 tmp->up = thr->th.th_cg_roots; 7711 thr->th.th_cg_roots = tmp; 7712 7713 // Launch league of teams now, but not let workers execute 7714 // (they hang on fork barrier until next parallel) 7715 #if INCLUDE_SSC_MARKS 7716 SSC_MARK_FORKING(); 7717 #endif 7718 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7719 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7720 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7721 #if INCLUDE_SSC_MARKS 7722 SSC_MARK_JOINING(); 7723 #endif 7724 // If the team size was reduced from the limit, set it to the new size 7725 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) 7726 thr->th.th_teams_size.nth = thr->th.th_team_nproc; 7727 // AC: last parameter "1" eliminates join barrier which won't work because 7728 // worker threads are in a fork barrier waiting for more parallel regions 7729 __kmp_join_call(loc, gtid 7730 #if OMPT_SUPPORT 7731 , 7732 fork_context_intel 7733 #endif 7734 , 7735 1); 7736 } 7737 7738 int __kmp_invoke_teams_master(int gtid) { 7739 kmp_info_t *this_thr = __kmp_threads[gtid]; 7740 kmp_team_t *team = this_thr->th.th_team; 7741 #if KMP_DEBUG 7742 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7743 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7744 (void *)__kmp_teams_master); 7745 #endif 7746 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7747 #if OMPT_SUPPORT 7748 int tid = __kmp_tid_from_gtid(gtid); 7749 ompt_data_t *task_data = 7750 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data; 7751 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data; 7752 if (ompt_enabled.ompt_callback_implicit_task) { 7753 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7754 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid, 7755 ompt_task_initial); 7756 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid; 7757 } 7758 #endif 7759 __kmp_teams_master(gtid); 7760 #if OMPT_SUPPORT 7761 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league; 7762 #endif 7763 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7764 return 1; 7765 } 7766 7767 /* this sets the requested number of threads for the next parallel region 7768 encountered by this team. since this should be enclosed in the forkjoin 7769 critical section it should avoid race conditions with asymmetrical nested 7770 parallelism */ 7771 7772 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7773 kmp_info_t *thr = __kmp_threads[gtid]; 7774 7775 if (num_threads > 0) 7776 thr->th.th_set_nproc = num_threads; 7777 } 7778 7779 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams, 7780 int num_threads) { 7781 KMP_DEBUG_ASSERT(thr); 7782 // Remember the number of threads for inner parallel regions 7783 if (!TCR_4(__kmp_init_middle)) 7784 __kmp_middle_initialize(); // get internal globals calculated 7785 __kmp_assign_root_init_mask(); 7786 KMP_DEBUG_ASSERT(__kmp_avail_proc); 7787 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth); 7788 7789 if (num_threads == 0) { 7790 if (__kmp_teams_thread_limit > 0) { 7791 num_threads = __kmp_teams_thread_limit; 7792 } else { 7793 num_threads = __kmp_avail_proc / num_teams; 7794 } 7795 // adjust num_threads w/o warning as it is not user setting 7796 // num_threads = min(num_threads, nthreads-var, thread-limit-var) 7797 // no thread_limit clause specified - do not change thread-limit-var ICV 7798 if (num_threads > __kmp_dflt_team_nth) { 7799 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7800 } 7801 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) { 7802 num_threads = thr->th.th_current_task->td_icvs.thread_limit; 7803 } // prevent team size to exceed thread-limit-var 7804 if (num_teams * num_threads > __kmp_teams_max_nth) { 7805 num_threads = __kmp_teams_max_nth / num_teams; 7806 } 7807 if (num_threads == 0) { 7808 num_threads = 1; 7809 } 7810 } else { 7811 if (num_threads < 0) { 7812 __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1), 7813 __kmp_msg_null); 7814 num_threads = 1; 7815 } 7816 // This thread will be the primary thread of the league primary threads 7817 // Store new thread limit; old limit is saved in th_cg_roots list 7818 thr->th.th_current_task->td_icvs.thread_limit = num_threads; 7819 // num_threads = min(num_threads, nthreads-var) 7820 if (num_threads > __kmp_dflt_team_nth) { 7821 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7822 } 7823 if (num_teams * num_threads > __kmp_teams_max_nth) { 7824 int new_threads = __kmp_teams_max_nth / num_teams; 7825 if (new_threads == 0) { 7826 new_threads = 1; 7827 } 7828 if (new_threads != num_threads) { 7829 if (!__kmp_reserve_warn) { // user asked for too many threads 7830 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT 7831 __kmp_msg(kmp_ms_warning, 7832 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7833 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7834 } 7835 } 7836 num_threads = new_threads; 7837 } 7838 } 7839 thr->th.th_teams_size.nth = num_threads; 7840 } 7841 7842 /* this sets the requested number of teams for the teams region and/or 7843 the number of threads for the next parallel region encountered */ 7844 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7845 int num_threads) { 7846 kmp_info_t *thr = __kmp_threads[gtid]; 7847 if (num_teams < 0) { 7848 // OpenMP specification requires requested values to be positive, 7849 // but people can send us any value, so we'd better check 7850 __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1), 7851 __kmp_msg_null); 7852 num_teams = 1; 7853 } 7854 if (num_teams == 0) { 7855 if (__kmp_nteams > 0) { 7856 num_teams = __kmp_nteams; 7857 } else { 7858 num_teams = 1; // default number of teams is 1. 7859 } 7860 } 7861 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7862 if (!__kmp_reserve_warn) { 7863 __kmp_reserve_warn = 1; 7864 __kmp_msg(kmp_ms_warning, 7865 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7866 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7867 } 7868 num_teams = __kmp_teams_max_nth; 7869 } 7870 // Set number of teams (number of threads in the outer "parallel" of the 7871 // teams) 7872 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7873 7874 __kmp_push_thread_limit(thr, num_teams, num_threads); 7875 } 7876 7877 /* This sets the requested number of teams for the teams region and/or 7878 the number of threads for the next parallel region encountered */ 7879 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb, 7880 int num_teams_ub, int num_threads) { 7881 kmp_info_t *thr = __kmp_threads[gtid]; 7882 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0); 7883 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb); 7884 KMP_DEBUG_ASSERT(num_threads >= 0); 7885 7886 if (num_teams_lb > num_teams_ub) { 7887 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub), 7888 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null); 7889 } 7890 7891 int num_teams = 1; // defalt number of teams is 1. 7892 7893 if (num_teams_lb == 0 && num_teams_ub > 0) 7894 num_teams_lb = num_teams_ub; 7895 7896 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause 7897 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams; 7898 if (num_teams > __kmp_teams_max_nth) { 7899 if (!__kmp_reserve_warn) { 7900 __kmp_reserve_warn = 1; 7901 __kmp_msg(kmp_ms_warning, 7902 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7903 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7904 } 7905 num_teams = __kmp_teams_max_nth; 7906 } 7907 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams 7908 num_teams = num_teams_ub; 7909 } else { // num_teams_lb <= num_teams <= num_teams_ub 7910 if (num_threads <= 0) { 7911 if (num_teams_ub > __kmp_teams_max_nth) { 7912 num_teams = num_teams_lb; 7913 } else { 7914 num_teams = num_teams_ub; 7915 } 7916 } else { 7917 num_teams = (num_threads > __kmp_teams_max_nth) 7918 ? num_teams 7919 : __kmp_teams_max_nth / num_threads; 7920 if (num_teams < num_teams_lb) { 7921 num_teams = num_teams_lb; 7922 } else if (num_teams > num_teams_ub) { 7923 num_teams = num_teams_ub; 7924 } 7925 } 7926 } 7927 // Set number of teams (number of threads in the outer "parallel" of the 7928 // teams) 7929 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7930 7931 __kmp_push_thread_limit(thr, num_teams, num_threads); 7932 } 7933 7934 // Set the proc_bind var to use in the following parallel region. 7935 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7936 kmp_info_t *thr = __kmp_threads[gtid]; 7937 thr->th.th_set_proc_bind = proc_bind; 7938 } 7939 7940 /* Launch the worker threads into the microtask. */ 7941 7942 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7943 kmp_info_t *this_thr = __kmp_threads[gtid]; 7944 7945 #ifdef KMP_DEBUG 7946 int f; 7947 #endif /* KMP_DEBUG */ 7948 7949 KMP_DEBUG_ASSERT(team); 7950 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7951 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7952 KMP_MB(); /* Flush all pending memory write invalidates. */ 7953 7954 team->t.t_construct = 0; /* no single directives seen yet */ 7955 team->t.t_ordered.dt.t_value = 7956 0; /* thread 0 enters the ordered section first */ 7957 7958 /* Reset the identifiers on the dispatch buffer */ 7959 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7960 if (team->t.t_max_nproc > 1) { 7961 int i; 7962 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7963 team->t.t_disp_buffer[i].buffer_index = i; 7964 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7965 } 7966 } else { 7967 team->t.t_disp_buffer[0].buffer_index = 0; 7968 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7969 } 7970 7971 KMP_MB(); /* Flush all pending memory write invalidates. */ 7972 KMP_ASSERT(this_thr->th.th_team == team); 7973 7974 #ifdef KMP_DEBUG 7975 for (f = 0; f < team->t.t_nproc; f++) { 7976 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7977 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7978 } 7979 #endif /* KMP_DEBUG */ 7980 7981 /* release the worker threads so they may begin working */ 7982 __kmp_fork_barrier(gtid, 0); 7983 } 7984 7985 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7986 kmp_info_t *this_thr = __kmp_threads[gtid]; 7987 7988 KMP_DEBUG_ASSERT(team); 7989 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7990 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7991 KMP_MB(); /* Flush all pending memory write invalidates. */ 7992 7993 /* Join barrier after fork */ 7994 7995 #ifdef KMP_DEBUG 7996 if (__kmp_threads[gtid] && 7997 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7998 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7999 __kmp_threads[gtid]); 8000 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 8001 "team->t.t_nproc=%d\n", 8002 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 8003 team->t.t_nproc); 8004 __kmp_print_structure(); 8005 } 8006 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 8007 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 8008 #endif /* KMP_DEBUG */ 8009 8010 __kmp_join_barrier(gtid); /* wait for everyone */ 8011 #if OMPT_SUPPORT 8012 if (ompt_enabled.enabled && 8013 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 8014 int ds_tid = this_thr->th.th_info.ds.ds_tid; 8015 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 8016 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 8017 #if OMPT_OPTIONAL 8018 void *codeptr = NULL; 8019 if (KMP_MASTER_TID(ds_tid) && 8020 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 8021 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 8022 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 8023 8024 if (ompt_enabled.ompt_callback_sync_region_wait) { 8025 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 8026 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 8027 codeptr); 8028 } 8029 if (ompt_enabled.ompt_callback_sync_region) { 8030 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 8031 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 8032 codeptr); 8033 } 8034 #endif 8035 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 8036 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 8037 ompt_scope_end, NULL, task_data, 0, ds_tid, 8038 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 8039 } 8040 } 8041 #endif 8042 8043 KMP_MB(); /* Flush all pending memory write invalidates. */ 8044 KMP_ASSERT(this_thr->th.th_team == team); 8045 } 8046 8047 /* ------------------------------------------------------------------------ */ 8048 8049 #ifdef USE_LOAD_BALANCE 8050 8051 // Return the worker threads actively spinning in the hot team, if we 8052 // are at the outermost level of parallelism. Otherwise, return 0. 8053 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 8054 int i; 8055 int retval; 8056 kmp_team_t *hot_team; 8057 8058 if (root->r.r_active) { 8059 return 0; 8060 } 8061 hot_team = root->r.r_hot_team; 8062 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 8063 return hot_team->t.t_nproc - 1; // Don't count primary thread 8064 } 8065 8066 // Skip the primary thread - it is accounted for elsewhere. 8067 retval = 0; 8068 for (i = 1; i < hot_team->t.t_nproc; i++) { 8069 if (hot_team->t.t_threads[i]->th.th_active) { 8070 retval++; 8071 } 8072 } 8073 return retval; 8074 } 8075 8076 // Perform an automatic adjustment to the number of 8077 // threads used by the next parallel region. 8078 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 8079 int retval; 8080 int pool_active; 8081 int hot_team_active; 8082 int team_curr_active; 8083 int system_active; 8084 8085 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 8086 set_nproc)); 8087 KMP_DEBUG_ASSERT(root); 8088 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 8089 ->th.th_current_task->td_icvs.dynamic == TRUE); 8090 KMP_DEBUG_ASSERT(set_nproc > 1); 8091 8092 if (set_nproc == 1) { 8093 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 8094 return 1; 8095 } 8096 8097 // Threads that are active in the thread pool, active in the hot team for this 8098 // particular root (if we are at the outer par level), and the currently 8099 // executing thread (to become the primary thread) are available to add to the 8100 // new team, but are currently contributing to the system load, and must be 8101 // accounted for. 8102 pool_active = __kmp_thread_pool_active_nth; 8103 hot_team_active = __kmp_active_hot_team_nproc(root); 8104 team_curr_active = pool_active + hot_team_active + 1; 8105 8106 // Check the system load. 8107 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 8108 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 8109 "hot team active = %d\n", 8110 system_active, pool_active, hot_team_active)); 8111 8112 if (system_active < 0) { 8113 // There was an error reading the necessary info from /proc, so use the 8114 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 8115 // = dynamic_thread_limit, we shouldn't wind up getting back here. 8116 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 8117 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 8118 8119 // Make this call behave like the thread limit algorithm. 8120 retval = __kmp_avail_proc - __kmp_nth + 8121 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 8122 if (retval > set_nproc) { 8123 retval = set_nproc; 8124 } 8125 if (retval < KMP_MIN_NTH) { 8126 retval = KMP_MIN_NTH; 8127 } 8128 8129 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 8130 retval)); 8131 return retval; 8132 } 8133 8134 // There is a slight delay in the load balance algorithm in detecting new 8135 // running procs. The real system load at this instant should be at least as 8136 // large as the #active omp thread that are available to add to the team. 8137 if (system_active < team_curr_active) { 8138 system_active = team_curr_active; 8139 } 8140 retval = __kmp_avail_proc - system_active + team_curr_active; 8141 if (retval > set_nproc) { 8142 retval = set_nproc; 8143 } 8144 if (retval < KMP_MIN_NTH) { 8145 retval = KMP_MIN_NTH; 8146 } 8147 8148 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 8149 return retval; 8150 } // __kmp_load_balance_nproc() 8151 8152 #endif /* USE_LOAD_BALANCE */ 8153 8154 /* ------------------------------------------------------------------------ */ 8155 8156 /* NOTE: this is called with the __kmp_init_lock held */ 8157 void __kmp_cleanup(void) { 8158 int f; 8159 8160 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 8161 8162 if (TCR_4(__kmp_init_parallel)) { 8163 #if KMP_HANDLE_SIGNALS 8164 __kmp_remove_signals(); 8165 #endif 8166 TCW_4(__kmp_init_parallel, FALSE); 8167 } 8168 8169 if (TCR_4(__kmp_init_middle)) { 8170 #if KMP_AFFINITY_SUPPORTED 8171 __kmp_affinity_uninitialize(); 8172 #endif /* KMP_AFFINITY_SUPPORTED */ 8173 __kmp_cleanup_hierarchy(); 8174 TCW_4(__kmp_init_middle, FALSE); 8175 } 8176 8177 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 8178 8179 if (__kmp_init_serial) { 8180 __kmp_runtime_destroy(); 8181 __kmp_init_serial = FALSE; 8182 } 8183 8184 __kmp_cleanup_threadprivate_caches(); 8185 8186 for (f = 0; f < __kmp_threads_capacity; f++) { 8187 if (__kmp_root[f] != NULL) { 8188 __kmp_free(__kmp_root[f]); 8189 __kmp_root[f] = NULL; 8190 } 8191 } 8192 __kmp_free(__kmp_threads); 8193 // __kmp_threads and __kmp_root were allocated at once, as single block, so 8194 // there is no need in freeing __kmp_root. 8195 __kmp_threads = NULL; 8196 __kmp_root = NULL; 8197 __kmp_threads_capacity = 0; 8198 8199 // Free old __kmp_threads arrays if they exist. 8200 kmp_old_threads_list_t *ptr = __kmp_old_threads_list; 8201 while (ptr) { 8202 kmp_old_threads_list_t *next = ptr->next; 8203 __kmp_free(ptr->threads); 8204 __kmp_free(ptr); 8205 ptr = next; 8206 } 8207 8208 #if KMP_USE_DYNAMIC_LOCK 8209 __kmp_cleanup_indirect_user_locks(); 8210 #else 8211 __kmp_cleanup_user_locks(); 8212 #endif 8213 #if OMPD_SUPPORT 8214 if (ompd_state) { 8215 __kmp_free(ompd_env_block); 8216 ompd_env_block = NULL; 8217 ompd_env_block_size = 0; 8218 } 8219 #endif 8220 8221 #if KMP_AFFINITY_SUPPORTED 8222 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 8223 __kmp_cpuinfo_file = NULL; 8224 #endif /* KMP_AFFINITY_SUPPORTED */ 8225 8226 #if KMP_USE_ADAPTIVE_LOCKS 8227 #if KMP_DEBUG_ADAPTIVE_LOCKS 8228 __kmp_print_speculative_stats(); 8229 #endif 8230 #endif 8231 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 8232 __kmp_nested_nth.nth = NULL; 8233 __kmp_nested_nth.size = 0; 8234 __kmp_nested_nth.used = 0; 8235 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 8236 __kmp_nested_proc_bind.bind_types = NULL; 8237 __kmp_nested_proc_bind.size = 0; 8238 __kmp_nested_proc_bind.used = 0; 8239 if (__kmp_affinity_format) { 8240 KMP_INTERNAL_FREE(__kmp_affinity_format); 8241 __kmp_affinity_format = NULL; 8242 } 8243 8244 __kmp_i18n_catclose(); 8245 8246 #if KMP_USE_HIER_SCHED 8247 __kmp_hier_scheds.deallocate(); 8248 #endif 8249 8250 #if KMP_STATS_ENABLED 8251 __kmp_stats_fini(); 8252 #endif 8253 8254 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 8255 } 8256 8257 /* ------------------------------------------------------------------------ */ 8258 8259 int __kmp_ignore_mppbeg(void) { 8260 char *env; 8261 8262 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 8263 if (__kmp_str_match_false(env)) 8264 return FALSE; 8265 } 8266 // By default __kmpc_begin() is no-op. 8267 return TRUE; 8268 } 8269 8270 int __kmp_ignore_mppend(void) { 8271 char *env; 8272 8273 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 8274 if (__kmp_str_match_false(env)) 8275 return FALSE; 8276 } 8277 // By default __kmpc_end() is no-op. 8278 return TRUE; 8279 } 8280 8281 void __kmp_internal_begin(void) { 8282 int gtid; 8283 kmp_root_t *root; 8284 8285 /* this is a very important step as it will register new sibling threads 8286 and assign these new uber threads a new gtid */ 8287 gtid = __kmp_entry_gtid(); 8288 root = __kmp_threads[gtid]->th.th_root; 8289 KMP_ASSERT(KMP_UBER_GTID(gtid)); 8290 8291 if (root->r.r_begin) 8292 return; 8293 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 8294 if (root->r.r_begin) { 8295 __kmp_release_lock(&root->r.r_begin_lock, gtid); 8296 return; 8297 } 8298 8299 root->r.r_begin = TRUE; 8300 8301 __kmp_release_lock(&root->r.r_begin_lock, gtid); 8302 } 8303 8304 /* ------------------------------------------------------------------------ */ 8305 8306 void __kmp_user_set_library(enum library_type arg) { 8307 int gtid; 8308 kmp_root_t *root; 8309 kmp_info_t *thread; 8310 8311 /* first, make sure we are initialized so we can get our gtid */ 8312 8313 gtid = __kmp_entry_gtid(); 8314 thread = __kmp_threads[gtid]; 8315 8316 root = thread->th.th_root; 8317 8318 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 8319 library_serial)); 8320 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 8321 thread */ 8322 KMP_WARNING(SetLibraryIncorrectCall); 8323 return; 8324 } 8325 8326 switch (arg) { 8327 case library_serial: 8328 thread->th.th_set_nproc = 0; 8329 set__nproc(thread, 1); 8330 break; 8331 case library_turnaround: 8332 thread->th.th_set_nproc = 0; 8333 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8334 : __kmp_dflt_team_nth_ub); 8335 break; 8336 case library_throughput: 8337 thread->th.th_set_nproc = 0; 8338 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8339 : __kmp_dflt_team_nth_ub); 8340 break; 8341 default: 8342 KMP_FATAL(UnknownLibraryType, arg); 8343 } 8344 8345 __kmp_aux_set_library(arg); 8346 } 8347 8348 void __kmp_aux_set_stacksize(size_t arg) { 8349 if (!__kmp_init_serial) 8350 __kmp_serial_initialize(); 8351 8352 #if KMP_OS_DARWIN 8353 if (arg & (0x1000 - 1)) { 8354 arg &= ~(0x1000 - 1); 8355 if (arg + 0x1000) /* check for overflow if we round up */ 8356 arg += 0x1000; 8357 } 8358 #endif 8359 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8360 8361 /* only change the default stacksize before the first parallel region */ 8362 if (!TCR_4(__kmp_init_parallel)) { 8363 size_t value = arg; /* argument is in bytes */ 8364 8365 if (value < __kmp_sys_min_stksize) 8366 value = __kmp_sys_min_stksize; 8367 else if (value > KMP_MAX_STKSIZE) 8368 value = KMP_MAX_STKSIZE; 8369 8370 __kmp_stksize = value; 8371 8372 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 8373 } 8374 8375 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8376 } 8377 8378 /* set the behaviour of the runtime library */ 8379 /* TODO this can cause some odd behaviour with sibling parallelism... */ 8380 void __kmp_aux_set_library(enum library_type arg) { 8381 __kmp_library = arg; 8382 8383 switch (__kmp_library) { 8384 case library_serial: { 8385 KMP_INFORM(LibraryIsSerial); 8386 } break; 8387 case library_turnaround: 8388 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) 8389 __kmp_use_yield = 2; // only yield when oversubscribed 8390 break; 8391 case library_throughput: 8392 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) 8393 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 8394 break; 8395 default: 8396 KMP_FATAL(UnknownLibraryType, arg); 8397 } 8398 } 8399 8400 /* Getting team information common for all team API */ 8401 // Returns NULL if not in teams construct 8402 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 8403 kmp_info_t *thr = __kmp_entry_thread(); 8404 teams_serialized = 0; 8405 if (thr->th.th_teams_microtask) { 8406 kmp_team_t *team = thr->th.th_team; 8407 int tlevel = thr->th.th_teams_level; // the level of the teams construct 8408 int ii = team->t.t_level; 8409 teams_serialized = team->t.t_serialized; 8410 int level = tlevel + 1; 8411 KMP_DEBUG_ASSERT(ii >= tlevel); 8412 while (ii > level) { 8413 for (teams_serialized = team->t.t_serialized; 8414 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 8415 } 8416 if (team->t.t_serialized && (!teams_serialized)) { 8417 team = team->t.t_parent; 8418 continue; 8419 } 8420 if (ii > level) { 8421 team = team->t.t_parent; 8422 ii--; 8423 } 8424 } 8425 return team; 8426 } 8427 return NULL; 8428 } 8429 8430 int __kmp_aux_get_team_num() { 8431 int serialized; 8432 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8433 if (team) { 8434 if (serialized > 1) { 8435 return 0; // teams region is serialized ( 1 team of 1 thread ). 8436 } else { 8437 return team->t.t_master_tid; 8438 } 8439 } 8440 return 0; 8441 } 8442 8443 int __kmp_aux_get_num_teams() { 8444 int serialized; 8445 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8446 if (team) { 8447 if (serialized > 1) { 8448 return 1; 8449 } else { 8450 return team->t.t_parent->t.t_nproc; 8451 } 8452 } 8453 return 1; 8454 } 8455 8456 /* ------------------------------------------------------------------------ */ 8457 8458 /* 8459 * Affinity Format Parser 8460 * 8461 * Field is in form of: %[[[0].]size]type 8462 * % and type are required (%% means print a literal '%') 8463 * type is either single char or long name surrounded by {}, 8464 * e.g., N or {num_threads} 8465 * 0 => leading zeros 8466 * . => right justified when size is specified 8467 * by default output is left justified 8468 * size is the *minimum* field length 8469 * All other characters are printed as is 8470 * 8471 * Available field types: 8472 * L {thread_level} - omp_get_level() 8473 * n {thread_num} - omp_get_thread_num() 8474 * h {host} - name of host machine 8475 * P {process_id} - process id (integer) 8476 * T {thread_identifier} - native thread identifier (integer) 8477 * N {num_threads} - omp_get_num_threads() 8478 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 8479 * a {thread_affinity} - comma separated list of integers or integer ranges 8480 * (values of affinity mask) 8481 * 8482 * Implementation-specific field types can be added 8483 * If a type is unknown, print "undefined" 8484 */ 8485 8486 // Structure holding the short name, long name, and corresponding data type 8487 // for snprintf. A table of these will represent the entire valid keyword 8488 // field types. 8489 typedef struct kmp_affinity_format_field_t { 8490 char short_name; // from spec e.g., L -> thread level 8491 const char *long_name; // from spec thread_level -> thread level 8492 char field_format; // data type for snprintf (typically 'd' or 's' 8493 // for integer or string) 8494 } kmp_affinity_format_field_t; 8495 8496 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 8497 #if KMP_AFFINITY_SUPPORTED 8498 {'A', "thread_affinity", 's'}, 8499 #endif 8500 {'t', "team_num", 'd'}, 8501 {'T', "num_teams", 'd'}, 8502 {'L', "nesting_level", 'd'}, 8503 {'n', "thread_num", 'd'}, 8504 {'N', "num_threads", 'd'}, 8505 {'a', "ancestor_tnum", 'd'}, 8506 {'H', "host", 's'}, 8507 {'P', "process_id", 'd'}, 8508 {'i', "native_thread_id", 'd'}}; 8509 8510 // Return the number of characters it takes to hold field 8511 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 8512 const char **ptr, 8513 kmp_str_buf_t *field_buffer) { 8514 int rc, format_index, field_value; 8515 const char *width_left, *width_right; 8516 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 8517 static const int FORMAT_SIZE = 20; 8518 char format[FORMAT_SIZE] = {0}; 8519 char absolute_short_name = 0; 8520 8521 KMP_DEBUG_ASSERT(gtid >= 0); 8522 KMP_DEBUG_ASSERT(th); 8523 KMP_DEBUG_ASSERT(**ptr == '%'); 8524 KMP_DEBUG_ASSERT(field_buffer); 8525 8526 __kmp_str_buf_clear(field_buffer); 8527 8528 // Skip the initial % 8529 (*ptr)++; 8530 8531 // Check for %% first 8532 if (**ptr == '%') { 8533 __kmp_str_buf_cat(field_buffer, "%", 1); 8534 (*ptr)++; // skip over the second % 8535 return 1; 8536 } 8537 8538 // Parse field modifiers if they are present 8539 pad_zeros = false; 8540 if (**ptr == '0') { 8541 pad_zeros = true; 8542 (*ptr)++; // skip over 0 8543 } 8544 right_justify = false; 8545 if (**ptr == '.') { 8546 right_justify = true; 8547 (*ptr)++; // skip over . 8548 } 8549 // Parse width of field: [width_left, width_right) 8550 width_left = width_right = NULL; 8551 if (**ptr >= '0' && **ptr <= '9') { 8552 width_left = *ptr; 8553 SKIP_DIGITS(*ptr); 8554 width_right = *ptr; 8555 } 8556 8557 // Create the format for KMP_SNPRINTF based on flags parsed above 8558 format_index = 0; 8559 format[format_index++] = '%'; 8560 if (!right_justify) 8561 format[format_index++] = '-'; 8562 if (pad_zeros) 8563 format[format_index++] = '0'; 8564 if (width_left && width_right) { 8565 int i = 0; 8566 // Only allow 8 digit number widths. 8567 // This also prevents overflowing format variable 8568 while (i < 8 && width_left < width_right) { 8569 format[format_index++] = *width_left; 8570 width_left++; 8571 i++; 8572 } 8573 } 8574 8575 // Parse a name (long or short) 8576 // Canonicalize the name into absolute_short_name 8577 found_valid_name = false; 8578 parse_long_name = (**ptr == '{'); 8579 if (parse_long_name) 8580 (*ptr)++; // skip initial left brace 8581 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 8582 sizeof(__kmp_affinity_format_table[0]); 8583 ++i) { 8584 char short_name = __kmp_affinity_format_table[i].short_name; 8585 const char *long_name = __kmp_affinity_format_table[i].long_name; 8586 char field_format = __kmp_affinity_format_table[i].field_format; 8587 if (parse_long_name) { 8588 size_t length = KMP_STRLEN(long_name); 8589 if (strncmp(*ptr, long_name, length) == 0) { 8590 found_valid_name = true; 8591 (*ptr) += length; // skip the long name 8592 } 8593 } else if (**ptr == short_name) { 8594 found_valid_name = true; 8595 (*ptr)++; // skip the short name 8596 } 8597 if (found_valid_name) { 8598 format[format_index++] = field_format; 8599 format[format_index++] = '\0'; 8600 absolute_short_name = short_name; 8601 break; 8602 } 8603 } 8604 if (parse_long_name) { 8605 if (**ptr != '}') { 8606 absolute_short_name = 0; 8607 } else { 8608 (*ptr)++; // skip over the right brace 8609 } 8610 } 8611 8612 // Attempt to fill the buffer with the requested 8613 // value using snprintf within __kmp_str_buf_print() 8614 switch (absolute_short_name) { 8615 case 't': 8616 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 8617 break; 8618 case 'T': 8619 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 8620 break; 8621 case 'L': 8622 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 8623 break; 8624 case 'n': 8625 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 8626 break; 8627 case 'H': { 8628 static const int BUFFER_SIZE = 256; 8629 char buf[BUFFER_SIZE]; 8630 __kmp_expand_host_name(buf, BUFFER_SIZE); 8631 rc = __kmp_str_buf_print(field_buffer, format, buf); 8632 } break; 8633 case 'P': 8634 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 8635 break; 8636 case 'i': 8637 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 8638 break; 8639 case 'N': 8640 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 8641 break; 8642 case 'a': 8643 field_value = 8644 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 8645 rc = __kmp_str_buf_print(field_buffer, format, field_value); 8646 break; 8647 #if KMP_AFFINITY_SUPPORTED 8648 case 'A': { 8649 kmp_str_buf_t buf; 8650 __kmp_str_buf_init(&buf); 8651 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 8652 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 8653 __kmp_str_buf_free(&buf); 8654 } break; 8655 #endif 8656 default: 8657 // According to spec, If an implementation does not have info for field 8658 // type, then "undefined" is printed 8659 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 8660 // Skip the field 8661 if (parse_long_name) { 8662 SKIP_TOKEN(*ptr); 8663 if (**ptr == '}') 8664 (*ptr)++; 8665 } else { 8666 (*ptr)++; 8667 } 8668 } 8669 8670 KMP_ASSERT(format_index <= FORMAT_SIZE); 8671 return rc; 8672 } 8673 8674 /* 8675 * Return number of characters needed to hold the affinity string 8676 * (not including null byte character) 8677 * The resultant string is printed to buffer, which the caller can then 8678 * handle afterwards 8679 */ 8680 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 8681 kmp_str_buf_t *buffer) { 8682 const char *parse_ptr; 8683 size_t retval; 8684 const kmp_info_t *th; 8685 kmp_str_buf_t field; 8686 8687 KMP_DEBUG_ASSERT(buffer); 8688 KMP_DEBUG_ASSERT(gtid >= 0); 8689 8690 __kmp_str_buf_init(&field); 8691 __kmp_str_buf_clear(buffer); 8692 8693 th = __kmp_threads[gtid]; 8694 retval = 0; 8695 8696 // If format is NULL or zero-length string, then we use 8697 // affinity-format-var ICV 8698 parse_ptr = format; 8699 if (parse_ptr == NULL || *parse_ptr == '\0') { 8700 parse_ptr = __kmp_affinity_format; 8701 } 8702 KMP_DEBUG_ASSERT(parse_ptr); 8703 8704 while (*parse_ptr != '\0') { 8705 // Parse a field 8706 if (*parse_ptr == '%') { 8707 // Put field in the buffer 8708 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 8709 __kmp_str_buf_catbuf(buffer, &field); 8710 retval += rc; 8711 } else { 8712 // Put literal character in buffer 8713 __kmp_str_buf_cat(buffer, parse_ptr, 1); 8714 retval++; 8715 parse_ptr++; 8716 } 8717 } 8718 __kmp_str_buf_free(&field); 8719 return retval; 8720 } 8721 8722 // Displays the affinity string to stdout 8723 void __kmp_aux_display_affinity(int gtid, const char *format) { 8724 kmp_str_buf_t buf; 8725 __kmp_str_buf_init(&buf); 8726 __kmp_aux_capture_affinity(gtid, format, &buf); 8727 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 8728 __kmp_str_buf_free(&buf); 8729 } 8730 8731 /* ------------------------------------------------------------------------ */ 8732 8733 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 8734 int blocktime = arg; /* argument is in milliseconds */ 8735 #if KMP_USE_MONITOR 8736 int bt_intervals; 8737 #endif 8738 kmp_int8 bt_set; 8739 8740 __kmp_save_internal_controls(thread); 8741 8742 /* Normalize and set blocktime for the teams */ 8743 if (blocktime < KMP_MIN_BLOCKTIME) 8744 blocktime = KMP_MIN_BLOCKTIME; 8745 else if (blocktime > KMP_MAX_BLOCKTIME) 8746 blocktime = KMP_MAX_BLOCKTIME; 8747 8748 set__blocktime_team(thread->th.th_team, tid, blocktime); 8749 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 8750 8751 #if KMP_USE_MONITOR 8752 /* Calculate and set blocktime intervals for the teams */ 8753 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 8754 8755 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 8756 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 8757 #endif 8758 8759 /* Set whether blocktime has been set to "TRUE" */ 8760 bt_set = TRUE; 8761 8762 set__bt_set_team(thread->th.th_team, tid, bt_set); 8763 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 8764 #if KMP_USE_MONITOR 8765 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 8766 "bt_intervals=%d, monitor_updates=%d\n", 8767 __kmp_gtid_from_tid(tid, thread->th.th_team), 8768 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 8769 __kmp_monitor_wakeups)); 8770 #else 8771 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 8772 __kmp_gtid_from_tid(tid, thread->th.th_team), 8773 thread->th.th_team->t.t_id, tid, blocktime)); 8774 #endif 8775 } 8776 8777 void __kmp_aux_set_defaults(char const *str, size_t len) { 8778 if (!__kmp_init_serial) { 8779 __kmp_serial_initialize(); 8780 } 8781 __kmp_env_initialize(str); 8782 8783 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { 8784 __kmp_env_print(); 8785 } 8786 } // __kmp_aux_set_defaults 8787 8788 /* ------------------------------------------------------------------------ */ 8789 /* internal fast reduction routines */ 8790 8791 PACKED_REDUCTION_METHOD_T 8792 __kmp_determine_reduction_method( 8793 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 8794 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 8795 kmp_critical_name *lck) { 8796 8797 // Default reduction method: critical construct ( lck != NULL, like in current 8798 // PAROPT ) 8799 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 8800 // can be selected by RTL 8801 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 8802 // can be selected by RTL 8803 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8804 // among generated by PAROPT. 8805 8806 PACKED_REDUCTION_METHOD_T retval; 8807 8808 int team_size; 8809 8810 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8811 8812 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8813 (loc && \ 8814 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))) 8815 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8816 8817 retval = critical_reduce_block; 8818 8819 // another choice of getting a team size (with 1 dynamic deference) is slower 8820 team_size = __kmp_get_team_num_threads(global_tid); 8821 if (team_size == 1) { 8822 8823 retval = empty_reduce_block; 8824 8825 } else { 8826 8827 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8828 8829 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ 8830 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 8831 8832 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8833 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8834 8835 int teamsize_cutoff = 4; 8836 8837 #if KMP_MIC_SUPPORTED 8838 if (__kmp_mic_type != non_mic) { 8839 teamsize_cutoff = 8; 8840 } 8841 #endif 8842 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8843 if (tree_available) { 8844 if (team_size <= teamsize_cutoff) { 8845 if (atomic_available) { 8846 retval = atomic_reduce_block; 8847 } 8848 } else { 8849 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8850 } 8851 } else if (atomic_available) { 8852 retval = atomic_reduce_block; 8853 } 8854 #else 8855 #error "Unknown or unsupported OS" 8856 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8857 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8858 8859 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 8860 8861 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD 8862 8863 // basic tuning 8864 8865 if (atomic_available) { 8866 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 8867 retval = atomic_reduce_block; 8868 } 8869 } // otherwise: use critical section 8870 8871 #elif KMP_OS_DARWIN 8872 8873 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8874 if (atomic_available && (num_vars <= 3)) { 8875 retval = atomic_reduce_block; 8876 } else if (tree_available) { 8877 if ((reduce_size > (9 * sizeof(kmp_real64))) && 8878 (reduce_size < (2000 * sizeof(kmp_real64)))) { 8879 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 8880 } 8881 } // otherwise: use critical section 8882 8883 #else 8884 #error "Unknown or unsupported OS" 8885 #endif 8886 8887 #else 8888 #error "Unknown or unsupported architecture" 8889 #endif 8890 } 8891 8892 // KMP_FORCE_REDUCTION 8893 8894 // If the team is serialized (team_size == 1), ignore the forced reduction 8895 // method and stay with the unsynchronized method (empty_reduce_block) 8896 if (__kmp_force_reduction_method != reduction_method_not_defined && 8897 team_size != 1) { 8898 8899 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 8900 8901 int atomic_available, tree_available; 8902 8903 switch ((forced_retval = __kmp_force_reduction_method)) { 8904 case critical_reduce_block: 8905 KMP_ASSERT(lck); // lck should be != 0 8906 break; 8907 8908 case atomic_reduce_block: 8909 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8910 if (!atomic_available) { 8911 KMP_WARNING(RedMethodNotSupported, "atomic"); 8912 forced_retval = critical_reduce_block; 8913 } 8914 break; 8915 8916 case tree_reduce_block: 8917 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8918 if (!tree_available) { 8919 KMP_WARNING(RedMethodNotSupported, "tree"); 8920 forced_retval = critical_reduce_block; 8921 } else { 8922 #if KMP_FAST_REDUCTION_BARRIER 8923 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8924 #endif 8925 } 8926 break; 8927 8928 default: 8929 KMP_ASSERT(0); // "unsupported method specified" 8930 } 8931 8932 retval = forced_retval; 8933 } 8934 8935 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 8936 8937 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 8938 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 8939 8940 return (retval); 8941 } 8942 // this function is for testing set/get/determine reduce method 8943 kmp_int32 __kmp_get_reduce_method(void) { 8944 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 8945 } 8946 8947 // Soft pause sets up threads to ignore blocktime and just go to sleep. 8948 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 8949 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 8950 8951 // Hard pause shuts down the runtime completely. Resume happens naturally when 8952 // OpenMP is used subsequently. 8953 void __kmp_hard_pause() { 8954 __kmp_pause_status = kmp_hard_paused; 8955 __kmp_internal_end_thread(-1); 8956 } 8957 8958 // Soft resume sets __kmp_pause_status, and wakes up all threads. 8959 void __kmp_resume_if_soft_paused() { 8960 if (__kmp_pause_status == kmp_soft_paused) { 8961 __kmp_pause_status = kmp_not_paused; 8962 8963 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 8964 kmp_info_t *thread = __kmp_threads[gtid]; 8965 if (thread) { // Wake it if sleeping 8966 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 8967 thread); 8968 if (fl.is_sleeping()) 8969 fl.resume(gtid); 8970 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 8971 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 8972 } else { // thread holds the lock and may sleep soon 8973 do { // until either the thread sleeps, or we can get the lock 8974 if (fl.is_sleeping()) { 8975 fl.resume(gtid); 8976 break; 8977 } else if (__kmp_try_suspend_mx(thread)) { 8978 __kmp_unlock_suspend_mx(thread); 8979 break; 8980 } 8981 } while (1); 8982 } 8983 } 8984 } 8985 } 8986 } 8987 8988 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 8989 // TODO: add warning messages 8990 int __kmp_pause_resource(kmp_pause_status_t level) { 8991 if (level == kmp_not_paused) { // requesting resume 8992 if (__kmp_pause_status == kmp_not_paused) { 8993 // error message about runtime not being paused, so can't resume 8994 return 1; 8995 } else { 8996 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 8997 __kmp_pause_status == kmp_hard_paused); 8998 __kmp_pause_status = kmp_not_paused; 8999 return 0; 9000 } 9001 } else if (level == kmp_soft_paused) { // requesting soft pause 9002 if (__kmp_pause_status != kmp_not_paused) { 9003 // error message about already being paused 9004 return 1; 9005 } else { 9006 __kmp_soft_pause(); 9007 return 0; 9008 } 9009 } else if (level == kmp_hard_paused) { // requesting hard pause 9010 if (__kmp_pause_status != kmp_not_paused) { 9011 // error message about already being paused 9012 return 1; 9013 } else { 9014 __kmp_hard_pause(); 9015 return 0; 9016 } 9017 } else { 9018 // error message about invalid level 9019 return 1; 9020 } 9021 } 9022 9023 void __kmp_omp_display_env(int verbose) { 9024 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 9025 if (__kmp_init_serial == 0) 9026 __kmp_do_serial_initialize(); 9027 __kmp_display_env_impl(!verbose, verbose); 9028 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 9029 } 9030 9031 // The team size is changing, so distributed barrier must be modified 9032 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads, 9033 int new_nthreads) { 9034 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] == 9035 bp_dist_bar); 9036 kmp_info_t **other_threads = team->t.t_threads; 9037 9038 // We want all the workers to stop waiting on the barrier while we adjust the 9039 // size of the team. 9040 for (int f = 1; f < old_nthreads; ++f) { 9041 KMP_DEBUG_ASSERT(other_threads[f] != NULL); 9042 // Ignore threads that are already inactive or not present in the team 9043 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) { 9044 // teams construct causes thread_limit to get passed in, and some of 9045 // those could be inactive; just ignore them 9046 continue; 9047 } 9048 // If thread is transitioning still to in_use state, wait for it 9049 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) { 9050 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3) 9051 KMP_CPU_PAUSE(); 9052 } 9053 // The thread should be in_use now 9054 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1); 9055 // Transition to unused state 9056 team->t.t_threads[f]->th.th_used_in_team.store(2); 9057 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2); 9058 } 9059 // Release all the workers 9060 team->t.b->go_release(); 9061 9062 KMP_MFENCE(); 9063 9064 // Workers should see transition status 2 and move to 0; but may need to be 9065 // woken up first 9066 int count = old_nthreads - 1; 9067 while (count > 0) { 9068 count = old_nthreads - 1; 9069 for (int f = 1; f < old_nthreads; ++f) { 9070 if (other_threads[f]->th.th_used_in_team.load() != 0) { 9071 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers 9072 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST( 9073 void *, other_threads[f]->th.th_sleep_loc); 9074 __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag); 9075 } 9076 } else { 9077 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0); 9078 count--; 9079 } 9080 } 9081 } 9082 // Now update the barrier size 9083 team->t.b->update_num_threads(new_nthreads); 9084 team->t.b->go_reset(); 9085 } 9086 9087 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) { 9088 // Add the threads back to the team 9089 KMP_DEBUG_ASSERT(team); 9090 // Threads were paused and pointed at th_used_in_team temporarily during a 9091 // resize of the team. We're going to set th_used_in_team to 3 to indicate to 9092 // the thread that it should transition itself back into the team. Then, if 9093 // blocktime isn't infinite, the thread could be sleeping, so we send a resume 9094 // to wake it up. 9095 for (int f = 1; f < new_nthreads; ++f) { 9096 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 9097 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0, 9098 3); 9099 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads 9100 __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid, 9101 (kmp_flag_32<false, false> *)NULL); 9102 } 9103 } 9104 // The threads should be transitioning to the team; when they are done, they 9105 // should have set th_used_in_team to 1. This loop forces master to wait until 9106 // all threads have moved into the team and are waiting in the barrier. 9107 int count = new_nthreads - 1; 9108 while (count > 0) { 9109 count = new_nthreads - 1; 9110 for (int f = 1; f < new_nthreads; ++f) { 9111 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) { 9112 count--; 9113 } 9114 } 9115 } 9116 } 9117 9118 // Globals and functions for hidden helper task 9119 kmp_info_t **__kmp_hidden_helper_threads; 9120 kmp_info_t *__kmp_hidden_helper_main_thread; 9121 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks; 9122 #if KMP_OS_LINUX 9123 kmp_int32 __kmp_hidden_helper_threads_num = 8; 9124 kmp_int32 __kmp_enable_hidden_helper = TRUE; 9125 #else 9126 kmp_int32 __kmp_hidden_helper_threads_num = 0; 9127 kmp_int32 __kmp_enable_hidden_helper = FALSE; 9128 #endif 9129 9130 namespace { 9131 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num; 9132 9133 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) { 9134 // This is an explicit synchronization on all hidden helper threads in case 9135 // that when a regular thread pushes a hidden helper task to one hidden 9136 // helper thread, the thread has not been awaken once since they're released 9137 // by the main thread after creating the team. 9138 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num); 9139 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) != 9140 __kmp_hidden_helper_threads_num) 9141 ; 9142 9143 // If main thread, then wait for signal 9144 if (__kmpc_master(nullptr, *gtid)) { 9145 // First, unset the initial state and release the initial thread 9146 TCW_4(__kmp_init_hidden_helper_threads, FALSE); 9147 __kmp_hidden_helper_initz_release(); 9148 __kmp_hidden_helper_main_thread_wait(); 9149 // Now wake up all worker threads 9150 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) { 9151 __kmp_hidden_helper_worker_thread_signal(); 9152 } 9153 } 9154 } 9155 } // namespace 9156 9157 void __kmp_hidden_helper_threads_initz_routine() { 9158 // Create a new root for hidden helper team/threads 9159 const int gtid = __kmp_register_root(TRUE); 9160 __kmp_hidden_helper_main_thread = __kmp_threads[gtid]; 9161 __kmp_hidden_helper_threads = &__kmp_threads[gtid]; 9162 __kmp_hidden_helper_main_thread->th.th_set_nproc = 9163 __kmp_hidden_helper_threads_num; 9164 9165 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0); 9166 9167 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn); 9168 9169 // Set the initialization flag to FALSE 9170 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE); 9171 9172 __kmp_hidden_helper_threads_deinitz_release(); 9173 } 9174 9175 /* Nesting Mode: 9176 Set via KMP_NESTING_MODE, which takes an integer. 9177 Note: we skip duplicate topology levels, and skip levels with only 9178 one entity. 9179 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode. 9180 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels 9181 in the topology, and initializes the number of threads at each of those 9182 levels to the number of entities at each level, respectively, below the 9183 entity at the parent level. 9184 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels, 9185 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires 9186 the user to turn nesting on explicitly. This is an even more experimental 9187 option to this experimental feature, and may change or go away in the 9188 future. 9189 */ 9190 9191 // Allocate space to store nesting levels 9192 void __kmp_init_nesting_mode() { 9193 int levels = KMP_HW_LAST; 9194 __kmp_nesting_mode_nlevels = levels; 9195 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int)); 9196 for (int i = 0; i < levels; ++i) 9197 __kmp_nesting_nth_level[i] = 0; 9198 if (__kmp_nested_nth.size < levels) { 9199 __kmp_nested_nth.nth = 9200 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int)); 9201 __kmp_nested_nth.size = levels; 9202 } 9203 } 9204 9205 // Set # threads for top levels of nesting; must be called after topology set 9206 void __kmp_set_nesting_mode_threads() { 9207 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()]; 9208 9209 if (__kmp_nesting_mode == 1) 9210 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 9211 else if (__kmp_nesting_mode > 1) 9212 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 9213 9214 if (__kmp_topology) { // use topology info 9215 int loc, hw_level; 9216 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() && 9217 loc < __kmp_nesting_mode_nlevels; 9218 loc++, hw_level++) { 9219 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level); 9220 if (__kmp_nesting_nth_level[loc] == 1) 9221 loc--; 9222 } 9223 // Make sure all cores are used 9224 if (__kmp_nesting_mode > 1 && loc > 1) { 9225 int core_level = __kmp_topology->get_level(KMP_HW_CORE); 9226 int num_cores = __kmp_topology->get_count(core_level); 9227 int upper_levels = 1; 9228 for (int level = 0; level < loc - 1; ++level) 9229 upper_levels *= __kmp_nesting_nth_level[level]; 9230 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores) 9231 __kmp_nesting_nth_level[loc - 1] = 9232 num_cores / __kmp_nesting_nth_level[loc - 2]; 9233 } 9234 __kmp_nesting_mode_nlevels = loc; 9235 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 9236 } else { // no topology info available; provide a reasonable guesstimation 9237 if (__kmp_avail_proc >= 4) { 9238 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2; 9239 __kmp_nesting_nth_level[1] = 2; 9240 __kmp_nesting_mode_nlevels = 2; 9241 } else { 9242 __kmp_nesting_nth_level[0] = __kmp_avail_proc; 9243 __kmp_nesting_mode_nlevels = 1; 9244 } 9245 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 9246 } 9247 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) { 9248 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i]; 9249 } 9250 set__nproc(thread, __kmp_nesting_nth_level[0]); 9251 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode) 9252 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 9253 if (get__max_active_levels(thread) > 1) { 9254 // if max levels was set, set nesting mode levels to same 9255 __kmp_nesting_mode_nlevels = get__max_active_levels(thread); 9256 } 9257 if (__kmp_nesting_mode == 1) // turn on nesting for this case only 9258 set__max_active_levels(thread, __kmp_nesting_mode_nlevels); 9259 } 9260 9261 // Empty symbols to export (see exports_so.txt) when feature is disabled 9262 extern "C" { 9263 #if !KMP_STATS_ENABLED 9264 void __kmp_reset_stats() {} 9265 #endif 9266 #if !USE_DEBUGGER 9267 int __kmp_omp_debug_struct_info = FALSE; 9268 int __kmp_debugging = FALSE; 9269 #endif 9270 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY 9271 void __kmp_itt_fini_ittlib() {} 9272 void __kmp_itt_init_ittlib() {} 9273 #endif 9274 } 9275 9276 // end of file 9277