1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #if KMP_USE_HIER_SCHED 28 #include "kmp_dispatch_hier.h" 29 #endif 30 31 #if OMPT_SUPPORT 32 #include "ompt-specific.h" 33 #endif 34 #if OMPD_SUPPORT 35 #include "ompd-specific.h" 36 #endif 37 38 #if OMP_PROFILING_SUPPORT 39 #include "llvm/Support/TimeProfiler.h" 40 static char *ProfileTraceFile = nullptr; 41 #endif 42 43 /* these are temporary issues to be dealt with */ 44 #define KMP_USE_PRCTL 0 45 46 #if KMP_OS_WINDOWS 47 #include <process.h> 48 #endif 49 50 #if KMP_OS_WINDOWS 51 // windows does not need include files as it doesn't use shared memory 52 #else 53 #include <sys/mman.h> 54 #include <sys/stat.h> 55 #include <fcntl.h> 56 #define SHM_SIZE 1024 57 #endif 58 59 #if defined(KMP_GOMP_COMPAT) 60 char const __kmp_version_alt_comp[] = 61 KMP_VERSION_PREFIX "alternative compiler support: yes"; 62 #endif /* defined(KMP_GOMP_COMPAT) */ 63 64 char const __kmp_version_omp_api[] = 65 KMP_VERSION_PREFIX "API version: 5.0 (201611)"; 66 67 #ifdef KMP_DEBUG 68 char const __kmp_version_lock[] = 69 KMP_VERSION_PREFIX "lock type: run time selectable"; 70 #endif /* KMP_DEBUG */ 71 72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 73 74 /* ------------------------------------------------------------------------ */ 75 76 #if KMP_USE_MONITOR 77 kmp_info_t __kmp_monitor; 78 #endif 79 80 /* Forward declarations */ 81 82 void __kmp_cleanup(void); 83 84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 85 int gtid); 86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 87 kmp_internal_control_t *new_icvs, 88 ident_t *loc); 89 #if KMP_AFFINITY_SUPPORTED 90 static void __kmp_partition_places(kmp_team_t *team, 91 int update_master_only = 0); 92 #endif 93 static void __kmp_do_serial_initialize(void); 94 void __kmp_fork_barrier(int gtid, int tid); 95 void __kmp_join_barrier(int gtid); 96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 97 kmp_internal_control_t *new_icvs, ident_t *loc); 98 99 #ifdef USE_LOAD_BALANCE 100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 101 #endif 102 103 static int __kmp_expand_threads(int nNeed); 104 #if KMP_OS_WINDOWS 105 static int __kmp_unregister_root_other_thread(int gtid); 106 #endif 107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 109 110 /* Calculate the identifier of the current thread */ 111 /* fast (and somewhat portable) way to get unique identifier of executing 112 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 113 int __kmp_get_global_thread_id() { 114 int i; 115 kmp_info_t **other_threads; 116 size_t stack_data; 117 char *stack_addr; 118 size_t stack_size; 119 char *stack_base; 120 121 KA_TRACE( 122 1000, 123 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 124 __kmp_nth, __kmp_all_nth)); 125 126 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 127 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 128 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 129 __kmp_init_gtid for this to work. */ 130 131 if (!TCR_4(__kmp_init_gtid)) 132 return KMP_GTID_DNE; 133 134 #ifdef KMP_TDATA_GTID 135 if (TCR_4(__kmp_gtid_mode) >= 3) { 136 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 137 return __kmp_gtid; 138 } 139 #endif 140 if (TCR_4(__kmp_gtid_mode) >= 2) { 141 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 142 return __kmp_gtid_get_specific(); 143 } 144 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 145 146 stack_addr = (char *)&stack_data; 147 other_threads = __kmp_threads; 148 149 /* ATT: The code below is a source of potential bugs due to unsynchronized 150 access to __kmp_threads array. For example: 151 1. Current thread loads other_threads[i] to thr and checks it, it is 152 non-NULL. 153 2. Current thread is suspended by OS. 154 3. Another thread unregisters and finishes (debug versions of free() 155 may fill memory with something like 0xEF). 156 4. Current thread is resumed. 157 5. Current thread reads junk from *thr. 158 TODO: Fix it. --ln */ 159 160 for (i = 0; i < __kmp_threads_capacity; i++) { 161 162 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 163 if (!thr) 164 continue; 165 166 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 167 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 168 169 /* stack grows down -- search through all of the active threads */ 170 171 if (stack_addr <= stack_base) { 172 size_t stack_diff = stack_base - stack_addr; 173 174 if (stack_diff <= stack_size) { 175 /* The only way we can be closer than the allocated */ 176 /* stack size is if we are running on this thread. */ 177 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 178 return i; 179 } 180 } 181 } 182 183 /* get specific to try and determine our gtid */ 184 KA_TRACE(1000, 185 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 186 "thread, using TLS\n")); 187 i = __kmp_gtid_get_specific(); 188 189 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 190 191 /* if we havn't been assigned a gtid, then return code */ 192 if (i < 0) 193 return i; 194 195 /* dynamically updated stack window for uber threads to avoid get_specific 196 call */ 197 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 198 KMP_FATAL(StackOverflow, i); 199 } 200 201 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 202 if (stack_addr > stack_base) { 203 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 204 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 205 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 206 stack_base); 207 } else { 208 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 209 stack_base - stack_addr); 210 } 211 212 /* Reprint stack bounds for ubermaster since they have been refined */ 213 if (__kmp_storage_map) { 214 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 215 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 216 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 217 other_threads[i]->th.th_info.ds.ds_stacksize, 218 "th_%d stack (refinement)", i); 219 } 220 return i; 221 } 222 223 int __kmp_get_global_thread_id_reg() { 224 int gtid; 225 226 if (!__kmp_init_serial) { 227 gtid = KMP_GTID_DNE; 228 } else 229 #ifdef KMP_TDATA_GTID 230 if (TCR_4(__kmp_gtid_mode) >= 3) { 231 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 232 gtid = __kmp_gtid; 233 } else 234 #endif 235 if (TCR_4(__kmp_gtid_mode) >= 2) { 236 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 237 gtid = __kmp_gtid_get_specific(); 238 } else { 239 KA_TRACE(1000, 240 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 241 gtid = __kmp_get_global_thread_id(); 242 } 243 244 /* we must be a new uber master sibling thread */ 245 if (gtid == KMP_GTID_DNE) { 246 KA_TRACE(10, 247 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 248 "Registering a new gtid.\n")); 249 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 250 if (!__kmp_init_serial) { 251 __kmp_do_serial_initialize(); 252 gtid = __kmp_gtid_get_specific(); 253 } else { 254 gtid = __kmp_register_root(FALSE); 255 } 256 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 257 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 258 } 259 260 KMP_DEBUG_ASSERT(gtid >= 0); 261 262 return gtid; 263 } 264 265 /* caller must hold forkjoin_lock */ 266 void __kmp_check_stack_overlap(kmp_info_t *th) { 267 int f; 268 char *stack_beg = NULL; 269 char *stack_end = NULL; 270 int gtid; 271 272 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 273 if (__kmp_storage_map) { 274 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 275 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 276 277 gtid = __kmp_gtid_from_thread(th); 278 279 if (gtid == KMP_GTID_MONITOR) { 280 __kmp_print_storage_map_gtid( 281 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 282 "th_%s stack (%s)", "mon", 283 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 284 } else { 285 __kmp_print_storage_map_gtid( 286 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 287 "th_%d stack (%s)", gtid, 288 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 289 } 290 } 291 292 /* No point in checking ubermaster threads since they use refinement and 293 * cannot overlap */ 294 gtid = __kmp_gtid_from_thread(th); 295 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 296 KA_TRACE(10, 297 ("__kmp_check_stack_overlap: performing extensive checking\n")); 298 if (stack_beg == NULL) { 299 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 300 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 301 } 302 303 for (f = 0; f < __kmp_threads_capacity; f++) { 304 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 305 306 if (f_th && f_th != th) { 307 char *other_stack_end = 308 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 309 char *other_stack_beg = 310 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 311 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 312 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 313 314 /* Print the other stack values before the abort */ 315 if (__kmp_storage_map) 316 __kmp_print_storage_map_gtid( 317 -1, other_stack_beg, other_stack_end, 318 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 319 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 320 321 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 322 __kmp_msg_null); 323 } 324 } 325 } 326 } 327 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 328 } 329 330 /* ------------------------------------------------------------------------ */ 331 332 void __kmp_infinite_loop(void) { 333 static int done = FALSE; 334 335 while (!done) { 336 KMP_YIELD(TRUE); 337 } 338 } 339 340 #define MAX_MESSAGE 512 341 342 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 343 char const *format, ...) { 344 char buffer[MAX_MESSAGE]; 345 va_list ap; 346 347 va_start(ap, format); 348 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 349 p2, (unsigned long)size, format); 350 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 351 __kmp_vprintf(kmp_err, buffer, ap); 352 #if KMP_PRINT_DATA_PLACEMENT 353 int node; 354 if (gtid >= 0) { 355 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 356 if (__kmp_storage_map_verbose) { 357 node = __kmp_get_host_node(p1); 358 if (node < 0) /* doesn't work, so don't try this next time */ 359 __kmp_storage_map_verbose = FALSE; 360 else { 361 char *last; 362 int lastNode; 363 int localProc = __kmp_get_cpu_from_gtid(gtid); 364 365 const int page_size = KMP_GET_PAGE_SIZE(); 366 367 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 368 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 369 if (localProc >= 0) 370 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 371 localProc >> 1); 372 else 373 __kmp_printf_no_lock(" GTID %d\n", gtid); 374 #if KMP_USE_PRCTL 375 /* The more elaborate format is disabled for now because of the prctl 376 * hanging bug. */ 377 do { 378 last = p1; 379 lastNode = node; 380 /* This loop collates adjacent pages with the same host node. */ 381 do { 382 (char *)p1 += page_size; 383 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 384 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 385 lastNode); 386 } while (p1 <= p2); 387 #else 388 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 389 (char *)p1 + (page_size - 1), 390 __kmp_get_host_node(p1)); 391 if (p1 < p2) { 392 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 393 (char *)p2 + (page_size - 1), 394 __kmp_get_host_node(p2)); 395 } 396 #endif 397 } 398 } 399 } else 400 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 401 } 402 #endif /* KMP_PRINT_DATA_PLACEMENT */ 403 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 404 } 405 406 void __kmp_warn(char const *format, ...) { 407 char buffer[MAX_MESSAGE]; 408 va_list ap; 409 410 if (__kmp_generate_warnings == kmp_warnings_off) { 411 return; 412 } 413 414 va_start(ap, format); 415 416 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 417 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 418 __kmp_vprintf(kmp_err, buffer, ap); 419 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 420 421 va_end(ap); 422 } 423 424 void __kmp_abort_process() { 425 // Later threads may stall here, but that's ok because abort() will kill them. 426 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 427 428 if (__kmp_debug_buf) { 429 __kmp_dump_debug_buffer(); 430 } 431 432 if (KMP_OS_WINDOWS) { 433 // Let other threads know of abnormal termination and prevent deadlock 434 // if abort happened during library initialization or shutdown 435 __kmp_global.g.g_abort = SIGABRT; 436 437 /* On Windows* OS by default abort() causes pop-up error box, which stalls 438 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 439 boxes. _set_abort_behavior() works well, but this function is not 440 available in VS7 (this is not problem for DLL, but it is a problem for 441 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 442 help, at least in some versions of MS C RTL. 443 444 It seems following sequence is the only way to simulate abort() and 445 avoid pop-up error box. */ 446 raise(SIGABRT); 447 _exit(3); // Just in case, if signal ignored, exit anyway. 448 } else { 449 __kmp_unregister_library(); 450 abort(); 451 } 452 453 __kmp_infinite_loop(); 454 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 455 456 } // __kmp_abort_process 457 458 void __kmp_abort_thread(void) { 459 // TODO: Eliminate g_abort global variable and this function. 460 // In case of abort just call abort(), it will kill all the threads. 461 __kmp_infinite_loop(); 462 } // __kmp_abort_thread 463 464 /* Print out the storage map for the major kmp_info_t thread data structures 465 that are allocated together. */ 466 467 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 468 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 469 gtid); 470 471 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 472 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 473 474 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 475 sizeof(kmp_local_t), "th_%d.th_local", gtid); 476 477 __kmp_print_storage_map_gtid( 478 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 479 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 480 481 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 482 &thr->th.th_bar[bs_plain_barrier + 1], 483 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 484 gtid); 485 486 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 487 &thr->th.th_bar[bs_forkjoin_barrier + 1], 488 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 489 gtid); 490 491 #if KMP_FAST_REDUCTION_BARRIER 492 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 493 &thr->th.th_bar[bs_reduction_barrier + 1], 494 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 495 gtid); 496 #endif // KMP_FAST_REDUCTION_BARRIER 497 } 498 499 /* Print out the storage map for the major kmp_team_t team data structures 500 that are allocated together. */ 501 502 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 503 int team_id, int num_thr) { 504 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 505 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 506 header, team_id); 507 508 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 509 &team->t.t_bar[bs_last_barrier], 510 sizeof(kmp_balign_team_t) * bs_last_barrier, 511 "%s_%d.t_bar", header, team_id); 512 513 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 514 &team->t.t_bar[bs_plain_barrier + 1], 515 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 516 header, team_id); 517 518 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 519 &team->t.t_bar[bs_forkjoin_barrier + 1], 520 sizeof(kmp_balign_team_t), 521 "%s_%d.t_bar[forkjoin]", header, team_id); 522 523 #if KMP_FAST_REDUCTION_BARRIER 524 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 525 &team->t.t_bar[bs_reduction_barrier + 1], 526 sizeof(kmp_balign_team_t), 527 "%s_%d.t_bar[reduction]", header, team_id); 528 #endif // KMP_FAST_REDUCTION_BARRIER 529 530 __kmp_print_storage_map_gtid( 531 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 532 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 533 534 __kmp_print_storage_map_gtid( 535 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 536 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 537 538 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 539 &team->t.t_disp_buffer[num_disp_buff], 540 sizeof(dispatch_shared_info_t) * num_disp_buff, 541 "%s_%d.t_disp_buffer", header, team_id); 542 } 543 544 static void __kmp_init_allocator() { 545 __kmp_init_memkind(); 546 __kmp_init_target_mem(); 547 } 548 static void __kmp_fini_allocator() { __kmp_fini_memkind(); } 549 550 /* ------------------------------------------------------------------------ */ 551 552 #if KMP_DYNAMIC_LIB 553 #if KMP_OS_WINDOWS 554 555 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 556 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 557 558 switch (fdwReason) { 559 560 case DLL_PROCESS_ATTACH: 561 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 562 563 return TRUE; 564 565 case DLL_PROCESS_DETACH: 566 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 567 568 // According to Windows* documentation for DllMain entry point: 569 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference: 570 // lpReserved == NULL when FreeLibrary() is called, 571 // lpReserved != NULL when the process is terminated. 572 // When FreeLibrary() is called, worker threads remain alive. So the 573 // runtime's state is consistent and executing proper shutdown is OK. 574 // When the process is terminated, worker threads have exited or been 575 // forcefully terminated by the OS and only the shutdown thread remains. 576 // This can leave the runtime in an inconsistent state. 577 // Hence, only attempt proper cleanup when FreeLibrary() is called. 578 // Otherwise, rely on OS to reclaim resources. 579 if (lpReserved == NULL) 580 __kmp_internal_end_library(__kmp_gtid_get_specific()); 581 582 return TRUE; 583 584 case DLL_THREAD_ATTACH: 585 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 586 587 /* if we want to register new siblings all the time here call 588 * __kmp_get_gtid(); */ 589 return TRUE; 590 591 case DLL_THREAD_DETACH: 592 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 593 594 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 595 return TRUE; 596 } 597 598 return TRUE; 599 } 600 601 #endif /* KMP_OS_WINDOWS */ 602 #endif /* KMP_DYNAMIC_LIB */ 603 604 /* __kmp_parallel_deo -- Wait until it's our turn. */ 605 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 606 int gtid = *gtid_ref; 607 #ifdef BUILD_PARALLEL_ORDERED 608 kmp_team_t *team = __kmp_team_from_gtid(gtid); 609 #endif /* BUILD_PARALLEL_ORDERED */ 610 611 if (__kmp_env_consistency_check) { 612 if (__kmp_threads[gtid]->th.th_root->r.r_active) 613 #if KMP_USE_DYNAMIC_LOCK 614 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 615 #else 616 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 617 #endif 618 } 619 #ifdef BUILD_PARALLEL_ORDERED 620 if (!team->t.t_serialized) { 621 KMP_MB(); 622 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, 623 NULL); 624 KMP_MB(); 625 } 626 #endif /* BUILD_PARALLEL_ORDERED */ 627 } 628 629 /* __kmp_parallel_dxo -- Signal the next task. */ 630 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 631 int gtid = *gtid_ref; 632 #ifdef BUILD_PARALLEL_ORDERED 633 int tid = __kmp_tid_from_gtid(gtid); 634 kmp_team_t *team = __kmp_team_from_gtid(gtid); 635 #endif /* BUILD_PARALLEL_ORDERED */ 636 637 if (__kmp_env_consistency_check) { 638 if (__kmp_threads[gtid]->th.th_root->r.r_active) 639 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 640 } 641 #ifdef BUILD_PARALLEL_ORDERED 642 if (!team->t.t_serialized) { 643 KMP_MB(); /* Flush all pending memory write invalidates. */ 644 645 /* use the tid of the next thread in this team */ 646 /* TODO replace with general release procedure */ 647 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 648 649 KMP_MB(); /* Flush all pending memory write invalidates. */ 650 } 651 #endif /* BUILD_PARALLEL_ORDERED */ 652 } 653 654 /* ------------------------------------------------------------------------ */ 655 /* The BARRIER for a SINGLE process section is always explicit */ 656 657 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 658 int status; 659 kmp_info_t *th; 660 kmp_team_t *team; 661 662 if (!TCR_4(__kmp_init_parallel)) 663 __kmp_parallel_initialize(); 664 __kmp_resume_if_soft_paused(); 665 666 th = __kmp_threads[gtid]; 667 team = th->th.th_team; 668 status = 0; 669 670 th->th.th_ident = id_ref; 671 672 if (team->t.t_serialized) { 673 status = 1; 674 } else { 675 kmp_int32 old_this = th->th.th_local.this_construct; 676 677 ++th->th.th_local.this_construct; 678 /* try to set team count to thread count--success means thread got the 679 single block */ 680 /* TODO: Should this be acquire or release? */ 681 if (team->t.t_construct == old_this) { 682 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 683 th->th.th_local.this_construct); 684 } 685 #if USE_ITT_BUILD 686 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 687 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 688 team->t.t_active_level == 1) { 689 // Only report metadata by primary thread of active team at level 1 690 __kmp_itt_metadata_single(id_ref); 691 } 692 #endif /* USE_ITT_BUILD */ 693 } 694 695 if (__kmp_env_consistency_check) { 696 if (status && push_ws) { 697 __kmp_push_workshare(gtid, ct_psingle, id_ref); 698 } else { 699 __kmp_check_workshare(gtid, ct_psingle, id_ref); 700 } 701 } 702 #if USE_ITT_BUILD 703 if (status) { 704 __kmp_itt_single_start(gtid); 705 } 706 #endif /* USE_ITT_BUILD */ 707 return status; 708 } 709 710 void __kmp_exit_single(int gtid) { 711 #if USE_ITT_BUILD 712 __kmp_itt_single_end(gtid); 713 #endif /* USE_ITT_BUILD */ 714 if (__kmp_env_consistency_check) 715 __kmp_pop_workshare(gtid, ct_psingle, NULL); 716 } 717 718 /* determine if we can go parallel or must use a serialized parallel region and 719 * how many threads we can use 720 * set_nproc is the number of threads requested for the team 721 * returns 0 if we should serialize or only use one thread, 722 * otherwise the number of threads to use 723 * The forkjoin lock is held by the caller. */ 724 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 725 int master_tid, int set_nthreads, 726 int enter_teams) { 727 int capacity; 728 int new_nthreads; 729 KMP_DEBUG_ASSERT(__kmp_init_serial); 730 KMP_DEBUG_ASSERT(root && parent_team); 731 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; 732 733 // If dyn-var is set, dynamically adjust the number of desired threads, 734 // according to the method specified by dynamic_mode. 735 new_nthreads = set_nthreads; 736 if (!get__dynamic_2(parent_team, master_tid)) { 737 ; 738 } 739 #ifdef USE_LOAD_BALANCE 740 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 741 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 742 if (new_nthreads == 1) { 743 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 744 "reservation to 1 thread\n", 745 master_tid)); 746 return 1; 747 } 748 if (new_nthreads < set_nthreads) { 749 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 750 "reservation to %d threads\n", 751 master_tid, new_nthreads)); 752 } 753 } 754 #endif /* USE_LOAD_BALANCE */ 755 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 756 new_nthreads = __kmp_avail_proc - __kmp_nth + 757 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 758 if (new_nthreads <= 1) { 759 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 760 "reservation to 1 thread\n", 761 master_tid)); 762 return 1; 763 } 764 if (new_nthreads < set_nthreads) { 765 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 766 "reservation to %d threads\n", 767 master_tid, new_nthreads)); 768 } else { 769 new_nthreads = set_nthreads; 770 } 771 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 772 if (set_nthreads > 2) { 773 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 774 new_nthreads = (new_nthreads % set_nthreads) + 1; 775 if (new_nthreads == 1) { 776 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 777 "reservation to 1 thread\n", 778 master_tid)); 779 return 1; 780 } 781 if (new_nthreads < set_nthreads) { 782 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 783 "reservation to %d threads\n", 784 master_tid, new_nthreads)); 785 } 786 } 787 } else { 788 KMP_ASSERT(0); 789 } 790 791 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 792 if (__kmp_nth + new_nthreads - 793 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 794 __kmp_max_nth) { 795 int tl_nthreads = __kmp_max_nth - __kmp_nth + 796 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 797 if (tl_nthreads <= 0) { 798 tl_nthreads = 1; 799 } 800 801 // If dyn-var is false, emit a 1-time warning. 802 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 803 __kmp_reserve_warn = 1; 804 __kmp_msg(kmp_ms_warning, 805 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 806 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 807 } 808 if (tl_nthreads == 1) { 809 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 810 "reduced reservation to 1 thread\n", 811 master_tid)); 812 return 1; 813 } 814 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 815 "reservation to %d threads\n", 816 master_tid, tl_nthreads)); 817 new_nthreads = tl_nthreads; 818 } 819 820 // Respect OMP_THREAD_LIMIT 821 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; 822 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; 823 if (cg_nthreads + new_nthreads - 824 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 825 max_cg_threads) { 826 int tl_nthreads = max_cg_threads - cg_nthreads + 827 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 828 if (tl_nthreads <= 0) { 829 tl_nthreads = 1; 830 } 831 832 // If dyn-var is false, emit a 1-time warning. 833 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 834 __kmp_reserve_warn = 1; 835 __kmp_msg(kmp_ms_warning, 836 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 837 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 838 } 839 if (tl_nthreads == 1) { 840 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 841 "reduced reservation to 1 thread\n", 842 master_tid)); 843 return 1; 844 } 845 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 846 "reservation to %d threads\n", 847 master_tid, tl_nthreads)); 848 new_nthreads = tl_nthreads; 849 } 850 851 // Check if the threads array is large enough, or needs expanding. 852 // See comment in __kmp_register_root() about the adjustment if 853 // __kmp_threads[0] == NULL. 854 capacity = __kmp_threads_capacity; 855 if (TCR_PTR(__kmp_threads[0]) == NULL) { 856 --capacity; 857 } 858 // If it is not for initializing the hidden helper team, we need to take 859 // __kmp_hidden_helper_threads_num out of the capacity because it is included 860 // in __kmp_threads_capacity. 861 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 862 capacity -= __kmp_hidden_helper_threads_num; 863 } 864 if (__kmp_nth + new_nthreads - 865 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 866 capacity) { 867 // Expand the threads array. 868 int slotsRequired = __kmp_nth + new_nthreads - 869 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 870 capacity; 871 int slotsAdded = __kmp_expand_threads(slotsRequired); 872 if (slotsAdded < slotsRequired) { 873 // The threads array was not expanded enough. 874 new_nthreads -= (slotsRequired - slotsAdded); 875 KMP_ASSERT(new_nthreads >= 1); 876 877 // If dyn-var is false, emit a 1-time warning. 878 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 879 __kmp_reserve_warn = 1; 880 if (__kmp_tp_cached) { 881 __kmp_msg(kmp_ms_warning, 882 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 883 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 884 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 885 } else { 886 __kmp_msg(kmp_ms_warning, 887 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 888 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 889 } 890 } 891 } 892 } 893 894 #ifdef KMP_DEBUG 895 if (new_nthreads == 1) { 896 KC_TRACE(10, 897 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 898 "dead roots and rechecking; requested %d threads\n", 899 __kmp_get_gtid(), set_nthreads)); 900 } else { 901 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 902 " %d threads\n", 903 __kmp_get_gtid(), new_nthreads, set_nthreads)); 904 } 905 #endif // KMP_DEBUG 906 return new_nthreads; 907 } 908 909 /* Allocate threads from the thread pool and assign them to the new team. We are 910 assured that there are enough threads available, because we checked on that 911 earlier within critical section forkjoin */ 912 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 913 kmp_info_t *master_th, int master_gtid) { 914 int i; 915 int use_hot_team; 916 917 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 918 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 919 KMP_MB(); 920 921 /* first, let's setup the primary thread */ 922 master_th->th.th_info.ds.ds_tid = 0; 923 master_th->th.th_team = team; 924 master_th->th.th_team_nproc = team->t.t_nproc; 925 master_th->th.th_team_master = master_th; 926 master_th->th.th_team_serialized = FALSE; 927 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 928 929 /* make sure we are not the optimized hot team */ 930 #if KMP_NESTED_HOT_TEAMS 931 use_hot_team = 0; 932 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 933 if (hot_teams) { // hot teams array is not allocated if 934 // KMP_HOT_TEAMS_MAX_LEVEL=0 935 int level = team->t.t_active_level - 1; // index in array of hot teams 936 if (master_th->th.th_teams_microtask) { // are we inside the teams? 937 if (master_th->th.th_teams_size.nteams > 1) { 938 ++level; // level was not increased in teams construct for 939 // team_of_masters 940 } 941 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 942 master_th->th.th_teams_level == team->t.t_level) { 943 ++level; // level was not increased in teams construct for 944 // team_of_workers before the parallel 945 } // team->t.t_level will be increased inside parallel 946 } 947 if (level < __kmp_hot_teams_max_level) { 948 if (hot_teams[level].hot_team) { 949 // hot team has already been allocated for given level 950 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 951 use_hot_team = 1; // the team is ready to use 952 } else { 953 use_hot_team = 0; // AC: threads are not allocated yet 954 hot_teams[level].hot_team = team; // remember new hot team 955 hot_teams[level].hot_team_nth = team->t.t_nproc; 956 } 957 } else { 958 use_hot_team = 0; 959 } 960 } 961 #else 962 use_hot_team = team == root->r.r_hot_team; 963 #endif 964 if (!use_hot_team) { 965 966 /* install the primary thread */ 967 team->t.t_threads[0] = master_th; 968 __kmp_initialize_info(master_th, team, 0, master_gtid); 969 970 /* now, install the worker threads */ 971 for (i = 1; i < team->t.t_nproc; i++) { 972 973 /* fork or reallocate a new thread and install it in team */ 974 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 975 team->t.t_threads[i] = thr; 976 KMP_DEBUG_ASSERT(thr); 977 KMP_DEBUG_ASSERT(thr->th.th_team == team); 978 /* align team and thread arrived states */ 979 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 980 "T#%d(%d:%d) join =%llu, plain=%llu\n", 981 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 982 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 983 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 984 team->t.t_bar[bs_plain_barrier].b_arrived)); 985 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 986 thr->th.th_teams_level = master_th->th.th_teams_level; 987 thr->th.th_teams_size = master_th->th.th_teams_size; 988 { // Initialize threads' barrier data. 989 int b; 990 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 991 for (b = 0; b < bs_last_barrier; ++b) { 992 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 993 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 994 #if USE_DEBUGGER 995 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 996 #endif 997 } 998 } 999 } 1000 1001 #if KMP_AFFINITY_SUPPORTED 1002 __kmp_partition_places(team); 1003 #endif 1004 } 1005 1006 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 1007 for (i = 0; i < team->t.t_nproc; i++) { 1008 kmp_info_t *thr = team->t.t_threads[i]; 1009 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1010 thr->th.th_prev_level != team->t.t_level) { 1011 team->t.t_display_affinity = 1; 1012 break; 1013 } 1014 } 1015 } 1016 1017 KMP_MB(); 1018 } 1019 1020 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1021 // Propagate any changes to the floating point control registers out to the team 1022 // We try to avoid unnecessary writes to the relevant cache line in the team 1023 // structure, so we don't make changes unless they are needed. 1024 inline static void propagateFPControl(kmp_team_t *team) { 1025 if (__kmp_inherit_fp_control) { 1026 kmp_int16 x87_fpu_control_word; 1027 kmp_uint32 mxcsr; 1028 1029 // Get primary thread's values of FPU control flags (both X87 and vector) 1030 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1031 __kmp_store_mxcsr(&mxcsr); 1032 mxcsr &= KMP_X86_MXCSR_MASK; 1033 1034 // There is no point looking at t_fp_control_saved here. 1035 // If it is TRUE, we still have to update the values if they are different 1036 // from those we now have. If it is FALSE we didn't save anything yet, but 1037 // our objective is the same. We have to ensure that the values in the team 1038 // are the same as those we have. 1039 // So, this code achieves what we need whether or not t_fp_control_saved is 1040 // true. By checking whether the value needs updating we avoid unnecessary 1041 // writes that would put the cache-line into a written state, causing all 1042 // threads in the team to have to read it again. 1043 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1044 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1045 // Although we don't use this value, other code in the runtime wants to know 1046 // whether it should restore them. So we must ensure it is correct. 1047 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1048 } else { 1049 // Similarly here. Don't write to this cache-line in the team structure 1050 // unless we have to. 1051 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1052 } 1053 } 1054 1055 // Do the opposite, setting the hardware registers to the updated values from 1056 // the team. 1057 inline static void updateHWFPControl(kmp_team_t *team) { 1058 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1059 // Only reset the fp control regs if they have been changed in the team. 1060 // the parallel region that we are exiting. 1061 kmp_int16 x87_fpu_control_word; 1062 kmp_uint32 mxcsr; 1063 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1064 __kmp_store_mxcsr(&mxcsr); 1065 mxcsr &= KMP_X86_MXCSR_MASK; 1066 1067 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1068 __kmp_clear_x87_fpu_status_word(); 1069 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1070 } 1071 1072 if (team->t.t_mxcsr != mxcsr) { 1073 __kmp_load_mxcsr(&team->t.t_mxcsr); 1074 } 1075 } 1076 } 1077 #else 1078 #define propagateFPControl(x) ((void)0) 1079 #define updateHWFPControl(x) ((void)0) 1080 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1081 1082 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1083 int realloc); // forward declaration 1084 1085 /* Run a parallel region that has been serialized, so runs only in a team of the 1086 single primary thread. */ 1087 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1088 kmp_info_t *this_thr; 1089 kmp_team_t *serial_team; 1090 1091 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1092 1093 /* Skip all this code for autopar serialized loops since it results in 1094 unacceptable overhead */ 1095 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1096 return; 1097 1098 if (!TCR_4(__kmp_init_parallel)) 1099 __kmp_parallel_initialize(); 1100 __kmp_resume_if_soft_paused(); 1101 1102 this_thr = __kmp_threads[global_tid]; 1103 serial_team = this_thr->th.th_serial_team; 1104 1105 /* utilize the serialized team held by this thread */ 1106 KMP_DEBUG_ASSERT(serial_team); 1107 KMP_MB(); 1108 1109 if (__kmp_tasking_mode != tskm_immediate_exec) { 1110 KMP_DEBUG_ASSERT( 1111 this_thr->th.th_task_team == 1112 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1113 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1114 NULL); 1115 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1116 "team %p, new task_team = NULL\n", 1117 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1118 this_thr->th.th_task_team = NULL; 1119 } 1120 1121 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1122 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1123 proc_bind = proc_bind_false; 1124 } else if (proc_bind == proc_bind_default) { 1125 // No proc_bind clause was specified, so use the current value 1126 // of proc-bind-var for this parallel region. 1127 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1128 } 1129 // Reset for next parallel region 1130 this_thr->th.th_set_proc_bind = proc_bind_default; 1131 1132 #if OMPT_SUPPORT 1133 ompt_data_t ompt_parallel_data = ompt_data_none; 1134 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1135 if (ompt_enabled.enabled && 1136 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1137 1138 ompt_task_info_t *parent_task_info; 1139 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1140 1141 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1142 if (ompt_enabled.ompt_callback_parallel_begin) { 1143 int team_size = 1; 1144 1145 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1146 &(parent_task_info->task_data), &(parent_task_info->frame), 1147 &ompt_parallel_data, team_size, 1148 ompt_parallel_invoker_program | ompt_parallel_team, codeptr); 1149 } 1150 } 1151 #endif // OMPT_SUPPORT 1152 1153 if (this_thr->th.th_team != serial_team) { 1154 // Nested level will be an index in the nested nthreads array 1155 int level = this_thr->th.th_team->t.t_level; 1156 1157 if (serial_team->t.t_serialized) { 1158 /* this serial team was already used 1159 TODO increase performance by making this locks more specific */ 1160 kmp_team_t *new_team; 1161 1162 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1163 1164 new_team = 1165 __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1166 #if OMPT_SUPPORT 1167 ompt_parallel_data, 1168 #endif 1169 proc_bind, &this_thr->th.th_current_task->td_icvs, 1170 0 USE_NESTED_HOT_ARG(NULL)); 1171 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1172 KMP_ASSERT(new_team); 1173 1174 /* setup new serialized team and install it */ 1175 new_team->t.t_threads[0] = this_thr; 1176 new_team->t.t_parent = this_thr->th.th_team; 1177 serial_team = new_team; 1178 this_thr->th.th_serial_team = serial_team; 1179 1180 KF_TRACE( 1181 10, 1182 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1183 global_tid, serial_team)); 1184 1185 /* TODO the above breaks the requirement that if we run out of resources, 1186 then we can still guarantee that serialized teams are ok, since we may 1187 need to allocate a new one */ 1188 } else { 1189 KF_TRACE( 1190 10, 1191 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1192 global_tid, serial_team)); 1193 } 1194 1195 /* we have to initialize this serial team */ 1196 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1197 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1198 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1199 serial_team->t.t_ident = loc; 1200 serial_team->t.t_serialized = 1; 1201 serial_team->t.t_nproc = 1; 1202 serial_team->t.t_parent = this_thr->th.th_team; 1203 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1204 this_thr->th.th_team = serial_team; 1205 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1206 1207 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid, 1208 this_thr->th.th_current_task)); 1209 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1210 this_thr->th.th_current_task->td_flags.executing = 0; 1211 1212 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1213 1214 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1215 implicit task for each serialized task represented by 1216 team->t.t_serialized? */ 1217 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1218 &this_thr->th.th_current_task->td_parent->td_icvs); 1219 1220 // Thread value exists in the nested nthreads array for the next nested 1221 // level 1222 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1223 this_thr->th.th_current_task->td_icvs.nproc = 1224 __kmp_nested_nth.nth[level + 1]; 1225 } 1226 1227 if (__kmp_nested_proc_bind.used && 1228 (level + 1 < __kmp_nested_proc_bind.used)) { 1229 this_thr->th.th_current_task->td_icvs.proc_bind = 1230 __kmp_nested_proc_bind.bind_types[level + 1]; 1231 } 1232 1233 #if USE_DEBUGGER 1234 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1235 #endif 1236 this_thr->th.th_info.ds.ds_tid = 0; 1237 1238 /* set thread cache values */ 1239 this_thr->th.th_team_nproc = 1; 1240 this_thr->th.th_team_master = this_thr; 1241 this_thr->th.th_team_serialized = 1; 1242 1243 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1244 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1245 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1246 1247 propagateFPControl(serial_team); 1248 1249 /* check if we need to allocate dispatch buffers stack */ 1250 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1251 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1252 serial_team->t.t_dispatch->th_disp_buffer = 1253 (dispatch_private_info_t *)__kmp_allocate( 1254 sizeof(dispatch_private_info_t)); 1255 } 1256 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1257 1258 KMP_MB(); 1259 1260 } else { 1261 /* this serialized team is already being used, 1262 * that's fine, just add another nested level */ 1263 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1264 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1265 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1266 ++serial_team->t.t_serialized; 1267 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1268 1269 // Nested level will be an index in the nested nthreads array 1270 int level = this_thr->th.th_team->t.t_level; 1271 // Thread value exists in the nested nthreads array for the next nested 1272 // level 1273 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1274 this_thr->th.th_current_task->td_icvs.nproc = 1275 __kmp_nested_nth.nth[level + 1]; 1276 } 1277 serial_team->t.t_level++; 1278 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1279 "of serial team %p to %d\n", 1280 global_tid, serial_team, serial_team->t.t_level)); 1281 1282 /* allocate/push dispatch buffers stack */ 1283 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1284 { 1285 dispatch_private_info_t *disp_buffer = 1286 (dispatch_private_info_t *)__kmp_allocate( 1287 sizeof(dispatch_private_info_t)); 1288 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1289 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1290 } 1291 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1292 1293 KMP_MB(); 1294 } 1295 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1296 1297 // Perform the display affinity functionality for 1298 // serialized parallel regions 1299 if (__kmp_display_affinity) { 1300 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1301 this_thr->th.th_prev_num_threads != 1) { 1302 // NULL means use the affinity-format-var ICV 1303 __kmp_aux_display_affinity(global_tid, NULL); 1304 this_thr->th.th_prev_level = serial_team->t.t_level; 1305 this_thr->th.th_prev_num_threads = 1; 1306 } 1307 } 1308 1309 if (__kmp_env_consistency_check) 1310 __kmp_push_parallel(global_tid, NULL); 1311 #if OMPT_SUPPORT 1312 serial_team->t.ompt_team_info.master_return_address = codeptr; 1313 if (ompt_enabled.enabled && 1314 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1315 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1316 OMPT_GET_FRAME_ADDRESS(0); 1317 1318 ompt_lw_taskteam_t lw_taskteam; 1319 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1320 &ompt_parallel_data, codeptr); 1321 1322 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1323 // don't use lw_taskteam after linking. content was swaped 1324 1325 /* OMPT implicit task begin */ 1326 if (ompt_enabled.ompt_callback_implicit_task) { 1327 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1328 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1329 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), 1330 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1331 OMPT_CUR_TASK_INFO(this_thr)->thread_num = 1332 __kmp_tid_from_gtid(global_tid); 1333 } 1334 1335 /* OMPT state */ 1336 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1337 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1338 OMPT_GET_FRAME_ADDRESS(0); 1339 } 1340 #endif 1341 } 1342 1343 /* most of the work for a fork */ 1344 /* return true if we really went parallel, false if serialized */ 1345 int __kmp_fork_call(ident_t *loc, int gtid, 1346 enum fork_context_e call_context, // Intel, GNU, ... 1347 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1348 kmp_va_list ap) { 1349 void **argv; 1350 int i; 1351 int master_tid; 1352 int master_this_cons; 1353 kmp_team_t *team; 1354 kmp_team_t *parent_team; 1355 kmp_info_t *master_th; 1356 kmp_root_t *root; 1357 int nthreads; 1358 int master_active; 1359 int master_set_numthreads; 1360 int level; 1361 int active_level; 1362 int teams_level; 1363 #if KMP_NESTED_HOT_TEAMS 1364 kmp_hot_team_ptr_t **p_hot_teams; 1365 #endif 1366 { // KMP_TIME_BLOCK 1367 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1368 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1369 1370 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1371 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1372 /* Some systems prefer the stack for the root thread(s) to start with */ 1373 /* some gap from the parent stack to prevent false sharing. */ 1374 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1375 /* These 2 lines below are so this does not get optimized out */ 1376 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1377 __kmp_stkpadding += (short)((kmp_int64)dummy); 1378 } 1379 1380 /* initialize if needed */ 1381 KMP_DEBUG_ASSERT( 1382 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1383 if (!TCR_4(__kmp_init_parallel)) 1384 __kmp_parallel_initialize(); 1385 __kmp_resume_if_soft_paused(); 1386 1387 /* setup current data */ 1388 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with 1389 // shutdown 1390 parent_team = master_th->th.th_team; 1391 master_tid = master_th->th.th_info.ds.ds_tid; 1392 master_this_cons = master_th->th.th_local.this_construct; 1393 root = master_th->th.th_root; 1394 master_active = root->r.r_active; 1395 master_set_numthreads = master_th->th.th_set_nproc; 1396 1397 #if OMPT_SUPPORT 1398 ompt_data_t ompt_parallel_data = ompt_data_none; 1399 ompt_data_t *parent_task_data; 1400 ompt_frame_t *ompt_frame; 1401 ompt_data_t *implicit_task_data; 1402 void *return_address = NULL; 1403 1404 if (ompt_enabled.enabled) { 1405 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1406 NULL, NULL); 1407 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1408 } 1409 #endif 1410 1411 // Assign affinity to root thread if it hasn't happened yet 1412 __kmp_assign_root_init_mask(); 1413 1414 // Nested level will be an index in the nested nthreads array 1415 level = parent_team->t.t_level; 1416 // used to launch non-serial teams even if nested is not allowed 1417 active_level = parent_team->t.t_active_level; 1418 // needed to check nesting inside the teams 1419 teams_level = master_th->th.th_teams_level; 1420 #if KMP_NESTED_HOT_TEAMS 1421 p_hot_teams = &master_th->th.th_hot_teams; 1422 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1423 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1424 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1425 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1426 // it is either actual or not needed (when active_level > 0) 1427 (*p_hot_teams)[0].hot_team_nth = 1; 1428 } 1429 #endif 1430 1431 #if OMPT_SUPPORT 1432 if (ompt_enabled.enabled) { 1433 if (ompt_enabled.ompt_callback_parallel_begin) { 1434 int team_size = master_set_numthreads 1435 ? master_set_numthreads 1436 : get__nproc_2(parent_team, master_tid); 1437 int flags = OMPT_INVOKER(call_context) | 1438 ((microtask == (microtask_t)__kmp_teams_master) 1439 ? ompt_parallel_league 1440 : ompt_parallel_team); 1441 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1442 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags, 1443 return_address); 1444 } 1445 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1446 } 1447 #endif 1448 1449 master_th->th.th_ident = loc; 1450 1451 if (master_th->th.th_teams_microtask && ap && 1452 microtask != (microtask_t)__kmp_teams_master && level == teams_level) { 1453 // AC: This is start of parallel that is nested inside teams construct. 1454 // The team is actual (hot), all workers are ready at the fork barrier. 1455 // No lock needed to initialize the team a bit, then free workers. 1456 parent_team->t.t_ident = loc; 1457 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1458 parent_team->t.t_argc = argc; 1459 argv = (void **)parent_team->t.t_argv; 1460 for (i = argc - 1; i >= 0; --i) 1461 *argv++ = va_arg(kmp_va_deref(ap), void *); 1462 // Increment our nested depth levels, but not increase the serialization 1463 if (parent_team == master_th->th.th_serial_team) { 1464 // AC: we are in serialized parallel 1465 __kmpc_serialized_parallel(loc, gtid); 1466 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1467 1468 if (call_context == fork_context_gnu) { 1469 // AC: need to decrement t_serialized for enquiry functions to work 1470 // correctly, will restore at join time 1471 parent_team->t.t_serialized--; 1472 return TRUE; 1473 } 1474 1475 #if OMPD_SUPPORT 1476 parent_team->t.t_pkfn = microtask; 1477 #endif 1478 1479 #if OMPT_SUPPORT 1480 void *dummy; 1481 void **exit_frame_p; 1482 1483 ompt_lw_taskteam_t lw_taskteam; 1484 1485 if (ompt_enabled.enabled) { 1486 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1487 &ompt_parallel_data, return_address); 1488 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1489 1490 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1491 // don't use lw_taskteam after linking. content was swaped 1492 1493 /* OMPT implicit task begin */ 1494 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1495 if (ompt_enabled.ompt_callback_implicit_task) { 1496 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1497 __kmp_tid_from_gtid(gtid); 1498 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1499 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1500 implicit_task_data, 1, 1501 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1502 } 1503 1504 /* OMPT state */ 1505 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1506 } else { 1507 exit_frame_p = &dummy; 1508 } 1509 #endif 1510 // AC: need to decrement t_serialized for enquiry functions to work 1511 // correctly, will restore at join time 1512 parent_team->t.t_serialized--; 1513 1514 { 1515 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1516 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1517 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1518 #if OMPT_SUPPORT 1519 , 1520 exit_frame_p 1521 #endif 1522 ); 1523 } 1524 1525 #if OMPT_SUPPORT 1526 if (ompt_enabled.enabled) { 1527 *exit_frame_p = NULL; 1528 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1529 if (ompt_enabled.ompt_callback_implicit_task) { 1530 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1531 ompt_scope_end, NULL, implicit_task_data, 1, 1532 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1533 } 1534 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1535 __ompt_lw_taskteam_unlink(master_th); 1536 if (ompt_enabled.ompt_callback_parallel_end) { 1537 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1538 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th), 1539 OMPT_INVOKER(call_context) | ompt_parallel_team, 1540 return_address); 1541 } 1542 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1543 } 1544 #endif 1545 return TRUE; 1546 } 1547 1548 parent_team->t.t_pkfn = microtask; 1549 parent_team->t.t_invoke = invoker; 1550 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1551 parent_team->t.t_active_level++; 1552 parent_team->t.t_level++; 1553 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1554 1555 #if OMPT_SUPPORT 1556 if (ompt_enabled.enabled) { 1557 ompt_lw_taskteam_t lw_taskteam; 1558 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1559 &ompt_parallel_data, return_address); 1560 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true); 1561 } 1562 #endif 1563 1564 /* Change number of threads in the team if requested */ 1565 if (master_set_numthreads) { // The parallel has num_threads clause 1566 if (master_set_numthreads < master_th->th.th_teams_size.nth) { 1567 // AC: only can reduce number of threads dynamically, can't increase 1568 kmp_info_t **other_threads = parent_team->t.t_threads; 1569 parent_team->t.t_nproc = master_set_numthreads; 1570 for (i = 0; i < master_set_numthreads; ++i) { 1571 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1572 } 1573 // Keep extra threads hot in the team for possible next parallels 1574 } 1575 master_th->th.th_set_nproc = 0; 1576 } 1577 1578 #if USE_DEBUGGER 1579 if (__kmp_debugging) { // Let debugger override number of threads. 1580 int nth = __kmp_omp_num_threads(loc); 1581 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1582 master_set_numthreads = nth; 1583 } 1584 } 1585 #endif 1586 1587 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1588 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || 1589 KMP_ITT_DEBUG) && 1590 __kmp_forkjoin_frames_mode == 3 && 1591 parent_team->t.t_active_level == 1 // only report frames at level 1 1592 && master_th->th.th_teams_size.nteams == 1) { 1593 kmp_uint64 tmp_time = __itt_get_timestamp(); 1594 master_th->th.th_frame_time = tmp_time; 1595 parent_team->t.t_region_time = tmp_time; 1596 } 1597 if (__itt_stack_caller_create_ptr) { 1598 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 1599 // create new stack stitching id before entering fork barrier 1600 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 1601 } 1602 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1603 1604 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " 1605 "master_th=%p, gtid=%d\n", 1606 root, parent_team, master_th, gtid)); 1607 __kmp_internal_fork(loc, gtid, parent_team); 1608 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, " 1609 "master_th=%p, gtid=%d\n", 1610 root, parent_team, master_th, gtid)); 1611 1612 if (call_context == fork_context_gnu) 1613 return TRUE; 1614 1615 /* Invoke microtask for PRIMARY thread */ 1616 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 1617 parent_team->t.t_id, parent_team->t.t_pkfn)); 1618 1619 if (!parent_team->t.t_invoke(gtid)) { 1620 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 1621 } 1622 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 1623 parent_team->t.t_id, parent_team->t.t_pkfn)); 1624 KMP_MB(); /* Flush all pending memory write invalidates. */ 1625 1626 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 1627 1628 return TRUE; 1629 } // Parallel closely nested in teams construct 1630 1631 #if KMP_DEBUG 1632 if (__kmp_tasking_mode != tskm_immediate_exec) { 1633 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1634 parent_team->t.t_task_team[master_th->th.th_task_state]); 1635 } 1636 #endif 1637 1638 int enter_teams = 0; 1639 if (parent_team->t.t_active_level >= 1640 master_th->th.th_current_task->td_icvs.max_active_levels) { 1641 nthreads = 1; 1642 } else { 1643 enter_teams = ((ap == NULL && active_level == 0) || 1644 (ap && teams_level > 0 && teams_level == level)); 1645 nthreads = 1646 master_set_numthreads 1647 ? master_set_numthreads 1648 : get__nproc_2( 1649 parent_team, 1650 master_tid); // TODO: get nproc directly from current task 1651 1652 // Check if we need to take forkjoin lock? (no need for serialized 1653 // parallel out of teams construct). This code moved here from 1654 // __kmp_reserve_threads() to speedup nested serialized parallels. 1655 if (nthreads > 1) { 1656 if ((get__max_active_levels(master_th) == 1 && 1657 (root->r.r_in_parallel && !enter_teams)) || 1658 (__kmp_library == library_serial)) { 1659 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" 1660 " threads\n", 1661 gtid, nthreads)); 1662 nthreads = 1; 1663 } 1664 } 1665 if (nthreads > 1) { 1666 /* determine how many new threads we can use */ 1667 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1668 /* AC: If we execute teams from parallel region (on host), then teams 1669 should be created but each can only have 1 thread if nesting is 1670 disabled. If teams called from serial region, then teams and their 1671 threads should be created regardless of the nesting setting. */ 1672 nthreads = __kmp_reserve_threads(root, parent_team, master_tid, 1673 nthreads, enter_teams); 1674 if (nthreads == 1) { 1675 // Free lock for single thread execution here; for multi-thread 1676 // execution it will be freed later after team of threads created 1677 // and initialized 1678 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1679 } 1680 } 1681 } 1682 KMP_DEBUG_ASSERT(nthreads > 0); 1683 1684 // If we temporarily changed the set number of threads then restore it now 1685 master_th->th.th_set_nproc = 0; 1686 1687 /* create a serialized parallel region? */ 1688 if (nthreads == 1) { 1689 /* josh todo: hypothetical question: what do we do for OS X*? */ 1690 #if KMP_OS_LINUX && \ 1691 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1692 void *args[argc]; 1693 #else 1694 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1695 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1696 KMP_ARCH_AARCH64) */ 1697 1698 KA_TRACE(20, 1699 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid)); 1700 1701 __kmpc_serialized_parallel(loc, gtid); 1702 1703 #if OMPD_SUPPORT 1704 master_th->th.th_serial_team->t.t_pkfn = microtask; 1705 #endif 1706 1707 if (call_context == fork_context_intel) { 1708 /* TODO this sucks, use the compiler itself to pass args! :) */ 1709 master_th->th.th_serial_team->t.t_ident = loc; 1710 if (!ap) { 1711 // revert change made in __kmpc_serialized_parallel() 1712 master_th->th.th_serial_team->t.t_level--; 1713 // Get args from parent team for teams construct 1714 1715 #if OMPT_SUPPORT 1716 void *dummy; 1717 void **exit_frame_p; 1718 ompt_task_info_t *task_info; 1719 1720 ompt_lw_taskteam_t lw_taskteam; 1721 1722 if (ompt_enabled.enabled) { 1723 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1724 &ompt_parallel_data, return_address); 1725 1726 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1727 // don't use lw_taskteam after linking. content was swaped 1728 1729 task_info = OMPT_CUR_TASK_INFO(master_th); 1730 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1731 if (ompt_enabled.ompt_callback_implicit_task) { 1732 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1733 __kmp_tid_from_gtid(gtid); 1734 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1735 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1736 &(task_info->task_data), 1, 1737 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1738 ompt_task_implicit); 1739 } 1740 1741 /* OMPT state */ 1742 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1743 } else { 1744 exit_frame_p = &dummy; 1745 } 1746 #endif 1747 1748 { 1749 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1750 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1751 __kmp_invoke_microtask(microtask, gtid, 0, argc, 1752 parent_team->t.t_argv 1753 #if OMPT_SUPPORT 1754 , 1755 exit_frame_p 1756 #endif 1757 ); 1758 } 1759 1760 #if OMPT_SUPPORT 1761 if (ompt_enabled.enabled) { 1762 *exit_frame_p = NULL; 1763 if (ompt_enabled.ompt_callback_implicit_task) { 1764 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1765 ompt_scope_end, NULL, &(task_info->task_data), 1, 1766 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1767 ompt_task_implicit); 1768 } 1769 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1770 __ompt_lw_taskteam_unlink(master_th); 1771 if (ompt_enabled.ompt_callback_parallel_end) { 1772 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1773 &ompt_parallel_data, parent_task_data, 1774 OMPT_INVOKER(call_context) | ompt_parallel_team, 1775 return_address); 1776 } 1777 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1778 } 1779 #endif 1780 } else if (microtask == (microtask_t)__kmp_teams_master) { 1781 KMP_DEBUG_ASSERT(master_th->th.th_team == 1782 master_th->th.th_serial_team); 1783 team = master_th->th.th_team; 1784 // team->t.t_pkfn = microtask; 1785 team->t.t_invoke = invoker; 1786 __kmp_alloc_argv_entries(argc, team, TRUE); 1787 team->t.t_argc = argc; 1788 argv = (void **)team->t.t_argv; 1789 if (ap) { 1790 for (i = argc - 1; i >= 0; --i) 1791 *argv++ = va_arg(kmp_va_deref(ap), void *); 1792 } else { 1793 for (i = 0; i < argc; ++i) 1794 // Get args from parent team for teams construct 1795 argv[i] = parent_team->t.t_argv[i]; 1796 } 1797 // AC: revert change made in __kmpc_serialized_parallel() 1798 // because initial code in teams should have level=0 1799 team->t.t_level--; 1800 // AC: call special invoker for outer "parallel" of teams construct 1801 invoker(gtid); 1802 #if OMPT_SUPPORT 1803 if (ompt_enabled.enabled) { 1804 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th); 1805 if (ompt_enabled.ompt_callback_implicit_task) { 1806 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1807 ompt_scope_end, NULL, &(task_info->task_data), 0, 1808 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial); 1809 } 1810 if (ompt_enabled.ompt_callback_parallel_end) { 1811 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1812 &ompt_parallel_data, parent_task_data, 1813 OMPT_INVOKER(call_context) | ompt_parallel_league, 1814 return_address); 1815 } 1816 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1817 } 1818 #endif 1819 } else { 1820 argv = args; 1821 for (i = argc - 1; i >= 0; --i) 1822 *argv++ = va_arg(kmp_va_deref(ap), void *); 1823 KMP_MB(); 1824 1825 #if OMPT_SUPPORT 1826 void *dummy; 1827 void **exit_frame_p; 1828 ompt_task_info_t *task_info; 1829 1830 ompt_lw_taskteam_t lw_taskteam; 1831 1832 if (ompt_enabled.enabled) { 1833 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1834 &ompt_parallel_data, return_address); 1835 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1836 // don't use lw_taskteam after linking. content was swaped 1837 task_info = OMPT_CUR_TASK_INFO(master_th); 1838 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1839 1840 /* OMPT implicit task begin */ 1841 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1842 if (ompt_enabled.ompt_callback_implicit_task) { 1843 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1844 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1845 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), 1846 ompt_task_implicit); 1847 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1848 __kmp_tid_from_gtid(gtid); 1849 } 1850 1851 /* OMPT state */ 1852 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1853 } else { 1854 exit_frame_p = &dummy; 1855 } 1856 #endif 1857 1858 { 1859 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1860 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1861 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1862 #if OMPT_SUPPORT 1863 , 1864 exit_frame_p 1865 #endif 1866 ); 1867 } 1868 1869 #if OMPT_SUPPORT 1870 if (ompt_enabled.enabled) { 1871 *exit_frame_p = NULL; 1872 if (ompt_enabled.ompt_callback_implicit_task) { 1873 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1874 ompt_scope_end, NULL, &(task_info->task_data), 1, 1875 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1876 ompt_task_implicit); 1877 } 1878 1879 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1880 __ompt_lw_taskteam_unlink(master_th); 1881 if (ompt_enabled.ompt_callback_parallel_end) { 1882 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1883 &ompt_parallel_data, parent_task_data, 1884 OMPT_INVOKER(call_context) | ompt_parallel_team, 1885 return_address); 1886 } 1887 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1888 } 1889 #endif 1890 } 1891 } else if (call_context == fork_context_gnu) { 1892 #if OMPT_SUPPORT 1893 ompt_lw_taskteam_t lwt; 1894 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data, 1895 return_address); 1896 1897 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1898 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1899 // don't use lw_taskteam after linking. content was swaped 1900 #endif 1901 1902 // we were called from GNU native code 1903 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1904 return FALSE; 1905 } else { 1906 KMP_ASSERT2(call_context < fork_context_last, 1907 "__kmp_fork_call: unknown fork_context parameter"); 1908 } 1909 1910 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1911 KMP_MB(); 1912 return FALSE; 1913 } // if (nthreads == 1) 1914 1915 // GEH: only modify the executing flag in the case when not serialized 1916 // serialized case is handled in kmpc_serialized_parallel 1917 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 1918 "curtask=%p, curtask_max_aclevel=%d\n", 1919 parent_team->t.t_active_level, master_th, 1920 master_th->th.th_current_task, 1921 master_th->th.th_current_task->td_icvs.max_active_levels)); 1922 // TODO: GEH - cannot do this assertion because root thread not set up as 1923 // executing 1924 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 1925 master_th->th.th_current_task->td_flags.executing = 0; 1926 1927 if (!master_th->th.th_teams_microtask || level > teams_level) { 1928 /* Increment our nested depth level */ 1929 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1930 } 1931 1932 // See if we need to make a copy of the ICVs. 1933 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 1934 if ((level + 1 < __kmp_nested_nth.used) && 1935 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 1936 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 1937 } else { 1938 nthreads_icv = 0; // don't update 1939 } 1940 1941 // Figure out the proc_bind_policy for the new team. 1942 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1943 kmp_proc_bind_t proc_bind_icv = 1944 proc_bind_default; // proc_bind_default means don't update 1945 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1946 proc_bind = proc_bind_false; 1947 } else { 1948 if (proc_bind == proc_bind_default) { 1949 // No proc_bind clause specified; use current proc-bind-var for this 1950 // parallel region 1951 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1952 } 1953 /* else: The proc_bind policy was specified explicitly on parallel clause. 1954 This overrides proc-bind-var for this parallel region, but does not 1955 change proc-bind-var. */ 1956 // Figure the value of proc-bind-var for the child threads. 1957 if ((level + 1 < __kmp_nested_proc_bind.used) && 1958 (__kmp_nested_proc_bind.bind_types[level + 1] != 1959 master_th->th.th_current_task->td_icvs.proc_bind)) { 1960 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1961 } 1962 } 1963 1964 // Reset for next parallel region 1965 master_th->th.th_set_proc_bind = proc_bind_default; 1966 1967 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { 1968 kmp_internal_control_t new_icvs; 1969 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 1970 new_icvs.next = NULL; 1971 if (nthreads_icv > 0) { 1972 new_icvs.nproc = nthreads_icv; 1973 } 1974 if (proc_bind_icv != proc_bind_default) { 1975 new_icvs.proc_bind = proc_bind_icv; 1976 } 1977 1978 /* allocate a new parallel team */ 1979 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1980 team = __kmp_allocate_team(root, nthreads, nthreads, 1981 #if OMPT_SUPPORT 1982 ompt_parallel_data, 1983 #endif 1984 proc_bind, &new_icvs, 1985 argc USE_NESTED_HOT_ARG(master_th)); 1986 } else { 1987 /* allocate a new parallel team */ 1988 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1989 team = __kmp_allocate_team(root, nthreads, nthreads, 1990 #if OMPT_SUPPORT 1991 ompt_parallel_data, 1992 #endif 1993 proc_bind, 1994 &master_th->th.th_current_task->td_icvs, 1995 argc USE_NESTED_HOT_ARG(master_th)); 1996 } 1997 KF_TRACE( 1998 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 1999 2000 /* setup the new team */ 2001 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 2002 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 2003 KMP_CHECK_UPDATE(team->t.t_ident, loc); 2004 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 2005 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 2006 #if OMPT_SUPPORT 2007 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 2008 return_address); 2009 #endif 2010 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 2011 // TODO: parent_team->t.t_level == INT_MAX ??? 2012 if (!master_th->th.th_teams_microtask || level > teams_level) { 2013 int new_level = parent_team->t.t_level + 1; 2014 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2015 new_level = parent_team->t.t_active_level + 1; 2016 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2017 } else { 2018 // AC: Do not increase parallel level at start of the teams construct 2019 int new_level = parent_team->t.t_level; 2020 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2021 new_level = parent_team->t.t_active_level; 2022 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2023 } 2024 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2025 // set primary thread's schedule as new run-time schedule 2026 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2027 2028 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2029 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2030 2031 // Update the floating point rounding in the team if required. 2032 propagateFPControl(team); 2033 #if OMPD_SUPPORT 2034 if (ompd_state & OMPD_ENABLE_BP) 2035 ompd_bp_parallel_begin(); 2036 #endif 2037 2038 if (__kmp_tasking_mode != tskm_immediate_exec) { 2039 // Set primary thread's task team to team's task team. Unless this is hot 2040 // team, it should be NULL. 2041 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2042 parent_team->t.t_task_team[master_th->th.th_task_state]); 2043 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team " 2044 "%p, new task_team %p / team %p\n", 2045 __kmp_gtid_from_thread(master_th), 2046 master_th->th.th_task_team, parent_team, 2047 team->t.t_task_team[master_th->th.th_task_state], team)); 2048 2049 if (active_level || master_th->th.th_task_team) { 2050 // Take a memo of primary thread's task_state 2051 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2052 if (master_th->th.th_task_state_top >= 2053 master_th->th.th_task_state_stack_sz) { // increase size 2054 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2055 kmp_uint8 *old_stack, *new_stack; 2056 kmp_uint32 i; 2057 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2058 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2059 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2060 } 2061 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2062 ++i) { // zero-init rest of stack 2063 new_stack[i] = 0; 2064 } 2065 old_stack = master_th->th.th_task_state_memo_stack; 2066 master_th->th.th_task_state_memo_stack = new_stack; 2067 master_th->th.th_task_state_stack_sz = new_size; 2068 __kmp_free(old_stack); 2069 } 2070 // Store primary thread's task_state on stack 2071 master_th->th 2072 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2073 master_th->th.th_task_state; 2074 master_th->th.th_task_state_top++; 2075 #if KMP_NESTED_HOT_TEAMS 2076 if (master_th->th.th_hot_teams && 2077 active_level < __kmp_hot_teams_max_level && 2078 team == master_th->th.th_hot_teams[active_level].hot_team) { 2079 // Restore primary thread's nested state if nested hot team 2080 master_th->th.th_task_state = 2081 master_th->th 2082 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2083 } else { 2084 #endif 2085 master_th->th.th_task_state = 0; 2086 #if KMP_NESTED_HOT_TEAMS 2087 } 2088 #endif 2089 } 2090 #if !KMP_NESTED_HOT_TEAMS 2091 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2092 (team == root->r.r_hot_team)); 2093 #endif 2094 } 2095 2096 KA_TRACE( 2097 20, 2098 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2099 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2100 team->t.t_nproc)); 2101 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2102 (team->t.t_master_tid == 0 && 2103 (team->t.t_parent == root->r.r_root_team || 2104 team->t.t_parent->t.t_serialized))); 2105 KMP_MB(); 2106 2107 /* now, setup the arguments */ 2108 argv = (void **)team->t.t_argv; 2109 if (ap) { 2110 for (i = argc - 1; i >= 0; --i) { 2111 void *new_argv = va_arg(kmp_va_deref(ap), void *); 2112 KMP_CHECK_UPDATE(*argv, new_argv); 2113 argv++; 2114 } 2115 } else { 2116 for (i = 0; i < argc; ++i) { 2117 // Get args from parent team for teams construct 2118 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2119 } 2120 } 2121 2122 /* now actually fork the threads */ 2123 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2124 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2125 root->r.r_active = TRUE; 2126 2127 __kmp_fork_team_threads(root, team, master_th, gtid); 2128 __kmp_setup_icv_copy(team, nthreads, 2129 &master_th->th.th_current_task->td_icvs, loc); 2130 2131 #if OMPT_SUPPORT 2132 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2133 #endif 2134 2135 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2136 2137 #if USE_ITT_BUILD 2138 if (team->t.t_active_level == 1 // only report frames at level 1 2139 && !master_th->th.th_teams_microtask) { // not in teams construct 2140 #if USE_ITT_NOTIFY 2141 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2142 (__kmp_forkjoin_frames_mode == 3 || 2143 __kmp_forkjoin_frames_mode == 1)) { 2144 kmp_uint64 tmp_time = 0; 2145 if (__itt_get_timestamp_ptr) 2146 tmp_time = __itt_get_timestamp(); 2147 // Internal fork - report frame begin 2148 master_th->th.th_frame_time = tmp_time; 2149 if (__kmp_forkjoin_frames_mode == 3) 2150 team->t.t_region_time = tmp_time; 2151 } else 2152 // only one notification scheme (either "submit" or "forking/joined", not both) 2153 #endif /* USE_ITT_NOTIFY */ 2154 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2155 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2156 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2157 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2158 } 2159 } 2160 #endif /* USE_ITT_BUILD */ 2161 2162 /* now go on and do the work */ 2163 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2164 KMP_MB(); 2165 KF_TRACE(10, 2166 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2167 root, team, master_th, gtid)); 2168 2169 #if USE_ITT_BUILD 2170 if (__itt_stack_caller_create_ptr) { 2171 // create new stack stitching id before entering fork barrier 2172 if (!enter_teams) { 2173 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL); 2174 team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2175 } else if (parent_team->t.t_serialized) { 2176 // keep stack stitching id in the serialized parent_team; 2177 // current team will be used for parallel inside the teams; 2178 // if parent_team is active, then it already keeps stack stitching id 2179 // for the league of teams 2180 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 2181 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2182 } 2183 } 2184 #endif /* USE_ITT_BUILD */ 2185 2186 // AC: skip __kmp_internal_fork at teams construct, let only primary 2187 // threads execute 2188 if (ap) { 2189 __kmp_internal_fork(loc, gtid, team); 2190 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2191 "master_th=%p, gtid=%d\n", 2192 root, team, master_th, gtid)); 2193 } 2194 2195 if (call_context == fork_context_gnu) { 2196 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2197 return TRUE; 2198 } 2199 2200 /* Invoke microtask for PRIMARY thread */ 2201 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2202 team->t.t_id, team->t.t_pkfn)); 2203 } // END of timer KMP_fork_call block 2204 2205 #if KMP_STATS_ENABLED 2206 // If beginning a teams construct, then change thread state 2207 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 2208 if (!ap) { 2209 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); 2210 } 2211 #endif 2212 2213 if (!team->t.t_invoke(gtid)) { 2214 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 2215 } 2216 2217 #if KMP_STATS_ENABLED 2218 // If was beginning of a teams construct, then reset thread state 2219 if (!ap) { 2220 KMP_SET_THREAD_STATE(previous_state); 2221 } 2222 #endif 2223 2224 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2225 team->t.t_id, team->t.t_pkfn)); 2226 KMP_MB(); /* Flush all pending memory write invalidates. */ 2227 2228 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2229 #if OMPT_SUPPORT 2230 if (ompt_enabled.enabled) { 2231 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2232 } 2233 #endif 2234 2235 return TRUE; 2236 } 2237 2238 #if OMPT_SUPPORT 2239 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2240 kmp_team_t *team) { 2241 // restore state outside the region 2242 thread->th.ompt_thread_info.state = 2243 ((team->t.t_serialized) ? ompt_state_work_serial 2244 : ompt_state_work_parallel); 2245 } 2246 2247 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2248 kmp_team_t *team, ompt_data_t *parallel_data, 2249 int flags, void *codeptr) { 2250 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2251 if (ompt_enabled.ompt_callback_parallel_end) { 2252 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2253 parallel_data, &(task_info->task_data), flags, codeptr); 2254 } 2255 2256 task_info->frame.enter_frame = ompt_data_none; 2257 __kmp_join_restore_state(thread, team); 2258 } 2259 #endif 2260 2261 void __kmp_join_call(ident_t *loc, int gtid 2262 #if OMPT_SUPPORT 2263 , 2264 enum fork_context_e fork_context 2265 #endif 2266 , 2267 int exit_teams) { 2268 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2269 kmp_team_t *team; 2270 kmp_team_t *parent_team; 2271 kmp_info_t *master_th; 2272 kmp_root_t *root; 2273 int master_active; 2274 2275 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2276 2277 /* setup current data */ 2278 master_th = __kmp_threads[gtid]; 2279 root = master_th->th.th_root; 2280 team = master_th->th.th_team; 2281 parent_team = team->t.t_parent; 2282 2283 master_th->th.th_ident = loc; 2284 2285 #if OMPT_SUPPORT 2286 void *team_microtask = (void *)team->t.t_pkfn; 2287 // For GOMP interface with serialized parallel, need the 2288 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task 2289 // and end-parallel events. 2290 if (ompt_enabled.enabled && 2291 !(team->t.t_serialized && fork_context == fork_context_gnu)) { 2292 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2293 } 2294 #endif 2295 2296 #if KMP_DEBUG 2297 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2298 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2299 "th_task_team = %p\n", 2300 __kmp_gtid_from_thread(master_th), team, 2301 team->t.t_task_team[master_th->th.th_task_state], 2302 master_th->th.th_task_team)); 2303 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2304 team->t.t_task_team[master_th->th.th_task_state]); 2305 } 2306 #endif 2307 2308 if (team->t.t_serialized) { 2309 if (master_th->th.th_teams_microtask) { 2310 // We are in teams construct 2311 int level = team->t.t_level; 2312 int tlevel = master_th->th.th_teams_level; 2313 if (level == tlevel) { 2314 // AC: we haven't incremented it earlier at start of teams construct, 2315 // so do it here - at the end of teams construct 2316 team->t.t_level++; 2317 } else if (level == tlevel + 1) { 2318 // AC: we are exiting parallel inside teams, need to increment 2319 // serialization in order to restore it in the next call to 2320 // __kmpc_end_serialized_parallel 2321 team->t.t_serialized++; 2322 } 2323 } 2324 __kmpc_end_serialized_parallel(loc, gtid); 2325 2326 #if OMPT_SUPPORT 2327 if (ompt_enabled.enabled) { 2328 __kmp_join_restore_state(master_th, parent_team); 2329 } 2330 #endif 2331 2332 return; 2333 } 2334 2335 master_active = team->t.t_master_active; 2336 2337 if (!exit_teams) { 2338 // AC: No barrier for internal teams at exit from teams construct. 2339 // But there is barrier for external team (league). 2340 __kmp_internal_join(loc, gtid, team); 2341 #if USE_ITT_BUILD 2342 if (__itt_stack_caller_create_ptr) { 2343 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL); 2344 // destroy the stack stitching id after join barrier 2345 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id); 2346 team->t.t_stack_id = NULL; 2347 } 2348 #endif 2349 } else { 2350 master_th->th.th_task_state = 2351 0; // AC: no tasking in teams (out of any parallel) 2352 #if USE_ITT_BUILD 2353 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) { 2354 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL); 2355 // destroy the stack stitching id on exit from the teams construct 2356 // if parent_team is active, then the id will be destroyed later on 2357 // by master of the league of teams 2358 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id); 2359 parent_team->t.t_stack_id = NULL; 2360 } 2361 #endif 2362 } 2363 2364 KMP_MB(); 2365 2366 #if OMPT_SUPPORT 2367 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2368 void *codeptr = team->t.ompt_team_info.master_return_address; 2369 #endif 2370 2371 #if USE_ITT_BUILD 2372 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2373 if (team->t.t_active_level == 1 && 2374 (!master_th->th.th_teams_microtask || /* not in teams construct */ 2375 master_th->th.th_teams_size.nteams == 1)) { 2376 master_th->th.th_ident = loc; 2377 // only one notification scheme (either "submit" or "forking/joined", not 2378 // both) 2379 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2380 __kmp_forkjoin_frames_mode == 3) 2381 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2382 master_th->th.th_frame_time, 0, loc, 2383 master_th->th.th_team_nproc, 1); 2384 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2385 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2386 __kmp_itt_region_joined(gtid); 2387 } // active_level == 1 2388 #endif /* USE_ITT_BUILD */ 2389 2390 if (master_th->th.th_teams_microtask && !exit_teams && 2391 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2392 team->t.t_level == master_th->th.th_teams_level + 1) { 2393 // AC: We need to leave the team structure intact at the end of parallel 2394 // inside the teams construct, so that at the next parallel same (hot) team 2395 // works, only adjust nesting levels 2396 #if OMPT_SUPPORT 2397 ompt_data_t ompt_parallel_data = ompt_data_none; 2398 if (ompt_enabled.enabled) { 2399 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2400 if (ompt_enabled.ompt_callback_implicit_task) { 2401 int ompt_team_size = team->t.t_nproc; 2402 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2403 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2404 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 2405 } 2406 task_info->frame.exit_frame = ompt_data_none; 2407 task_info->task_data = ompt_data_none; 2408 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 2409 __ompt_lw_taskteam_unlink(master_th); 2410 } 2411 #endif 2412 /* Decrement our nested depth level */ 2413 team->t.t_level--; 2414 team->t.t_active_level--; 2415 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2416 2417 // Restore number of threads in the team if needed. This code relies on 2418 // the proper adjustment of th_teams_size.nth after the fork in 2419 // __kmp_teams_master on each teams primary thread in the case that 2420 // __kmp_reserve_threads reduced it. 2421 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2422 int old_num = master_th->th.th_team_nproc; 2423 int new_num = master_th->th.th_teams_size.nth; 2424 kmp_info_t **other_threads = team->t.t_threads; 2425 team->t.t_nproc = new_num; 2426 for (int i = 0; i < old_num; ++i) { 2427 other_threads[i]->th.th_team_nproc = new_num; 2428 } 2429 // Adjust states of non-used threads of the team 2430 for (int i = old_num; i < new_num; ++i) { 2431 // Re-initialize thread's barrier data. 2432 KMP_DEBUG_ASSERT(other_threads[i]); 2433 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2434 for (int b = 0; b < bs_last_barrier; ++b) { 2435 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2436 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2437 #if USE_DEBUGGER 2438 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2439 #endif 2440 } 2441 if (__kmp_tasking_mode != tskm_immediate_exec) { 2442 // Synchronize thread's task state 2443 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2444 } 2445 } 2446 } 2447 2448 #if OMPT_SUPPORT 2449 if (ompt_enabled.enabled) { 2450 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data, 2451 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr); 2452 } 2453 #endif 2454 2455 return; 2456 } 2457 2458 /* do cleanup and restore the parent team */ 2459 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2460 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2461 2462 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2463 2464 /* jc: The following lock has instructions with REL and ACQ semantics, 2465 separating the parallel user code called in this parallel region 2466 from the serial user code called after this function returns. */ 2467 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2468 2469 if (!master_th->th.th_teams_microtask || 2470 team->t.t_level > master_th->th.th_teams_level) { 2471 /* Decrement our nested depth level */ 2472 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2473 } 2474 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2475 2476 #if OMPT_SUPPORT 2477 if (ompt_enabled.enabled) { 2478 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2479 if (ompt_enabled.ompt_callback_implicit_task) { 2480 int flags = (team_microtask == (void *)__kmp_teams_master) 2481 ? ompt_task_initial 2482 : ompt_task_implicit; 2483 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc; 2484 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2485 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2486 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags); 2487 } 2488 task_info->frame.exit_frame = ompt_data_none; 2489 task_info->task_data = ompt_data_none; 2490 } 2491 #endif 2492 2493 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2494 master_th, team)); 2495 __kmp_pop_current_task_from_thread(master_th); 2496 2497 #if KMP_AFFINITY_SUPPORTED 2498 // Restore master thread's partition. 2499 master_th->th.th_first_place = team->t.t_first_place; 2500 master_th->th.th_last_place = team->t.t_last_place; 2501 #endif // KMP_AFFINITY_SUPPORTED 2502 master_th->th.th_def_allocator = team->t.t_def_allocator; 2503 2504 #if OMPD_SUPPORT 2505 if (ompd_state & OMPD_ENABLE_BP) 2506 ompd_bp_parallel_end(); 2507 #endif 2508 updateHWFPControl(team); 2509 2510 if (root->r.r_active != master_active) 2511 root->r.r_active = master_active; 2512 2513 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2514 master_th)); // this will free worker threads 2515 2516 /* this race was fun to find. make sure the following is in the critical 2517 region otherwise assertions may fail occasionally since the old team may be 2518 reallocated and the hierarchy appears inconsistent. it is actually safe to 2519 run and won't cause any bugs, but will cause those assertion failures. it's 2520 only one deref&assign so might as well put this in the critical region */ 2521 master_th->th.th_team = parent_team; 2522 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2523 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2524 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2525 2526 /* restore serialized team, if need be */ 2527 if (parent_team->t.t_serialized && 2528 parent_team != master_th->th.th_serial_team && 2529 parent_team != root->r.r_root_team) { 2530 __kmp_free_team(root, 2531 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2532 master_th->th.th_serial_team = parent_team; 2533 } 2534 2535 if (__kmp_tasking_mode != tskm_immediate_exec) { 2536 if (master_th->th.th_task_state_top > 2537 0) { // Restore task state from memo stack 2538 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2539 // Remember primary thread's state if we re-use this nested hot team 2540 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2541 master_th->th.th_task_state; 2542 --master_th->th.th_task_state_top; // pop 2543 // Now restore state at this level 2544 master_th->th.th_task_state = 2545 master_th->th 2546 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2547 } 2548 // Copy the task team from the parent team to the primary thread 2549 master_th->th.th_task_team = 2550 parent_team->t.t_task_team[master_th->th.th_task_state]; 2551 KA_TRACE(20, 2552 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n", 2553 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2554 parent_team)); 2555 } 2556 2557 // TODO: GEH - cannot do this assertion because root thread not set up as 2558 // executing 2559 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2560 master_th->th.th_current_task->td_flags.executing = 1; 2561 2562 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2563 2564 #if OMPT_SUPPORT 2565 int flags = 2566 OMPT_INVOKER(fork_context) | 2567 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league 2568 : ompt_parallel_team); 2569 if (ompt_enabled.enabled) { 2570 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags, 2571 codeptr); 2572 } 2573 #endif 2574 2575 KMP_MB(); 2576 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2577 } 2578 2579 /* Check whether we should push an internal control record onto the 2580 serial team stack. If so, do it. */ 2581 void __kmp_save_internal_controls(kmp_info_t *thread) { 2582 2583 if (thread->th.th_team != thread->th.th_serial_team) { 2584 return; 2585 } 2586 if (thread->th.th_team->t.t_serialized > 1) { 2587 int push = 0; 2588 2589 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2590 push = 1; 2591 } else { 2592 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2593 thread->th.th_team->t.t_serialized) { 2594 push = 1; 2595 } 2596 } 2597 if (push) { /* push a record on the serial team's stack */ 2598 kmp_internal_control_t *control = 2599 (kmp_internal_control_t *)__kmp_allocate( 2600 sizeof(kmp_internal_control_t)); 2601 2602 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2603 2604 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2605 2606 control->next = thread->th.th_team->t.t_control_stack_top; 2607 thread->th.th_team->t.t_control_stack_top = control; 2608 } 2609 } 2610 } 2611 2612 /* Changes set_nproc */ 2613 void __kmp_set_num_threads(int new_nth, int gtid) { 2614 kmp_info_t *thread; 2615 kmp_root_t *root; 2616 2617 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2618 KMP_DEBUG_ASSERT(__kmp_init_serial); 2619 2620 if (new_nth < 1) 2621 new_nth = 1; 2622 else if (new_nth > __kmp_max_nth) 2623 new_nth = __kmp_max_nth; 2624 2625 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2626 thread = __kmp_threads[gtid]; 2627 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2628 return; // nothing to do 2629 2630 __kmp_save_internal_controls(thread); 2631 2632 set__nproc(thread, new_nth); 2633 2634 // If this omp_set_num_threads() call will cause the hot team size to be 2635 // reduced (in the absence of a num_threads clause), then reduce it now, 2636 // rather than waiting for the next parallel region. 2637 root = thread->th.th_root; 2638 if (__kmp_init_parallel && (!root->r.r_active) && 2639 (root->r.r_hot_team->t.t_nproc > new_nth) 2640 #if KMP_NESTED_HOT_TEAMS 2641 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2642 #endif 2643 ) { 2644 kmp_team_t *hot_team = root->r.r_hot_team; 2645 int f; 2646 2647 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2648 2649 // Release the extra threads we don't need any more. 2650 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2651 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2652 if (__kmp_tasking_mode != tskm_immediate_exec) { 2653 // When decreasing team size, threads no longer in the team should unref 2654 // task team. 2655 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2656 } 2657 __kmp_free_thread(hot_team->t.t_threads[f]); 2658 hot_team->t.t_threads[f] = NULL; 2659 } 2660 hot_team->t.t_nproc = new_nth; 2661 #if KMP_NESTED_HOT_TEAMS 2662 if (thread->th.th_hot_teams) { 2663 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2664 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2665 } 2666 #endif 2667 2668 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2669 2670 // Update the t_nproc field in the threads that are still active. 2671 for (f = 0; f < new_nth; f++) { 2672 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2673 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2674 } 2675 // Special flag in case omp_set_num_threads() call 2676 hot_team->t.t_size_changed = -1; 2677 } 2678 } 2679 2680 /* Changes max_active_levels */ 2681 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2682 kmp_info_t *thread; 2683 2684 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2685 "%d = (%d)\n", 2686 gtid, max_active_levels)); 2687 KMP_DEBUG_ASSERT(__kmp_init_serial); 2688 2689 // validate max_active_levels 2690 if (max_active_levels < 0) { 2691 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2692 // We ignore this call if the user has specified a negative value. 2693 // The current setting won't be changed. The last valid setting will be 2694 // used. A warning will be issued (if warnings are allowed as controlled by 2695 // the KMP_WARNINGS env var). 2696 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2697 "max_active_levels for thread %d = (%d)\n", 2698 gtid, max_active_levels)); 2699 return; 2700 } 2701 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2702 // it's OK, the max_active_levels is within the valid range: [ 0; 2703 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2704 // We allow a zero value. (implementation defined behavior) 2705 } else { 2706 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2707 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2708 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2709 // Current upper limit is MAX_INT. (implementation defined behavior) 2710 // If the input exceeds the upper limit, we correct the input to be the 2711 // upper limit. (implementation defined behavior) 2712 // Actually, the flow should never get here until we use MAX_INT limit. 2713 } 2714 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2715 "max_active_levels for thread %d = (%d)\n", 2716 gtid, max_active_levels)); 2717 2718 thread = __kmp_threads[gtid]; 2719 2720 __kmp_save_internal_controls(thread); 2721 2722 set__max_active_levels(thread, max_active_levels); 2723 } 2724 2725 /* Gets max_active_levels */ 2726 int __kmp_get_max_active_levels(int gtid) { 2727 kmp_info_t *thread; 2728 2729 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2730 KMP_DEBUG_ASSERT(__kmp_init_serial); 2731 2732 thread = __kmp_threads[gtid]; 2733 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2734 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2735 "curtask_maxaclevel=%d\n", 2736 gtid, thread->th.th_current_task, 2737 thread->th.th_current_task->td_icvs.max_active_levels)); 2738 return thread->th.th_current_task->td_icvs.max_active_levels; 2739 } 2740 2741 // nteams-var per-device ICV 2742 void __kmp_set_num_teams(int num_teams) { 2743 if (num_teams > 0) 2744 __kmp_nteams = num_teams; 2745 } 2746 int __kmp_get_max_teams(void) { return __kmp_nteams; } 2747 // teams-thread-limit-var per-device ICV 2748 void __kmp_set_teams_thread_limit(int limit) { 2749 if (limit > 0) 2750 __kmp_teams_thread_limit = limit; 2751 } 2752 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; } 2753 2754 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); 2755 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); 2756 2757 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2758 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2759 kmp_info_t *thread; 2760 kmp_sched_t orig_kind; 2761 // kmp_team_t *team; 2762 2763 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2764 gtid, (int)kind, chunk)); 2765 KMP_DEBUG_ASSERT(__kmp_init_serial); 2766 2767 // Check if the kind parameter is valid, correct if needed. 2768 // Valid parameters should fit in one of two intervals - standard or extended: 2769 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2770 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2771 orig_kind = kind; 2772 kind = __kmp_sched_without_mods(kind); 2773 2774 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2775 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2776 // TODO: Hint needs attention in case we change the default schedule. 2777 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2778 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2779 __kmp_msg_null); 2780 kind = kmp_sched_default; 2781 chunk = 0; // ignore chunk value in case of bad kind 2782 } 2783 2784 thread = __kmp_threads[gtid]; 2785 2786 __kmp_save_internal_controls(thread); 2787 2788 if (kind < kmp_sched_upper_std) { 2789 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2790 // differ static chunked vs. unchunked: chunk should be invalid to 2791 // indicate unchunked schedule (which is the default) 2792 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2793 } else { 2794 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2795 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2796 } 2797 } else { 2798 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2799 // kmp_sched_lower - 2 ]; 2800 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2801 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2802 kmp_sched_lower - 2]; 2803 } 2804 __kmp_sched_apply_mods_intkind( 2805 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); 2806 if (kind == kmp_sched_auto || chunk < 1) { 2807 // ignore parameter chunk for schedule auto 2808 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2809 } else { 2810 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2811 } 2812 } 2813 2814 /* Gets def_sched_var ICV values */ 2815 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2816 kmp_info_t *thread; 2817 enum sched_type th_type; 2818 2819 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2820 KMP_DEBUG_ASSERT(__kmp_init_serial); 2821 2822 thread = __kmp_threads[gtid]; 2823 2824 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2825 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { 2826 case kmp_sch_static: 2827 case kmp_sch_static_greedy: 2828 case kmp_sch_static_balanced: 2829 *kind = kmp_sched_static; 2830 __kmp_sched_apply_mods_stdkind(kind, th_type); 2831 *chunk = 0; // chunk was not set, try to show this fact via zero value 2832 return; 2833 case kmp_sch_static_chunked: 2834 *kind = kmp_sched_static; 2835 break; 2836 case kmp_sch_dynamic_chunked: 2837 *kind = kmp_sched_dynamic; 2838 break; 2839 case kmp_sch_guided_chunked: 2840 case kmp_sch_guided_iterative_chunked: 2841 case kmp_sch_guided_analytical_chunked: 2842 *kind = kmp_sched_guided; 2843 break; 2844 case kmp_sch_auto: 2845 *kind = kmp_sched_auto; 2846 break; 2847 case kmp_sch_trapezoidal: 2848 *kind = kmp_sched_trapezoidal; 2849 break; 2850 #if KMP_STATIC_STEAL_ENABLED 2851 case kmp_sch_static_steal: 2852 *kind = kmp_sched_static_steal; 2853 break; 2854 #endif 2855 default: 2856 KMP_FATAL(UnknownSchedulingType, th_type); 2857 } 2858 2859 __kmp_sched_apply_mods_stdkind(kind, th_type); 2860 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 2861 } 2862 2863 int __kmp_get_ancestor_thread_num(int gtid, int level) { 2864 2865 int ii, dd; 2866 kmp_team_t *team; 2867 kmp_info_t *thr; 2868 2869 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 2870 KMP_DEBUG_ASSERT(__kmp_init_serial); 2871 2872 // validate level 2873 if (level == 0) 2874 return 0; 2875 if (level < 0) 2876 return -1; 2877 thr = __kmp_threads[gtid]; 2878 team = thr->th.th_team; 2879 ii = team->t.t_level; 2880 if (level > ii) 2881 return -1; 2882 2883 if (thr->th.th_teams_microtask) { 2884 // AC: we are in teams region where multiple nested teams have same level 2885 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2886 if (level <= 2887 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2888 KMP_DEBUG_ASSERT(ii >= tlevel); 2889 // AC: As we need to pass by the teams league, we need to artificially 2890 // increase ii 2891 if (ii == tlevel) { 2892 ii += 2; // three teams have same level 2893 } else { 2894 ii++; // two teams have same level 2895 } 2896 } 2897 } 2898 2899 if (ii == level) 2900 return __kmp_tid_from_gtid(gtid); 2901 2902 dd = team->t.t_serialized; 2903 level++; 2904 while (ii > level) { 2905 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2906 } 2907 if ((team->t.t_serialized) && (!dd)) { 2908 team = team->t.t_parent; 2909 continue; 2910 } 2911 if (ii > level) { 2912 team = team->t.t_parent; 2913 dd = team->t.t_serialized; 2914 ii--; 2915 } 2916 } 2917 2918 return (dd > 1) ? (0) : (team->t.t_master_tid); 2919 } 2920 2921 int __kmp_get_team_size(int gtid, int level) { 2922 2923 int ii, dd; 2924 kmp_team_t *team; 2925 kmp_info_t *thr; 2926 2927 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 2928 KMP_DEBUG_ASSERT(__kmp_init_serial); 2929 2930 // validate level 2931 if (level == 0) 2932 return 1; 2933 if (level < 0) 2934 return -1; 2935 thr = __kmp_threads[gtid]; 2936 team = thr->th.th_team; 2937 ii = team->t.t_level; 2938 if (level > ii) 2939 return -1; 2940 2941 if (thr->th.th_teams_microtask) { 2942 // AC: we are in teams region where multiple nested teams have same level 2943 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2944 if (level <= 2945 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2946 KMP_DEBUG_ASSERT(ii >= tlevel); 2947 // AC: As we need to pass by the teams league, we need to artificially 2948 // increase ii 2949 if (ii == tlevel) { 2950 ii += 2; // three teams have same level 2951 } else { 2952 ii++; // two teams have same level 2953 } 2954 } 2955 } 2956 2957 while (ii > level) { 2958 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2959 } 2960 if (team->t.t_serialized && (!dd)) { 2961 team = team->t.t_parent; 2962 continue; 2963 } 2964 if (ii > level) { 2965 team = team->t.t_parent; 2966 ii--; 2967 } 2968 } 2969 2970 return team->t.t_nproc; 2971 } 2972 2973 kmp_r_sched_t __kmp_get_schedule_global() { 2974 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 2975 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 2976 // independently. So one can get the updated schedule here. 2977 2978 kmp_r_sched_t r_sched; 2979 2980 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 2981 // __kmp_guided. __kmp_sched should keep original value, so that user can set 2982 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 2983 // different roots (even in OMP 2.5) 2984 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); 2985 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); 2986 if (s == kmp_sch_static) { 2987 // replace STATIC with more detailed schedule (balanced or greedy) 2988 r_sched.r_sched_type = __kmp_static; 2989 } else if (s == kmp_sch_guided_chunked) { 2990 // replace GUIDED with more detailed schedule (iterative or analytical) 2991 r_sched.r_sched_type = __kmp_guided; 2992 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 2993 r_sched.r_sched_type = __kmp_sched; 2994 } 2995 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); 2996 2997 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 2998 // __kmp_chunk may be wrong here (if it was not ever set) 2999 r_sched.chunk = KMP_DEFAULT_CHUNK; 3000 } else { 3001 r_sched.chunk = __kmp_chunk; 3002 } 3003 3004 return r_sched; 3005 } 3006 3007 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 3008 at least argc number of *t_argv entries for the requested team. */ 3009 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 3010 3011 KMP_DEBUG_ASSERT(team); 3012 if (!realloc || argc > team->t.t_max_argc) { 3013 3014 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 3015 "current entries=%d\n", 3016 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 3017 /* if previously allocated heap space for args, free them */ 3018 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 3019 __kmp_free((void *)team->t.t_argv); 3020 3021 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 3022 /* use unused space in the cache line for arguments */ 3023 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 3024 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 3025 "argv entries\n", 3026 team->t.t_id, team->t.t_max_argc)); 3027 team->t.t_argv = &team->t.t_inline_argv[0]; 3028 if (__kmp_storage_map) { 3029 __kmp_print_storage_map_gtid( 3030 -1, &team->t.t_inline_argv[0], 3031 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3032 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3033 team->t.t_id); 3034 } 3035 } else { 3036 /* allocate space for arguments in the heap */ 3037 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3038 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3039 : 2 * argc; 3040 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3041 "argv entries\n", 3042 team->t.t_id, team->t.t_max_argc)); 3043 team->t.t_argv = 3044 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3045 if (__kmp_storage_map) { 3046 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3047 &team->t.t_argv[team->t.t_max_argc], 3048 sizeof(void *) * team->t.t_max_argc, 3049 "team_%d.t_argv", team->t.t_id); 3050 } 3051 } 3052 } 3053 } 3054 3055 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3056 int i; 3057 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3058 team->t.t_threads = 3059 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3060 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3061 sizeof(dispatch_shared_info_t) * num_disp_buff); 3062 team->t.t_dispatch = 3063 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3064 team->t.t_implicit_task_taskdata = 3065 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3066 team->t.t_max_nproc = max_nth; 3067 3068 /* setup dispatch buffers */ 3069 for (i = 0; i < num_disp_buff; ++i) { 3070 team->t.t_disp_buffer[i].buffer_index = i; 3071 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3072 } 3073 } 3074 3075 static void __kmp_free_team_arrays(kmp_team_t *team) { 3076 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3077 int i; 3078 for (i = 0; i < team->t.t_max_nproc; ++i) { 3079 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3080 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3081 team->t.t_dispatch[i].th_disp_buffer = NULL; 3082 } 3083 } 3084 #if KMP_USE_HIER_SCHED 3085 __kmp_dispatch_free_hierarchies(team); 3086 #endif 3087 __kmp_free(team->t.t_threads); 3088 __kmp_free(team->t.t_disp_buffer); 3089 __kmp_free(team->t.t_dispatch); 3090 __kmp_free(team->t.t_implicit_task_taskdata); 3091 team->t.t_threads = NULL; 3092 team->t.t_disp_buffer = NULL; 3093 team->t.t_dispatch = NULL; 3094 team->t.t_implicit_task_taskdata = 0; 3095 } 3096 3097 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3098 kmp_info_t **oldThreads = team->t.t_threads; 3099 3100 __kmp_free(team->t.t_disp_buffer); 3101 __kmp_free(team->t.t_dispatch); 3102 __kmp_free(team->t.t_implicit_task_taskdata); 3103 __kmp_allocate_team_arrays(team, max_nth); 3104 3105 KMP_MEMCPY(team->t.t_threads, oldThreads, 3106 team->t.t_nproc * sizeof(kmp_info_t *)); 3107 3108 __kmp_free(oldThreads); 3109 } 3110 3111 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3112 3113 kmp_r_sched_t r_sched = 3114 __kmp_get_schedule_global(); // get current state of scheduling globals 3115 3116 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3117 3118 kmp_internal_control_t g_icvs = { 3119 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3120 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3121 // adjustment of threads (per thread) 3122 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3123 // whether blocktime is explicitly set 3124 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3125 #if KMP_USE_MONITOR 3126 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3127 // intervals 3128 #endif 3129 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3130 // next parallel region (per thread) 3131 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3132 __kmp_cg_max_nth, // int thread_limit; 3133 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3134 // for max_active_levels 3135 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3136 // {sched,chunk} pair 3137 __kmp_nested_proc_bind.bind_types[0], 3138 __kmp_default_device, 3139 NULL // struct kmp_internal_control *next; 3140 }; 3141 3142 return g_icvs; 3143 } 3144 3145 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3146 3147 kmp_internal_control_t gx_icvs; 3148 gx_icvs.serial_nesting_level = 3149 0; // probably =team->t.t_serial like in save_inter_controls 3150 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3151 gx_icvs.next = NULL; 3152 3153 return gx_icvs; 3154 } 3155 3156 static void __kmp_initialize_root(kmp_root_t *root) { 3157 int f; 3158 kmp_team_t *root_team; 3159 kmp_team_t *hot_team; 3160 int hot_team_max_nth; 3161 kmp_r_sched_t r_sched = 3162 __kmp_get_schedule_global(); // get current state of scheduling globals 3163 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3164 KMP_DEBUG_ASSERT(root); 3165 KMP_ASSERT(!root->r.r_begin); 3166 3167 /* setup the root state structure */ 3168 __kmp_init_lock(&root->r.r_begin_lock); 3169 root->r.r_begin = FALSE; 3170 root->r.r_active = FALSE; 3171 root->r.r_in_parallel = 0; 3172 root->r.r_blocktime = __kmp_dflt_blocktime; 3173 #if KMP_AFFINITY_SUPPORTED 3174 root->r.r_affinity_assigned = FALSE; 3175 #endif 3176 3177 /* setup the root team for this task */ 3178 /* allocate the root team structure */ 3179 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3180 3181 root_team = 3182 __kmp_allocate_team(root, 3183 1, // new_nproc 3184 1, // max_nproc 3185 #if OMPT_SUPPORT 3186 ompt_data_none, // root parallel id 3187 #endif 3188 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3189 0 // argc 3190 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3191 ); 3192 #if USE_DEBUGGER 3193 // Non-NULL value should be assigned to make the debugger display the root 3194 // team. 3195 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3196 #endif 3197 3198 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3199 3200 root->r.r_root_team = root_team; 3201 root_team->t.t_control_stack_top = NULL; 3202 3203 /* initialize root team */ 3204 root_team->t.t_threads[0] = NULL; 3205 root_team->t.t_nproc = 1; 3206 root_team->t.t_serialized = 1; 3207 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3208 root_team->t.t_sched.sched = r_sched.sched; 3209 KA_TRACE( 3210 20, 3211 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3212 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3213 3214 /* setup the hot team for this task */ 3215 /* allocate the hot team structure */ 3216 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3217 3218 hot_team = 3219 __kmp_allocate_team(root, 3220 1, // new_nproc 3221 __kmp_dflt_team_nth_ub * 2, // max_nproc 3222 #if OMPT_SUPPORT 3223 ompt_data_none, // root parallel id 3224 #endif 3225 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3226 0 // argc 3227 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3228 ); 3229 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3230 3231 root->r.r_hot_team = hot_team; 3232 root_team->t.t_control_stack_top = NULL; 3233 3234 /* first-time initialization */ 3235 hot_team->t.t_parent = root_team; 3236 3237 /* initialize hot team */ 3238 hot_team_max_nth = hot_team->t.t_max_nproc; 3239 for (f = 0; f < hot_team_max_nth; ++f) { 3240 hot_team->t.t_threads[f] = NULL; 3241 } 3242 hot_team->t.t_nproc = 1; 3243 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3244 hot_team->t.t_sched.sched = r_sched.sched; 3245 hot_team->t.t_size_changed = 0; 3246 } 3247 3248 #ifdef KMP_DEBUG 3249 3250 typedef struct kmp_team_list_item { 3251 kmp_team_p const *entry; 3252 struct kmp_team_list_item *next; 3253 } kmp_team_list_item_t; 3254 typedef kmp_team_list_item_t *kmp_team_list_t; 3255 3256 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3257 kmp_team_list_t list, // List of teams. 3258 kmp_team_p const *team // Team to add. 3259 ) { 3260 3261 // List must terminate with item where both entry and next are NULL. 3262 // Team is added to the list only once. 3263 // List is sorted in ascending order by team id. 3264 // Team id is *not* a key. 3265 3266 kmp_team_list_t l; 3267 3268 KMP_DEBUG_ASSERT(list != NULL); 3269 if (team == NULL) { 3270 return; 3271 } 3272 3273 __kmp_print_structure_team_accum(list, team->t.t_parent); 3274 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3275 3276 // Search list for the team. 3277 l = list; 3278 while (l->next != NULL && l->entry != team) { 3279 l = l->next; 3280 } 3281 if (l->next != NULL) { 3282 return; // Team has been added before, exit. 3283 } 3284 3285 // Team is not found. Search list again for insertion point. 3286 l = list; 3287 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3288 l = l->next; 3289 } 3290 3291 // Insert team. 3292 { 3293 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3294 sizeof(kmp_team_list_item_t)); 3295 *item = *l; 3296 l->entry = team; 3297 l->next = item; 3298 } 3299 } 3300 3301 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3302 3303 ) { 3304 __kmp_printf("%s", title); 3305 if (team != NULL) { 3306 __kmp_printf("%2x %p\n", team->t.t_id, team); 3307 } else { 3308 __kmp_printf(" - (nil)\n"); 3309 } 3310 } 3311 3312 static void __kmp_print_structure_thread(char const *title, 3313 kmp_info_p const *thread) { 3314 __kmp_printf("%s", title); 3315 if (thread != NULL) { 3316 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3317 } else { 3318 __kmp_printf(" - (nil)\n"); 3319 } 3320 } 3321 3322 void __kmp_print_structure(void) { 3323 3324 kmp_team_list_t list; 3325 3326 // Initialize list of teams. 3327 list = 3328 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3329 list->entry = NULL; 3330 list->next = NULL; 3331 3332 __kmp_printf("\n------------------------------\nGlobal Thread " 3333 "Table\n------------------------------\n"); 3334 { 3335 int gtid; 3336 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3337 __kmp_printf("%2d", gtid); 3338 if (__kmp_threads != NULL) { 3339 __kmp_printf(" %p", __kmp_threads[gtid]); 3340 } 3341 if (__kmp_root != NULL) { 3342 __kmp_printf(" %p", __kmp_root[gtid]); 3343 } 3344 __kmp_printf("\n"); 3345 } 3346 } 3347 3348 // Print out __kmp_threads array. 3349 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3350 "----------\n"); 3351 if (__kmp_threads != NULL) { 3352 int gtid; 3353 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3354 kmp_info_t const *thread = __kmp_threads[gtid]; 3355 if (thread != NULL) { 3356 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3357 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3358 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3359 __kmp_print_structure_team(" Serial Team: ", 3360 thread->th.th_serial_team); 3361 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3362 __kmp_print_structure_thread(" Primary: ", 3363 thread->th.th_team_master); 3364 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3365 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3366 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3367 __kmp_print_structure_thread(" Next in pool: ", 3368 thread->th.th_next_pool); 3369 __kmp_printf("\n"); 3370 __kmp_print_structure_team_accum(list, thread->th.th_team); 3371 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3372 } 3373 } 3374 } else { 3375 __kmp_printf("Threads array is not allocated.\n"); 3376 } 3377 3378 // Print out __kmp_root array. 3379 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3380 "--------\n"); 3381 if (__kmp_root != NULL) { 3382 int gtid; 3383 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3384 kmp_root_t const *root = __kmp_root[gtid]; 3385 if (root != NULL) { 3386 __kmp_printf("GTID %2d %p:\n", gtid, root); 3387 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3388 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3389 __kmp_print_structure_thread(" Uber Thread: ", 3390 root->r.r_uber_thread); 3391 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3392 __kmp_printf(" In Parallel: %2d\n", 3393 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3394 __kmp_printf("\n"); 3395 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3396 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3397 } 3398 } 3399 } else { 3400 __kmp_printf("Ubers array is not allocated.\n"); 3401 } 3402 3403 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3404 "--------\n"); 3405 while (list->next != NULL) { 3406 kmp_team_p const *team = list->entry; 3407 int i; 3408 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3409 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3410 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid); 3411 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3412 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3413 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3414 for (i = 0; i < team->t.t_nproc; ++i) { 3415 __kmp_printf(" Thread %2d: ", i); 3416 __kmp_print_structure_thread("", team->t.t_threads[i]); 3417 } 3418 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3419 __kmp_printf("\n"); 3420 list = list->next; 3421 } 3422 3423 // Print out __kmp_thread_pool and __kmp_team_pool. 3424 __kmp_printf("\n------------------------------\nPools\n----------------------" 3425 "--------\n"); 3426 __kmp_print_structure_thread("Thread pool: ", 3427 CCAST(kmp_info_t *, __kmp_thread_pool)); 3428 __kmp_print_structure_team("Team pool: ", 3429 CCAST(kmp_team_t *, __kmp_team_pool)); 3430 __kmp_printf("\n"); 3431 3432 // Free team list. 3433 while (list != NULL) { 3434 kmp_team_list_item_t *item = list; 3435 list = list->next; 3436 KMP_INTERNAL_FREE(item); 3437 } 3438 } 3439 3440 #endif 3441 3442 //--------------------------------------------------------------------------- 3443 // Stuff for per-thread fast random number generator 3444 // Table of primes 3445 static const unsigned __kmp_primes[] = { 3446 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3447 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3448 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3449 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3450 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3451 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3452 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3453 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3454 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3455 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3456 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3457 3458 //--------------------------------------------------------------------------- 3459 // __kmp_get_random: Get a random number using a linear congruential method. 3460 unsigned short __kmp_get_random(kmp_info_t *thread) { 3461 unsigned x = thread->th.th_x; 3462 unsigned short r = (unsigned short)(x >> 16); 3463 3464 thread->th.th_x = x * thread->th.th_a + 1; 3465 3466 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3467 thread->th.th_info.ds.ds_tid, r)); 3468 3469 return r; 3470 } 3471 //-------------------------------------------------------- 3472 // __kmp_init_random: Initialize a random number generator 3473 void __kmp_init_random(kmp_info_t *thread) { 3474 unsigned seed = thread->th.th_info.ds.ds_tid; 3475 3476 thread->th.th_a = 3477 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3478 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3479 KA_TRACE(30, 3480 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3481 } 3482 3483 #if KMP_OS_WINDOWS 3484 /* reclaim array entries for root threads that are already dead, returns number 3485 * reclaimed */ 3486 static int __kmp_reclaim_dead_roots(void) { 3487 int i, r = 0; 3488 3489 for (i = 0; i < __kmp_threads_capacity; ++i) { 3490 if (KMP_UBER_GTID(i) && 3491 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3492 !__kmp_root[i] 3493 ->r.r_active) { // AC: reclaim only roots died in non-active state 3494 r += __kmp_unregister_root_other_thread(i); 3495 } 3496 } 3497 return r; 3498 } 3499 #endif 3500 3501 /* This function attempts to create free entries in __kmp_threads and 3502 __kmp_root, and returns the number of free entries generated. 3503 3504 For Windows* OS static library, the first mechanism used is to reclaim array 3505 entries for root threads that are already dead. 3506 3507 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3508 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3509 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3510 threadprivate cache array has been created. Synchronization with 3511 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3512 3513 After any dead root reclamation, if the clipping value allows array expansion 3514 to result in the generation of a total of nNeed free slots, the function does 3515 that expansion. If not, nothing is done beyond the possible initial root 3516 thread reclamation. 3517 3518 If any argument is negative, the behavior is undefined. */ 3519 static int __kmp_expand_threads(int nNeed) { 3520 int added = 0; 3521 int minimumRequiredCapacity; 3522 int newCapacity; 3523 kmp_info_t **newThreads; 3524 kmp_root_t **newRoot; 3525 3526 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3527 // resizing __kmp_threads does not need additional protection if foreign 3528 // threads are present 3529 3530 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3531 /* only for Windows static library */ 3532 /* reclaim array entries for root threads that are already dead */ 3533 added = __kmp_reclaim_dead_roots(); 3534 3535 if (nNeed) { 3536 nNeed -= added; 3537 if (nNeed < 0) 3538 nNeed = 0; 3539 } 3540 #endif 3541 if (nNeed <= 0) 3542 return added; 3543 3544 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3545 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3546 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3547 // > __kmp_max_nth in one of two ways: 3548 // 3549 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3550 // may not be reused by another thread, so we may need to increase 3551 // __kmp_threads_capacity to __kmp_max_nth + 1. 3552 // 3553 // 2) New foreign root(s) are encountered. We always register new foreign 3554 // roots. This may cause a smaller # of threads to be allocated at 3555 // subsequent parallel regions, but the worker threads hang around (and 3556 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3557 // 3558 // Anyway, that is the reason for moving the check to see if 3559 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3560 // instead of having it performed here. -BB 3561 3562 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3563 3564 /* compute expansion headroom to check if we can expand */ 3565 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3566 /* possible expansion too small -- give up */ 3567 return added; 3568 } 3569 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3570 3571 newCapacity = __kmp_threads_capacity; 3572 do { 3573 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3574 : __kmp_sys_max_nth; 3575 } while (newCapacity < minimumRequiredCapacity); 3576 newThreads = (kmp_info_t **)__kmp_allocate( 3577 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3578 newRoot = 3579 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3580 KMP_MEMCPY(newThreads, __kmp_threads, 3581 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3582 KMP_MEMCPY(newRoot, __kmp_root, 3583 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3584 3585 kmp_info_t **temp_threads = __kmp_threads; 3586 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3587 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3588 __kmp_free(temp_threads); 3589 added += newCapacity - __kmp_threads_capacity; 3590 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3591 3592 if (newCapacity > __kmp_tp_capacity) { 3593 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3594 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3595 __kmp_threadprivate_resize_cache(newCapacity); 3596 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3597 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3598 } 3599 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3600 } 3601 3602 return added; 3603 } 3604 3605 /* Register the current thread as a root thread and obtain our gtid. We must 3606 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3607 thread that calls from __kmp_do_serial_initialize() */ 3608 int __kmp_register_root(int initial_thread) { 3609 kmp_info_t *root_thread; 3610 kmp_root_t *root; 3611 int gtid; 3612 int capacity; 3613 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3614 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3615 KMP_MB(); 3616 3617 /* 2007-03-02: 3618 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3619 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3620 work as expected -- it may return false (that means there is at least one 3621 empty slot in __kmp_threads array), but it is possible the only free slot 3622 is #0, which is reserved for initial thread and so cannot be used for this 3623 one. Following code workarounds this bug. 3624 3625 However, right solution seems to be not reserving slot #0 for initial 3626 thread because: 3627 (1) there is no magic in slot #0, 3628 (2) we cannot detect initial thread reliably (the first thread which does 3629 serial initialization may be not a real initial thread). 3630 */ 3631 capacity = __kmp_threads_capacity; 3632 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3633 --capacity; 3634 } 3635 3636 // If it is not for initializing the hidden helper team, we need to take 3637 // __kmp_hidden_helper_threads_num out of the capacity because it is included 3638 // in __kmp_threads_capacity. 3639 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 3640 capacity -= __kmp_hidden_helper_threads_num; 3641 } 3642 3643 /* see if there are too many threads */ 3644 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3645 if (__kmp_tp_cached) { 3646 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3647 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3648 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3649 } else { 3650 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3651 __kmp_msg_null); 3652 } 3653 } 3654 3655 // When hidden helper task is enabled, __kmp_threads is organized as follows: 3656 // 0: initial thread, also a regular OpenMP thread. 3657 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads. 3658 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for 3659 // regular OpenMP threads. 3660 if (TCR_4(__kmp_init_hidden_helper_threads)) { 3661 // Find an available thread slot for hidden helper thread. Slots for hidden 3662 // helper threads start from 1 to __kmp_hidden_helper_threads_num. 3663 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL && 3664 gtid <= __kmp_hidden_helper_threads_num; 3665 gtid++) 3666 ; 3667 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num); 3668 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for " 3669 "hidden helper thread: T#%d\n", 3670 gtid)); 3671 } else { 3672 /* find an available thread slot */ 3673 // Don't reassign the zero slot since we need that to only be used by 3674 // initial thread. Slots for hidden helper threads should also be skipped. 3675 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3676 gtid = 0; 3677 } else { 3678 for (gtid = __kmp_hidden_helper_threads_num + 1; 3679 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++) 3680 ; 3681 } 3682 KA_TRACE( 3683 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3684 KMP_ASSERT(gtid < __kmp_threads_capacity); 3685 } 3686 3687 /* update global accounting */ 3688 __kmp_all_nth++; 3689 TCW_4(__kmp_nth, __kmp_nth + 1); 3690 3691 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3692 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3693 if (__kmp_adjust_gtid_mode) { 3694 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3695 if (TCR_4(__kmp_gtid_mode) != 2) { 3696 TCW_4(__kmp_gtid_mode, 2); 3697 } 3698 } else { 3699 if (TCR_4(__kmp_gtid_mode) != 1) { 3700 TCW_4(__kmp_gtid_mode, 1); 3701 } 3702 } 3703 } 3704 3705 #ifdef KMP_ADJUST_BLOCKTIME 3706 /* Adjust blocktime to zero if necessary */ 3707 /* Middle initialization might not have occurred yet */ 3708 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3709 if (__kmp_nth > __kmp_avail_proc) { 3710 __kmp_zero_bt = TRUE; 3711 } 3712 } 3713 #endif /* KMP_ADJUST_BLOCKTIME */ 3714 3715 /* setup this new hierarchy */ 3716 if (!(root = __kmp_root[gtid])) { 3717 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3718 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3719 } 3720 3721 #if KMP_STATS_ENABLED 3722 // Initialize stats as soon as possible (right after gtid assignment). 3723 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3724 __kmp_stats_thread_ptr->startLife(); 3725 KMP_SET_THREAD_STATE(SERIAL_REGION); 3726 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3727 #endif 3728 __kmp_initialize_root(root); 3729 3730 /* setup new root thread structure */ 3731 if (root->r.r_uber_thread) { 3732 root_thread = root->r.r_uber_thread; 3733 } else { 3734 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3735 if (__kmp_storage_map) { 3736 __kmp_print_thread_storage_map(root_thread, gtid); 3737 } 3738 root_thread->th.th_info.ds.ds_gtid = gtid; 3739 #if OMPT_SUPPORT 3740 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3741 #endif 3742 root_thread->th.th_root = root; 3743 if (__kmp_env_consistency_check) { 3744 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3745 } 3746 #if USE_FAST_MEMORY 3747 __kmp_initialize_fast_memory(root_thread); 3748 #endif /* USE_FAST_MEMORY */ 3749 3750 #if KMP_USE_BGET 3751 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3752 __kmp_initialize_bget(root_thread); 3753 #endif 3754 __kmp_init_random(root_thread); // Initialize random number generator 3755 } 3756 3757 /* setup the serial team held in reserve by the root thread */ 3758 if (!root_thread->th.th_serial_team) { 3759 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3760 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3761 root_thread->th.th_serial_team = __kmp_allocate_team( 3762 root, 1, 1, 3763 #if OMPT_SUPPORT 3764 ompt_data_none, // root parallel id 3765 #endif 3766 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3767 } 3768 KMP_ASSERT(root_thread->th.th_serial_team); 3769 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3770 root_thread->th.th_serial_team)); 3771 3772 /* drop root_thread into place */ 3773 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3774 3775 root->r.r_root_team->t.t_threads[0] = root_thread; 3776 root->r.r_hot_team->t.t_threads[0] = root_thread; 3777 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3778 // AC: the team created in reserve, not for execution (it is unused for now). 3779 root_thread->th.th_serial_team->t.t_serialized = 0; 3780 root->r.r_uber_thread = root_thread; 3781 3782 /* initialize the thread, get it ready to go */ 3783 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3784 TCW_4(__kmp_init_gtid, TRUE); 3785 3786 /* prepare the primary thread for get_gtid() */ 3787 __kmp_gtid_set_specific(gtid); 3788 3789 #if USE_ITT_BUILD 3790 __kmp_itt_thread_name(gtid); 3791 #endif /* USE_ITT_BUILD */ 3792 3793 #ifdef KMP_TDATA_GTID 3794 __kmp_gtid = gtid; 3795 #endif 3796 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3797 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3798 3799 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3800 "plain=%u\n", 3801 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3802 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3803 KMP_INIT_BARRIER_STATE)); 3804 { // Initialize barrier data. 3805 int b; 3806 for (b = 0; b < bs_last_barrier; ++b) { 3807 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3808 #if USE_DEBUGGER 3809 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3810 #endif 3811 } 3812 } 3813 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3814 KMP_INIT_BARRIER_STATE); 3815 3816 #if KMP_AFFINITY_SUPPORTED 3817 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3818 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3819 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3820 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3821 #endif /* KMP_AFFINITY_SUPPORTED */ 3822 root_thread->th.th_def_allocator = __kmp_def_allocator; 3823 root_thread->th.th_prev_level = 0; 3824 root_thread->th.th_prev_num_threads = 1; 3825 3826 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 3827 tmp->cg_root = root_thread; 3828 tmp->cg_thread_limit = __kmp_cg_max_nth; 3829 tmp->cg_nthreads = 1; 3830 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" 3831 " cg_nthreads init to 1\n", 3832 root_thread, tmp)); 3833 tmp->up = NULL; 3834 root_thread->th.th_cg_roots = tmp; 3835 3836 __kmp_root_counter++; 3837 3838 #if OMPT_SUPPORT 3839 if (!initial_thread && ompt_enabled.enabled) { 3840 3841 kmp_info_t *root_thread = ompt_get_thread(); 3842 3843 ompt_set_thread_state(root_thread, ompt_state_overhead); 3844 3845 if (ompt_enabled.ompt_callback_thread_begin) { 3846 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 3847 ompt_thread_initial, __ompt_get_thread_data_internal()); 3848 } 3849 ompt_data_t *task_data; 3850 ompt_data_t *parallel_data; 3851 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3852 NULL); 3853 if (ompt_enabled.ompt_callback_implicit_task) { 3854 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3855 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); 3856 } 3857 3858 ompt_set_thread_state(root_thread, ompt_state_work_serial); 3859 } 3860 #endif 3861 #if OMPD_SUPPORT 3862 if (ompd_state & OMPD_ENABLE_BP) 3863 ompd_bp_thread_begin(); 3864 #endif 3865 3866 KMP_MB(); 3867 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3868 3869 return gtid; 3870 } 3871 3872 #if KMP_NESTED_HOT_TEAMS 3873 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 3874 const int max_level) { 3875 int i, n, nth; 3876 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 3877 if (!hot_teams || !hot_teams[level].hot_team) { 3878 return 0; 3879 } 3880 KMP_DEBUG_ASSERT(level < max_level); 3881 kmp_team_t *team = hot_teams[level].hot_team; 3882 nth = hot_teams[level].hot_team_nth; 3883 n = nth - 1; // primary thread is not freed 3884 if (level < max_level - 1) { 3885 for (i = 0; i < nth; ++i) { 3886 kmp_info_t *th = team->t.t_threads[i]; 3887 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 3888 if (i > 0 && th->th.th_hot_teams) { 3889 __kmp_free(th->th.th_hot_teams); 3890 th->th.th_hot_teams = NULL; 3891 } 3892 } 3893 } 3894 __kmp_free_team(root, team, NULL); 3895 return n; 3896 } 3897 #endif 3898 3899 // Resets a root thread and clear its root and hot teams. 3900 // Returns the number of __kmp_threads entries directly and indirectly freed. 3901 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 3902 kmp_team_t *root_team = root->r.r_root_team; 3903 kmp_team_t *hot_team = root->r.r_hot_team; 3904 int n = hot_team->t.t_nproc; 3905 int i; 3906 3907 KMP_DEBUG_ASSERT(!root->r.r_active); 3908 3909 root->r.r_root_team = NULL; 3910 root->r.r_hot_team = NULL; 3911 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 3912 // before call to __kmp_free_team(). 3913 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 3914 #if KMP_NESTED_HOT_TEAMS 3915 if (__kmp_hot_teams_max_level > 3916 0) { // need to free nested hot teams and their threads if any 3917 for (i = 0; i < hot_team->t.t_nproc; ++i) { 3918 kmp_info_t *th = hot_team->t.t_threads[i]; 3919 if (__kmp_hot_teams_max_level > 1) { 3920 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 3921 } 3922 if (th->th.th_hot_teams) { 3923 __kmp_free(th->th.th_hot_teams); 3924 th->th.th_hot_teams = NULL; 3925 } 3926 } 3927 } 3928 #endif 3929 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 3930 3931 // Before we can reap the thread, we need to make certain that all other 3932 // threads in the teams that had this root as ancestor have stopped trying to 3933 // steal tasks. 3934 if (__kmp_tasking_mode != tskm_immediate_exec) { 3935 __kmp_wait_to_unref_task_teams(); 3936 } 3937 3938 #if KMP_OS_WINDOWS 3939 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 3940 KA_TRACE( 3941 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 3942 "\n", 3943 (LPVOID) & (root->r.r_uber_thread->th), 3944 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 3945 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 3946 #endif /* KMP_OS_WINDOWS */ 3947 3948 #if OMPD_SUPPORT 3949 if (ompd_state & OMPD_ENABLE_BP) 3950 ompd_bp_thread_end(); 3951 #endif 3952 3953 #if OMPT_SUPPORT 3954 ompt_data_t *task_data; 3955 ompt_data_t *parallel_data; 3956 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3957 NULL); 3958 if (ompt_enabled.ompt_callback_implicit_task) { 3959 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3960 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); 3961 } 3962 if (ompt_enabled.ompt_callback_thread_end) { 3963 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 3964 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 3965 } 3966 #endif 3967 3968 TCW_4(__kmp_nth, 3969 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 3970 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; 3971 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" 3972 " to %d\n", 3973 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, 3974 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); 3975 if (i == 1) { 3976 // need to free contention group structure 3977 KMP_DEBUG_ASSERT(root->r.r_uber_thread == 3978 root->r.r_uber_thread->th.th_cg_roots->cg_root); 3979 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); 3980 __kmp_free(root->r.r_uber_thread->th.th_cg_roots); 3981 root->r.r_uber_thread->th.th_cg_roots = NULL; 3982 } 3983 __kmp_reap_thread(root->r.r_uber_thread, 1); 3984 3985 // We canot put root thread to __kmp_thread_pool, so we have to reap it 3986 // instead of freeing. 3987 root->r.r_uber_thread = NULL; 3988 /* mark root as no longer in use */ 3989 root->r.r_begin = FALSE; 3990 3991 return n; 3992 } 3993 3994 void __kmp_unregister_root_current_thread(int gtid) { 3995 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 3996 /* this lock should be ok, since unregister_root_current_thread is never 3997 called during an abort, only during a normal close. furthermore, if you 3998 have the forkjoin lock, you should never try to get the initz lock */ 3999 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 4000 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 4001 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 4002 "exiting T#%d\n", 4003 gtid)); 4004 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4005 return; 4006 } 4007 kmp_root_t *root = __kmp_root[gtid]; 4008 4009 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4010 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4011 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4012 KMP_ASSERT(root->r.r_active == FALSE); 4013 4014 KMP_MB(); 4015 4016 kmp_info_t *thread = __kmp_threads[gtid]; 4017 kmp_team_t *team = thread->th.th_team; 4018 kmp_task_team_t *task_team = thread->th.th_task_team; 4019 4020 // we need to wait for the proxy tasks before finishing the thread 4021 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) { 4022 #if OMPT_SUPPORT 4023 // the runtime is shutting down so we won't report any events 4024 thread->th.ompt_thread_info.state = ompt_state_undefined; 4025 #endif 4026 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 4027 } 4028 4029 __kmp_reset_root(gtid, root); 4030 4031 KMP_MB(); 4032 KC_TRACE(10, 4033 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 4034 4035 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4036 } 4037 4038 #if KMP_OS_WINDOWS 4039 /* __kmp_forkjoin_lock must be already held 4040 Unregisters a root thread that is not the current thread. Returns the number 4041 of __kmp_threads entries freed as a result. */ 4042 static int __kmp_unregister_root_other_thread(int gtid) { 4043 kmp_root_t *root = __kmp_root[gtid]; 4044 int r; 4045 4046 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 4047 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4048 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4049 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4050 KMP_ASSERT(root->r.r_active == FALSE); 4051 4052 r = __kmp_reset_root(gtid, root); 4053 KC_TRACE(10, 4054 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 4055 return r; 4056 } 4057 #endif 4058 4059 #if KMP_DEBUG 4060 void __kmp_task_info() { 4061 4062 kmp_int32 gtid = __kmp_entry_gtid(); 4063 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4064 kmp_info_t *this_thr = __kmp_threads[gtid]; 4065 kmp_team_t *steam = this_thr->th.th_serial_team; 4066 kmp_team_t *team = this_thr->th.th_team; 4067 4068 __kmp_printf( 4069 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4070 "ptask=%p\n", 4071 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4072 team->t.t_implicit_task_taskdata[tid].td_parent); 4073 } 4074 #endif // KMP_DEBUG 4075 4076 /* TODO optimize with one big memclr, take out what isn't needed, split 4077 responsibility to workers as much as possible, and delay initialization of 4078 features as much as possible */ 4079 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4080 int tid, int gtid) { 4081 /* this_thr->th.th_info.ds.ds_gtid is setup in 4082 kmp_allocate_thread/create_worker. 4083 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4084 KMP_DEBUG_ASSERT(this_thr != NULL); 4085 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4086 KMP_DEBUG_ASSERT(team); 4087 KMP_DEBUG_ASSERT(team->t.t_threads); 4088 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4089 kmp_info_t *master = team->t.t_threads[0]; 4090 KMP_DEBUG_ASSERT(master); 4091 KMP_DEBUG_ASSERT(master->th.th_root); 4092 4093 KMP_MB(); 4094 4095 TCW_SYNC_PTR(this_thr->th.th_team, team); 4096 4097 this_thr->th.th_info.ds.ds_tid = tid; 4098 this_thr->th.th_set_nproc = 0; 4099 if (__kmp_tasking_mode != tskm_immediate_exec) 4100 // When tasking is possible, threads are not safe to reap until they are 4101 // done tasking; this will be set when tasking code is exited in wait 4102 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4103 else // no tasking --> always safe to reap 4104 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4105 this_thr->th.th_set_proc_bind = proc_bind_default; 4106 #if KMP_AFFINITY_SUPPORTED 4107 this_thr->th.th_new_place = this_thr->th.th_current_place; 4108 #endif 4109 this_thr->th.th_root = master->th.th_root; 4110 4111 /* setup the thread's cache of the team structure */ 4112 this_thr->th.th_team_nproc = team->t.t_nproc; 4113 this_thr->th.th_team_master = master; 4114 this_thr->th.th_team_serialized = team->t.t_serialized; 4115 TCW_PTR(this_thr->th.th_sleep_loc, NULL); 4116 4117 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4118 4119 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4120 tid, gtid, this_thr, this_thr->th.th_current_task)); 4121 4122 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4123 team, tid, TRUE); 4124 4125 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4126 tid, gtid, this_thr, this_thr->th.th_current_task)); 4127 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4128 // __kmp_initialize_team()? 4129 4130 /* TODO no worksharing in speculative threads */ 4131 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4132 4133 this_thr->th.th_local.this_construct = 0; 4134 4135 if (!this_thr->th.th_pri_common) { 4136 this_thr->th.th_pri_common = 4137 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4138 if (__kmp_storage_map) { 4139 __kmp_print_storage_map_gtid( 4140 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4141 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4142 } 4143 this_thr->th.th_pri_head = NULL; 4144 } 4145 4146 if (this_thr != master && // Primary thread's CG root is initialized elsewhere 4147 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set 4148 // Make new thread's CG root same as primary thread's 4149 KMP_DEBUG_ASSERT(master->th.th_cg_roots); 4150 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 4151 if (tmp) { 4152 // worker changes CG, need to check if old CG should be freed 4153 int i = tmp->cg_nthreads--; 4154 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" 4155 " on node %p of thread %p to %d\n", 4156 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); 4157 if (i == 1) { 4158 __kmp_free(tmp); // last thread left CG --> free it 4159 } 4160 } 4161 this_thr->th.th_cg_roots = master->th.th_cg_roots; 4162 // Increment new thread's CG root's counter to add the new thread 4163 this_thr->th.th_cg_roots->cg_nthreads++; 4164 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" 4165 " node %p of thread %p to %d\n", 4166 this_thr, this_thr->th.th_cg_roots, 4167 this_thr->th.th_cg_roots->cg_root, 4168 this_thr->th.th_cg_roots->cg_nthreads)); 4169 this_thr->th.th_current_task->td_icvs.thread_limit = 4170 this_thr->th.th_cg_roots->cg_thread_limit; 4171 } 4172 4173 /* Initialize dynamic dispatch */ 4174 { 4175 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4176 // Use team max_nproc since this will never change for the team. 4177 size_t disp_size = 4178 sizeof(dispatch_private_info_t) * 4179 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4180 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4181 team->t.t_max_nproc)); 4182 KMP_ASSERT(dispatch); 4183 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4184 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4185 4186 dispatch->th_disp_index = 0; 4187 dispatch->th_doacross_buf_idx = 0; 4188 if (!dispatch->th_disp_buffer) { 4189 dispatch->th_disp_buffer = 4190 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4191 4192 if (__kmp_storage_map) { 4193 __kmp_print_storage_map_gtid( 4194 gtid, &dispatch->th_disp_buffer[0], 4195 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4196 ? 1 4197 : __kmp_dispatch_num_buffers], 4198 disp_size, 4199 "th_%d.th_dispatch.th_disp_buffer " 4200 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4201 gtid, team->t.t_id, gtid); 4202 } 4203 } else { 4204 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4205 } 4206 4207 dispatch->th_dispatch_pr_current = 0; 4208 dispatch->th_dispatch_sh_current = 0; 4209 4210 dispatch->th_deo_fcn = 0; /* ORDERED */ 4211 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4212 } 4213 4214 this_thr->th.th_next_pool = NULL; 4215 4216 if (!this_thr->th.th_task_state_memo_stack) { 4217 size_t i; 4218 this_thr->th.th_task_state_memo_stack = 4219 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4220 this_thr->th.th_task_state_top = 0; 4221 this_thr->th.th_task_state_stack_sz = 4; 4222 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4223 ++i) // zero init the stack 4224 this_thr->th.th_task_state_memo_stack[i] = 0; 4225 } 4226 4227 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4228 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4229 4230 KMP_MB(); 4231 } 4232 4233 /* allocate a new thread for the requesting team. this is only called from 4234 within a forkjoin critical section. we will first try to get an available 4235 thread from the thread pool. if none is available, we will fork a new one 4236 assuming we are able to create a new one. this should be assured, as the 4237 caller should check on this first. */ 4238 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4239 int new_tid) { 4240 kmp_team_t *serial_team; 4241 kmp_info_t *new_thr; 4242 int new_gtid; 4243 4244 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4245 KMP_DEBUG_ASSERT(root && team); 4246 #if !KMP_NESTED_HOT_TEAMS 4247 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4248 #endif 4249 KMP_MB(); 4250 4251 /* first, try to get one from the thread pool */ 4252 if (__kmp_thread_pool) { 4253 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4254 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4255 if (new_thr == __kmp_thread_pool_insert_pt) { 4256 __kmp_thread_pool_insert_pt = NULL; 4257 } 4258 TCW_4(new_thr->th.th_in_pool, FALSE); 4259 __kmp_suspend_initialize_thread(new_thr); 4260 __kmp_lock_suspend_mx(new_thr); 4261 if (new_thr->th.th_active_in_pool == TRUE) { 4262 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); 4263 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 4264 new_thr->th.th_active_in_pool = FALSE; 4265 } 4266 __kmp_unlock_suspend_mx(new_thr); 4267 4268 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4269 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4270 KMP_ASSERT(!new_thr->th.th_team); 4271 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4272 4273 /* setup the thread structure */ 4274 __kmp_initialize_info(new_thr, team, new_tid, 4275 new_thr->th.th_info.ds.ds_gtid); 4276 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4277 4278 TCW_4(__kmp_nth, __kmp_nth + 1); 4279 4280 new_thr->th.th_task_state = 0; 4281 new_thr->th.th_task_state_top = 0; 4282 new_thr->th.th_task_state_stack_sz = 4; 4283 4284 #ifdef KMP_ADJUST_BLOCKTIME 4285 /* Adjust blocktime back to zero if necessary */ 4286 /* Middle initialization might not have occurred yet */ 4287 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4288 if (__kmp_nth > __kmp_avail_proc) { 4289 __kmp_zero_bt = TRUE; 4290 } 4291 } 4292 #endif /* KMP_ADJUST_BLOCKTIME */ 4293 4294 #if KMP_DEBUG 4295 // If thread entered pool via __kmp_free_thread, wait_flag should != 4296 // KMP_BARRIER_PARENT_FLAG. 4297 int b; 4298 kmp_balign_t *balign = new_thr->th.th_bar; 4299 for (b = 0; b < bs_last_barrier; ++b) 4300 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4301 #endif 4302 4303 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4304 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4305 4306 KMP_MB(); 4307 return new_thr; 4308 } 4309 4310 /* no, well fork a new one */ 4311 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4312 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4313 4314 #if KMP_USE_MONITOR 4315 // If this is the first worker thread the RTL is creating, then also 4316 // launch the monitor thread. We try to do this as early as possible. 4317 if (!TCR_4(__kmp_init_monitor)) { 4318 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4319 if (!TCR_4(__kmp_init_monitor)) { 4320 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4321 TCW_4(__kmp_init_monitor, 1); 4322 __kmp_create_monitor(&__kmp_monitor); 4323 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4324 #if KMP_OS_WINDOWS 4325 // AC: wait until monitor has started. This is a fix for CQ232808. 4326 // The reason is that if the library is loaded/unloaded in a loop with 4327 // small (parallel) work in between, then there is high probability that 4328 // monitor thread started after the library shutdown. At shutdown it is 4329 // too late to cope with the problem, because when the primary thread is 4330 // in DllMain (process detach) the monitor has no chances to start (it is 4331 // blocked), and primary thread has no means to inform the monitor that 4332 // the library has gone, because all the memory which the monitor can 4333 // access is going to be released/reset. 4334 while (TCR_4(__kmp_init_monitor) < 2) { 4335 KMP_YIELD(TRUE); 4336 } 4337 KF_TRACE(10, ("after monitor thread has started\n")); 4338 #endif 4339 } 4340 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4341 } 4342 #endif 4343 4344 KMP_MB(); 4345 4346 { 4347 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads) 4348 ? 1 4349 : __kmp_hidden_helper_threads_num + 1; 4350 4351 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL; 4352 ++new_gtid) { 4353 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4354 } 4355 4356 if (TCR_4(__kmp_init_hidden_helper_threads)) { 4357 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num); 4358 } 4359 } 4360 4361 /* allocate space for it. */ 4362 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4363 4364 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4365 4366 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 4367 // suppress race conditions detection on synchronization flags in debug mode 4368 // this helps to analyze library internals eliminating false positives 4369 __itt_suppress_mark_range( 4370 __itt_suppress_range, __itt_suppress_threading_errors, 4371 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc)); 4372 __itt_suppress_mark_range( 4373 __itt_suppress_range, __itt_suppress_threading_errors, 4374 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state)); 4375 #if KMP_OS_WINDOWS 4376 __itt_suppress_mark_range( 4377 __itt_suppress_range, __itt_suppress_threading_errors, 4378 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init)); 4379 #else 4380 __itt_suppress_mark_range(__itt_suppress_range, 4381 __itt_suppress_threading_errors, 4382 &new_thr->th.th_suspend_init_count, 4383 sizeof(new_thr->th.th_suspend_init_count)); 4384 #endif 4385 // TODO: check if we need to also suppress b_arrived flags 4386 __itt_suppress_mark_range(__itt_suppress_range, 4387 __itt_suppress_threading_errors, 4388 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go), 4389 sizeof(new_thr->th.th_bar[0].bb.b_go)); 4390 __itt_suppress_mark_range(__itt_suppress_range, 4391 __itt_suppress_threading_errors, 4392 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go), 4393 sizeof(new_thr->th.th_bar[1].bb.b_go)); 4394 __itt_suppress_mark_range(__itt_suppress_range, 4395 __itt_suppress_threading_errors, 4396 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go), 4397 sizeof(new_thr->th.th_bar[2].bb.b_go)); 4398 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 4399 if (__kmp_storage_map) { 4400 __kmp_print_thread_storage_map(new_thr, new_gtid); 4401 } 4402 4403 // add the reserve serialized team, initialized from the team's primary thread 4404 { 4405 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4406 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4407 new_thr->th.th_serial_team = serial_team = 4408 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4409 #if OMPT_SUPPORT 4410 ompt_data_none, // root parallel id 4411 #endif 4412 proc_bind_default, &r_icvs, 4413 0 USE_NESTED_HOT_ARG(NULL)); 4414 } 4415 KMP_ASSERT(serial_team); 4416 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4417 // execution (it is unused for now). 4418 serial_team->t.t_threads[0] = new_thr; 4419 KF_TRACE(10, 4420 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4421 new_thr)); 4422 4423 /* setup the thread structures */ 4424 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4425 4426 #if USE_FAST_MEMORY 4427 __kmp_initialize_fast_memory(new_thr); 4428 #endif /* USE_FAST_MEMORY */ 4429 4430 #if KMP_USE_BGET 4431 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4432 __kmp_initialize_bget(new_thr); 4433 #endif 4434 4435 __kmp_init_random(new_thr); // Initialize random number generator 4436 4437 /* Initialize these only once when thread is grabbed for a team allocation */ 4438 KA_TRACE(20, 4439 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4440 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4441 4442 int b; 4443 kmp_balign_t *balign = new_thr->th.th_bar; 4444 for (b = 0; b < bs_last_barrier; ++b) { 4445 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4446 balign[b].bb.team = NULL; 4447 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4448 balign[b].bb.use_oncore_barrier = 0; 4449 } 4450 4451 new_thr->th.th_spin_here = FALSE; 4452 new_thr->th.th_next_waiting = 0; 4453 #if KMP_OS_UNIX 4454 new_thr->th.th_blocking = false; 4455 #endif 4456 4457 #if KMP_AFFINITY_SUPPORTED 4458 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4459 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4460 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4461 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4462 #endif 4463 new_thr->th.th_def_allocator = __kmp_def_allocator; 4464 new_thr->th.th_prev_level = 0; 4465 new_thr->th.th_prev_num_threads = 1; 4466 4467 TCW_4(new_thr->th.th_in_pool, FALSE); 4468 new_thr->th.th_active_in_pool = FALSE; 4469 TCW_4(new_thr->th.th_active, TRUE); 4470 4471 /* adjust the global counters */ 4472 __kmp_all_nth++; 4473 __kmp_nth++; 4474 4475 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4476 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4477 if (__kmp_adjust_gtid_mode) { 4478 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4479 if (TCR_4(__kmp_gtid_mode) != 2) { 4480 TCW_4(__kmp_gtid_mode, 2); 4481 } 4482 } else { 4483 if (TCR_4(__kmp_gtid_mode) != 1) { 4484 TCW_4(__kmp_gtid_mode, 1); 4485 } 4486 } 4487 } 4488 4489 #ifdef KMP_ADJUST_BLOCKTIME 4490 /* Adjust blocktime back to zero if necessary */ 4491 /* Middle initialization might not have occurred yet */ 4492 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4493 if (__kmp_nth > __kmp_avail_proc) { 4494 __kmp_zero_bt = TRUE; 4495 } 4496 } 4497 #endif /* KMP_ADJUST_BLOCKTIME */ 4498 4499 /* actually fork it and create the new worker thread */ 4500 KF_TRACE( 4501 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4502 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4503 KF_TRACE(10, 4504 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4505 4506 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4507 new_gtid)); 4508 KMP_MB(); 4509 return new_thr; 4510 } 4511 4512 /* Reinitialize team for reuse. 4513 The hot team code calls this case at every fork barrier, so EPCC barrier 4514 test are extremely sensitive to changes in it, esp. writes to the team 4515 struct, which cause a cache invalidation in all threads. 4516 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4517 static void __kmp_reinitialize_team(kmp_team_t *team, 4518 kmp_internal_control_t *new_icvs, 4519 ident_t *loc) { 4520 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4521 team->t.t_threads[0], team)); 4522 KMP_DEBUG_ASSERT(team && new_icvs); 4523 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4524 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4525 4526 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4527 // Copy ICVs to the primary thread's implicit taskdata 4528 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4529 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4530 4531 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4532 team->t.t_threads[0], team)); 4533 } 4534 4535 /* Initialize the team data structure. 4536 This assumes the t_threads and t_max_nproc are already set. 4537 Also, we don't touch the arguments */ 4538 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4539 kmp_internal_control_t *new_icvs, 4540 ident_t *loc) { 4541 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4542 4543 /* verify */ 4544 KMP_DEBUG_ASSERT(team); 4545 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4546 KMP_DEBUG_ASSERT(team->t.t_threads); 4547 KMP_MB(); 4548 4549 team->t.t_master_tid = 0; /* not needed */ 4550 /* team->t.t_master_bar; not needed */ 4551 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4552 team->t.t_nproc = new_nproc; 4553 4554 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4555 team->t.t_next_pool = NULL; 4556 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4557 * up hot team */ 4558 4559 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4560 team->t.t_invoke = NULL; /* not needed */ 4561 4562 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4563 team->t.t_sched.sched = new_icvs->sched.sched; 4564 4565 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4566 team->t.t_fp_control_saved = FALSE; /* not needed */ 4567 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4568 team->t.t_mxcsr = 0; /* not needed */ 4569 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4570 4571 team->t.t_construct = 0; 4572 4573 team->t.t_ordered.dt.t_value = 0; 4574 team->t.t_master_active = FALSE; 4575 4576 #ifdef KMP_DEBUG 4577 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4578 #endif 4579 #if KMP_OS_WINDOWS 4580 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4581 #endif 4582 4583 team->t.t_control_stack_top = NULL; 4584 4585 __kmp_reinitialize_team(team, new_icvs, loc); 4586 4587 KMP_MB(); 4588 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4589 } 4590 4591 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 4592 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4593 static void 4594 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4595 if (KMP_AFFINITY_CAPABLE()) { 4596 int status; 4597 if (old_mask != NULL) { 4598 status = __kmp_get_system_affinity(old_mask, TRUE); 4599 int error = errno; 4600 if (status != 0) { 4601 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4602 __kmp_msg_null); 4603 } 4604 } 4605 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4606 } 4607 } 4608 #endif 4609 4610 #if KMP_AFFINITY_SUPPORTED 4611 4612 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4613 // It calculates the worker + primary thread's partition based upon the parent 4614 // thread's partition, and binds each worker to a thread in their partition. 4615 // The primary thread's partition should already include its current binding. 4616 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4617 // Do not partition places for the hidden helper team 4618 if (KMP_HIDDEN_HELPER_TEAM(team)) 4619 return; 4620 // Copy the primary thread's place partition to the team struct 4621 kmp_info_t *master_th = team->t.t_threads[0]; 4622 KMP_DEBUG_ASSERT(master_th != NULL); 4623 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4624 int first_place = master_th->th.th_first_place; 4625 int last_place = master_th->th.th_last_place; 4626 int masters_place = master_th->th.th_current_place; 4627 team->t.t_first_place = first_place; 4628 team->t.t_last_place = last_place; 4629 4630 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4631 "bound to place %d partition = [%d,%d]\n", 4632 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4633 team->t.t_id, masters_place, first_place, last_place)); 4634 4635 switch (proc_bind) { 4636 4637 case proc_bind_default: 4638 // Serial teams might have the proc_bind policy set to proc_bind_default. 4639 // Not an issue -- we don't rebind primary thread for any proc_bind policy. 4640 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4641 break; 4642 4643 case proc_bind_primary: { 4644 int f; 4645 int n_th = team->t.t_nproc; 4646 for (f = 1; f < n_th; f++) { 4647 kmp_info_t *th = team->t.t_threads[f]; 4648 KMP_DEBUG_ASSERT(th != NULL); 4649 th->th.th_first_place = first_place; 4650 th->th.th_last_place = last_place; 4651 th->th.th_new_place = masters_place; 4652 if (__kmp_display_affinity && masters_place != th->th.th_current_place && 4653 team->t.t_display_affinity != 1) { 4654 team->t.t_display_affinity = 1; 4655 } 4656 4657 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d " 4658 "partition = [%d,%d]\n", 4659 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4660 f, masters_place, first_place, last_place)); 4661 } 4662 } break; 4663 4664 case proc_bind_close: { 4665 int f; 4666 int n_th = team->t.t_nproc; 4667 int n_places; 4668 if (first_place <= last_place) { 4669 n_places = last_place - first_place + 1; 4670 } else { 4671 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4672 } 4673 if (n_th <= n_places) { 4674 int place = masters_place; 4675 for (f = 1; f < n_th; f++) { 4676 kmp_info_t *th = team->t.t_threads[f]; 4677 KMP_DEBUG_ASSERT(th != NULL); 4678 4679 if (place == last_place) { 4680 place = first_place; 4681 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4682 place = 0; 4683 } else { 4684 place++; 4685 } 4686 th->th.th_first_place = first_place; 4687 th->th.th_last_place = last_place; 4688 th->th.th_new_place = place; 4689 if (__kmp_display_affinity && place != th->th.th_current_place && 4690 team->t.t_display_affinity != 1) { 4691 team->t.t_display_affinity = 1; 4692 } 4693 4694 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4695 "partition = [%d,%d]\n", 4696 __kmp_gtid_from_thread(team->t.t_threads[f]), 4697 team->t.t_id, f, place, first_place, last_place)); 4698 } 4699 } else { 4700 int S, rem, gap, s_count; 4701 S = n_th / n_places; 4702 s_count = 0; 4703 rem = n_th - (S * n_places); 4704 gap = rem > 0 ? n_places / rem : n_places; 4705 int place = masters_place; 4706 int gap_ct = gap; 4707 for (f = 0; f < n_th; f++) { 4708 kmp_info_t *th = team->t.t_threads[f]; 4709 KMP_DEBUG_ASSERT(th != NULL); 4710 4711 th->th.th_first_place = first_place; 4712 th->th.th_last_place = last_place; 4713 th->th.th_new_place = place; 4714 if (__kmp_display_affinity && place != th->th.th_current_place && 4715 team->t.t_display_affinity != 1) { 4716 team->t.t_display_affinity = 1; 4717 } 4718 s_count++; 4719 4720 if ((s_count == S) && rem && (gap_ct == gap)) { 4721 // do nothing, add an extra thread to place on next iteration 4722 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4723 // we added an extra thread to this place; move to next place 4724 if (place == last_place) { 4725 place = first_place; 4726 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4727 place = 0; 4728 } else { 4729 place++; 4730 } 4731 s_count = 0; 4732 gap_ct = 1; 4733 rem--; 4734 } else if (s_count == S) { // place full; don't add extra 4735 if (place == last_place) { 4736 place = first_place; 4737 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4738 place = 0; 4739 } else { 4740 place++; 4741 } 4742 gap_ct++; 4743 s_count = 0; 4744 } 4745 4746 KA_TRACE(100, 4747 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4748 "partition = [%d,%d]\n", 4749 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4750 th->th.th_new_place, first_place, last_place)); 4751 } 4752 KMP_DEBUG_ASSERT(place == masters_place); 4753 } 4754 } break; 4755 4756 case proc_bind_spread: { 4757 int f; 4758 int n_th = team->t.t_nproc; 4759 int n_places; 4760 int thidx; 4761 if (first_place <= last_place) { 4762 n_places = last_place - first_place + 1; 4763 } else { 4764 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4765 } 4766 if (n_th <= n_places) { 4767 int place = -1; 4768 4769 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) { 4770 int S = n_places / n_th; 4771 int s_count, rem, gap, gap_ct; 4772 4773 place = masters_place; 4774 rem = n_places - n_th * S; 4775 gap = rem ? n_th / rem : 1; 4776 gap_ct = gap; 4777 thidx = n_th; 4778 if (update_master_only == 1) 4779 thidx = 1; 4780 for (f = 0; f < thidx; f++) { 4781 kmp_info_t *th = team->t.t_threads[f]; 4782 KMP_DEBUG_ASSERT(th != NULL); 4783 4784 th->th.th_first_place = place; 4785 th->th.th_new_place = place; 4786 if (__kmp_display_affinity && place != th->th.th_current_place && 4787 team->t.t_display_affinity != 1) { 4788 team->t.t_display_affinity = 1; 4789 } 4790 s_count = 1; 4791 while (s_count < S) { 4792 if (place == last_place) { 4793 place = first_place; 4794 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4795 place = 0; 4796 } else { 4797 place++; 4798 } 4799 s_count++; 4800 } 4801 if (rem && (gap_ct == gap)) { 4802 if (place == last_place) { 4803 place = first_place; 4804 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4805 place = 0; 4806 } else { 4807 place++; 4808 } 4809 rem--; 4810 gap_ct = 0; 4811 } 4812 th->th.th_last_place = place; 4813 gap_ct++; 4814 4815 if (place == last_place) { 4816 place = first_place; 4817 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4818 place = 0; 4819 } else { 4820 place++; 4821 } 4822 4823 KA_TRACE(100, 4824 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4825 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n", 4826 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4827 f, th->th.th_new_place, th->th.th_first_place, 4828 th->th.th_last_place, __kmp_affinity_num_masks)); 4829 } 4830 } else { 4831 /* Having uniform space of available computation places I can create 4832 T partitions of round(P/T) size and put threads into the first 4833 place of each partition. */ 4834 double current = static_cast<double>(masters_place); 4835 double spacing = 4836 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4837 int first, last; 4838 kmp_info_t *th; 4839 4840 thidx = n_th + 1; 4841 if (update_master_only == 1) 4842 thidx = 1; 4843 for (f = 0; f < thidx; f++) { 4844 first = static_cast<int>(current); 4845 last = static_cast<int>(current + spacing) - 1; 4846 KMP_DEBUG_ASSERT(last >= first); 4847 if (first >= n_places) { 4848 if (masters_place) { 4849 first -= n_places; 4850 last -= n_places; 4851 if (first == (masters_place + 1)) { 4852 KMP_DEBUG_ASSERT(f == n_th); 4853 first--; 4854 } 4855 if (last == masters_place) { 4856 KMP_DEBUG_ASSERT(f == (n_th - 1)); 4857 last--; 4858 } 4859 } else { 4860 KMP_DEBUG_ASSERT(f == n_th); 4861 first = 0; 4862 last = 0; 4863 } 4864 } 4865 if (last >= n_places) { 4866 last = (n_places - 1); 4867 } 4868 place = first; 4869 current += spacing; 4870 if (f < n_th) { 4871 KMP_DEBUG_ASSERT(0 <= first); 4872 KMP_DEBUG_ASSERT(n_places > first); 4873 KMP_DEBUG_ASSERT(0 <= last); 4874 KMP_DEBUG_ASSERT(n_places > last); 4875 KMP_DEBUG_ASSERT(last_place >= first_place); 4876 th = team->t.t_threads[f]; 4877 KMP_DEBUG_ASSERT(th); 4878 th->th.th_first_place = first; 4879 th->th.th_new_place = place; 4880 th->th.th_last_place = last; 4881 if (__kmp_display_affinity && place != th->th.th_current_place && 4882 team->t.t_display_affinity != 1) { 4883 team->t.t_display_affinity = 1; 4884 } 4885 KA_TRACE(100, 4886 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4887 "partition = [%d,%d], spacing = %.4f\n", 4888 __kmp_gtid_from_thread(team->t.t_threads[f]), 4889 team->t.t_id, f, th->th.th_new_place, 4890 th->th.th_first_place, th->th.th_last_place, spacing)); 4891 } 4892 } 4893 } 4894 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4895 } else { 4896 int S, rem, gap, s_count; 4897 S = n_th / n_places; 4898 s_count = 0; 4899 rem = n_th - (S * n_places); 4900 gap = rem > 0 ? n_places / rem : n_places; 4901 int place = masters_place; 4902 int gap_ct = gap; 4903 thidx = n_th; 4904 if (update_master_only == 1) 4905 thidx = 1; 4906 for (f = 0; f < thidx; f++) { 4907 kmp_info_t *th = team->t.t_threads[f]; 4908 KMP_DEBUG_ASSERT(th != NULL); 4909 4910 th->th.th_first_place = place; 4911 th->th.th_last_place = place; 4912 th->th.th_new_place = place; 4913 if (__kmp_display_affinity && place != th->th.th_current_place && 4914 team->t.t_display_affinity != 1) { 4915 team->t.t_display_affinity = 1; 4916 } 4917 s_count++; 4918 4919 if ((s_count == S) && rem && (gap_ct == gap)) { 4920 // do nothing, add an extra thread to place on next iteration 4921 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4922 // we added an extra thread to this place; move on to next place 4923 if (place == last_place) { 4924 place = first_place; 4925 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4926 place = 0; 4927 } else { 4928 place++; 4929 } 4930 s_count = 0; 4931 gap_ct = 1; 4932 rem--; 4933 } else if (s_count == S) { // place is full; don't add extra thread 4934 if (place == last_place) { 4935 place = first_place; 4936 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4937 place = 0; 4938 } else { 4939 place++; 4940 } 4941 gap_ct++; 4942 s_count = 0; 4943 } 4944 4945 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4946 "partition = [%d,%d]\n", 4947 __kmp_gtid_from_thread(team->t.t_threads[f]), 4948 team->t.t_id, f, th->th.th_new_place, 4949 th->th.th_first_place, th->th.th_last_place)); 4950 } 4951 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4952 } 4953 } break; 4954 4955 default: 4956 break; 4957 } 4958 4959 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 4960 } 4961 4962 #endif // KMP_AFFINITY_SUPPORTED 4963 4964 /* allocate a new team data structure to use. take one off of the free pool if 4965 available */ 4966 kmp_team_t * 4967 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 4968 #if OMPT_SUPPORT 4969 ompt_data_t ompt_parallel_data, 4970 #endif 4971 kmp_proc_bind_t new_proc_bind, 4972 kmp_internal_control_t *new_icvs, 4973 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 4974 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 4975 int f; 4976 kmp_team_t *team; 4977 int use_hot_team = !root->r.r_active; 4978 int level = 0; 4979 4980 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 4981 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 4982 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 4983 KMP_MB(); 4984 4985 #if KMP_NESTED_HOT_TEAMS 4986 kmp_hot_team_ptr_t *hot_teams; 4987 if (master) { 4988 team = master->th.th_team; 4989 level = team->t.t_active_level; 4990 if (master->th.th_teams_microtask) { // in teams construct? 4991 if (master->th.th_teams_size.nteams > 1 && 4992 ( // #teams > 1 4993 team->t.t_pkfn == 4994 (microtask_t)__kmp_teams_master || // inner fork of the teams 4995 master->th.th_teams_level < 4996 team->t.t_level)) { // or nested parallel inside the teams 4997 ++level; // not increment if #teams==1, or for outer fork of the teams; 4998 // increment otherwise 4999 } 5000 } 5001 hot_teams = master->th.th_hot_teams; 5002 if (level < __kmp_hot_teams_max_level && hot_teams && 5003 hot_teams[level].hot_team) { 5004 // hot team has already been allocated for given level 5005 use_hot_team = 1; 5006 } else { 5007 use_hot_team = 0; 5008 } 5009 } else { 5010 // check we won't access uninitialized hot_teams, just in case 5011 KMP_DEBUG_ASSERT(new_nproc == 1); 5012 } 5013 #endif 5014 // Optimization to use a "hot" team 5015 if (use_hot_team && new_nproc > 1) { 5016 KMP_DEBUG_ASSERT(new_nproc <= max_nproc); 5017 #if KMP_NESTED_HOT_TEAMS 5018 team = hot_teams[level].hot_team; 5019 #else 5020 team = root->r.r_hot_team; 5021 #endif 5022 #if KMP_DEBUG 5023 if (__kmp_tasking_mode != tskm_immediate_exec) { 5024 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5025 "task_team[1] = %p before reinit\n", 5026 team->t.t_task_team[0], team->t.t_task_team[1])); 5027 } 5028 #endif 5029 5030 // Has the number of threads changed? 5031 /* Let's assume the most common case is that the number of threads is 5032 unchanged, and put that case first. */ 5033 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 5034 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 5035 // This case can mean that omp_set_num_threads() was called and the hot 5036 // team size was already reduced, so we check the special flag 5037 if (team->t.t_size_changed == -1) { 5038 team->t.t_size_changed = 1; 5039 } else { 5040 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 5041 } 5042 5043 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5044 kmp_r_sched_t new_sched = new_icvs->sched; 5045 // set primary thread's schedule as new run-time schedule 5046 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 5047 5048 __kmp_reinitialize_team(team, new_icvs, 5049 root->r.r_uber_thread->th.th_ident); 5050 5051 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 5052 team->t.t_threads[0], team)); 5053 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5054 5055 #if KMP_AFFINITY_SUPPORTED 5056 if ((team->t.t_size_changed == 0) && 5057 (team->t.t_proc_bind == new_proc_bind)) { 5058 if (new_proc_bind == proc_bind_spread) { 5059 __kmp_partition_places( 5060 team, 1); // add flag to update only master for spread 5061 } 5062 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 5063 "proc_bind = %d, partition = [%d,%d]\n", 5064 team->t.t_id, new_proc_bind, team->t.t_first_place, 5065 team->t.t_last_place)); 5066 } else { 5067 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5068 __kmp_partition_places(team); 5069 } 5070 #else 5071 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5072 #endif /* KMP_AFFINITY_SUPPORTED */ 5073 } else if (team->t.t_nproc > new_nproc) { 5074 KA_TRACE(20, 5075 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 5076 new_nproc)); 5077 5078 team->t.t_size_changed = 1; 5079 #if KMP_NESTED_HOT_TEAMS 5080 if (__kmp_hot_teams_mode == 0) { 5081 // AC: saved number of threads should correspond to team's value in this 5082 // mode, can be bigger in mode 1, when hot team has threads in reserve 5083 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 5084 hot_teams[level].hot_team_nth = new_nproc; 5085 #endif // KMP_NESTED_HOT_TEAMS 5086 /* release the extra threads we don't need any more */ 5087 for (f = new_nproc; f < team->t.t_nproc; f++) { 5088 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5089 if (__kmp_tasking_mode != tskm_immediate_exec) { 5090 // When decreasing team size, threads no longer in the team should 5091 // unref task team. 5092 team->t.t_threads[f]->th.th_task_team = NULL; 5093 } 5094 __kmp_free_thread(team->t.t_threads[f]); 5095 team->t.t_threads[f] = NULL; 5096 } 5097 #if KMP_NESTED_HOT_TEAMS 5098 } // (__kmp_hot_teams_mode == 0) 5099 else { 5100 // When keeping extra threads in team, switch threads to wait on own 5101 // b_go flag 5102 for (f = new_nproc; f < team->t.t_nproc; ++f) { 5103 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5104 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 5105 for (int b = 0; b < bs_last_barrier; ++b) { 5106 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5107 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5108 } 5109 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5110 } 5111 } 5112 } 5113 #endif // KMP_NESTED_HOT_TEAMS 5114 team->t.t_nproc = new_nproc; 5115 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5116 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 5117 __kmp_reinitialize_team(team, new_icvs, 5118 root->r.r_uber_thread->th.th_ident); 5119 5120 // Update remaining threads 5121 for (f = 0; f < new_nproc; ++f) { 5122 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5123 } 5124 5125 // restore the current task state of the primary thread: should be the 5126 // implicit task 5127 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5128 team->t.t_threads[0], team)); 5129 5130 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5131 5132 #ifdef KMP_DEBUG 5133 for (f = 0; f < team->t.t_nproc; f++) { 5134 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5135 team->t.t_threads[f]->th.th_team_nproc == 5136 team->t.t_nproc); 5137 } 5138 #endif 5139 5140 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5141 #if KMP_AFFINITY_SUPPORTED 5142 __kmp_partition_places(team); 5143 #endif 5144 } else { // team->t.t_nproc < new_nproc 5145 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5146 kmp_affin_mask_t *old_mask; 5147 if (KMP_AFFINITY_CAPABLE()) { 5148 KMP_CPU_ALLOC(old_mask); 5149 } 5150 #endif 5151 5152 KA_TRACE(20, 5153 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5154 new_nproc)); 5155 5156 team->t.t_size_changed = 1; 5157 5158 #if KMP_NESTED_HOT_TEAMS 5159 int avail_threads = hot_teams[level].hot_team_nth; 5160 if (new_nproc < avail_threads) 5161 avail_threads = new_nproc; 5162 kmp_info_t **other_threads = team->t.t_threads; 5163 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5164 // Adjust barrier data of reserved threads (if any) of the team 5165 // Other data will be set in __kmp_initialize_info() below. 5166 int b; 5167 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5168 for (b = 0; b < bs_last_barrier; ++b) { 5169 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5170 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5171 #if USE_DEBUGGER 5172 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5173 #endif 5174 } 5175 } 5176 if (hot_teams[level].hot_team_nth >= new_nproc) { 5177 // we have all needed threads in reserve, no need to allocate any 5178 // this only possible in mode 1, cannot have reserved threads in mode 0 5179 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5180 team->t.t_nproc = new_nproc; // just get reserved threads involved 5181 } else { 5182 // we may have some threads in reserve, but not enough 5183 team->t.t_nproc = 5184 hot_teams[level] 5185 .hot_team_nth; // get reserved threads involved if any 5186 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5187 #endif // KMP_NESTED_HOT_TEAMS 5188 if (team->t.t_max_nproc < new_nproc) { 5189 /* reallocate larger arrays */ 5190 __kmp_reallocate_team_arrays(team, new_nproc); 5191 __kmp_reinitialize_team(team, new_icvs, NULL); 5192 } 5193 5194 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5195 /* Temporarily set full mask for primary thread before creation of 5196 workers. The reason is that workers inherit the affinity from the 5197 primary thread, so if a lot of workers are created on the single 5198 core quickly, they don't get a chance to set their own affinity for 5199 a long time. */ 5200 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5201 #endif 5202 5203 /* allocate new threads for the hot team */ 5204 for (f = team->t.t_nproc; f < new_nproc; f++) { 5205 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5206 KMP_DEBUG_ASSERT(new_worker); 5207 team->t.t_threads[f] = new_worker; 5208 5209 KA_TRACE(20, 5210 ("__kmp_allocate_team: team %d init T#%d arrived: " 5211 "join=%llu, plain=%llu\n", 5212 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5213 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5214 team->t.t_bar[bs_plain_barrier].b_arrived)); 5215 5216 { // Initialize barrier data for new threads. 5217 int b; 5218 kmp_balign_t *balign = new_worker->th.th_bar; 5219 for (b = 0; b < bs_last_barrier; ++b) { 5220 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5221 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5222 KMP_BARRIER_PARENT_FLAG); 5223 #if USE_DEBUGGER 5224 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5225 #endif 5226 } 5227 } 5228 } 5229 5230 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5231 if (KMP_AFFINITY_CAPABLE()) { 5232 /* Restore initial primary thread's affinity mask */ 5233 __kmp_set_system_affinity(old_mask, TRUE); 5234 KMP_CPU_FREE(old_mask); 5235 } 5236 #endif 5237 #if KMP_NESTED_HOT_TEAMS 5238 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5239 #endif // KMP_NESTED_HOT_TEAMS 5240 /* make sure everyone is syncronized */ 5241 int old_nproc = team->t.t_nproc; // save old value and use to update only 5242 // new threads below 5243 __kmp_initialize_team(team, new_nproc, new_icvs, 5244 root->r.r_uber_thread->th.th_ident); 5245 5246 /* reinitialize the threads */ 5247 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5248 for (f = 0; f < team->t.t_nproc; ++f) 5249 __kmp_initialize_info(team->t.t_threads[f], team, f, 5250 __kmp_gtid_from_tid(f, team)); 5251 5252 if (level) { // set th_task_state for new threads in nested hot team 5253 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5254 // only need to set the th_task_state for the new threads. th_task_state 5255 // for primary thread will not be accurate until after this in 5256 // __kmp_fork_call(), so we look to the primary thread's memo_stack to 5257 // get the correct value. 5258 for (f = old_nproc; f < team->t.t_nproc; ++f) 5259 team->t.t_threads[f]->th.th_task_state = 5260 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5261 } else { // set th_task_state for new threads in non-nested hot team 5262 // copy primary thread's state 5263 kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state; 5264 for (f = old_nproc; f < team->t.t_nproc; ++f) 5265 team->t.t_threads[f]->th.th_task_state = old_state; 5266 } 5267 5268 #ifdef KMP_DEBUG 5269 for (f = 0; f < team->t.t_nproc; ++f) { 5270 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5271 team->t.t_threads[f]->th.th_team_nproc == 5272 team->t.t_nproc); 5273 } 5274 #endif 5275 5276 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5277 #if KMP_AFFINITY_SUPPORTED 5278 __kmp_partition_places(team); 5279 #endif 5280 } // Check changes in number of threads 5281 5282 kmp_info_t *master = team->t.t_threads[0]; 5283 if (master->th.th_teams_microtask) { 5284 for (f = 1; f < new_nproc; ++f) { 5285 // propagate teams construct specific info to workers 5286 kmp_info_t *thr = team->t.t_threads[f]; 5287 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5288 thr->th.th_teams_level = master->th.th_teams_level; 5289 thr->th.th_teams_size = master->th.th_teams_size; 5290 } 5291 } 5292 #if KMP_NESTED_HOT_TEAMS 5293 if (level) { 5294 // Sync barrier state for nested hot teams, not needed for outermost hot 5295 // team. 5296 for (f = 1; f < new_nproc; ++f) { 5297 kmp_info_t *thr = team->t.t_threads[f]; 5298 int b; 5299 kmp_balign_t *balign = thr->th.th_bar; 5300 for (b = 0; b < bs_last_barrier; ++b) { 5301 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5302 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5303 #if USE_DEBUGGER 5304 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5305 #endif 5306 } 5307 } 5308 } 5309 #endif // KMP_NESTED_HOT_TEAMS 5310 5311 /* reallocate space for arguments if necessary */ 5312 __kmp_alloc_argv_entries(argc, team, TRUE); 5313 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5314 // The hot team re-uses the previous task team, 5315 // if untouched during the previous release->gather phase. 5316 5317 KF_TRACE(10, (" hot_team = %p\n", team)); 5318 5319 #if KMP_DEBUG 5320 if (__kmp_tasking_mode != tskm_immediate_exec) { 5321 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5322 "task_team[1] = %p after reinit\n", 5323 team->t.t_task_team[0], team->t.t_task_team[1])); 5324 } 5325 #endif 5326 5327 #if OMPT_SUPPORT 5328 __ompt_team_assign_id(team, ompt_parallel_data); 5329 #endif 5330 5331 KMP_MB(); 5332 5333 return team; 5334 } 5335 5336 /* next, let's try to take one from the team pool */ 5337 KMP_MB(); 5338 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5339 /* TODO: consider resizing undersized teams instead of reaping them, now 5340 that we have a resizing mechanism */ 5341 if (team->t.t_max_nproc >= max_nproc) { 5342 /* take this team from the team pool */ 5343 __kmp_team_pool = team->t.t_next_pool; 5344 5345 /* setup the team for fresh use */ 5346 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5347 5348 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5349 "task_team[1] %p to NULL\n", 5350 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5351 team->t.t_task_team[0] = NULL; 5352 team->t.t_task_team[1] = NULL; 5353 5354 /* reallocate space for arguments if necessary */ 5355 __kmp_alloc_argv_entries(argc, team, TRUE); 5356 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5357 5358 KA_TRACE( 5359 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5360 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5361 { // Initialize barrier data. 5362 int b; 5363 for (b = 0; b < bs_last_barrier; ++b) { 5364 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5365 #if USE_DEBUGGER 5366 team->t.t_bar[b].b_master_arrived = 0; 5367 team->t.t_bar[b].b_team_arrived = 0; 5368 #endif 5369 } 5370 } 5371 5372 team->t.t_proc_bind = new_proc_bind; 5373 5374 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5375 team->t.t_id)); 5376 5377 #if OMPT_SUPPORT 5378 __ompt_team_assign_id(team, ompt_parallel_data); 5379 #endif 5380 5381 KMP_MB(); 5382 5383 return team; 5384 } 5385 5386 /* reap team if it is too small, then loop back and check the next one */ 5387 // not sure if this is wise, but, will be redone during the hot-teams 5388 // rewrite. 5389 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5390 team = __kmp_reap_team(team); 5391 __kmp_team_pool = team; 5392 } 5393 5394 /* nothing available in the pool, no matter, make a new team! */ 5395 KMP_MB(); 5396 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5397 5398 /* and set it up */ 5399 team->t.t_max_nproc = max_nproc; 5400 /* NOTE well, for some reason allocating one big buffer and dividing it up 5401 seems to really hurt performance a lot on the P4, so, let's not use this */ 5402 __kmp_allocate_team_arrays(team, max_nproc); 5403 5404 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5405 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5406 5407 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5408 "%p to NULL\n", 5409 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5410 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5411 // memory, no need to duplicate 5412 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5413 // memory, no need to duplicate 5414 5415 if (__kmp_storage_map) { 5416 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5417 } 5418 5419 /* allocate space for arguments */ 5420 __kmp_alloc_argv_entries(argc, team, FALSE); 5421 team->t.t_argc = argc; 5422 5423 KA_TRACE(20, 5424 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5425 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5426 { // Initialize barrier data. 5427 int b; 5428 for (b = 0; b < bs_last_barrier; ++b) { 5429 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5430 #if USE_DEBUGGER 5431 team->t.t_bar[b].b_master_arrived = 0; 5432 team->t.t_bar[b].b_team_arrived = 0; 5433 #endif 5434 } 5435 } 5436 5437 team->t.t_proc_bind = new_proc_bind; 5438 5439 #if OMPT_SUPPORT 5440 __ompt_team_assign_id(team, ompt_parallel_data); 5441 team->t.ompt_serialized_team_info = NULL; 5442 #endif 5443 5444 KMP_MB(); 5445 5446 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5447 team->t.t_id)); 5448 5449 return team; 5450 } 5451 5452 /* TODO implement hot-teams at all levels */ 5453 /* TODO implement lazy thread release on demand (disband request) */ 5454 5455 /* free the team. return it to the team pool. release all the threads 5456 * associated with it */ 5457 void __kmp_free_team(kmp_root_t *root, 5458 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5459 int f; 5460 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5461 team->t.t_id)); 5462 5463 /* verify state */ 5464 KMP_DEBUG_ASSERT(root); 5465 KMP_DEBUG_ASSERT(team); 5466 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5467 KMP_DEBUG_ASSERT(team->t.t_threads); 5468 5469 int use_hot_team = team == root->r.r_hot_team; 5470 #if KMP_NESTED_HOT_TEAMS 5471 int level; 5472 kmp_hot_team_ptr_t *hot_teams; 5473 if (master) { 5474 level = team->t.t_active_level - 1; 5475 if (master->th.th_teams_microtask) { // in teams construct? 5476 if (master->th.th_teams_size.nteams > 1) { 5477 ++level; // level was not increased in teams construct for 5478 // team_of_masters 5479 } 5480 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5481 master->th.th_teams_level == team->t.t_level) { 5482 ++level; // level was not increased in teams construct for 5483 // team_of_workers before the parallel 5484 } // team->t.t_level will be increased inside parallel 5485 } 5486 hot_teams = master->th.th_hot_teams; 5487 if (level < __kmp_hot_teams_max_level) { 5488 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5489 use_hot_team = 1; 5490 } 5491 } 5492 #endif // KMP_NESTED_HOT_TEAMS 5493 5494 /* team is done working */ 5495 TCW_SYNC_PTR(team->t.t_pkfn, 5496 NULL); // Important for Debugging Support Library. 5497 #if KMP_OS_WINDOWS 5498 team->t.t_copyin_counter = 0; // init counter for possible reuse 5499 #endif 5500 // Do not reset pointer to parent team to NULL for hot teams. 5501 5502 /* if we are non-hot team, release our threads */ 5503 if (!use_hot_team) { 5504 if (__kmp_tasking_mode != tskm_immediate_exec) { 5505 // Wait for threads to reach reapable state 5506 for (f = 1; f < team->t.t_nproc; ++f) { 5507 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5508 kmp_info_t *th = team->t.t_threads[f]; 5509 volatile kmp_uint32 *state = &th->th.th_reap_state; 5510 while (*state != KMP_SAFE_TO_REAP) { 5511 #if KMP_OS_WINDOWS 5512 // On Windows a thread can be killed at any time, check this 5513 DWORD ecode; 5514 if (!__kmp_is_thread_alive(th, &ecode)) { 5515 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5516 break; 5517 } 5518 #endif 5519 // first check if thread is sleeping 5520 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5521 if (fl.is_sleeping()) 5522 fl.resume(__kmp_gtid_from_thread(th)); 5523 KMP_CPU_PAUSE(); 5524 } 5525 } 5526 5527 // Delete task teams 5528 int tt_idx; 5529 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5530 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5531 if (task_team != NULL) { 5532 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams 5533 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5534 team->t.t_threads[f]->th.th_task_team = NULL; 5535 } 5536 KA_TRACE( 5537 20, 5538 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5539 __kmp_get_gtid(), task_team, team->t.t_id)); 5540 #if KMP_NESTED_HOT_TEAMS 5541 __kmp_free_task_team(master, task_team); 5542 #endif 5543 team->t.t_task_team[tt_idx] = NULL; 5544 } 5545 } 5546 } 5547 5548 // Reset pointer to parent team only for non-hot teams. 5549 team->t.t_parent = NULL; 5550 team->t.t_level = 0; 5551 team->t.t_active_level = 0; 5552 5553 /* free the worker threads */ 5554 for (f = 1; f < team->t.t_nproc; ++f) { 5555 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5556 __kmp_free_thread(team->t.t_threads[f]); 5557 team->t.t_threads[f] = NULL; 5558 } 5559 5560 /* put the team back in the team pool */ 5561 /* TODO limit size of team pool, call reap_team if pool too large */ 5562 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5563 __kmp_team_pool = (volatile kmp_team_t *)team; 5564 } else { // Check if team was created for primary threads in teams construct 5565 // See if first worker is a CG root 5566 KMP_DEBUG_ASSERT(team->t.t_threads[1] && 5567 team->t.t_threads[1]->th.th_cg_roots); 5568 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { 5569 // Clean up the CG root nodes on workers so that this team can be re-used 5570 for (f = 1; f < team->t.t_nproc; ++f) { 5571 kmp_info_t *thr = team->t.t_threads[f]; 5572 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && 5573 thr->th.th_cg_roots->cg_root == thr); 5574 // Pop current CG root off list 5575 kmp_cg_root_t *tmp = thr->th.th_cg_roots; 5576 thr->th.th_cg_roots = tmp->up; 5577 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" 5578 " up to node %p. cg_nthreads was %d\n", 5579 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); 5580 int i = tmp->cg_nthreads--; 5581 if (i == 1) { 5582 __kmp_free(tmp); // free CG if we are the last thread in it 5583 } 5584 // Restore current task's thread_limit from CG root 5585 if (thr->th.th_cg_roots) 5586 thr->th.th_current_task->td_icvs.thread_limit = 5587 thr->th.th_cg_roots->cg_thread_limit; 5588 } 5589 } 5590 } 5591 5592 KMP_MB(); 5593 } 5594 5595 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5596 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5597 kmp_team_t *next_pool = team->t.t_next_pool; 5598 5599 KMP_DEBUG_ASSERT(team); 5600 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5601 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5602 KMP_DEBUG_ASSERT(team->t.t_threads); 5603 KMP_DEBUG_ASSERT(team->t.t_argv); 5604 5605 /* TODO clean the threads that are a part of this? */ 5606 5607 /* free stuff */ 5608 __kmp_free_team_arrays(team); 5609 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5610 __kmp_free((void *)team->t.t_argv); 5611 __kmp_free(team); 5612 5613 KMP_MB(); 5614 return next_pool; 5615 } 5616 5617 // Free the thread. Don't reap it, just place it on the pool of available 5618 // threads. 5619 // 5620 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5621 // binding for the affinity mechanism to be useful. 5622 // 5623 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5624 // However, we want to avoid a potential performance problem by always 5625 // scanning through the list to find the correct point at which to insert 5626 // the thread (potential N**2 behavior). To do this we keep track of the 5627 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5628 // With single-level parallelism, threads will always be added to the tail 5629 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5630 // parallelism, all bets are off and we may need to scan through the entire 5631 // free list. 5632 // 5633 // This change also has a potentially large performance benefit, for some 5634 // applications. Previously, as threads were freed from the hot team, they 5635 // would be placed back on the free list in inverse order. If the hot team 5636 // grew back to it's original size, then the freed thread would be placed 5637 // back on the hot team in reverse order. This could cause bad cache 5638 // locality problems on programs where the size of the hot team regularly 5639 // grew and shrunk. 5640 // 5641 // Now, for single-level parallelism, the OMP tid is always == gtid. 5642 void __kmp_free_thread(kmp_info_t *this_th) { 5643 int gtid; 5644 kmp_info_t **scan; 5645 5646 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5647 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5648 5649 KMP_DEBUG_ASSERT(this_th); 5650 5651 // When moving thread to pool, switch thread to wait on own b_go flag, and 5652 // uninitialized (NULL team). 5653 int b; 5654 kmp_balign_t *balign = this_th->th.th_bar; 5655 for (b = 0; b < bs_last_barrier; ++b) { 5656 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5657 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5658 balign[b].bb.team = NULL; 5659 balign[b].bb.leaf_kids = 0; 5660 } 5661 this_th->th.th_task_state = 0; 5662 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5663 5664 /* put thread back on the free pool */ 5665 TCW_PTR(this_th->th.th_team, NULL); 5666 TCW_PTR(this_th->th.th_root, NULL); 5667 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5668 5669 while (this_th->th.th_cg_roots) { 5670 this_th->th.th_cg_roots->cg_nthreads--; 5671 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" 5672 " %p of thread %p to %d\n", 5673 this_th, this_th->th.th_cg_roots, 5674 this_th->th.th_cg_roots->cg_root, 5675 this_th->th.th_cg_roots->cg_nthreads)); 5676 kmp_cg_root_t *tmp = this_th->th.th_cg_roots; 5677 if (tmp->cg_root == this_th) { // Thread is a cg_root 5678 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); 5679 KA_TRACE( 5680 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); 5681 this_th->th.th_cg_roots = tmp->up; 5682 __kmp_free(tmp); 5683 } else { // Worker thread 5684 if (tmp->cg_nthreads == 0) { // last thread leaves contention group 5685 __kmp_free(tmp); 5686 } 5687 this_th->th.th_cg_roots = NULL; 5688 break; 5689 } 5690 } 5691 5692 /* If the implicit task assigned to this thread can be used by other threads 5693 * -> multiple threads can share the data and try to free the task at 5694 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5695 * with higher probability when hot team is disabled but can occurs even when 5696 * the hot team is enabled */ 5697 __kmp_free_implicit_task(this_th); 5698 this_th->th.th_current_task = NULL; 5699 5700 // If the __kmp_thread_pool_insert_pt is already past the new insert 5701 // point, then we need to re-scan the entire list. 5702 gtid = this_th->th.th_info.ds.ds_gtid; 5703 if (__kmp_thread_pool_insert_pt != NULL) { 5704 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5705 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5706 __kmp_thread_pool_insert_pt = NULL; 5707 } 5708 } 5709 5710 // Scan down the list to find the place to insert the thread. 5711 // scan is the address of a link in the list, possibly the address of 5712 // __kmp_thread_pool itself. 5713 // 5714 // In the absence of nested parallelism, the for loop will have 0 iterations. 5715 if (__kmp_thread_pool_insert_pt != NULL) { 5716 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5717 } else { 5718 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5719 } 5720 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5721 scan = &((*scan)->th.th_next_pool)) 5722 ; 5723 5724 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5725 // to its address. 5726 TCW_PTR(this_th->th.th_next_pool, *scan); 5727 __kmp_thread_pool_insert_pt = *scan = this_th; 5728 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5729 (this_th->th.th_info.ds.ds_gtid < 5730 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5731 TCW_4(this_th->th.th_in_pool, TRUE); 5732 __kmp_suspend_initialize_thread(this_th); 5733 __kmp_lock_suspend_mx(this_th); 5734 if (this_th->th.th_active == TRUE) { 5735 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); 5736 this_th->th.th_active_in_pool = TRUE; 5737 } 5738 #if KMP_DEBUG 5739 else { 5740 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); 5741 } 5742 #endif 5743 __kmp_unlock_suspend_mx(this_th); 5744 5745 TCW_4(__kmp_nth, __kmp_nth - 1); 5746 5747 #ifdef KMP_ADJUST_BLOCKTIME 5748 /* Adjust blocktime back to user setting or default if necessary */ 5749 /* Middle initialization might never have occurred */ 5750 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5751 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5752 if (__kmp_nth <= __kmp_avail_proc) { 5753 __kmp_zero_bt = FALSE; 5754 } 5755 } 5756 #endif /* KMP_ADJUST_BLOCKTIME */ 5757 5758 KMP_MB(); 5759 } 5760 5761 /* ------------------------------------------------------------------------ */ 5762 5763 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5764 #if OMP_PROFILING_SUPPORT 5765 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE"); 5766 // TODO: add a configuration option for time granularity 5767 if (ProfileTraceFile) 5768 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget"); 5769 #endif 5770 5771 int gtid = this_thr->th.th_info.ds.ds_gtid; 5772 /* void *stack_data;*/ 5773 kmp_team_t **volatile pteam; 5774 5775 KMP_MB(); 5776 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5777 5778 if (__kmp_env_consistency_check) { 5779 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5780 } 5781 5782 #if OMPD_SUPPORT 5783 if (ompd_state & OMPD_ENABLE_BP) 5784 ompd_bp_thread_begin(); 5785 #endif 5786 5787 #if OMPT_SUPPORT 5788 ompt_data_t *thread_data = nullptr; 5789 if (ompt_enabled.enabled) { 5790 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 5791 *thread_data = ompt_data_none; 5792 5793 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5794 this_thr->th.ompt_thread_info.wait_id = 0; 5795 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 5796 this_thr->th.ompt_thread_info.parallel_flags = 0; 5797 if (ompt_enabled.ompt_callback_thread_begin) { 5798 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 5799 ompt_thread_worker, thread_data); 5800 } 5801 this_thr->th.ompt_thread_info.state = ompt_state_idle; 5802 } 5803 #endif 5804 5805 /* This is the place where threads wait for work */ 5806 while (!TCR_4(__kmp_global.g.g_done)) { 5807 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 5808 KMP_MB(); 5809 5810 /* wait for work to do */ 5811 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 5812 5813 /* No tid yet since not part of a team */ 5814 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 5815 5816 #if OMPT_SUPPORT 5817 if (ompt_enabled.enabled) { 5818 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5819 } 5820 #endif 5821 5822 pteam = &this_thr->th.th_team; 5823 5824 /* have we been allocated? */ 5825 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 5826 /* we were just woken up, so run our new task */ 5827 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 5828 int rc; 5829 KA_TRACE(20, 5830 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 5831 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5832 (*pteam)->t.t_pkfn)); 5833 5834 updateHWFPControl(*pteam); 5835 5836 #if OMPT_SUPPORT 5837 if (ompt_enabled.enabled) { 5838 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 5839 } 5840 #endif 5841 5842 rc = (*pteam)->t.t_invoke(gtid); 5843 KMP_ASSERT(rc); 5844 5845 KMP_MB(); 5846 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 5847 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5848 (*pteam)->t.t_pkfn)); 5849 } 5850 #if OMPT_SUPPORT 5851 if (ompt_enabled.enabled) { 5852 /* no frame set while outside task */ 5853 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 5854 5855 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5856 } 5857 #endif 5858 /* join barrier after parallel region */ 5859 __kmp_join_barrier(gtid); 5860 } 5861 } 5862 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 5863 5864 #if OMPD_SUPPORT 5865 if (ompd_state & OMPD_ENABLE_BP) 5866 ompd_bp_thread_end(); 5867 #endif 5868 5869 #if OMPT_SUPPORT 5870 if (ompt_enabled.ompt_callback_thread_end) { 5871 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 5872 } 5873 #endif 5874 5875 this_thr->th.th_task_team = NULL; 5876 /* run the destructors for the threadprivate data for this thread */ 5877 __kmp_common_destroy_gtid(gtid); 5878 5879 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 5880 KMP_MB(); 5881 5882 #if OMP_PROFILING_SUPPORT 5883 llvm::timeTraceProfilerFinishThread(); 5884 #endif 5885 return this_thr; 5886 } 5887 5888 /* ------------------------------------------------------------------------ */ 5889 5890 void __kmp_internal_end_dest(void *specific_gtid) { 5891 // Make sure no significant bits are lost 5892 int gtid; 5893 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, >id); 5894 5895 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 5896 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 5897 * this is because 0 is reserved for the nothing-stored case */ 5898 5899 __kmp_internal_end_thread(gtid); 5900 } 5901 5902 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 5903 5904 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 5905 __kmp_internal_end_atexit(); 5906 } 5907 5908 #endif 5909 5910 /* [Windows] josh: when the atexit handler is called, there may still be more 5911 than one thread alive */ 5912 void __kmp_internal_end_atexit(void) { 5913 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 5914 /* [Windows] 5915 josh: ideally, we want to completely shutdown the library in this atexit 5916 handler, but stat code that depends on thread specific data for gtid fails 5917 because that data becomes unavailable at some point during the shutdown, so 5918 we call __kmp_internal_end_thread instead. We should eventually remove the 5919 dependency on __kmp_get_specific_gtid in the stat code and use 5920 __kmp_internal_end_library to cleanly shutdown the library. 5921 5922 // TODO: Can some of this comment about GVS be removed? 5923 I suspect that the offending stat code is executed when the calling thread 5924 tries to clean up a dead root thread's data structures, resulting in GVS 5925 code trying to close the GVS structures for that thread, but since the stat 5926 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 5927 the calling thread is cleaning up itself instead of another thread, it get 5928 confused. This happens because allowing a thread to unregister and cleanup 5929 another thread is a recent modification for addressing an issue. 5930 Based on the current design (20050722), a thread may end up 5931 trying to unregister another thread only if thread death does not trigger 5932 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 5933 thread specific data destructor function to detect thread death. For 5934 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 5935 is nothing. Thus, the workaround is applicable only for Windows static 5936 stat library. */ 5937 __kmp_internal_end_library(-1); 5938 #if KMP_OS_WINDOWS 5939 __kmp_close_console(); 5940 #endif 5941 } 5942 5943 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 5944 // It is assumed __kmp_forkjoin_lock is acquired. 5945 5946 int gtid; 5947 5948 KMP_DEBUG_ASSERT(thread != NULL); 5949 5950 gtid = thread->th.th_info.ds.ds_gtid; 5951 5952 if (!is_root) { 5953 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5954 /* Assume the threads are at the fork barrier here */ 5955 KA_TRACE( 5956 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 5957 gtid)); 5958 /* Need release fence here to prevent seg faults for tree forkjoin barrier 5959 * (GEH) */ 5960 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 5961 thread); 5962 __kmp_release_64(&flag); 5963 } 5964 5965 // Terminate OS thread. 5966 __kmp_reap_worker(thread); 5967 5968 // The thread was killed asynchronously. If it was actively 5969 // spinning in the thread pool, decrement the global count. 5970 // 5971 // There is a small timing hole here - if the worker thread was just waking 5972 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 5973 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 5974 // the global counter might not get updated. 5975 // 5976 // Currently, this can only happen as the library is unloaded, 5977 // so there are no harmful side effects. 5978 if (thread->th.th_active_in_pool) { 5979 thread->th.th_active_in_pool = FALSE; 5980 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 5981 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 5982 } 5983 } 5984 5985 __kmp_free_implicit_task(thread); 5986 5987 // Free the fast memory for tasking 5988 #if USE_FAST_MEMORY 5989 __kmp_free_fast_memory(thread); 5990 #endif /* USE_FAST_MEMORY */ 5991 5992 __kmp_suspend_uninitialize_thread(thread); 5993 5994 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 5995 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 5996 5997 --__kmp_all_nth; 5998 // __kmp_nth was decremented when thread is added to the pool. 5999 6000 #ifdef KMP_ADJUST_BLOCKTIME 6001 /* Adjust blocktime back to user setting or default if necessary */ 6002 /* Middle initialization might never have occurred */ 6003 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 6004 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 6005 if (__kmp_nth <= __kmp_avail_proc) { 6006 __kmp_zero_bt = FALSE; 6007 } 6008 } 6009 #endif /* KMP_ADJUST_BLOCKTIME */ 6010 6011 /* free the memory being used */ 6012 if (__kmp_env_consistency_check) { 6013 if (thread->th.th_cons) { 6014 __kmp_free_cons_stack(thread->th.th_cons); 6015 thread->th.th_cons = NULL; 6016 } 6017 } 6018 6019 if (thread->th.th_pri_common != NULL) { 6020 __kmp_free(thread->th.th_pri_common); 6021 thread->th.th_pri_common = NULL; 6022 } 6023 6024 if (thread->th.th_task_state_memo_stack != NULL) { 6025 __kmp_free(thread->th.th_task_state_memo_stack); 6026 thread->th.th_task_state_memo_stack = NULL; 6027 } 6028 6029 #if KMP_USE_BGET 6030 if (thread->th.th_local.bget_data != NULL) { 6031 __kmp_finalize_bget(thread); 6032 } 6033 #endif 6034 6035 #if KMP_AFFINITY_SUPPORTED 6036 if (thread->th.th_affin_mask != NULL) { 6037 KMP_CPU_FREE(thread->th.th_affin_mask); 6038 thread->th.th_affin_mask = NULL; 6039 } 6040 #endif /* KMP_AFFINITY_SUPPORTED */ 6041 6042 #if KMP_USE_HIER_SCHED 6043 if (thread->th.th_hier_bar_data != NULL) { 6044 __kmp_free(thread->th.th_hier_bar_data); 6045 thread->th.th_hier_bar_data = NULL; 6046 } 6047 #endif 6048 6049 __kmp_reap_team(thread->th.th_serial_team); 6050 thread->th.th_serial_team = NULL; 6051 __kmp_free(thread); 6052 6053 KMP_MB(); 6054 6055 } // __kmp_reap_thread 6056 6057 static void __kmp_internal_end(void) { 6058 int i; 6059 6060 /* First, unregister the library */ 6061 __kmp_unregister_library(); 6062 6063 #if KMP_OS_WINDOWS 6064 /* In Win static library, we can't tell when a root actually dies, so we 6065 reclaim the data structures for any root threads that have died but not 6066 unregistered themselves, in order to shut down cleanly. 6067 In Win dynamic library we also can't tell when a thread dies. */ 6068 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 6069 // dead roots 6070 #endif 6071 6072 for (i = 0; i < __kmp_threads_capacity; i++) 6073 if (__kmp_root[i]) 6074 if (__kmp_root[i]->r.r_active) 6075 break; 6076 KMP_MB(); /* Flush all pending memory write invalidates. */ 6077 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6078 6079 if (i < __kmp_threads_capacity) { 6080 #if KMP_USE_MONITOR 6081 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 6082 KMP_MB(); /* Flush all pending memory write invalidates. */ 6083 6084 // Need to check that monitor was initialized before reaping it. If we are 6085 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 6086 // __kmp_monitor will appear to contain valid data, but it is only valid in 6087 // the parent process, not the child. 6088 // New behavior (201008): instead of keying off of the flag 6089 // __kmp_init_parallel, the monitor thread creation is keyed off 6090 // of the new flag __kmp_init_monitor. 6091 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6092 if (TCR_4(__kmp_init_monitor)) { 6093 __kmp_reap_monitor(&__kmp_monitor); 6094 TCW_4(__kmp_init_monitor, 0); 6095 } 6096 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6097 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6098 #endif // KMP_USE_MONITOR 6099 } else { 6100 /* TODO move this to cleanup code */ 6101 #ifdef KMP_DEBUG 6102 /* make sure that everything has properly ended */ 6103 for (i = 0; i < __kmp_threads_capacity; i++) { 6104 if (__kmp_root[i]) { 6105 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 6106 // there can be uber threads alive here 6107 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 6108 } 6109 } 6110 #endif 6111 6112 KMP_MB(); 6113 6114 // Reap the worker threads. 6115 // This is valid for now, but be careful if threads are reaped sooner. 6116 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 6117 // Get the next thread from the pool. 6118 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 6119 __kmp_thread_pool = thread->th.th_next_pool; 6120 // Reap it. 6121 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 6122 thread->th.th_next_pool = NULL; 6123 thread->th.th_in_pool = FALSE; 6124 __kmp_reap_thread(thread, 0); 6125 } 6126 __kmp_thread_pool_insert_pt = NULL; 6127 6128 // Reap teams. 6129 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 6130 // Get the next team from the pool. 6131 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 6132 __kmp_team_pool = team->t.t_next_pool; 6133 // Reap it. 6134 team->t.t_next_pool = NULL; 6135 __kmp_reap_team(team); 6136 } 6137 6138 __kmp_reap_task_teams(); 6139 6140 #if KMP_OS_UNIX 6141 // Threads that are not reaped should not access any resources since they 6142 // are going to be deallocated soon, so the shutdown sequence should wait 6143 // until all threads either exit the final spin-waiting loop or begin 6144 // sleeping after the given blocktime. 6145 for (i = 0; i < __kmp_threads_capacity; i++) { 6146 kmp_info_t *thr = __kmp_threads[i]; 6147 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 6148 KMP_CPU_PAUSE(); 6149 } 6150 #endif 6151 6152 for (i = 0; i < __kmp_threads_capacity; ++i) { 6153 // TBD: Add some checking... 6154 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6155 } 6156 6157 /* Make sure all threadprivate destructors get run by joining with all 6158 worker threads before resetting this flag */ 6159 TCW_SYNC_4(__kmp_init_common, FALSE); 6160 6161 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6162 KMP_MB(); 6163 6164 #if KMP_USE_MONITOR 6165 // See note above: One of the possible fixes for CQ138434 / CQ140126 6166 // 6167 // FIXME: push both code fragments down and CSE them? 6168 // push them into __kmp_cleanup() ? 6169 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6170 if (TCR_4(__kmp_init_monitor)) { 6171 __kmp_reap_monitor(&__kmp_monitor); 6172 TCW_4(__kmp_init_monitor, 0); 6173 } 6174 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6175 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6176 #endif 6177 } /* else !__kmp_global.t_active */ 6178 TCW_4(__kmp_init_gtid, FALSE); 6179 KMP_MB(); /* Flush all pending memory write invalidates. */ 6180 6181 __kmp_cleanup(); 6182 #if OMPT_SUPPORT 6183 ompt_fini(); 6184 #endif 6185 } 6186 6187 void __kmp_internal_end_library(int gtid_req) { 6188 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6189 /* this shouldn't be a race condition because __kmp_internal_end() is the 6190 only place to clear __kmp_serial_init */ 6191 /* we'll check this later too, after we get the lock */ 6192 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6193 // redundant, because the next check will work in any case. 6194 if (__kmp_global.g.g_abort) { 6195 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6196 /* TODO abort? */ 6197 return; 6198 } 6199 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6200 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6201 return; 6202 } 6203 6204 // If hidden helper team has been initialized, we need to deinit it 6205 if (TCR_4(__kmp_init_hidden_helper) && 6206 !TCR_4(__kmp_hidden_helper_team_done)) { 6207 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6208 // First release the main thread to let it continue its work 6209 __kmp_hidden_helper_main_thread_release(); 6210 // Wait until the hidden helper team has been destroyed 6211 __kmp_hidden_helper_threads_deinitz_wait(); 6212 } 6213 6214 KMP_MB(); /* Flush all pending memory write invalidates. */ 6215 /* find out who we are and what we should do */ 6216 { 6217 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6218 KA_TRACE( 6219 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6220 if (gtid == KMP_GTID_SHUTDOWN) { 6221 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6222 "already shutdown\n")); 6223 return; 6224 } else if (gtid == KMP_GTID_MONITOR) { 6225 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6226 "registered, or system shutdown\n")); 6227 return; 6228 } else if (gtid == KMP_GTID_DNE) { 6229 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6230 "shutdown\n")); 6231 /* we don't know who we are, but we may still shutdown the library */ 6232 } else if (KMP_UBER_GTID(gtid)) { 6233 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6234 if (__kmp_root[gtid]->r.r_active) { 6235 __kmp_global.g.g_abort = -1; 6236 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6237 __kmp_unregister_library(); 6238 KA_TRACE(10, 6239 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6240 gtid)); 6241 return; 6242 } else { 6243 KA_TRACE( 6244 10, 6245 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6246 __kmp_unregister_root_current_thread(gtid); 6247 } 6248 } else { 6249 /* worker threads may call this function through the atexit handler, if they 6250 * call exit() */ 6251 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6252 TODO: do a thorough shutdown instead */ 6253 #ifdef DUMP_DEBUG_ON_EXIT 6254 if (__kmp_debug_buf) 6255 __kmp_dump_debug_buffer(); 6256 #endif 6257 // added unregister library call here when we switch to shm linux 6258 // if we don't, it will leave lots of files in /dev/shm 6259 // cleanup shared memory file before exiting. 6260 __kmp_unregister_library(); 6261 return; 6262 } 6263 } 6264 /* synchronize the termination process */ 6265 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6266 6267 /* have we already finished */ 6268 if (__kmp_global.g.g_abort) { 6269 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6270 /* TODO abort? */ 6271 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6272 return; 6273 } 6274 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6275 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6276 return; 6277 } 6278 6279 /* We need this lock to enforce mutex between this reading of 6280 __kmp_threads_capacity and the writing by __kmp_register_root. 6281 Alternatively, we can use a counter of roots that is atomically updated by 6282 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6283 __kmp_internal_end_*. */ 6284 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6285 6286 /* now we can safely conduct the actual termination */ 6287 __kmp_internal_end(); 6288 6289 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6290 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6291 6292 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6293 6294 #ifdef DUMP_DEBUG_ON_EXIT 6295 if (__kmp_debug_buf) 6296 __kmp_dump_debug_buffer(); 6297 #endif 6298 6299 #if KMP_OS_WINDOWS 6300 __kmp_close_console(); 6301 #endif 6302 6303 __kmp_fini_allocator(); 6304 6305 } // __kmp_internal_end_library 6306 6307 void __kmp_internal_end_thread(int gtid_req) { 6308 int i; 6309 6310 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6311 /* this shouldn't be a race condition because __kmp_internal_end() is the 6312 * only place to clear __kmp_serial_init */ 6313 /* we'll check this later too, after we get the lock */ 6314 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6315 // redundant, because the next check will work in any case. 6316 if (__kmp_global.g.g_abort) { 6317 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6318 /* TODO abort? */ 6319 return; 6320 } 6321 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6322 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6323 return; 6324 } 6325 6326 // If hidden helper team has been initialized, we need to deinit it 6327 if (TCR_4(__kmp_init_hidden_helper) && 6328 !TCR_4(__kmp_hidden_helper_team_done)) { 6329 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6330 // First release the main thread to let it continue its work 6331 __kmp_hidden_helper_main_thread_release(); 6332 // Wait until the hidden helper team has been destroyed 6333 __kmp_hidden_helper_threads_deinitz_wait(); 6334 } 6335 6336 KMP_MB(); /* Flush all pending memory write invalidates. */ 6337 6338 /* find out who we are and what we should do */ 6339 { 6340 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6341 KA_TRACE(10, 6342 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6343 if (gtid == KMP_GTID_SHUTDOWN) { 6344 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6345 "already shutdown\n")); 6346 return; 6347 } else if (gtid == KMP_GTID_MONITOR) { 6348 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6349 "registered, or system shutdown\n")); 6350 return; 6351 } else if (gtid == KMP_GTID_DNE) { 6352 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6353 "shutdown\n")); 6354 return; 6355 /* we don't know who we are */ 6356 } else if (KMP_UBER_GTID(gtid)) { 6357 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6358 if (__kmp_root[gtid]->r.r_active) { 6359 __kmp_global.g.g_abort = -1; 6360 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6361 KA_TRACE(10, 6362 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6363 gtid)); 6364 return; 6365 } else { 6366 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6367 gtid)); 6368 __kmp_unregister_root_current_thread(gtid); 6369 } 6370 } else { 6371 /* just a worker thread, let's leave */ 6372 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6373 6374 if (gtid >= 0) { 6375 __kmp_threads[gtid]->th.th_task_team = NULL; 6376 } 6377 6378 KA_TRACE(10, 6379 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6380 gtid)); 6381 return; 6382 } 6383 } 6384 #if KMP_DYNAMIC_LIB 6385 if (__kmp_pause_status != kmp_hard_paused) 6386 // AC: lets not shutdown the dynamic library at the exit of uber thread, 6387 // because we will better shutdown later in the library destructor. 6388 { 6389 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6390 return; 6391 } 6392 #endif 6393 /* synchronize the termination process */ 6394 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6395 6396 /* have we already finished */ 6397 if (__kmp_global.g.g_abort) { 6398 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6399 /* TODO abort? */ 6400 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6401 return; 6402 } 6403 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6404 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6405 return; 6406 } 6407 6408 /* We need this lock to enforce mutex between this reading of 6409 __kmp_threads_capacity and the writing by __kmp_register_root. 6410 Alternatively, we can use a counter of roots that is atomically updated by 6411 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6412 __kmp_internal_end_*. */ 6413 6414 /* should we finish the run-time? are all siblings done? */ 6415 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6416 6417 for (i = 0; i < __kmp_threads_capacity; ++i) { 6418 if (KMP_UBER_GTID(i)) { 6419 KA_TRACE( 6420 10, 6421 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6422 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6423 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6424 return; 6425 } 6426 } 6427 6428 /* now we can safely conduct the actual termination */ 6429 6430 __kmp_internal_end(); 6431 6432 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6433 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6434 6435 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6436 6437 #ifdef DUMP_DEBUG_ON_EXIT 6438 if (__kmp_debug_buf) 6439 __kmp_dump_debug_buffer(); 6440 #endif 6441 } // __kmp_internal_end_thread 6442 6443 // ----------------------------------------------------------------------------- 6444 // Library registration stuff. 6445 6446 static long __kmp_registration_flag = 0; 6447 // Random value used to indicate library initialization. 6448 static char *__kmp_registration_str = NULL; 6449 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6450 6451 static inline char *__kmp_reg_status_name() { 6452 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6453 each thread. If registration and unregistration go in different threads 6454 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6455 env var can not be found, because the name will contain different pid. */ 6456 // macOS* complains about name being too long with additional getuid() 6457 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB 6458 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(), 6459 (int)getuid()); 6460 #else 6461 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6462 #endif 6463 } // __kmp_reg_status_get 6464 6465 void __kmp_register_library_startup(void) { 6466 6467 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6468 int done = 0; 6469 union { 6470 double dtime; 6471 long ltime; 6472 } time; 6473 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6474 __kmp_initialize_system_tick(); 6475 #endif 6476 __kmp_read_system_time(&time.dtime); 6477 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6478 __kmp_registration_str = 6479 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6480 __kmp_registration_flag, KMP_LIBRARY_FILE); 6481 6482 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6483 __kmp_registration_str)); 6484 6485 while (!done) { 6486 6487 char *value = NULL; // Actual value of the environment variable. 6488 6489 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6490 char *shm_name = __kmp_str_format("/%s", name); 6491 int shm_preexist = 0; 6492 char *data1; 6493 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666); 6494 if ((fd1 == -1) && (errno == EEXIST)) { 6495 // file didn't open because it already exists. 6496 // try opening existing file 6497 fd1 = shm_open(shm_name, O_RDWR, 0666); 6498 if (fd1 == -1) { // file didn't open 6499 // error out here 6500 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0), 6501 __kmp_msg_null); 6502 } else { 6503 // able to open existing file 6504 shm_preexist = 1; 6505 } 6506 } else if (fd1 == -1) { // SHM didn't open; it was due to error other than 6507 // already exists. 6508 // error out here. 6509 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno), 6510 __kmp_msg_null); 6511 } 6512 if (shm_preexist == 0) { 6513 // we created SHM now set size 6514 if (ftruncate(fd1, SHM_SIZE) == -1) { 6515 // error occured setting size; 6516 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"), 6517 KMP_ERR(errno), __kmp_msg_null); 6518 } 6519 } 6520 data1 = 6521 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0); 6522 if (data1 == MAP_FAILED) { 6523 // failed to map shared memory 6524 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno), 6525 __kmp_msg_null); 6526 } 6527 if (shm_preexist == 0) { // set data to SHM, set value 6528 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); 6529 } 6530 // Read value from either what we just wrote or existing file. 6531 value = __kmp_str_format("%s", data1); // read value from SHM 6532 munmap(data1, SHM_SIZE); 6533 close(fd1); 6534 #else // Windows and unix with static library 6535 // Set environment variable, but do not overwrite if it is exist. 6536 __kmp_env_set(name, __kmp_registration_str, 0); 6537 // read value to see if it got set 6538 value = __kmp_env_get(name); 6539 #endif 6540 6541 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6542 done = 1; // Ok, environment variable set successfully, exit the loop. 6543 } else { 6544 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6545 // Check whether it alive or dead. 6546 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6547 char *tail = value; 6548 char *flag_addr_str = NULL; 6549 char *flag_val_str = NULL; 6550 char const *file_name = NULL; 6551 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6552 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6553 file_name = tail; 6554 if (tail != NULL) { 6555 unsigned long *flag_addr = 0; 6556 unsigned long flag_val = 0; 6557 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr)); 6558 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6559 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6560 // First, check whether environment-encoded address is mapped into 6561 // addr space. 6562 // If so, dereference it to see if it still has the right value. 6563 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6564 neighbor = 1; 6565 } else { 6566 // If not, then we know the other copy of the library is no longer 6567 // running. 6568 neighbor = 2; 6569 } 6570 } 6571 } 6572 switch (neighbor) { 6573 case 0: // Cannot parse environment variable -- neighbor status unknown. 6574 // Assume it is the incompatible format of future version of the 6575 // library. Assume the other library is alive. 6576 // WARN( ... ); // TODO: Issue a warning. 6577 file_name = "unknown library"; 6578 KMP_FALLTHROUGH(); 6579 // Attention! Falling to the next case. That's intentional. 6580 case 1: { // Neighbor is alive. 6581 // Check it is allowed. 6582 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6583 if (!__kmp_str_match_true(duplicate_ok)) { 6584 // That's not allowed. Issue fatal error. 6585 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6586 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6587 } 6588 KMP_INTERNAL_FREE(duplicate_ok); 6589 __kmp_duplicate_library_ok = 1; 6590 done = 1; // Exit the loop. 6591 } break; 6592 case 2: { // Neighbor is dead. 6593 6594 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6595 // close shared memory. 6596 shm_unlink(shm_name); // this removes file in /dev/shm 6597 #else 6598 // Clear the variable and try to register library again. 6599 __kmp_env_unset(name); 6600 #endif 6601 } break; 6602 default: { 6603 KMP_DEBUG_ASSERT(0); 6604 } break; 6605 } 6606 } 6607 KMP_INTERNAL_FREE((void *)value); 6608 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6609 KMP_INTERNAL_FREE((void *)shm_name); 6610 #endif 6611 } // while 6612 KMP_INTERNAL_FREE((void *)name); 6613 6614 } // func __kmp_register_library_startup 6615 6616 void __kmp_unregister_library(void) { 6617 6618 char *name = __kmp_reg_status_name(); 6619 char *value = NULL; 6620 6621 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6622 char *shm_name = __kmp_str_format("/%s", name); 6623 int fd1 = shm_open(shm_name, O_RDONLY, 0666); 6624 if (fd1 == -1) { 6625 // file did not open. return. 6626 return; 6627 } 6628 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); 6629 if (data1 != MAP_FAILED) { 6630 value = __kmp_str_format("%s", data1); // read value from SHM 6631 munmap(data1, SHM_SIZE); 6632 } 6633 close(fd1); 6634 #else 6635 value = __kmp_env_get(name); 6636 #endif 6637 6638 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6639 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6640 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6641 // Ok, this is our variable. Delete it. 6642 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6643 shm_unlink(shm_name); // this removes file in /dev/shm 6644 #else 6645 __kmp_env_unset(name); 6646 #endif 6647 } 6648 6649 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6650 KMP_INTERNAL_FREE(shm_name); 6651 #endif 6652 6653 KMP_INTERNAL_FREE(__kmp_registration_str); 6654 KMP_INTERNAL_FREE(value); 6655 KMP_INTERNAL_FREE(name); 6656 6657 __kmp_registration_flag = 0; 6658 __kmp_registration_str = NULL; 6659 6660 } // __kmp_unregister_library 6661 6662 // End of Library registration stuff. 6663 // ----------------------------------------------------------------------------- 6664 6665 #if KMP_MIC_SUPPORTED 6666 6667 static void __kmp_check_mic_type() { 6668 kmp_cpuid_t cpuid_state = {0}; 6669 kmp_cpuid_t *cs_p = &cpuid_state; 6670 __kmp_x86_cpuid(1, 0, cs_p); 6671 // We don't support mic1 at the moment 6672 if ((cs_p->eax & 0xff0) == 0xB10) { 6673 __kmp_mic_type = mic2; 6674 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6675 __kmp_mic_type = mic3; 6676 } else { 6677 __kmp_mic_type = non_mic; 6678 } 6679 } 6680 6681 #endif /* KMP_MIC_SUPPORTED */ 6682 6683 #if KMP_HAVE_UMWAIT 6684 static void __kmp_user_level_mwait_init() { 6685 struct kmp_cpuid buf; 6686 __kmp_x86_cpuid(7, 0, &buf); 6687 __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait; 6688 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", 6689 __kmp_umwait_enabled)); 6690 } 6691 #elif KMP_HAVE_MWAIT 6692 #ifndef AT_INTELPHIUSERMWAIT 6693 // Spurious, non-existent value that should always fail to return anything. 6694 // Will be replaced with the correct value when we know that. 6695 #define AT_INTELPHIUSERMWAIT 10000 6696 #endif 6697 // getauxval() function is available in RHEL7 and SLES12. If a system with an 6698 // earlier OS is used to build the RTL, we'll use the following internal 6699 // function when the entry is not found. 6700 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL; 6701 unsigned long getauxval(unsigned long) { return 0; } 6702 6703 static void __kmp_user_level_mwait_init() { 6704 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available 6705 // use them to find if the user-level mwait is enabled. Otherwise, forcibly 6706 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable 6707 // KMP_USER_LEVEL_MWAIT was set to TRUE. 6708 if (__kmp_mic_type == mic3) { 6709 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT); 6710 if ((res & 0x1) || __kmp_user_level_mwait) { 6711 __kmp_mwait_enabled = TRUE; 6712 if (__kmp_user_level_mwait) { 6713 KMP_INFORM(EnvMwaitWarn); 6714 } 6715 } else { 6716 __kmp_mwait_enabled = FALSE; 6717 } 6718 } 6719 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, " 6720 "__kmp_mwait_enabled = %d\n", 6721 __kmp_mic_type, __kmp_mwait_enabled)); 6722 } 6723 #endif /* KMP_HAVE_UMWAIT */ 6724 6725 static void __kmp_do_serial_initialize(void) { 6726 int i, gtid; 6727 size_t size; 6728 6729 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 6730 6731 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 6732 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 6733 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 6734 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 6735 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 6736 6737 #if OMPT_SUPPORT 6738 ompt_pre_init(); 6739 #endif 6740 #if OMPD_SUPPORT 6741 __kmp_env_dump(); 6742 ompd_init(); 6743 #endif 6744 6745 __kmp_validate_locks(); 6746 6747 /* Initialize internal memory allocator */ 6748 __kmp_init_allocator(); 6749 6750 /* Register the library startup via an environment variable and check to see 6751 whether another copy of the library is already registered. */ 6752 6753 __kmp_register_library_startup(); 6754 6755 /* TODO reinitialization of library */ 6756 if (TCR_4(__kmp_global.g.g_done)) { 6757 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 6758 } 6759 6760 __kmp_global.g.g_abort = 0; 6761 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 6762 6763 /* initialize the locks */ 6764 #if KMP_USE_ADAPTIVE_LOCKS 6765 #if KMP_DEBUG_ADAPTIVE_LOCKS 6766 __kmp_init_speculative_stats(); 6767 #endif 6768 #endif 6769 #if KMP_STATS_ENABLED 6770 __kmp_stats_init(); 6771 #endif 6772 __kmp_init_lock(&__kmp_global_lock); 6773 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 6774 __kmp_init_lock(&__kmp_debug_lock); 6775 __kmp_init_atomic_lock(&__kmp_atomic_lock); 6776 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 6777 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 6778 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 6779 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 6780 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 6781 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 6782 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 6783 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 6784 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 6785 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 6786 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 6787 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 6788 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 6789 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 6790 #if KMP_USE_MONITOR 6791 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 6792 #endif 6793 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 6794 6795 /* conduct initialization and initial setup of configuration */ 6796 6797 __kmp_runtime_initialize(); 6798 6799 #if KMP_MIC_SUPPORTED 6800 __kmp_check_mic_type(); 6801 #endif 6802 6803 // Some global variable initialization moved here from kmp_env_initialize() 6804 #ifdef KMP_DEBUG 6805 kmp_diag = 0; 6806 #endif 6807 __kmp_abort_delay = 0; 6808 6809 // From __kmp_init_dflt_team_nth() 6810 /* assume the entire machine will be used */ 6811 __kmp_dflt_team_nth_ub = __kmp_xproc; 6812 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 6813 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 6814 } 6815 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 6816 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 6817 } 6818 __kmp_max_nth = __kmp_sys_max_nth; 6819 __kmp_cg_max_nth = __kmp_sys_max_nth; 6820 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 6821 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 6822 __kmp_teams_max_nth = __kmp_sys_max_nth; 6823 } 6824 6825 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 6826 // part 6827 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 6828 #if KMP_USE_MONITOR 6829 __kmp_monitor_wakeups = 6830 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6831 __kmp_bt_intervals = 6832 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6833 #endif 6834 // From "KMP_LIBRARY" part of __kmp_env_initialize() 6835 __kmp_library = library_throughput; 6836 // From KMP_SCHEDULE initialization 6837 __kmp_static = kmp_sch_static_balanced; 6838 // AC: do not use analytical here, because it is non-monotonous 6839 //__kmp_guided = kmp_sch_guided_iterative_chunked; 6840 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 6841 // need to repeat assignment 6842 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 6843 // bit control and barrier method control parts 6844 #if KMP_FAST_REDUCTION_BARRIER 6845 #define kmp_reduction_barrier_gather_bb ((int)1) 6846 #define kmp_reduction_barrier_release_bb ((int)1) 6847 #define kmp_reduction_barrier_gather_pat bp_hyper_bar 6848 #define kmp_reduction_barrier_release_pat bp_hyper_bar 6849 #endif // KMP_FAST_REDUCTION_BARRIER 6850 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 6851 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 6852 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 6853 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 6854 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 6855 #if KMP_FAST_REDUCTION_BARRIER 6856 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 6857 // lin_64 ): hyper,1 6858 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 6859 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 6860 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 6861 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 6862 } 6863 #endif // KMP_FAST_REDUCTION_BARRIER 6864 } 6865 #if KMP_FAST_REDUCTION_BARRIER 6866 #undef kmp_reduction_barrier_release_pat 6867 #undef kmp_reduction_barrier_gather_pat 6868 #undef kmp_reduction_barrier_release_bb 6869 #undef kmp_reduction_barrier_gather_bb 6870 #endif // KMP_FAST_REDUCTION_BARRIER 6871 #if KMP_MIC_SUPPORTED 6872 if (__kmp_mic_type == mic2) { // KNC 6873 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 6874 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 6875 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 6876 1; // forkjoin release 6877 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6878 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6879 } 6880 #if KMP_FAST_REDUCTION_BARRIER 6881 if (__kmp_mic_type == mic2) { // KNC 6882 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6883 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6884 } 6885 #endif // KMP_FAST_REDUCTION_BARRIER 6886 #endif // KMP_MIC_SUPPORTED 6887 6888 // From KMP_CHECKS initialization 6889 #ifdef KMP_DEBUG 6890 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 6891 #else 6892 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 6893 #endif 6894 6895 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 6896 __kmp_foreign_tp = TRUE; 6897 6898 __kmp_global.g.g_dynamic = FALSE; 6899 __kmp_global.g.g_dynamic_mode = dynamic_default; 6900 6901 __kmp_init_nesting_mode(); 6902 6903 __kmp_env_initialize(NULL); 6904 6905 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT 6906 __kmp_user_level_mwait_init(); 6907 #endif 6908 // Print all messages in message catalog for testing purposes. 6909 #ifdef KMP_DEBUG 6910 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 6911 if (__kmp_str_match_true(val)) { 6912 kmp_str_buf_t buffer; 6913 __kmp_str_buf_init(&buffer); 6914 __kmp_i18n_dump_catalog(&buffer); 6915 __kmp_printf("%s", buffer.str); 6916 __kmp_str_buf_free(&buffer); 6917 } 6918 __kmp_env_free(&val); 6919 #endif 6920 6921 __kmp_threads_capacity = 6922 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 6923 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 6924 __kmp_tp_capacity = __kmp_default_tp_capacity( 6925 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 6926 6927 // If the library is shut down properly, both pools must be NULL. Just in 6928 // case, set them to NULL -- some memory may leak, but subsequent code will 6929 // work even if pools are not freed. 6930 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 6931 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 6932 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 6933 __kmp_thread_pool = NULL; 6934 __kmp_thread_pool_insert_pt = NULL; 6935 __kmp_team_pool = NULL; 6936 6937 /* Allocate all of the variable sized records */ 6938 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 6939 * expandable */ 6940 /* Since allocation is cache-aligned, just add extra padding at the end */ 6941 size = 6942 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 6943 CACHE_LINE; 6944 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 6945 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 6946 sizeof(kmp_info_t *) * __kmp_threads_capacity); 6947 6948 /* init thread counts */ 6949 KMP_DEBUG_ASSERT(__kmp_all_nth == 6950 0); // Asserts fail if the library is reinitializing and 6951 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 6952 __kmp_all_nth = 0; 6953 __kmp_nth = 0; 6954 6955 /* setup the uber master thread and hierarchy */ 6956 gtid = __kmp_register_root(TRUE); 6957 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 6958 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6959 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 6960 6961 KMP_MB(); /* Flush all pending memory write invalidates. */ 6962 6963 __kmp_common_initialize(); 6964 6965 #if KMP_OS_UNIX 6966 /* invoke the child fork handler */ 6967 __kmp_register_atfork(); 6968 #endif 6969 6970 #if !KMP_DYNAMIC_LIB 6971 { 6972 /* Invoke the exit handler when the program finishes, only for static 6973 library. For dynamic library, we already have _fini and DllMain. */ 6974 int rc = atexit(__kmp_internal_end_atexit); 6975 if (rc != 0) { 6976 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 6977 __kmp_msg_null); 6978 } 6979 } 6980 #endif 6981 6982 #if KMP_HANDLE_SIGNALS 6983 #if KMP_OS_UNIX 6984 /* NOTE: make sure that this is called before the user installs their own 6985 signal handlers so that the user handlers are called first. this way they 6986 can return false, not call our handler, avoid terminating the library, and 6987 continue execution where they left off. */ 6988 __kmp_install_signals(FALSE); 6989 #endif /* KMP_OS_UNIX */ 6990 #if KMP_OS_WINDOWS 6991 __kmp_install_signals(TRUE); 6992 #endif /* KMP_OS_WINDOWS */ 6993 #endif 6994 6995 /* we have finished the serial initialization */ 6996 __kmp_init_counter++; 6997 6998 __kmp_init_serial = TRUE; 6999 7000 if (__kmp_settings) { 7001 __kmp_env_print(); 7002 } 7003 7004 if (__kmp_display_env || __kmp_display_env_verbose) { 7005 __kmp_env_print_2(); 7006 } 7007 7008 #if OMPT_SUPPORT 7009 ompt_post_init(); 7010 #endif 7011 7012 KMP_MB(); 7013 7014 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 7015 } 7016 7017 void __kmp_serial_initialize(void) { 7018 if (__kmp_init_serial) { 7019 return; 7020 } 7021 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7022 if (__kmp_init_serial) { 7023 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7024 return; 7025 } 7026 __kmp_do_serial_initialize(); 7027 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7028 } 7029 7030 static void __kmp_do_middle_initialize(void) { 7031 int i, j; 7032 int prev_dflt_team_nth; 7033 7034 if (!__kmp_init_serial) { 7035 __kmp_do_serial_initialize(); 7036 } 7037 7038 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 7039 7040 // Save the previous value for the __kmp_dflt_team_nth so that 7041 // we can avoid some reinitialization if it hasn't changed. 7042 prev_dflt_team_nth = __kmp_dflt_team_nth; 7043 7044 #if KMP_AFFINITY_SUPPORTED 7045 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 7046 // number of cores on the machine. 7047 __kmp_affinity_initialize(); 7048 7049 #endif /* KMP_AFFINITY_SUPPORTED */ 7050 7051 KMP_ASSERT(__kmp_xproc > 0); 7052 if (__kmp_avail_proc == 0) { 7053 __kmp_avail_proc = __kmp_xproc; 7054 } 7055 7056 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 7057 // correct them now 7058 j = 0; 7059 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 7060 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 7061 __kmp_avail_proc; 7062 j++; 7063 } 7064 7065 if (__kmp_dflt_team_nth == 0) { 7066 #ifdef KMP_DFLT_NTH_CORES 7067 // Default #threads = #cores 7068 __kmp_dflt_team_nth = __kmp_ncores; 7069 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7070 "__kmp_ncores (%d)\n", 7071 __kmp_dflt_team_nth)); 7072 #else 7073 // Default #threads = #available OS procs 7074 __kmp_dflt_team_nth = __kmp_avail_proc; 7075 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7076 "__kmp_avail_proc(%d)\n", 7077 __kmp_dflt_team_nth)); 7078 #endif /* KMP_DFLT_NTH_CORES */ 7079 } 7080 7081 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 7082 __kmp_dflt_team_nth = KMP_MIN_NTH; 7083 } 7084 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 7085 __kmp_dflt_team_nth = __kmp_sys_max_nth; 7086 } 7087 7088 if (__kmp_nesting_mode > 0) 7089 __kmp_set_nesting_mode_threads(); 7090 7091 // There's no harm in continuing if the following check fails, 7092 // but it indicates an error in the previous logic. 7093 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 7094 7095 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 7096 // Run through the __kmp_threads array and set the num threads icv for each 7097 // root thread that is currently registered with the RTL (which has not 7098 // already explicitly set its nthreads-var with a call to 7099 // omp_set_num_threads()). 7100 for (i = 0; i < __kmp_threads_capacity; i++) { 7101 kmp_info_t *thread = __kmp_threads[i]; 7102 if (thread == NULL) 7103 continue; 7104 if (thread->th.th_current_task->td_icvs.nproc != 0) 7105 continue; 7106 7107 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 7108 } 7109 } 7110 KA_TRACE( 7111 20, 7112 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 7113 __kmp_dflt_team_nth)); 7114 7115 #ifdef KMP_ADJUST_BLOCKTIME 7116 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 7117 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 7118 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 7119 if (__kmp_nth > __kmp_avail_proc) { 7120 __kmp_zero_bt = TRUE; 7121 } 7122 } 7123 #endif /* KMP_ADJUST_BLOCKTIME */ 7124 7125 /* we have finished middle initialization */ 7126 TCW_SYNC_4(__kmp_init_middle, TRUE); 7127 7128 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 7129 } 7130 7131 void __kmp_middle_initialize(void) { 7132 if (__kmp_init_middle) { 7133 return; 7134 } 7135 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7136 if (__kmp_init_middle) { 7137 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7138 return; 7139 } 7140 __kmp_do_middle_initialize(); 7141 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7142 } 7143 7144 void __kmp_parallel_initialize(void) { 7145 int gtid = __kmp_entry_gtid(); // this might be a new root 7146 7147 /* synchronize parallel initialization (for sibling) */ 7148 if (TCR_4(__kmp_init_parallel)) 7149 return; 7150 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7151 if (TCR_4(__kmp_init_parallel)) { 7152 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7153 return; 7154 } 7155 7156 /* TODO reinitialization after we have already shut down */ 7157 if (TCR_4(__kmp_global.g.g_done)) { 7158 KA_TRACE( 7159 10, 7160 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 7161 __kmp_infinite_loop(); 7162 } 7163 7164 /* jc: The lock __kmp_initz_lock is already held, so calling 7165 __kmp_serial_initialize would cause a deadlock. So we call 7166 __kmp_do_serial_initialize directly. */ 7167 if (!__kmp_init_middle) { 7168 __kmp_do_middle_initialize(); 7169 } 7170 __kmp_assign_root_init_mask(); 7171 __kmp_resume_if_hard_paused(); 7172 7173 /* begin initialization */ 7174 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 7175 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7176 7177 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 7178 // Save the FP control regs. 7179 // Worker threads will set theirs to these values at thread startup. 7180 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 7181 __kmp_store_mxcsr(&__kmp_init_mxcsr); 7182 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 7183 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 7184 7185 #if KMP_OS_UNIX 7186 #if KMP_HANDLE_SIGNALS 7187 /* must be after __kmp_serial_initialize */ 7188 __kmp_install_signals(TRUE); 7189 #endif 7190 #endif 7191 7192 __kmp_suspend_initialize(); 7193 7194 #if defined(USE_LOAD_BALANCE) 7195 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7196 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 7197 } 7198 #else 7199 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7200 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7201 } 7202 #endif 7203 7204 if (__kmp_version) { 7205 __kmp_print_version_2(); 7206 } 7207 7208 /* we have finished parallel initialization */ 7209 TCW_SYNC_4(__kmp_init_parallel, TRUE); 7210 7211 KMP_MB(); 7212 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 7213 7214 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7215 } 7216 7217 void __kmp_hidden_helper_initialize() { 7218 if (TCR_4(__kmp_init_hidden_helper)) 7219 return; 7220 7221 // __kmp_parallel_initialize is required before we initialize hidden helper 7222 if (!TCR_4(__kmp_init_parallel)) 7223 __kmp_parallel_initialize(); 7224 7225 // Double check. Note that this double check should not be placed before 7226 // __kmp_parallel_initialize as it will cause dead lock. 7227 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7228 if (TCR_4(__kmp_init_hidden_helper)) { 7229 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7230 return; 7231 } 7232 7233 // Set the count of hidden helper tasks to be executed to zero 7234 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0); 7235 7236 // Set the global variable indicating that we're initializing hidden helper 7237 // team/threads 7238 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE); 7239 7240 // Platform independent initialization 7241 __kmp_do_initialize_hidden_helper_threads(); 7242 7243 // Wait here for the finish of initialization of hidden helper teams 7244 __kmp_hidden_helper_threads_initz_wait(); 7245 7246 // We have finished hidden helper initialization 7247 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE); 7248 7249 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7250 } 7251 7252 /* ------------------------------------------------------------------------ */ 7253 7254 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7255 kmp_team_t *team) { 7256 kmp_disp_t *dispatch; 7257 7258 KMP_MB(); 7259 7260 /* none of the threads have encountered any constructs, yet. */ 7261 this_thr->th.th_local.this_construct = 0; 7262 #if KMP_CACHE_MANAGE 7263 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 7264 #endif /* KMP_CACHE_MANAGE */ 7265 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 7266 KMP_DEBUG_ASSERT(dispatch); 7267 KMP_DEBUG_ASSERT(team->t.t_dispatch); 7268 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 7269 // this_thr->th.th_info.ds.ds_tid ] ); 7270 7271 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 7272 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter 7273 if (__kmp_env_consistency_check) 7274 __kmp_push_parallel(gtid, team->t.t_ident); 7275 7276 KMP_MB(); /* Flush all pending memory write invalidates. */ 7277 } 7278 7279 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7280 kmp_team_t *team) { 7281 if (__kmp_env_consistency_check) 7282 __kmp_pop_parallel(gtid, team->t.t_ident); 7283 7284 __kmp_finish_implicit_task(this_thr); 7285 } 7286 7287 int __kmp_invoke_task_func(int gtid) { 7288 int rc; 7289 int tid = __kmp_tid_from_gtid(gtid); 7290 kmp_info_t *this_thr = __kmp_threads[gtid]; 7291 kmp_team_t *team = this_thr->th.th_team; 7292 7293 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 7294 #if USE_ITT_BUILD 7295 if (__itt_stack_caller_create_ptr) { 7296 // inform ittnotify about entering user's code 7297 if (team->t.t_stack_id != NULL) { 7298 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id); 7299 } else { 7300 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7301 __kmp_itt_stack_callee_enter( 7302 (__itt_caller)team->t.t_parent->t.t_stack_id); 7303 } 7304 } 7305 #endif /* USE_ITT_BUILD */ 7306 #if INCLUDE_SSC_MARKS 7307 SSC_MARK_INVOKING(); 7308 #endif 7309 7310 #if OMPT_SUPPORT 7311 void *dummy; 7312 void **exit_frame_p; 7313 ompt_data_t *my_task_data; 7314 ompt_data_t *my_parallel_data; 7315 int ompt_team_size; 7316 7317 if (ompt_enabled.enabled) { 7318 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid] 7319 .ompt_task_info.frame.exit_frame.ptr); 7320 } else { 7321 exit_frame_p = &dummy; 7322 } 7323 7324 my_task_data = 7325 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 7326 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 7327 if (ompt_enabled.ompt_callback_implicit_task) { 7328 ompt_team_size = team->t.t_nproc; 7329 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7330 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 7331 __kmp_tid_from_gtid(gtid), ompt_task_implicit); 7332 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 7333 } 7334 #endif 7335 7336 #if KMP_STATS_ENABLED 7337 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 7338 if (previous_state == stats_state_e::TEAMS_REGION) { 7339 KMP_PUSH_PARTITIONED_TIMER(OMP_teams); 7340 } else { 7341 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); 7342 } 7343 KMP_SET_THREAD_STATE(IMPLICIT_TASK); 7344 #endif 7345 7346 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 7347 tid, (int)team->t.t_argc, (void **)team->t.t_argv 7348 #if OMPT_SUPPORT 7349 , 7350 exit_frame_p 7351 #endif 7352 ); 7353 #if OMPT_SUPPORT 7354 *exit_frame_p = NULL; 7355 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team; 7356 #endif 7357 7358 #if KMP_STATS_ENABLED 7359 if (previous_state == stats_state_e::TEAMS_REGION) { 7360 KMP_SET_THREAD_STATE(previous_state); 7361 } 7362 KMP_POP_PARTITIONED_TIMER(); 7363 #endif 7364 7365 #if USE_ITT_BUILD 7366 if (__itt_stack_caller_create_ptr) { 7367 // inform ittnotify about leaving user's code 7368 if (team->t.t_stack_id != NULL) { 7369 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id); 7370 } else { 7371 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7372 __kmp_itt_stack_callee_leave( 7373 (__itt_caller)team->t.t_parent->t.t_stack_id); 7374 } 7375 } 7376 #endif /* USE_ITT_BUILD */ 7377 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7378 7379 return rc; 7380 } 7381 7382 void __kmp_teams_master(int gtid) { 7383 // This routine is called by all primary threads in teams construct 7384 kmp_info_t *thr = __kmp_threads[gtid]; 7385 kmp_team_t *team = thr->th.th_team; 7386 ident_t *loc = team->t.t_ident; 7387 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7388 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7389 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7390 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7391 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7392 7393 // This thread is a new CG root. Set up the proper variables. 7394 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 7395 tmp->cg_root = thr; // Make thr the CG root 7396 // Init to thread limit stored when league primary threads were forked 7397 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; 7398 tmp->cg_nthreads = 1; // Init counter to one active thread, this one 7399 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" 7400 " cg_nthreads to 1\n", 7401 thr, tmp)); 7402 tmp->up = thr->th.th_cg_roots; 7403 thr->th.th_cg_roots = tmp; 7404 7405 // Launch league of teams now, but not let workers execute 7406 // (they hang on fork barrier until next parallel) 7407 #if INCLUDE_SSC_MARKS 7408 SSC_MARK_FORKING(); 7409 #endif 7410 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7411 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7412 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7413 #if INCLUDE_SSC_MARKS 7414 SSC_MARK_JOINING(); 7415 #endif 7416 // If the team size was reduced from the limit, set it to the new size 7417 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) 7418 thr->th.th_teams_size.nth = thr->th.th_team_nproc; 7419 // AC: last parameter "1" eliminates join barrier which won't work because 7420 // worker threads are in a fork barrier waiting for more parallel regions 7421 __kmp_join_call(loc, gtid 7422 #if OMPT_SUPPORT 7423 , 7424 fork_context_intel 7425 #endif 7426 , 7427 1); 7428 } 7429 7430 int __kmp_invoke_teams_master(int gtid) { 7431 kmp_info_t *this_thr = __kmp_threads[gtid]; 7432 kmp_team_t *team = this_thr->th.th_team; 7433 #if KMP_DEBUG 7434 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7435 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7436 (void *)__kmp_teams_master); 7437 #endif 7438 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7439 #if OMPT_SUPPORT 7440 int tid = __kmp_tid_from_gtid(gtid); 7441 ompt_data_t *task_data = 7442 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data; 7443 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data; 7444 if (ompt_enabled.ompt_callback_implicit_task) { 7445 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7446 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid, 7447 ompt_task_initial); 7448 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid; 7449 } 7450 #endif 7451 __kmp_teams_master(gtid); 7452 #if OMPT_SUPPORT 7453 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league; 7454 #endif 7455 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7456 return 1; 7457 } 7458 7459 /* this sets the requested number of threads for the next parallel region 7460 encountered by this team. since this should be enclosed in the forkjoin 7461 critical section it should avoid race conditions with asymmetrical nested 7462 parallelism */ 7463 7464 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7465 kmp_info_t *thr = __kmp_threads[gtid]; 7466 7467 if (num_threads > 0) 7468 thr->th.th_set_nproc = num_threads; 7469 } 7470 7471 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams, 7472 int num_threads) { 7473 KMP_DEBUG_ASSERT(thr); 7474 // Remember the number of threads for inner parallel regions 7475 if (!TCR_4(__kmp_init_middle)) 7476 __kmp_middle_initialize(); // get internal globals calculated 7477 __kmp_assign_root_init_mask(); 7478 KMP_DEBUG_ASSERT(__kmp_avail_proc); 7479 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth); 7480 7481 if (num_threads == 0) { 7482 if (__kmp_teams_thread_limit > 0) { 7483 num_threads = __kmp_teams_thread_limit; 7484 } else { 7485 num_threads = __kmp_avail_proc / num_teams; 7486 } 7487 // adjust num_threads w/o warning as it is not user setting 7488 // num_threads = min(num_threads, nthreads-var, thread-limit-var) 7489 // no thread_limit clause specified - do not change thread-limit-var ICV 7490 if (num_threads > __kmp_dflt_team_nth) { 7491 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7492 } 7493 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) { 7494 num_threads = thr->th.th_current_task->td_icvs.thread_limit; 7495 } // prevent team size to exceed thread-limit-var 7496 if (num_teams * num_threads > __kmp_teams_max_nth) { 7497 num_threads = __kmp_teams_max_nth / num_teams; 7498 } 7499 if (num_threads == 0) { 7500 num_threads = 1; 7501 } 7502 } else { 7503 // This thread will be the primary thread of the league primary threads 7504 // Store new thread limit; old limit is saved in th_cg_roots list 7505 thr->th.th_current_task->td_icvs.thread_limit = num_threads; 7506 // num_threads = min(num_threads, nthreads-var) 7507 if (num_threads > __kmp_dflt_team_nth) { 7508 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7509 } 7510 if (num_teams * num_threads > __kmp_teams_max_nth) { 7511 int new_threads = __kmp_teams_max_nth / num_teams; 7512 if (new_threads == 0) { 7513 new_threads = 1; 7514 } 7515 if (new_threads != num_threads) { 7516 if (!__kmp_reserve_warn) { // user asked for too many threads 7517 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT 7518 __kmp_msg(kmp_ms_warning, 7519 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7520 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7521 } 7522 } 7523 num_threads = new_threads; 7524 } 7525 } 7526 thr->th.th_teams_size.nth = num_threads; 7527 } 7528 7529 /* this sets the requested number of teams for the teams region and/or 7530 the number of threads for the next parallel region encountered */ 7531 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7532 int num_threads) { 7533 kmp_info_t *thr = __kmp_threads[gtid]; 7534 KMP_DEBUG_ASSERT(num_teams >= 0); 7535 KMP_DEBUG_ASSERT(num_threads >= 0); 7536 7537 if (num_teams == 0) { 7538 if (__kmp_nteams > 0) { 7539 num_teams = __kmp_nteams; 7540 } else { 7541 num_teams = 1; // default number of teams is 1. 7542 } 7543 } 7544 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7545 if (!__kmp_reserve_warn) { 7546 __kmp_reserve_warn = 1; 7547 __kmp_msg(kmp_ms_warning, 7548 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7549 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7550 } 7551 num_teams = __kmp_teams_max_nth; 7552 } 7553 // Set number of teams (number of threads in the outer "parallel" of the 7554 // teams) 7555 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7556 7557 __kmp_push_thread_limit(thr, num_teams, num_threads); 7558 } 7559 7560 /* This sets the requested number of teams for the teams region and/or 7561 the number of threads for the next parallel region encountered */ 7562 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb, 7563 int num_teams_ub, int num_threads) { 7564 kmp_info_t *thr = __kmp_threads[gtid]; 7565 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0); 7566 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb); 7567 KMP_DEBUG_ASSERT(num_threads >= 0); 7568 7569 if (num_teams_lb > num_teams_ub) { 7570 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub), 7571 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null); 7572 } 7573 7574 int num_teams = 1; // defalt number of teams is 1. 7575 7576 if (num_teams_lb == 0 && num_teams_ub > 0) 7577 num_teams_lb = num_teams_ub; 7578 7579 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause 7580 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams; 7581 if (num_teams > __kmp_teams_max_nth) { 7582 if (!__kmp_reserve_warn) { 7583 __kmp_reserve_warn = 1; 7584 __kmp_msg(kmp_ms_warning, 7585 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7586 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7587 } 7588 num_teams = __kmp_teams_max_nth; 7589 } 7590 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams 7591 num_teams = num_teams_ub; 7592 } else { // num_teams_lb <= num_teams <= num_teams_ub 7593 if (num_threads == 0) { 7594 if (num_teams_ub > __kmp_teams_max_nth) { 7595 num_teams = num_teams_lb; 7596 } else { 7597 num_teams = num_teams_ub; 7598 } 7599 } else { 7600 num_teams = (num_threads > __kmp_teams_max_nth) 7601 ? num_teams 7602 : __kmp_teams_max_nth / num_threads; 7603 if (num_teams < num_teams_lb) { 7604 num_teams = num_teams_lb; 7605 } else if (num_teams > num_teams_ub) { 7606 num_teams = num_teams_ub; 7607 } 7608 } 7609 } 7610 // Set number of teams (number of threads in the outer "parallel" of the 7611 // teams) 7612 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7613 7614 __kmp_push_thread_limit(thr, num_teams, num_threads); 7615 } 7616 7617 // Set the proc_bind var to use in the following parallel region. 7618 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7619 kmp_info_t *thr = __kmp_threads[gtid]; 7620 thr->th.th_set_proc_bind = proc_bind; 7621 } 7622 7623 /* Launch the worker threads into the microtask. */ 7624 7625 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7626 kmp_info_t *this_thr = __kmp_threads[gtid]; 7627 7628 #ifdef KMP_DEBUG 7629 int f; 7630 #endif /* KMP_DEBUG */ 7631 7632 KMP_DEBUG_ASSERT(team); 7633 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7634 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7635 KMP_MB(); /* Flush all pending memory write invalidates. */ 7636 7637 team->t.t_construct = 0; /* no single directives seen yet */ 7638 team->t.t_ordered.dt.t_value = 7639 0; /* thread 0 enters the ordered section first */ 7640 7641 /* Reset the identifiers on the dispatch buffer */ 7642 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7643 if (team->t.t_max_nproc > 1) { 7644 int i; 7645 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7646 team->t.t_disp_buffer[i].buffer_index = i; 7647 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7648 } 7649 } else { 7650 team->t.t_disp_buffer[0].buffer_index = 0; 7651 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7652 } 7653 7654 KMP_MB(); /* Flush all pending memory write invalidates. */ 7655 KMP_ASSERT(this_thr->th.th_team == team); 7656 7657 #ifdef KMP_DEBUG 7658 for (f = 0; f < team->t.t_nproc; f++) { 7659 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7660 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7661 } 7662 #endif /* KMP_DEBUG */ 7663 7664 /* release the worker threads so they may begin working */ 7665 __kmp_fork_barrier(gtid, 0); 7666 } 7667 7668 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7669 kmp_info_t *this_thr = __kmp_threads[gtid]; 7670 7671 KMP_DEBUG_ASSERT(team); 7672 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7673 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7674 KMP_MB(); /* Flush all pending memory write invalidates. */ 7675 7676 /* Join barrier after fork */ 7677 7678 #ifdef KMP_DEBUG 7679 if (__kmp_threads[gtid] && 7680 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7681 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7682 __kmp_threads[gtid]); 7683 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 7684 "team->t.t_nproc=%d\n", 7685 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 7686 team->t.t_nproc); 7687 __kmp_print_structure(); 7688 } 7689 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 7690 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 7691 #endif /* KMP_DEBUG */ 7692 7693 __kmp_join_barrier(gtid); /* wait for everyone */ 7694 #if OMPT_SUPPORT 7695 if (ompt_enabled.enabled && 7696 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 7697 int ds_tid = this_thr->th.th_info.ds.ds_tid; 7698 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 7699 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 7700 #if OMPT_OPTIONAL 7701 void *codeptr = NULL; 7702 if (KMP_MASTER_TID(ds_tid) && 7703 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 7704 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 7705 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 7706 7707 if (ompt_enabled.ompt_callback_sync_region_wait) { 7708 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 7709 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7710 codeptr); 7711 } 7712 if (ompt_enabled.ompt_callback_sync_region) { 7713 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 7714 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7715 codeptr); 7716 } 7717 #endif 7718 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 7719 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7720 ompt_scope_end, NULL, task_data, 0, ds_tid, 7721 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 7722 } 7723 } 7724 #endif 7725 7726 KMP_MB(); /* Flush all pending memory write invalidates. */ 7727 KMP_ASSERT(this_thr->th.th_team == team); 7728 } 7729 7730 /* ------------------------------------------------------------------------ */ 7731 7732 #ifdef USE_LOAD_BALANCE 7733 7734 // Return the worker threads actively spinning in the hot team, if we 7735 // are at the outermost level of parallelism. Otherwise, return 0. 7736 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 7737 int i; 7738 int retval; 7739 kmp_team_t *hot_team; 7740 7741 if (root->r.r_active) { 7742 return 0; 7743 } 7744 hot_team = root->r.r_hot_team; 7745 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 7746 return hot_team->t.t_nproc - 1; // Don't count primary thread 7747 } 7748 7749 // Skip the primary thread - it is accounted for elsewhere. 7750 retval = 0; 7751 for (i = 1; i < hot_team->t.t_nproc; i++) { 7752 if (hot_team->t.t_threads[i]->th.th_active) { 7753 retval++; 7754 } 7755 } 7756 return retval; 7757 } 7758 7759 // Perform an automatic adjustment to the number of 7760 // threads used by the next parallel region. 7761 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 7762 int retval; 7763 int pool_active; 7764 int hot_team_active; 7765 int team_curr_active; 7766 int system_active; 7767 7768 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 7769 set_nproc)); 7770 KMP_DEBUG_ASSERT(root); 7771 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 7772 ->th.th_current_task->td_icvs.dynamic == TRUE); 7773 KMP_DEBUG_ASSERT(set_nproc > 1); 7774 7775 if (set_nproc == 1) { 7776 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 7777 return 1; 7778 } 7779 7780 // Threads that are active in the thread pool, active in the hot team for this 7781 // particular root (if we are at the outer par level), and the currently 7782 // executing thread (to become the primary thread) are available to add to the 7783 // new team, but are currently contributing to the system load, and must be 7784 // accounted for. 7785 pool_active = __kmp_thread_pool_active_nth; 7786 hot_team_active = __kmp_active_hot_team_nproc(root); 7787 team_curr_active = pool_active + hot_team_active + 1; 7788 7789 // Check the system load. 7790 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 7791 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 7792 "hot team active = %d\n", 7793 system_active, pool_active, hot_team_active)); 7794 7795 if (system_active < 0) { 7796 // There was an error reading the necessary info from /proc, so use the 7797 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 7798 // = dynamic_thread_limit, we shouldn't wind up getting back here. 7799 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7800 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 7801 7802 // Make this call behave like the thread limit algorithm. 7803 retval = __kmp_avail_proc - __kmp_nth + 7804 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 7805 if (retval > set_nproc) { 7806 retval = set_nproc; 7807 } 7808 if (retval < KMP_MIN_NTH) { 7809 retval = KMP_MIN_NTH; 7810 } 7811 7812 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 7813 retval)); 7814 return retval; 7815 } 7816 7817 // There is a slight delay in the load balance algorithm in detecting new 7818 // running procs. The real system load at this instant should be at least as 7819 // large as the #active omp thread that are available to add to the team. 7820 if (system_active < team_curr_active) { 7821 system_active = team_curr_active; 7822 } 7823 retval = __kmp_avail_proc - system_active + team_curr_active; 7824 if (retval > set_nproc) { 7825 retval = set_nproc; 7826 } 7827 if (retval < KMP_MIN_NTH) { 7828 retval = KMP_MIN_NTH; 7829 } 7830 7831 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 7832 return retval; 7833 } // __kmp_load_balance_nproc() 7834 7835 #endif /* USE_LOAD_BALANCE */ 7836 7837 /* ------------------------------------------------------------------------ */ 7838 7839 /* NOTE: this is called with the __kmp_init_lock held */ 7840 void __kmp_cleanup(void) { 7841 int f; 7842 7843 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 7844 7845 if (TCR_4(__kmp_init_parallel)) { 7846 #if KMP_HANDLE_SIGNALS 7847 __kmp_remove_signals(); 7848 #endif 7849 TCW_4(__kmp_init_parallel, FALSE); 7850 } 7851 7852 if (TCR_4(__kmp_init_middle)) { 7853 #if KMP_AFFINITY_SUPPORTED 7854 __kmp_affinity_uninitialize(); 7855 #endif /* KMP_AFFINITY_SUPPORTED */ 7856 __kmp_cleanup_hierarchy(); 7857 TCW_4(__kmp_init_middle, FALSE); 7858 } 7859 7860 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 7861 7862 if (__kmp_init_serial) { 7863 __kmp_runtime_destroy(); 7864 __kmp_init_serial = FALSE; 7865 } 7866 7867 __kmp_cleanup_threadprivate_caches(); 7868 7869 for (f = 0; f < __kmp_threads_capacity; f++) { 7870 if (__kmp_root[f] != NULL) { 7871 __kmp_free(__kmp_root[f]); 7872 __kmp_root[f] = NULL; 7873 } 7874 } 7875 __kmp_free(__kmp_threads); 7876 // __kmp_threads and __kmp_root were allocated at once, as single block, so 7877 // there is no need in freeing __kmp_root. 7878 __kmp_threads = NULL; 7879 __kmp_root = NULL; 7880 __kmp_threads_capacity = 0; 7881 7882 #if KMP_USE_DYNAMIC_LOCK 7883 __kmp_cleanup_indirect_user_locks(); 7884 #else 7885 __kmp_cleanup_user_locks(); 7886 #endif 7887 #if OMPD_SUPPORT 7888 if (ompd_state) { 7889 __kmp_free(ompd_env_block); 7890 ompd_env_block = NULL; 7891 ompd_env_block_size = 0; 7892 } 7893 #endif 7894 7895 #if KMP_AFFINITY_SUPPORTED 7896 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 7897 __kmp_cpuinfo_file = NULL; 7898 #endif /* KMP_AFFINITY_SUPPORTED */ 7899 7900 #if KMP_USE_ADAPTIVE_LOCKS 7901 #if KMP_DEBUG_ADAPTIVE_LOCKS 7902 __kmp_print_speculative_stats(); 7903 #endif 7904 #endif 7905 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 7906 __kmp_nested_nth.nth = NULL; 7907 __kmp_nested_nth.size = 0; 7908 __kmp_nested_nth.used = 0; 7909 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 7910 __kmp_nested_proc_bind.bind_types = NULL; 7911 __kmp_nested_proc_bind.size = 0; 7912 __kmp_nested_proc_bind.used = 0; 7913 if (__kmp_affinity_format) { 7914 KMP_INTERNAL_FREE(__kmp_affinity_format); 7915 __kmp_affinity_format = NULL; 7916 } 7917 7918 __kmp_i18n_catclose(); 7919 7920 #if KMP_USE_HIER_SCHED 7921 __kmp_hier_scheds.deallocate(); 7922 #endif 7923 7924 #if KMP_STATS_ENABLED 7925 __kmp_stats_fini(); 7926 #endif 7927 7928 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 7929 } 7930 7931 /* ------------------------------------------------------------------------ */ 7932 7933 int __kmp_ignore_mppbeg(void) { 7934 char *env; 7935 7936 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 7937 if (__kmp_str_match_false(env)) 7938 return FALSE; 7939 } 7940 // By default __kmpc_begin() is no-op. 7941 return TRUE; 7942 } 7943 7944 int __kmp_ignore_mppend(void) { 7945 char *env; 7946 7947 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 7948 if (__kmp_str_match_false(env)) 7949 return FALSE; 7950 } 7951 // By default __kmpc_end() is no-op. 7952 return TRUE; 7953 } 7954 7955 void __kmp_internal_begin(void) { 7956 int gtid; 7957 kmp_root_t *root; 7958 7959 /* this is a very important step as it will register new sibling threads 7960 and assign these new uber threads a new gtid */ 7961 gtid = __kmp_entry_gtid(); 7962 root = __kmp_threads[gtid]->th.th_root; 7963 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7964 7965 if (root->r.r_begin) 7966 return; 7967 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 7968 if (root->r.r_begin) { 7969 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7970 return; 7971 } 7972 7973 root->r.r_begin = TRUE; 7974 7975 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7976 } 7977 7978 /* ------------------------------------------------------------------------ */ 7979 7980 void __kmp_user_set_library(enum library_type arg) { 7981 int gtid; 7982 kmp_root_t *root; 7983 kmp_info_t *thread; 7984 7985 /* first, make sure we are initialized so we can get our gtid */ 7986 7987 gtid = __kmp_entry_gtid(); 7988 thread = __kmp_threads[gtid]; 7989 7990 root = thread->th.th_root; 7991 7992 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 7993 library_serial)); 7994 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 7995 thread */ 7996 KMP_WARNING(SetLibraryIncorrectCall); 7997 return; 7998 } 7999 8000 switch (arg) { 8001 case library_serial: 8002 thread->th.th_set_nproc = 0; 8003 set__nproc(thread, 1); 8004 break; 8005 case library_turnaround: 8006 thread->th.th_set_nproc = 0; 8007 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8008 : __kmp_dflt_team_nth_ub); 8009 break; 8010 case library_throughput: 8011 thread->th.th_set_nproc = 0; 8012 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8013 : __kmp_dflt_team_nth_ub); 8014 break; 8015 default: 8016 KMP_FATAL(UnknownLibraryType, arg); 8017 } 8018 8019 __kmp_aux_set_library(arg); 8020 } 8021 8022 void __kmp_aux_set_stacksize(size_t arg) { 8023 if (!__kmp_init_serial) 8024 __kmp_serial_initialize(); 8025 8026 #if KMP_OS_DARWIN 8027 if (arg & (0x1000 - 1)) { 8028 arg &= ~(0x1000 - 1); 8029 if (arg + 0x1000) /* check for overflow if we round up */ 8030 arg += 0x1000; 8031 } 8032 #endif 8033 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8034 8035 /* only change the default stacksize before the first parallel region */ 8036 if (!TCR_4(__kmp_init_parallel)) { 8037 size_t value = arg; /* argument is in bytes */ 8038 8039 if (value < __kmp_sys_min_stksize) 8040 value = __kmp_sys_min_stksize; 8041 else if (value > KMP_MAX_STKSIZE) 8042 value = KMP_MAX_STKSIZE; 8043 8044 __kmp_stksize = value; 8045 8046 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 8047 } 8048 8049 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8050 } 8051 8052 /* set the behaviour of the runtime library */ 8053 /* TODO this can cause some odd behaviour with sibling parallelism... */ 8054 void __kmp_aux_set_library(enum library_type arg) { 8055 __kmp_library = arg; 8056 8057 switch (__kmp_library) { 8058 case library_serial: { 8059 KMP_INFORM(LibraryIsSerial); 8060 } break; 8061 case library_turnaround: 8062 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) 8063 __kmp_use_yield = 2; // only yield when oversubscribed 8064 break; 8065 case library_throughput: 8066 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) 8067 __kmp_dflt_blocktime = 200; 8068 break; 8069 default: 8070 KMP_FATAL(UnknownLibraryType, arg); 8071 } 8072 } 8073 8074 /* Getting team information common for all team API */ 8075 // Returns NULL if not in teams construct 8076 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 8077 kmp_info_t *thr = __kmp_entry_thread(); 8078 teams_serialized = 0; 8079 if (thr->th.th_teams_microtask) { 8080 kmp_team_t *team = thr->th.th_team; 8081 int tlevel = thr->th.th_teams_level; // the level of the teams construct 8082 int ii = team->t.t_level; 8083 teams_serialized = team->t.t_serialized; 8084 int level = tlevel + 1; 8085 KMP_DEBUG_ASSERT(ii >= tlevel); 8086 while (ii > level) { 8087 for (teams_serialized = team->t.t_serialized; 8088 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 8089 } 8090 if (team->t.t_serialized && (!teams_serialized)) { 8091 team = team->t.t_parent; 8092 continue; 8093 } 8094 if (ii > level) { 8095 team = team->t.t_parent; 8096 ii--; 8097 } 8098 } 8099 return team; 8100 } 8101 return NULL; 8102 } 8103 8104 int __kmp_aux_get_team_num() { 8105 int serialized; 8106 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8107 if (team) { 8108 if (serialized > 1) { 8109 return 0; // teams region is serialized ( 1 team of 1 thread ). 8110 } else { 8111 return team->t.t_master_tid; 8112 } 8113 } 8114 return 0; 8115 } 8116 8117 int __kmp_aux_get_num_teams() { 8118 int serialized; 8119 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8120 if (team) { 8121 if (serialized > 1) { 8122 return 1; 8123 } else { 8124 return team->t.t_parent->t.t_nproc; 8125 } 8126 } 8127 return 1; 8128 } 8129 8130 /* ------------------------------------------------------------------------ */ 8131 8132 /* 8133 * Affinity Format Parser 8134 * 8135 * Field is in form of: %[[[0].]size]type 8136 * % and type are required (%% means print a literal '%') 8137 * type is either single char or long name surrounded by {}, 8138 * e.g., N or {num_threads} 8139 * 0 => leading zeros 8140 * . => right justified when size is specified 8141 * by default output is left justified 8142 * size is the *minimum* field length 8143 * All other characters are printed as is 8144 * 8145 * Available field types: 8146 * L {thread_level} - omp_get_level() 8147 * n {thread_num} - omp_get_thread_num() 8148 * h {host} - name of host machine 8149 * P {process_id} - process id (integer) 8150 * T {thread_identifier} - native thread identifier (integer) 8151 * N {num_threads} - omp_get_num_threads() 8152 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 8153 * a {thread_affinity} - comma separated list of integers or integer ranges 8154 * (values of affinity mask) 8155 * 8156 * Implementation-specific field types can be added 8157 * If a type is unknown, print "undefined" 8158 */ 8159 8160 // Structure holding the short name, long name, and corresponding data type 8161 // for snprintf. A table of these will represent the entire valid keyword 8162 // field types. 8163 typedef struct kmp_affinity_format_field_t { 8164 char short_name; // from spec e.g., L -> thread level 8165 const char *long_name; // from spec thread_level -> thread level 8166 char field_format; // data type for snprintf (typically 'd' or 's' 8167 // for integer or string) 8168 } kmp_affinity_format_field_t; 8169 8170 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 8171 #if KMP_AFFINITY_SUPPORTED 8172 {'A', "thread_affinity", 's'}, 8173 #endif 8174 {'t', "team_num", 'd'}, 8175 {'T', "num_teams", 'd'}, 8176 {'L', "nesting_level", 'd'}, 8177 {'n', "thread_num", 'd'}, 8178 {'N', "num_threads", 'd'}, 8179 {'a', "ancestor_tnum", 'd'}, 8180 {'H', "host", 's'}, 8181 {'P', "process_id", 'd'}, 8182 {'i', "native_thread_id", 'd'}}; 8183 8184 // Return the number of characters it takes to hold field 8185 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 8186 const char **ptr, 8187 kmp_str_buf_t *field_buffer) { 8188 int rc, format_index, field_value; 8189 const char *width_left, *width_right; 8190 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 8191 static const int FORMAT_SIZE = 20; 8192 char format[FORMAT_SIZE] = {0}; 8193 char absolute_short_name = 0; 8194 8195 KMP_DEBUG_ASSERT(gtid >= 0); 8196 KMP_DEBUG_ASSERT(th); 8197 KMP_DEBUG_ASSERT(**ptr == '%'); 8198 KMP_DEBUG_ASSERT(field_buffer); 8199 8200 __kmp_str_buf_clear(field_buffer); 8201 8202 // Skip the initial % 8203 (*ptr)++; 8204 8205 // Check for %% first 8206 if (**ptr == '%') { 8207 __kmp_str_buf_cat(field_buffer, "%", 1); 8208 (*ptr)++; // skip over the second % 8209 return 1; 8210 } 8211 8212 // Parse field modifiers if they are present 8213 pad_zeros = false; 8214 if (**ptr == '0') { 8215 pad_zeros = true; 8216 (*ptr)++; // skip over 0 8217 } 8218 right_justify = false; 8219 if (**ptr == '.') { 8220 right_justify = true; 8221 (*ptr)++; // skip over . 8222 } 8223 // Parse width of field: [width_left, width_right) 8224 width_left = width_right = NULL; 8225 if (**ptr >= '0' && **ptr <= '9') { 8226 width_left = *ptr; 8227 SKIP_DIGITS(*ptr); 8228 width_right = *ptr; 8229 } 8230 8231 // Create the format for KMP_SNPRINTF based on flags parsed above 8232 format_index = 0; 8233 format[format_index++] = '%'; 8234 if (!right_justify) 8235 format[format_index++] = '-'; 8236 if (pad_zeros) 8237 format[format_index++] = '0'; 8238 if (width_left && width_right) { 8239 int i = 0; 8240 // Only allow 8 digit number widths. 8241 // This also prevents overflowing format variable 8242 while (i < 8 && width_left < width_right) { 8243 format[format_index++] = *width_left; 8244 width_left++; 8245 i++; 8246 } 8247 } 8248 8249 // Parse a name (long or short) 8250 // Canonicalize the name into absolute_short_name 8251 found_valid_name = false; 8252 parse_long_name = (**ptr == '{'); 8253 if (parse_long_name) 8254 (*ptr)++; // skip initial left brace 8255 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 8256 sizeof(__kmp_affinity_format_table[0]); 8257 ++i) { 8258 char short_name = __kmp_affinity_format_table[i].short_name; 8259 const char *long_name = __kmp_affinity_format_table[i].long_name; 8260 char field_format = __kmp_affinity_format_table[i].field_format; 8261 if (parse_long_name) { 8262 size_t length = KMP_STRLEN(long_name); 8263 if (strncmp(*ptr, long_name, length) == 0) { 8264 found_valid_name = true; 8265 (*ptr) += length; // skip the long name 8266 } 8267 } else if (**ptr == short_name) { 8268 found_valid_name = true; 8269 (*ptr)++; // skip the short name 8270 } 8271 if (found_valid_name) { 8272 format[format_index++] = field_format; 8273 format[format_index++] = '\0'; 8274 absolute_short_name = short_name; 8275 break; 8276 } 8277 } 8278 if (parse_long_name) { 8279 if (**ptr != '}') { 8280 absolute_short_name = 0; 8281 } else { 8282 (*ptr)++; // skip over the right brace 8283 } 8284 } 8285 8286 // Attempt to fill the buffer with the requested 8287 // value using snprintf within __kmp_str_buf_print() 8288 switch (absolute_short_name) { 8289 case 't': 8290 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 8291 break; 8292 case 'T': 8293 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 8294 break; 8295 case 'L': 8296 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 8297 break; 8298 case 'n': 8299 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 8300 break; 8301 case 'H': { 8302 static const int BUFFER_SIZE = 256; 8303 char buf[BUFFER_SIZE]; 8304 __kmp_expand_host_name(buf, BUFFER_SIZE); 8305 rc = __kmp_str_buf_print(field_buffer, format, buf); 8306 } break; 8307 case 'P': 8308 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 8309 break; 8310 case 'i': 8311 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 8312 break; 8313 case 'N': 8314 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 8315 break; 8316 case 'a': 8317 field_value = 8318 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 8319 rc = __kmp_str_buf_print(field_buffer, format, field_value); 8320 break; 8321 #if KMP_AFFINITY_SUPPORTED 8322 case 'A': { 8323 kmp_str_buf_t buf; 8324 __kmp_str_buf_init(&buf); 8325 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 8326 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 8327 __kmp_str_buf_free(&buf); 8328 } break; 8329 #endif 8330 default: 8331 // According to spec, If an implementation does not have info for field 8332 // type, then "undefined" is printed 8333 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 8334 // Skip the field 8335 if (parse_long_name) { 8336 SKIP_TOKEN(*ptr); 8337 if (**ptr == '}') 8338 (*ptr)++; 8339 } else { 8340 (*ptr)++; 8341 } 8342 } 8343 8344 KMP_ASSERT(format_index <= FORMAT_SIZE); 8345 return rc; 8346 } 8347 8348 /* 8349 * Return number of characters needed to hold the affinity string 8350 * (not including null byte character) 8351 * The resultant string is printed to buffer, which the caller can then 8352 * handle afterwards 8353 */ 8354 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 8355 kmp_str_buf_t *buffer) { 8356 const char *parse_ptr; 8357 size_t retval; 8358 const kmp_info_t *th; 8359 kmp_str_buf_t field; 8360 8361 KMP_DEBUG_ASSERT(buffer); 8362 KMP_DEBUG_ASSERT(gtid >= 0); 8363 8364 __kmp_str_buf_init(&field); 8365 __kmp_str_buf_clear(buffer); 8366 8367 th = __kmp_threads[gtid]; 8368 retval = 0; 8369 8370 // If format is NULL or zero-length string, then we use 8371 // affinity-format-var ICV 8372 parse_ptr = format; 8373 if (parse_ptr == NULL || *parse_ptr == '\0') { 8374 parse_ptr = __kmp_affinity_format; 8375 } 8376 KMP_DEBUG_ASSERT(parse_ptr); 8377 8378 while (*parse_ptr != '\0') { 8379 // Parse a field 8380 if (*parse_ptr == '%') { 8381 // Put field in the buffer 8382 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 8383 __kmp_str_buf_catbuf(buffer, &field); 8384 retval += rc; 8385 } else { 8386 // Put literal character in buffer 8387 __kmp_str_buf_cat(buffer, parse_ptr, 1); 8388 retval++; 8389 parse_ptr++; 8390 } 8391 } 8392 __kmp_str_buf_free(&field); 8393 return retval; 8394 } 8395 8396 // Displays the affinity string to stdout 8397 void __kmp_aux_display_affinity(int gtid, const char *format) { 8398 kmp_str_buf_t buf; 8399 __kmp_str_buf_init(&buf); 8400 __kmp_aux_capture_affinity(gtid, format, &buf); 8401 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 8402 __kmp_str_buf_free(&buf); 8403 } 8404 8405 /* ------------------------------------------------------------------------ */ 8406 8407 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 8408 int blocktime = arg; /* argument is in milliseconds */ 8409 #if KMP_USE_MONITOR 8410 int bt_intervals; 8411 #endif 8412 kmp_int8 bt_set; 8413 8414 __kmp_save_internal_controls(thread); 8415 8416 /* Normalize and set blocktime for the teams */ 8417 if (blocktime < KMP_MIN_BLOCKTIME) 8418 blocktime = KMP_MIN_BLOCKTIME; 8419 else if (blocktime > KMP_MAX_BLOCKTIME) 8420 blocktime = KMP_MAX_BLOCKTIME; 8421 8422 set__blocktime_team(thread->th.th_team, tid, blocktime); 8423 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 8424 8425 #if KMP_USE_MONITOR 8426 /* Calculate and set blocktime intervals for the teams */ 8427 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 8428 8429 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 8430 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 8431 #endif 8432 8433 /* Set whether blocktime has been set to "TRUE" */ 8434 bt_set = TRUE; 8435 8436 set__bt_set_team(thread->th.th_team, tid, bt_set); 8437 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 8438 #if KMP_USE_MONITOR 8439 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 8440 "bt_intervals=%d, monitor_updates=%d\n", 8441 __kmp_gtid_from_tid(tid, thread->th.th_team), 8442 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 8443 __kmp_monitor_wakeups)); 8444 #else 8445 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 8446 __kmp_gtid_from_tid(tid, thread->th.th_team), 8447 thread->th.th_team->t.t_id, tid, blocktime)); 8448 #endif 8449 } 8450 8451 void __kmp_aux_set_defaults(char const *str, size_t len) { 8452 if (!__kmp_init_serial) { 8453 __kmp_serial_initialize(); 8454 } 8455 __kmp_env_initialize(str); 8456 8457 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { 8458 __kmp_env_print(); 8459 } 8460 } // __kmp_aux_set_defaults 8461 8462 /* ------------------------------------------------------------------------ */ 8463 /* internal fast reduction routines */ 8464 8465 PACKED_REDUCTION_METHOD_T 8466 __kmp_determine_reduction_method( 8467 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 8468 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 8469 kmp_critical_name *lck) { 8470 8471 // Default reduction method: critical construct ( lck != NULL, like in current 8472 // PAROPT ) 8473 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 8474 // can be selected by RTL 8475 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 8476 // can be selected by RTL 8477 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8478 // among generated by PAROPT. 8479 8480 PACKED_REDUCTION_METHOD_T retval; 8481 8482 int team_size; 8483 8484 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 8485 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8486 8487 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8488 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)) 8489 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8490 8491 retval = critical_reduce_block; 8492 8493 // another choice of getting a team size (with 1 dynamic deference) is slower 8494 team_size = __kmp_get_team_num_threads(global_tid); 8495 if (team_size == 1) { 8496 8497 retval = empty_reduce_block; 8498 8499 } else { 8500 8501 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8502 8503 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ 8504 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 8505 8506 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8507 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8508 8509 int teamsize_cutoff = 4; 8510 8511 #if KMP_MIC_SUPPORTED 8512 if (__kmp_mic_type != non_mic) { 8513 teamsize_cutoff = 8; 8514 } 8515 #endif 8516 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8517 if (tree_available) { 8518 if (team_size <= teamsize_cutoff) { 8519 if (atomic_available) { 8520 retval = atomic_reduce_block; 8521 } 8522 } else { 8523 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8524 } 8525 } else if (atomic_available) { 8526 retval = atomic_reduce_block; 8527 } 8528 #else 8529 #error "Unknown or unsupported OS" 8530 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8531 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8532 8533 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 8534 8535 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD 8536 8537 // basic tuning 8538 8539 if (atomic_available) { 8540 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 8541 retval = atomic_reduce_block; 8542 } 8543 } // otherwise: use critical section 8544 8545 #elif KMP_OS_DARWIN 8546 8547 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8548 if (atomic_available && (num_vars <= 3)) { 8549 retval = atomic_reduce_block; 8550 } else if (tree_available) { 8551 if ((reduce_size > (9 * sizeof(kmp_real64))) && 8552 (reduce_size < (2000 * sizeof(kmp_real64)))) { 8553 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 8554 } 8555 } // otherwise: use critical section 8556 8557 #else 8558 #error "Unknown or unsupported OS" 8559 #endif 8560 8561 #else 8562 #error "Unknown or unsupported architecture" 8563 #endif 8564 } 8565 8566 // KMP_FORCE_REDUCTION 8567 8568 // If the team is serialized (team_size == 1), ignore the forced reduction 8569 // method and stay with the unsynchronized method (empty_reduce_block) 8570 if (__kmp_force_reduction_method != reduction_method_not_defined && 8571 team_size != 1) { 8572 8573 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 8574 8575 int atomic_available, tree_available; 8576 8577 switch ((forced_retval = __kmp_force_reduction_method)) { 8578 case critical_reduce_block: 8579 KMP_ASSERT(lck); // lck should be != 0 8580 break; 8581 8582 case atomic_reduce_block: 8583 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8584 if (!atomic_available) { 8585 KMP_WARNING(RedMethodNotSupported, "atomic"); 8586 forced_retval = critical_reduce_block; 8587 } 8588 break; 8589 8590 case tree_reduce_block: 8591 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8592 if (!tree_available) { 8593 KMP_WARNING(RedMethodNotSupported, "tree"); 8594 forced_retval = critical_reduce_block; 8595 } else { 8596 #if KMP_FAST_REDUCTION_BARRIER 8597 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8598 #endif 8599 } 8600 break; 8601 8602 default: 8603 KMP_ASSERT(0); // "unsupported method specified" 8604 } 8605 8606 retval = forced_retval; 8607 } 8608 8609 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 8610 8611 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 8612 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 8613 8614 return (retval); 8615 } 8616 // this function is for testing set/get/determine reduce method 8617 kmp_int32 __kmp_get_reduce_method(void) { 8618 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 8619 } 8620 8621 // Soft pause sets up threads to ignore blocktime and just go to sleep. 8622 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 8623 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 8624 8625 // Hard pause shuts down the runtime completely. Resume happens naturally when 8626 // OpenMP is used subsequently. 8627 void __kmp_hard_pause() { 8628 __kmp_pause_status = kmp_hard_paused; 8629 __kmp_internal_end_thread(-1); 8630 } 8631 8632 // Soft resume sets __kmp_pause_status, and wakes up all threads. 8633 void __kmp_resume_if_soft_paused() { 8634 if (__kmp_pause_status == kmp_soft_paused) { 8635 __kmp_pause_status = kmp_not_paused; 8636 8637 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 8638 kmp_info_t *thread = __kmp_threads[gtid]; 8639 if (thread) { // Wake it if sleeping 8640 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 8641 thread); 8642 if (fl.is_sleeping()) 8643 fl.resume(gtid); 8644 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 8645 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 8646 } else { // thread holds the lock and may sleep soon 8647 do { // until either the thread sleeps, or we can get the lock 8648 if (fl.is_sleeping()) { 8649 fl.resume(gtid); 8650 break; 8651 } else if (__kmp_try_suspend_mx(thread)) { 8652 __kmp_unlock_suspend_mx(thread); 8653 break; 8654 } 8655 } while (1); 8656 } 8657 } 8658 } 8659 } 8660 } 8661 8662 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 8663 // TODO: add warning messages 8664 int __kmp_pause_resource(kmp_pause_status_t level) { 8665 if (level == kmp_not_paused) { // requesting resume 8666 if (__kmp_pause_status == kmp_not_paused) { 8667 // error message about runtime not being paused, so can't resume 8668 return 1; 8669 } else { 8670 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 8671 __kmp_pause_status == kmp_hard_paused); 8672 __kmp_pause_status = kmp_not_paused; 8673 return 0; 8674 } 8675 } else if (level == kmp_soft_paused) { // requesting soft pause 8676 if (__kmp_pause_status != kmp_not_paused) { 8677 // error message about already being paused 8678 return 1; 8679 } else { 8680 __kmp_soft_pause(); 8681 return 0; 8682 } 8683 } else if (level == kmp_hard_paused) { // requesting hard pause 8684 if (__kmp_pause_status != kmp_not_paused) { 8685 // error message about already being paused 8686 return 1; 8687 } else { 8688 __kmp_hard_pause(); 8689 return 0; 8690 } 8691 } else { 8692 // error message about invalid level 8693 return 1; 8694 } 8695 } 8696 8697 void __kmp_omp_display_env(int verbose) { 8698 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8699 if (__kmp_init_serial == 0) 8700 __kmp_do_serial_initialize(); 8701 __kmp_display_env_impl(!verbose, verbose); 8702 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8703 } 8704 8705 // Globals and functions for hidden helper task 8706 kmp_info_t **__kmp_hidden_helper_threads; 8707 kmp_info_t *__kmp_hidden_helper_main_thread; 8708 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks; 8709 #if KMP_OS_LINUX 8710 kmp_int32 __kmp_hidden_helper_threads_num = 8; 8711 kmp_int32 __kmp_enable_hidden_helper = TRUE; 8712 #else 8713 kmp_int32 __kmp_hidden_helper_threads_num = 0; 8714 kmp_int32 __kmp_enable_hidden_helper = FALSE; 8715 #endif 8716 8717 namespace { 8718 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num; 8719 8720 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) { 8721 // This is an explicit synchronization on all hidden helper threads in case 8722 // that when a regular thread pushes a hidden helper task to one hidden 8723 // helper thread, the thread has not been awaken once since they're released 8724 // by the main thread after creating the team. 8725 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num); 8726 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) != 8727 __kmp_hidden_helper_threads_num) 8728 ; 8729 8730 // If main thread, then wait for signal 8731 if (__kmpc_master(nullptr, *gtid)) { 8732 // First, unset the initial state and release the initial thread 8733 TCW_4(__kmp_init_hidden_helper_threads, FALSE); 8734 __kmp_hidden_helper_initz_release(); 8735 __kmp_hidden_helper_main_thread_wait(); 8736 // Now wake up all worker threads 8737 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) { 8738 __kmp_hidden_helper_worker_thread_signal(); 8739 } 8740 } 8741 } 8742 } // namespace 8743 8744 void __kmp_hidden_helper_threads_initz_routine() { 8745 // Create a new root for hidden helper team/threads 8746 const int gtid = __kmp_register_root(TRUE); 8747 __kmp_hidden_helper_main_thread = __kmp_threads[gtid]; 8748 __kmp_hidden_helper_threads = &__kmp_threads[gtid]; 8749 __kmp_hidden_helper_main_thread->th.th_set_nproc = 8750 __kmp_hidden_helper_threads_num; 8751 8752 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0); 8753 8754 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn); 8755 8756 // Set the initialization flag to FALSE 8757 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE); 8758 8759 __kmp_hidden_helper_threads_deinitz_release(); 8760 } 8761 8762 /* Nesting Mode: 8763 Set via KMP_NESTING_MODE, which takes an integer. 8764 Note: we skip duplicate topology levels, and skip levels with only 8765 one entity. 8766 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode. 8767 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels 8768 in the topology, and initializes the number of threads at each of those 8769 levels to the number of entities at each level, respectively, below the 8770 entity at the parent level. 8771 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels, 8772 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires 8773 the user to turn nesting on explicitly. This is an even more experimental 8774 option to this experimental feature, and may change or go away in the 8775 future. 8776 */ 8777 8778 // Allocate space to store nesting levels 8779 void __kmp_init_nesting_mode() { 8780 int levels = KMP_HW_LAST; 8781 __kmp_nesting_mode_nlevels = levels; 8782 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int)); 8783 for (int i = 0; i < levels; ++i) 8784 __kmp_nesting_nth_level[i] = 0; 8785 if (__kmp_nested_nth.size < levels) { 8786 __kmp_nested_nth.nth = 8787 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int)); 8788 __kmp_nested_nth.size = levels; 8789 } 8790 } 8791 8792 // Set # threads for top levels of nesting; must be called after topology set 8793 void __kmp_set_nesting_mode_threads() { 8794 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()]; 8795 8796 if (__kmp_nesting_mode == 1) 8797 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 8798 else if (__kmp_nesting_mode > 1) 8799 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 8800 8801 if (__kmp_topology) { // use topology info 8802 int loc, hw_level; 8803 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() && 8804 loc < __kmp_nesting_mode_nlevels; 8805 loc++, hw_level++) { 8806 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level); 8807 if (__kmp_nesting_nth_level[loc] == 1) 8808 loc--; 8809 } 8810 // Make sure all cores are used 8811 if (__kmp_nesting_mode > 1 && loc > 1) { 8812 int core_level = __kmp_topology->get_level(KMP_HW_CORE); 8813 int num_cores = __kmp_topology->get_count(core_level); 8814 int upper_levels = 1; 8815 for (int level = 0; level < loc - 1; ++level) 8816 upper_levels *= __kmp_nesting_nth_level[level]; 8817 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores) 8818 __kmp_nesting_nth_level[loc - 1] = 8819 num_cores / __kmp_nesting_nth_level[loc - 2]; 8820 } 8821 __kmp_nesting_mode_nlevels = loc; 8822 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 8823 } else { // no topology info available; provide a reasonable guesstimation 8824 if (__kmp_avail_proc >= 4) { 8825 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2; 8826 __kmp_nesting_nth_level[1] = 2; 8827 __kmp_nesting_mode_nlevels = 2; 8828 } else { 8829 __kmp_nesting_nth_level[0] = __kmp_avail_proc; 8830 __kmp_nesting_mode_nlevels = 1; 8831 } 8832 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 8833 } 8834 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) { 8835 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i]; 8836 } 8837 set__nproc(thread, __kmp_nesting_nth_level[0]); 8838 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode) 8839 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 8840 if (get__max_active_levels(thread) > 1) { 8841 // if max levels was set, set nesting mode levels to same 8842 __kmp_nesting_mode_nlevels = get__max_active_levels(thread); 8843 } 8844 if (__kmp_nesting_mode == 1) // turn on nesting for this case only 8845 set__max_active_levels(thread, __kmp_nesting_mode_nlevels); 8846 } 8847