1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #include "kmp_utils.h" 28 #if KMP_USE_HIER_SCHED 29 #include "kmp_dispatch_hier.h" 30 #endif 31 32 #if OMPT_SUPPORT 33 #include "ompt-specific.h" 34 #endif 35 #if OMPD_SUPPORT 36 #include "ompd-specific.h" 37 #endif 38 39 #if OMP_PROFILING_SUPPORT 40 #include "llvm/Support/TimeProfiler.h" 41 static char *ProfileTraceFile = nullptr; 42 #endif 43 44 /* these are temporary issues to be dealt with */ 45 #define KMP_USE_PRCTL 0 46 47 #if KMP_OS_WINDOWS 48 #include <process.h> 49 #endif 50 51 #ifndef KMP_USE_SHM 52 // Windows and WASI do not need these include files as they don't use shared 53 // memory. 54 #else 55 #include <sys/mman.h> 56 #include <sys/stat.h> 57 #include <fcntl.h> 58 #define SHM_SIZE 1024 59 #endif 60 61 #if defined(KMP_GOMP_COMPAT) 62 char const __kmp_version_alt_comp[] = 63 KMP_VERSION_PREFIX "alternative compiler support: yes"; 64 #endif /* defined(KMP_GOMP_COMPAT) */ 65 66 char const __kmp_version_omp_api[] = 67 KMP_VERSION_PREFIX "API version: 5.0 (201611)"; 68 69 #ifdef KMP_DEBUG 70 char const __kmp_version_lock[] = 71 KMP_VERSION_PREFIX "lock type: run time selectable"; 72 #endif /* KMP_DEBUG */ 73 74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 75 76 /* ------------------------------------------------------------------------ */ 77 78 #if KMP_USE_MONITOR 79 kmp_info_t __kmp_monitor; 80 #endif 81 82 /* Forward declarations */ 83 84 void __kmp_cleanup(void); 85 86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 87 int gtid); 88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 89 kmp_internal_control_t *new_icvs, 90 ident_t *loc); 91 #if KMP_AFFINITY_SUPPORTED 92 static void __kmp_partition_places(kmp_team_t *team, 93 int update_master_only = 0); 94 #endif 95 static void __kmp_do_serial_initialize(void); 96 void __kmp_fork_barrier(int gtid, int tid); 97 void __kmp_join_barrier(int gtid); 98 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 99 kmp_internal_control_t *new_icvs, ident_t *loc); 100 101 #ifdef USE_LOAD_BALANCE 102 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 103 #endif 104 105 static int __kmp_expand_threads(int nNeed); 106 #if KMP_OS_WINDOWS 107 static int __kmp_unregister_root_other_thread(int gtid); 108 #endif 109 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 110 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 111 112 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads, 113 int new_nthreads); 114 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads); 115 116 static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr, 117 int level) { 118 kmp_nested_nthreads_t *new_nested_nth = 119 (kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC( 120 sizeof(kmp_nested_nthreads_t)); 121 int new_size = level + thr->th.th_set_nested_nth_sz; 122 new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int)); 123 for (int i = 0; i < level + 1; ++i) 124 new_nested_nth->nth[i] = 0; 125 for (int i = level + 1, j = 1; i < new_size; ++i, ++j) 126 new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j]; 127 new_nested_nth->size = new_nested_nth->used = new_size; 128 return new_nested_nth; 129 } 130 131 /* Calculate the identifier of the current thread */ 132 /* fast (and somewhat portable) way to get unique identifier of executing 133 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 134 int __kmp_get_global_thread_id() { 135 int i; 136 kmp_info_t **other_threads; 137 size_t stack_data; 138 char *stack_addr; 139 size_t stack_size; 140 char *stack_base; 141 142 KA_TRACE( 143 1000, 144 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 145 __kmp_nth, __kmp_all_nth)); 146 147 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 148 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 149 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 150 __kmp_init_gtid for this to work. */ 151 152 if (!TCR_4(__kmp_init_gtid)) 153 return KMP_GTID_DNE; 154 155 #ifdef KMP_TDATA_GTID 156 if (TCR_4(__kmp_gtid_mode) >= 3) { 157 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 158 return __kmp_gtid; 159 } 160 #endif 161 if (TCR_4(__kmp_gtid_mode) >= 2) { 162 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 163 return __kmp_gtid_get_specific(); 164 } 165 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 166 167 stack_addr = (char *)&stack_data; 168 other_threads = __kmp_threads; 169 170 /* ATT: The code below is a source of potential bugs due to unsynchronized 171 access to __kmp_threads array. For example: 172 1. Current thread loads other_threads[i] to thr and checks it, it is 173 non-NULL. 174 2. Current thread is suspended by OS. 175 3. Another thread unregisters and finishes (debug versions of free() 176 may fill memory with something like 0xEF). 177 4. Current thread is resumed. 178 5. Current thread reads junk from *thr. 179 TODO: Fix it. --ln */ 180 181 for (i = 0; i < __kmp_threads_capacity; i++) { 182 183 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 184 if (!thr) 185 continue; 186 187 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 188 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 189 190 /* stack grows down -- search through all of the active threads */ 191 192 if (stack_addr <= stack_base) { 193 size_t stack_diff = stack_base - stack_addr; 194 195 if (stack_diff <= stack_size) { 196 /* The only way we can be closer than the allocated */ 197 /* stack size is if we are running on this thread. */ 198 // __kmp_gtid_get_specific can return negative value because this 199 // function can be called by thread destructor. However, before the 200 // thread destructor is called, the value of the corresponding 201 // thread-specific data will be reset to NULL. 202 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 || 203 __kmp_gtid_get_specific() == i); 204 return i; 205 } 206 } 207 } 208 209 /* get specific to try and determine our gtid */ 210 KA_TRACE(1000, 211 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 212 "thread, using TLS\n")); 213 i = __kmp_gtid_get_specific(); 214 215 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 216 217 /* if we havn't been assigned a gtid, then return code */ 218 if (i < 0) 219 return i; 220 221 // other_threads[i] can be nullptr at this point because the corresponding 222 // thread could have already been destructed. It can happen when this function 223 // is called in end library routine. 224 if (!TCR_SYNC_PTR(other_threads[i])) 225 return i; 226 227 /* dynamically updated stack window for uber threads to avoid get_specific 228 call */ 229 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 230 KMP_FATAL(StackOverflow, i); 231 } 232 233 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 234 if (stack_addr > stack_base) { 235 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 236 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 237 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 238 stack_base); 239 } else { 240 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 241 stack_base - stack_addr); 242 } 243 244 /* Reprint stack bounds for ubermaster since they have been refined */ 245 if (__kmp_storage_map) { 246 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 247 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 248 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 249 other_threads[i]->th.th_info.ds.ds_stacksize, 250 "th_%d stack (refinement)", i); 251 } 252 return i; 253 } 254 255 int __kmp_get_global_thread_id_reg() { 256 int gtid; 257 258 if (!__kmp_init_serial) { 259 gtid = KMP_GTID_DNE; 260 } else 261 #ifdef KMP_TDATA_GTID 262 if (TCR_4(__kmp_gtid_mode) >= 3) { 263 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 264 gtid = __kmp_gtid; 265 } else 266 #endif 267 if (TCR_4(__kmp_gtid_mode) >= 2) { 268 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 269 gtid = __kmp_gtid_get_specific(); 270 } else { 271 KA_TRACE(1000, 272 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 273 gtid = __kmp_get_global_thread_id(); 274 } 275 276 /* we must be a new uber master sibling thread */ 277 if (gtid == KMP_GTID_DNE) { 278 KA_TRACE(10, 279 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 280 "Registering a new gtid.\n")); 281 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 282 if (!__kmp_init_serial) { 283 __kmp_do_serial_initialize(); 284 gtid = __kmp_gtid_get_specific(); 285 } else { 286 gtid = __kmp_register_root(FALSE); 287 } 288 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 289 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 290 } 291 292 KMP_DEBUG_ASSERT(gtid >= 0); 293 294 return gtid; 295 } 296 297 /* caller must hold forkjoin_lock */ 298 void __kmp_check_stack_overlap(kmp_info_t *th) { 299 int f; 300 char *stack_beg = NULL; 301 char *stack_end = NULL; 302 int gtid; 303 304 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 305 if (__kmp_storage_map) { 306 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 307 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 308 309 gtid = __kmp_gtid_from_thread(th); 310 311 if (gtid == KMP_GTID_MONITOR) { 312 __kmp_print_storage_map_gtid( 313 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 314 "th_%s stack (%s)", "mon", 315 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 316 } else { 317 __kmp_print_storage_map_gtid( 318 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 319 "th_%d stack (%s)", gtid, 320 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 321 } 322 } 323 324 /* No point in checking ubermaster threads since they use refinement and 325 * cannot overlap */ 326 gtid = __kmp_gtid_from_thread(th); 327 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 328 KA_TRACE(10, 329 ("__kmp_check_stack_overlap: performing extensive checking\n")); 330 if (stack_beg == NULL) { 331 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 332 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 333 } 334 335 for (f = 0; f < __kmp_threads_capacity; f++) { 336 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 337 338 if (f_th && f_th != th) { 339 char *other_stack_end = 340 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 341 char *other_stack_beg = 342 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 343 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 344 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 345 346 /* Print the other stack values before the abort */ 347 if (__kmp_storage_map) 348 __kmp_print_storage_map_gtid( 349 -1, other_stack_beg, other_stack_end, 350 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 351 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 352 353 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 354 __kmp_msg_null); 355 } 356 } 357 } 358 } 359 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 360 } 361 362 /* ------------------------------------------------------------------------ */ 363 364 void __kmp_infinite_loop(void) { 365 static int done = FALSE; 366 367 while (!done) { 368 KMP_YIELD(TRUE); 369 } 370 } 371 372 #define MAX_MESSAGE 512 373 374 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 375 char const *format, ...) { 376 char buffer[MAX_MESSAGE]; 377 va_list ap; 378 379 va_start(ap, format); 380 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 381 p2, (unsigned long)size, format); 382 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 383 __kmp_vprintf(kmp_err, buffer, ap); 384 #if KMP_PRINT_DATA_PLACEMENT 385 int node; 386 if (gtid >= 0) { 387 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 388 if (__kmp_storage_map_verbose) { 389 node = __kmp_get_host_node(p1); 390 if (node < 0) /* doesn't work, so don't try this next time */ 391 __kmp_storage_map_verbose = FALSE; 392 else { 393 char *last; 394 int lastNode; 395 int localProc = __kmp_get_cpu_from_gtid(gtid); 396 397 const int page_size = KMP_GET_PAGE_SIZE(); 398 399 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 400 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 401 if (localProc >= 0) 402 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 403 localProc >> 1); 404 else 405 __kmp_printf_no_lock(" GTID %d\n", gtid); 406 #if KMP_USE_PRCTL 407 /* The more elaborate format is disabled for now because of the prctl 408 * hanging bug. */ 409 do { 410 last = p1; 411 lastNode = node; 412 /* This loop collates adjacent pages with the same host node. */ 413 do { 414 (char *)p1 += page_size; 415 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 416 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 417 lastNode); 418 } while (p1 <= p2); 419 #else 420 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 421 (char *)p1 + (page_size - 1), 422 __kmp_get_host_node(p1)); 423 if (p1 < p2) { 424 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 425 (char *)p2 + (page_size - 1), 426 __kmp_get_host_node(p2)); 427 } 428 #endif 429 } 430 } 431 } else 432 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 433 } 434 #endif /* KMP_PRINT_DATA_PLACEMENT */ 435 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 436 437 va_end(ap); 438 } 439 440 void __kmp_warn(char const *format, ...) { 441 char buffer[MAX_MESSAGE]; 442 va_list ap; 443 444 if (__kmp_generate_warnings == kmp_warnings_off) { 445 return; 446 } 447 448 va_start(ap, format); 449 450 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 451 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 452 __kmp_vprintf(kmp_err, buffer, ap); 453 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 454 455 va_end(ap); 456 } 457 458 void __kmp_abort_process() { 459 // Later threads may stall here, but that's ok because abort() will kill them. 460 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 461 462 if (__kmp_debug_buf) { 463 __kmp_dump_debug_buffer(); 464 } 465 466 #if KMP_OS_WINDOWS 467 // Let other threads know of abnormal termination and prevent deadlock 468 // if abort happened during library initialization or shutdown 469 __kmp_global.g.g_abort = SIGABRT; 470 471 /* On Windows* OS by default abort() causes pop-up error box, which stalls 472 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 473 boxes. _set_abort_behavior() works well, but this function is not 474 available in VS7 (this is not problem for DLL, but it is a problem for 475 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 476 help, at least in some versions of MS C RTL. 477 478 It seems following sequence is the only way to simulate abort() and 479 avoid pop-up error box. */ 480 raise(SIGABRT); 481 _exit(3); // Just in case, if signal ignored, exit anyway. 482 #else 483 __kmp_unregister_library(); 484 abort(); 485 #endif 486 487 __kmp_infinite_loop(); 488 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 489 490 } // __kmp_abort_process 491 492 void __kmp_abort_thread(void) { 493 // TODO: Eliminate g_abort global variable and this function. 494 // In case of abort just call abort(), it will kill all the threads. 495 __kmp_infinite_loop(); 496 } // __kmp_abort_thread 497 498 /* Print out the storage map for the major kmp_info_t thread data structures 499 that are allocated together. */ 500 501 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 502 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 503 gtid); 504 505 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 506 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 507 508 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 509 sizeof(kmp_local_t), "th_%d.th_local", gtid); 510 511 __kmp_print_storage_map_gtid( 512 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 513 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 514 515 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 516 &thr->th.th_bar[bs_plain_barrier + 1], 517 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 518 gtid); 519 520 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 521 &thr->th.th_bar[bs_forkjoin_barrier + 1], 522 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 523 gtid); 524 525 #if KMP_FAST_REDUCTION_BARRIER 526 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 527 &thr->th.th_bar[bs_reduction_barrier + 1], 528 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 529 gtid); 530 #endif // KMP_FAST_REDUCTION_BARRIER 531 } 532 533 /* Print out the storage map for the major kmp_team_t team data structures 534 that are allocated together. */ 535 536 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 537 int team_id, int num_thr) { 538 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 539 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 540 header, team_id); 541 542 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 543 &team->t.t_bar[bs_last_barrier], 544 sizeof(kmp_balign_team_t) * bs_last_barrier, 545 "%s_%d.t_bar", header, team_id); 546 547 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 548 &team->t.t_bar[bs_plain_barrier + 1], 549 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 550 header, team_id); 551 552 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 553 &team->t.t_bar[bs_forkjoin_barrier + 1], 554 sizeof(kmp_balign_team_t), 555 "%s_%d.t_bar[forkjoin]", header, team_id); 556 557 #if KMP_FAST_REDUCTION_BARRIER 558 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 559 &team->t.t_bar[bs_reduction_barrier + 1], 560 sizeof(kmp_balign_team_t), 561 "%s_%d.t_bar[reduction]", header, team_id); 562 #endif // KMP_FAST_REDUCTION_BARRIER 563 564 __kmp_print_storage_map_gtid( 565 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 566 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 567 568 __kmp_print_storage_map_gtid( 569 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 570 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 571 572 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 573 &team->t.t_disp_buffer[num_disp_buff], 574 sizeof(dispatch_shared_info_t) * num_disp_buff, 575 "%s_%d.t_disp_buffer", header, team_id); 576 } 577 578 static void __kmp_init_allocator() { 579 __kmp_init_memkind(); 580 __kmp_init_target_mem(); 581 } 582 static void __kmp_fini_allocator() { 583 __kmp_fini_target_mem(); 584 __kmp_fini_memkind(); 585 } 586 587 /* ------------------------------------------------------------------------ */ 588 589 #if ENABLE_LIBOMPTARGET 590 static void __kmp_init_omptarget() { 591 __kmp_init_target_task(); 592 } 593 #endif 594 595 /* ------------------------------------------------------------------------ */ 596 597 #if KMP_DYNAMIC_LIB 598 #if KMP_OS_WINDOWS 599 600 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 601 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 602 603 switch (fdwReason) { 604 605 case DLL_PROCESS_ATTACH: 606 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 607 608 return TRUE; 609 610 case DLL_PROCESS_DETACH: 611 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 612 613 // According to Windows* documentation for DllMain entry point: 614 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference: 615 // lpReserved == NULL when FreeLibrary() is called, 616 // lpReserved != NULL when the process is terminated. 617 // When FreeLibrary() is called, worker threads remain alive. So the 618 // runtime's state is consistent and executing proper shutdown is OK. 619 // When the process is terminated, worker threads have exited or been 620 // forcefully terminated by the OS and only the shutdown thread remains. 621 // This can leave the runtime in an inconsistent state. 622 // Hence, only attempt proper cleanup when FreeLibrary() is called. 623 // Otherwise, rely on OS to reclaim resources. 624 if (lpReserved == NULL) 625 __kmp_internal_end_library(__kmp_gtid_get_specific()); 626 627 return TRUE; 628 629 case DLL_THREAD_ATTACH: 630 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 631 632 /* if we want to register new siblings all the time here call 633 * __kmp_get_gtid(); */ 634 return TRUE; 635 636 case DLL_THREAD_DETACH: 637 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 638 639 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 640 return TRUE; 641 } 642 643 return TRUE; 644 } 645 646 #endif /* KMP_OS_WINDOWS */ 647 #endif /* KMP_DYNAMIC_LIB */ 648 649 /* __kmp_parallel_deo -- Wait until it's our turn. */ 650 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 651 int gtid = *gtid_ref; 652 #ifdef BUILD_PARALLEL_ORDERED 653 kmp_team_t *team = __kmp_team_from_gtid(gtid); 654 #endif /* BUILD_PARALLEL_ORDERED */ 655 656 if (__kmp_env_consistency_check) { 657 if (__kmp_threads[gtid]->th.th_root->r.r_active) 658 #if KMP_USE_DYNAMIC_LOCK 659 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 660 #else 661 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 662 #endif 663 } 664 #ifdef BUILD_PARALLEL_ORDERED 665 if (!team->t.t_serialized) { 666 KMP_MB(); 667 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, 668 NULL); 669 KMP_MB(); 670 } 671 #endif /* BUILD_PARALLEL_ORDERED */ 672 } 673 674 /* __kmp_parallel_dxo -- Signal the next task. */ 675 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 676 int gtid = *gtid_ref; 677 #ifdef BUILD_PARALLEL_ORDERED 678 int tid = __kmp_tid_from_gtid(gtid); 679 kmp_team_t *team = __kmp_team_from_gtid(gtid); 680 #endif /* BUILD_PARALLEL_ORDERED */ 681 682 if (__kmp_env_consistency_check) { 683 if (__kmp_threads[gtid]->th.th_root->r.r_active) 684 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 685 } 686 #ifdef BUILD_PARALLEL_ORDERED 687 if (!team->t.t_serialized) { 688 KMP_MB(); /* Flush all pending memory write invalidates. */ 689 690 /* use the tid of the next thread in this team */ 691 /* TODO replace with general release procedure */ 692 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 693 694 KMP_MB(); /* Flush all pending memory write invalidates. */ 695 } 696 #endif /* BUILD_PARALLEL_ORDERED */ 697 } 698 699 /* ------------------------------------------------------------------------ */ 700 /* The BARRIER for a SINGLE process section is always explicit */ 701 702 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 703 int status; 704 kmp_info_t *th; 705 kmp_team_t *team; 706 707 if (!TCR_4(__kmp_init_parallel)) 708 __kmp_parallel_initialize(); 709 __kmp_resume_if_soft_paused(); 710 711 th = __kmp_threads[gtid]; 712 team = th->th.th_team; 713 status = 0; 714 715 th->th.th_ident = id_ref; 716 717 if (team->t.t_serialized) { 718 status = 1; 719 } else { 720 kmp_int32 old_this = th->th.th_local.this_construct; 721 722 ++th->th.th_local.this_construct; 723 /* try to set team count to thread count--success means thread got the 724 single block */ 725 /* TODO: Should this be acquire or release? */ 726 if (team->t.t_construct == old_this) { 727 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 728 th->th.th_local.this_construct); 729 } 730 #if USE_ITT_BUILD 731 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 732 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 733 team->t.t_active_level == 1) { 734 // Only report metadata by primary thread of active team at level 1 735 __kmp_itt_metadata_single(id_ref); 736 } 737 #endif /* USE_ITT_BUILD */ 738 } 739 740 if (__kmp_env_consistency_check) { 741 if (status && push_ws) { 742 __kmp_push_workshare(gtid, ct_psingle, id_ref); 743 } else { 744 __kmp_check_workshare(gtid, ct_psingle, id_ref); 745 } 746 } 747 #if USE_ITT_BUILD 748 if (status) { 749 __kmp_itt_single_start(gtid); 750 } 751 #endif /* USE_ITT_BUILD */ 752 return status; 753 } 754 755 void __kmp_exit_single(int gtid) { 756 #if USE_ITT_BUILD 757 __kmp_itt_single_end(gtid); 758 #endif /* USE_ITT_BUILD */ 759 if (__kmp_env_consistency_check) 760 __kmp_pop_workshare(gtid, ct_psingle, NULL); 761 } 762 763 /* determine if we can go parallel or must use a serialized parallel region and 764 * how many threads we can use 765 * set_nproc is the number of threads requested for the team 766 * returns 0 if we should serialize or only use one thread, 767 * otherwise the number of threads to use 768 * The forkjoin lock is held by the caller. */ 769 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 770 int master_tid, int set_nthreads, 771 int enter_teams) { 772 int capacity; 773 int new_nthreads; 774 KMP_DEBUG_ASSERT(__kmp_init_serial); 775 KMP_DEBUG_ASSERT(root && parent_team); 776 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; 777 778 // If dyn-var is set, dynamically adjust the number of desired threads, 779 // according to the method specified by dynamic_mode. 780 new_nthreads = set_nthreads; 781 if (!get__dynamic_2(parent_team, master_tid)) { 782 ; 783 } 784 #ifdef USE_LOAD_BALANCE 785 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 786 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 787 if (new_nthreads == 1) { 788 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 789 "reservation to 1 thread\n", 790 master_tid)); 791 return 1; 792 } 793 if (new_nthreads < set_nthreads) { 794 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 795 "reservation to %d threads\n", 796 master_tid, new_nthreads)); 797 } 798 } 799 #endif /* USE_LOAD_BALANCE */ 800 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 801 new_nthreads = __kmp_avail_proc - __kmp_nth + 802 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 803 if (new_nthreads <= 1) { 804 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 805 "reservation to 1 thread\n", 806 master_tid)); 807 return 1; 808 } 809 if (new_nthreads < set_nthreads) { 810 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 811 "reservation to %d threads\n", 812 master_tid, new_nthreads)); 813 } else { 814 new_nthreads = set_nthreads; 815 } 816 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 817 if (set_nthreads > 2) { 818 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 819 new_nthreads = (new_nthreads % set_nthreads) + 1; 820 if (new_nthreads == 1) { 821 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 822 "reservation to 1 thread\n", 823 master_tid)); 824 return 1; 825 } 826 if (new_nthreads < set_nthreads) { 827 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 828 "reservation to %d threads\n", 829 master_tid, new_nthreads)); 830 } 831 } 832 } else { 833 KMP_ASSERT(0); 834 } 835 836 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 837 if (__kmp_nth + new_nthreads - 838 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 839 __kmp_max_nth) { 840 int tl_nthreads = __kmp_max_nth - __kmp_nth + 841 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 842 if (tl_nthreads <= 0) { 843 tl_nthreads = 1; 844 } 845 846 // If dyn-var is false, emit a 1-time warning. 847 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 848 __kmp_reserve_warn = 1; 849 __kmp_msg(kmp_ms_warning, 850 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 851 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 852 } 853 if (tl_nthreads == 1) { 854 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 855 "reduced reservation to 1 thread\n", 856 master_tid)); 857 return 1; 858 } 859 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 860 "reservation to %d threads\n", 861 master_tid, tl_nthreads)); 862 new_nthreads = tl_nthreads; 863 } 864 865 // Respect OMP_THREAD_LIMIT 866 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; 867 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; 868 if (cg_nthreads + new_nthreads - 869 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 870 max_cg_threads) { 871 int tl_nthreads = max_cg_threads - cg_nthreads + 872 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 873 if (tl_nthreads <= 0) { 874 tl_nthreads = 1; 875 } 876 877 // If dyn-var is false, emit a 1-time warning. 878 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 879 __kmp_reserve_warn = 1; 880 __kmp_msg(kmp_ms_warning, 881 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 882 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 883 } 884 if (tl_nthreads == 1) { 885 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 886 "reduced reservation to 1 thread\n", 887 master_tid)); 888 return 1; 889 } 890 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 891 "reservation to %d threads\n", 892 master_tid, tl_nthreads)); 893 new_nthreads = tl_nthreads; 894 } 895 896 // Check if the threads array is large enough, or needs expanding. 897 // See comment in __kmp_register_root() about the adjustment if 898 // __kmp_threads[0] == NULL. 899 capacity = __kmp_threads_capacity; 900 if (TCR_PTR(__kmp_threads[0]) == NULL) { 901 --capacity; 902 } 903 // If it is not for initializing the hidden helper team, we need to take 904 // __kmp_hidden_helper_threads_num out of the capacity because it is included 905 // in __kmp_threads_capacity. 906 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 907 capacity -= __kmp_hidden_helper_threads_num; 908 } 909 if (__kmp_nth + new_nthreads - 910 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 911 capacity) { 912 // Expand the threads array. 913 int slotsRequired = __kmp_nth + new_nthreads - 914 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 915 capacity; 916 int slotsAdded = __kmp_expand_threads(slotsRequired); 917 if (slotsAdded < slotsRequired) { 918 // The threads array was not expanded enough. 919 new_nthreads -= (slotsRequired - slotsAdded); 920 KMP_ASSERT(new_nthreads >= 1); 921 922 // If dyn-var is false, emit a 1-time warning. 923 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 924 __kmp_reserve_warn = 1; 925 if (__kmp_tp_cached) { 926 __kmp_msg(kmp_ms_warning, 927 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 928 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 929 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 930 } else { 931 __kmp_msg(kmp_ms_warning, 932 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 933 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 934 } 935 } 936 } 937 } 938 939 #ifdef KMP_DEBUG 940 if (new_nthreads == 1) { 941 KC_TRACE(10, 942 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 943 "dead roots and rechecking; requested %d threads\n", 944 __kmp_get_gtid(), set_nthreads)); 945 } else { 946 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 947 " %d threads\n", 948 __kmp_get_gtid(), new_nthreads, set_nthreads)); 949 } 950 #endif // KMP_DEBUG 951 952 if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) { 953 __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev, 954 this_thr->th.th_nt_msg); 955 } 956 return new_nthreads; 957 } 958 959 /* Allocate threads from the thread pool and assign them to the new team. We are 960 assured that there are enough threads available, because we checked on that 961 earlier within critical section forkjoin */ 962 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 963 kmp_info_t *master_th, int master_gtid, 964 int fork_teams_workers) { 965 int i; 966 int use_hot_team; 967 968 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 969 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 970 KMP_MB(); 971 972 /* first, let's setup the primary thread */ 973 master_th->th.th_info.ds.ds_tid = 0; 974 master_th->th.th_team = team; 975 master_th->th.th_team_nproc = team->t.t_nproc; 976 master_th->th.th_team_master = master_th; 977 master_th->th.th_team_serialized = FALSE; 978 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 979 980 /* make sure we are not the optimized hot team */ 981 #if KMP_NESTED_HOT_TEAMS 982 use_hot_team = 0; 983 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 984 if (hot_teams) { // hot teams array is not allocated if 985 // KMP_HOT_TEAMS_MAX_LEVEL=0 986 int level = team->t.t_active_level - 1; // index in array of hot teams 987 if (master_th->th.th_teams_microtask) { // are we inside the teams? 988 if (master_th->th.th_teams_size.nteams > 1) { 989 ++level; // level was not increased in teams construct for 990 // team_of_masters 991 } 992 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 993 master_th->th.th_teams_level == team->t.t_level) { 994 ++level; // level was not increased in teams construct for 995 // team_of_workers before the parallel 996 } // team->t.t_level will be increased inside parallel 997 } 998 if (level < __kmp_hot_teams_max_level) { 999 if (hot_teams[level].hot_team) { 1000 // hot team has already been allocated for given level 1001 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 1002 use_hot_team = 1; // the team is ready to use 1003 } else { 1004 use_hot_team = 0; // AC: threads are not allocated yet 1005 hot_teams[level].hot_team = team; // remember new hot team 1006 hot_teams[level].hot_team_nth = team->t.t_nproc; 1007 } 1008 } else { 1009 use_hot_team = 0; 1010 } 1011 } 1012 #else 1013 use_hot_team = team == root->r.r_hot_team; 1014 #endif 1015 if (!use_hot_team) { 1016 1017 /* install the primary thread */ 1018 team->t.t_threads[0] = master_th; 1019 __kmp_initialize_info(master_th, team, 0, master_gtid); 1020 1021 /* now, install the worker threads */ 1022 for (i = 1; i < team->t.t_nproc; i++) { 1023 1024 /* fork or reallocate a new thread and install it in team */ 1025 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 1026 team->t.t_threads[i] = thr; 1027 KMP_DEBUG_ASSERT(thr); 1028 KMP_DEBUG_ASSERT(thr->th.th_team == team); 1029 /* align team and thread arrived states */ 1030 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 1031 "T#%d(%d:%d) join =%llu, plain=%llu\n", 1032 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 1033 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 1034 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 1035 team->t.t_bar[bs_plain_barrier].b_arrived)); 1036 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 1037 thr->th.th_teams_level = master_th->th.th_teams_level; 1038 thr->th.th_teams_size = master_th->th.th_teams_size; 1039 { // Initialize threads' barrier data. 1040 int b; 1041 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 1042 for (b = 0; b < bs_last_barrier; ++b) { 1043 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 1044 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 1045 #if USE_DEBUGGER 1046 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 1047 #endif 1048 } 1049 } 1050 } 1051 1052 #if KMP_AFFINITY_SUPPORTED 1053 // Do not partition the places list for teams construct workers who 1054 // haven't actually been forked to do real work yet. This partitioning 1055 // will take place in the parallel region nested within the teams construct. 1056 if (!fork_teams_workers) { 1057 __kmp_partition_places(team); 1058 } 1059 #endif 1060 1061 if (team->t.t_nproc > 1 && 1062 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 1063 team->t.b->update_num_threads(team->t.t_nproc); 1064 __kmp_add_threads_to_team(team, team->t.t_nproc); 1065 } 1066 } 1067 1068 // Take care of primary thread's task state 1069 if (__kmp_tasking_mode != tskm_immediate_exec) { 1070 if (use_hot_team) { 1071 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th); 1072 KA_TRACE( 1073 20, 1074 ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team " 1075 "%p, new task_team %p / team %p\n", 1076 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 1077 team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state], 1078 team)); 1079 1080 // Store primary thread's current task state on new team 1081 KMP_CHECK_UPDATE(team->t.t_primary_task_state, 1082 master_th->th.th_task_state); 1083 1084 // Restore primary thread's task state to hot team's state 1085 // by using thread 1's task state 1086 if (team->t.t_nproc > 1) { 1087 KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 || 1088 team->t.t_threads[1]->th.th_task_state == 1); 1089 KMP_CHECK_UPDATE(master_th->th.th_task_state, 1090 team->t.t_threads[1]->th.th_task_state); 1091 } else { 1092 master_th->th.th_task_state = 0; 1093 } 1094 } else { 1095 // Store primary thread's current task_state on new team 1096 KMP_CHECK_UPDATE(team->t.t_primary_task_state, 1097 master_th->th.th_task_state); 1098 // Are not using hot team, so set task state to 0. 1099 master_th->th.th_task_state = 0; 1100 } 1101 } 1102 1103 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 1104 for (i = 0; i < team->t.t_nproc; i++) { 1105 kmp_info_t *thr = team->t.t_threads[i]; 1106 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1107 thr->th.th_prev_level != team->t.t_level) { 1108 team->t.t_display_affinity = 1; 1109 break; 1110 } 1111 } 1112 } 1113 1114 KMP_MB(); 1115 } 1116 1117 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1118 // Propagate any changes to the floating point control registers out to the team 1119 // We try to avoid unnecessary writes to the relevant cache line in the team 1120 // structure, so we don't make changes unless they are needed. 1121 inline static void propagateFPControl(kmp_team_t *team) { 1122 if (__kmp_inherit_fp_control) { 1123 kmp_int16 x87_fpu_control_word; 1124 kmp_uint32 mxcsr; 1125 1126 // Get primary thread's values of FPU control flags (both X87 and vector) 1127 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1128 __kmp_store_mxcsr(&mxcsr); 1129 mxcsr &= KMP_X86_MXCSR_MASK; 1130 1131 // There is no point looking at t_fp_control_saved here. 1132 // If it is TRUE, we still have to update the values if they are different 1133 // from those we now have. If it is FALSE we didn't save anything yet, but 1134 // our objective is the same. We have to ensure that the values in the team 1135 // are the same as those we have. 1136 // So, this code achieves what we need whether or not t_fp_control_saved is 1137 // true. By checking whether the value needs updating we avoid unnecessary 1138 // writes that would put the cache-line into a written state, causing all 1139 // threads in the team to have to read it again. 1140 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1141 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1142 // Although we don't use this value, other code in the runtime wants to know 1143 // whether it should restore them. So we must ensure it is correct. 1144 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1145 } else { 1146 // Similarly here. Don't write to this cache-line in the team structure 1147 // unless we have to. 1148 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1149 } 1150 } 1151 1152 // Do the opposite, setting the hardware registers to the updated values from 1153 // the team. 1154 inline static void updateHWFPControl(kmp_team_t *team) { 1155 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1156 // Only reset the fp control regs if they have been changed in the team. 1157 // the parallel region that we are exiting. 1158 kmp_int16 x87_fpu_control_word; 1159 kmp_uint32 mxcsr; 1160 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1161 __kmp_store_mxcsr(&mxcsr); 1162 mxcsr &= KMP_X86_MXCSR_MASK; 1163 1164 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1165 __kmp_clear_x87_fpu_status_word(); 1166 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1167 } 1168 1169 if (team->t.t_mxcsr != mxcsr) { 1170 __kmp_load_mxcsr(&team->t.t_mxcsr); 1171 } 1172 } 1173 } 1174 #else 1175 #define propagateFPControl(x) ((void)0) 1176 #define updateHWFPControl(x) ((void)0) 1177 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1178 1179 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1180 int realloc); // forward declaration 1181 1182 /* Run a parallel region that has been serialized, so runs only in a team of the 1183 single primary thread. */ 1184 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1185 kmp_info_t *this_thr; 1186 kmp_team_t *serial_team; 1187 1188 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1189 1190 /* Skip all this code for autopar serialized loops since it results in 1191 unacceptable overhead */ 1192 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1193 return; 1194 1195 if (!TCR_4(__kmp_init_parallel)) 1196 __kmp_parallel_initialize(); 1197 __kmp_resume_if_soft_paused(); 1198 1199 this_thr = __kmp_threads[global_tid]; 1200 serial_team = this_thr->th.th_serial_team; 1201 1202 /* utilize the serialized team held by this thread */ 1203 KMP_DEBUG_ASSERT(serial_team); 1204 KMP_MB(); 1205 1206 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1207 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1208 proc_bind = proc_bind_false; 1209 } else if (proc_bind == proc_bind_default) { 1210 // No proc_bind clause was specified, so use the current value 1211 // of proc-bind-var for this parallel region. 1212 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1213 } 1214 // Reset for next parallel region 1215 this_thr->th.th_set_proc_bind = proc_bind_default; 1216 1217 // OpenMP 6.0 12.1.2 requires the num_threads 'strict' modifier to also have 1218 // effect when parallel execution is disabled by a corresponding if clause 1219 // attached to the parallel directive. 1220 if (this_thr->th.th_nt_strict && this_thr->th.th_set_nproc > 1) 1221 __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev, 1222 this_thr->th.th_nt_msg); 1223 // Reset num_threads for next parallel region 1224 this_thr->th.th_set_nproc = 0; 1225 1226 #if OMPT_SUPPORT 1227 ompt_data_t ompt_parallel_data = ompt_data_none; 1228 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1229 if (ompt_enabled.enabled && 1230 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1231 1232 ompt_task_info_t *parent_task_info; 1233 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1234 1235 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1236 if (ompt_enabled.ompt_callback_parallel_begin) { 1237 int team_size = 1; 1238 1239 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1240 &(parent_task_info->task_data), &(parent_task_info->frame), 1241 &ompt_parallel_data, team_size, 1242 ompt_parallel_invoker_program | ompt_parallel_team, codeptr); 1243 } 1244 } 1245 #endif // OMPT_SUPPORT 1246 1247 if (this_thr->th.th_team != serial_team) { 1248 // Nested level will be an index in the nested nthreads array 1249 int level = this_thr->th.th_team->t.t_level; 1250 1251 if (serial_team->t.t_serialized) { 1252 /* this serial team was already used 1253 TODO increase performance by making this locks more specific */ 1254 kmp_team_t *new_team; 1255 1256 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1257 1258 new_team = 1259 __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1260 #if OMPT_SUPPORT 1261 ompt_parallel_data, 1262 #endif 1263 proc_bind, &this_thr->th.th_current_task->td_icvs, 1264 0 USE_NESTED_HOT_ARG(NULL)); 1265 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1266 KMP_ASSERT(new_team); 1267 1268 /* setup new serialized team and install it */ 1269 new_team->t.t_threads[0] = this_thr; 1270 new_team->t.t_parent = this_thr->th.th_team; 1271 serial_team = new_team; 1272 this_thr->th.th_serial_team = serial_team; 1273 1274 KF_TRACE( 1275 10, 1276 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1277 global_tid, serial_team)); 1278 1279 /* TODO the above breaks the requirement that if we run out of resources, 1280 then we can still guarantee that serialized teams are ok, since we may 1281 need to allocate a new one */ 1282 } else { 1283 KF_TRACE( 1284 10, 1285 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1286 global_tid, serial_team)); 1287 } 1288 1289 /* we have to initialize this serial team */ 1290 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1291 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1292 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1293 serial_team->t.t_ident = loc; 1294 serial_team->t.t_serialized = 1; 1295 serial_team->t.t_nproc = 1; 1296 serial_team->t.t_parent = this_thr->th.th_team; 1297 if (this_thr->th.th_team->t.t_nested_nth) 1298 serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth; 1299 else 1300 serial_team->t.t_nested_nth = &__kmp_nested_nth; 1301 // Save previous team's task state on serial team structure 1302 serial_team->t.t_primary_task_state = this_thr->th.th_task_state; 1303 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1304 this_thr->th.th_team = serial_team; 1305 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1306 1307 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid, 1308 this_thr->th.th_current_task)); 1309 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1310 this_thr->th.th_current_task->td_flags.executing = 0; 1311 1312 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1313 1314 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1315 implicit task for each serialized task represented by 1316 team->t.t_serialized? */ 1317 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1318 &this_thr->th.th_current_task->td_parent->td_icvs); 1319 1320 // Thread value exists in the nested nthreads array for the next nested 1321 // level 1322 kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth; 1323 if (this_thr->th.th_team->t.t_nested_nth) 1324 nested_nth = this_thr->th.th_team->t.t_nested_nth; 1325 if (nested_nth->used && (level + 1 < nested_nth->used)) { 1326 this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1]; 1327 } 1328 1329 if (__kmp_nested_proc_bind.used && 1330 (level + 1 < __kmp_nested_proc_bind.used)) { 1331 this_thr->th.th_current_task->td_icvs.proc_bind = 1332 __kmp_nested_proc_bind.bind_types[level + 1]; 1333 } 1334 1335 #if USE_DEBUGGER 1336 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1337 #endif 1338 this_thr->th.th_info.ds.ds_tid = 0; 1339 1340 /* set thread cache values */ 1341 this_thr->th.th_team_nproc = 1; 1342 this_thr->th.th_team_master = this_thr; 1343 this_thr->th.th_team_serialized = 1; 1344 this_thr->th.th_task_team = NULL; 1345 this_thr->th.th_task_state = 0; 1346 1347 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1348 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1349 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1350 1351 propagateFPControl(serial_team); 1352 1353 /* check if we need to allocate dispatch buffers stack */ 1354 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1355 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1356 serial_team->t.t_dispatch->th_disp_buffer = 1357 (dispatch_private_info_t *)__kmp_allocate( 1358 sizeof(dispatch_private_info_t)); 1359 } 1360 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1361 1362 KMP_MB(); 1363 1364 } else { 1365 /* this serialized team is already being used, 1366 * that's fine, just add another nested level */ 1367 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1368 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1369 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1370 ++serial_team->t.t_serialized; 1371 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1372 1373 // Nested level will be an index in the nested nthreads array 1374 int level = this_thr->th.th_team->t.t_level; 1375 // Thread value exists in the nested nthreads array for the next nested 1376 // level 1377 1378 kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth; 1379 if (serial_team->t.t_nested_nth) 1380 nested_nth = serial_team->t.t_nested_nth; 1381 if (nested_nth->used && (level + 1 < nested_nth->used)) { 1382 this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1]; 1383 } 1384 1385 serial_team->t.t_level++; 1386 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1387 "of serial team %p to %d\n", 1388 global_tid, serial_team, serial_team->t.t_level)); 1389 1390 /* allocate/push dispatch buffers stack */ 1391 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1392 { 1393 dispatch_private_info_t *disp_buffer = 1394 (dispatch_private_info_t *)__kmp_allocate( 1395 sizeof(dispatch_private_info_t)); 1396 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1397 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1398 } 1399 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1400 1401 /* allocate/push task team stack */ 1402 __kmp_push_task_team_node(this_thr, serial_team); 1403 1404 KMP_MB(); 1405 } 1406 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1407 1408 // Perform the display affinity functionality for 1409 // serialized parallel regions 1410 if (__kmp_display_affinity) { 1411 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1412 this_thr->th.th_prev_num_threads != 1) { 1413 // NULL means use the affinity-format-var ICV 1414 __kmp_aux_display_affinity(global_tid, NULL); 1415 this_thr->th.th_prev_level = serial_team->t.t_level; 1416 this_thr->th.th_prev_num_threads = 1; 1417 } 1418 } 1419 1420 if (__kmp_env_consistency_check) 1421 __kmp_push_parallel(global_tid, NULL); 1422 #if OMPT_SUPPORT 1423 serial_team->t.ompt_team_info.master_return_address = codeptr; 1424 if (ompt_enabled.enabled && 1425 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1426 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1427 OMPT_GET_FRAME_ADDRESS(0); 1428 1429 ompt_lw_taskteam_t lw_taskteam; 1430 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1431 &ompt_parallel_data, codeptr); 1432 1433 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1434 // don't use lw_taskteam after linking. content was swaped 1435 1436 /* OMPT implicit task begin */ 1437 if (ompt_enabled.ompt_callback_implicit_task) { 1438 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1439 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1440 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), 1441 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1442 OMPT_CUR_TASK_INFO(this_thr)->thread_num = 1443 __kmp_tid_from_gtid(global_tid); 1444 } 1445 1446 /* OMPT state */ 1447 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1448 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1449 OMPT_GET_FRAME_ADDRESS(0); 1450 } 1451 #endif 1452 } 1453 1454 // Test if this fork is for a team closely nested in a teams construct 1455 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th, 1456 microtask_t microtask, int level, 1457 int teams_level, kmp_va_list ap) { 1458 return (master_th->th.th_teams_microtask && ap && 1459 microtask != (microtask_t)__kmp_teams_master && level == teams_level); 1460 } 1461 1462 // Test if this fork is for the teams construct, i.e. to form the outer league 1463 // of teams 1464 static inline bool __kmp_is_entering_teams(int active_level, int level, 1465 int teams_level, kmp_va_list ap) { 1466 return ((ap == NULL && active_level == 0) || 1467 (ap && teams_level > 0 && teams_level == level)); 1468 } 1469 1470 // AC: This is start of parallel that is nested inside teams construct. 1471 // The team is actual (hot), all workers are ready at the fork barrier. 1472 // No lock needed to initialize the team a bit, then free workers. 1473 static inline int 1474 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team, 1475 kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root, 1476 enum fork_context_e call_context, microtask_t microtask, 1477 launch_t invoker, int master_set_numthreads, int level, 1478 #if OMPT_SUPPORT 1479 ompt_data_t ompt_parallel_data, void *return_address, 1480 #endif 1481 kmp_va_list ap) { 1482 void **argv; 1483 int i; 1484 1485 parent_team->t.t_ident = loc; 1486 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1487 parent_team->t.t_argc = argc; 1488 argv = (void **)parent_team->t.t_argv; 1489 for (i = argc - 1; i >= 0; --i) { 1490 *argv++ = va_arg(kmp_va_deref(ap), void *); 1491 } 1492 // Increment our nested depth levels, but not increase the serialization 1493 if (parent_team == master_th->th.th_serial_team) { 1494 // AC: we are in serialized parallel 1495 __kmpc_serialized_parallel(loc, gtid); 1496 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1497 1498 if (call_context == fork_context_gnu) { 1499 // AC: need to decrement t_serialized for enquiry functions to work 1500 // correctly, will restore at join time 1501 parent_team->t.t_serialized--; 1502 return TRUE; 1503 } 1504 1505 #if OMPD_SUPPORT 1506 parent_team->t.t_pkfn = microtask; 1507 #endif 1508 1509 #if OMPT_SUPPORT 1510 void *dummy; 1511 void **exit_frame_p; 1512 ompt_data_t *implicit_task_data; 1513 ompt_lw_taskteam_t lw_taskteam; 1514 1515 if (ompt_enabled.enabled) { 1516 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1517 &ompt_parallel_data, return_address); 1518 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1519 1520 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1521 // Don't use lw_taskteam after linking. Content was swapped. 1522 1523 /* OMPT implicit task begin */ 1524 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1525 if (ompt_enabled.ompt_callback_implicit_task) { 1526 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid); 1527 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1528 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data, 1529 1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1530 } 1531 1532 /* OMPT state */ 1533 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1534 } else { 1535 exit_frame_p = &dummy; 1536 } 1537 #endif 1538 1539 // AC: need to decrement t_serialized for enquiry functions to work 1540 // correctly, will restore at join time 1541 parent_team->t.t_serialized--; 1542 1543 { 1544 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1545 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1546 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1547 #if OMPT_SUPPORT 1548 , 1549 exit_frame_p 1550 #endif 1551 ); 1552 } 1553 1554 #if OMPT_SUPPORT 1555 if (ompt_enabled.enabled) { 1556 *exit_frame_p = NULL; 1557 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1558 if (ompt_enabled.ompt_callback_implicit_task) { 1559 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1560 ompt_scope_end, NULL, implicit_task_data, 1, 1561 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1562 } 1563 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1564 __ompt_lw_taskteam_unlink(master_th); 1565 if (ompt_enabled.ompt_callback_parallel_end) { 1566 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1567 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th), 1568 OMPT_INVOKER(call_context) | ompt_parallel_team, return_address); 1569 } 1570 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1571 } 1572 #endif 1573 return TRUE; 1574 } 1575 1576 parent_team->t.t_pkfn = microtask; 1577 parent_team->t.t_invoke = invoker; 1578 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1579 parent_team->t.t_active_level++; 1580 parent_team->t.t_level++; 1581 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1582 1583 // If the threads allocated to the team are less than the thread limit, update 1584 // the thread limit here. th_teams_size.nth is specific to this team nested 1585 // in a teams construct, the team is fully created, and we're about to do 1586 // the actual fork. Best to do this here so that the subsequent uses below 1587 // and in the join have the correct value. 1588 master_th->th.th_teams_size.nth = parent_team->t.t_nproc; 1589 1590 #if OMPT_SUPPORT 1591 if (ompt_enabled.enabled) { 1592 ompt_lw_taskteam_t lw_taskteam; 1593 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data, 1594 return_address); 1595 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true); 1596 } 1597 #endif 1598 1599 /* Change number of threads in the team if requested */ 1600 if (master_set_numthreads) { // The parallel has num_threads clause 1601 if (master_set_numthreads <= master_th->th.th_teams_size.nth) { 1602 // AC: only can reduce number of threads dynamically, can't increase 1603 kmp_info_t **other_threads = parent_team->t.t_threads; 1604 // NOTE: if using distributed barrier, we need to run this code block 1605 // even when the team size appears not to have changed from the max. 1606 int old_proc = master_th->th.th_teams_size.nth; 1607 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 1608 __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads); 1609 __kmp_add_threads_to_team(parent_team, master_set_numthreads); 1610 } 1611 parent_team->t.t_nproc = master_set_numthreads; 1612 for (i = 0; i < master_set_numthreads; ++i) { 1613 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1614 } 1615 } 1616 // Keep extra threads hot in the team for possible next parallels 1617 master_th->th.th_set_nproc = 0; 1618 } 1619 1620 #if USE_DEBUGGER 1621 if (__kmp_debugging) { // Let debugger override number of threads. 1622 int nth = __kmp_omp_num_threads(loc); 1623 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1624 master_set_numthreads = nth; 1625 } 1626 } 1627 #endif 1628 1629 // Figure out the proc_bind policy for the nested parallel within teams 1630 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1631 // proc_bind_default means don't update 1632 kmp_proc_bind_t proc_bind_icv = proc_bind_default; 1633 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1634 proc_bind = proc_bind_false; 1635 } else { 1636 // No proc_bind clause specified; use current proc-bind-var 1637 if (proc_bind == proc_bind_default) { 1638 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1639 } 1640 /* else: The proc_bind policy was specified explicitly on parallel clause. 1641 This overrides proc-bind-var for this parallel region, but does not 1642 change proc-bind-var. */ 1643 // Figure the value of proc-bind-var for the child threads. 1644 if ((level + 1 < __kmp_nested_proc_bind.used) && 1645 (__kmp_nested_proc_bind.bind_types[level + 1] != 1646 master_th->th.th_current_task->td_icvs.proc_bind)) { 1647 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1648 } 1649 } 1650 KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind); 1651 // Need to change the bind-var ICV to correct value for each implicit task 1652 if (proc_bind_icv != proc_bind_default && 1653 master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) { 1654 kmp_info_t **other_threads = parent_team->t.t_threads; 1655 for (i = 0; i < master_th->th.th_team_nproc; ++i) { 1656 other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv; 1657 } 1658 } 1659 // Reset for next parallel region 1660 master_th->th.th_set_proc_bind = proc_bind_default; 1661 1662 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1663 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || 1664 KMP_ITT_DEBUG) && 1665 __kmp_forkjoin_frames_mode == 3 && 1666 parent_team->t.t_active_level == 1 // only report frames at level 1 1667 && master_th->th.th_teams_size.nteams == 1) { 1668 kmp_uint64 tmp_time = __itt_get_timestamp(); 1669 master_th->th.th_frame_time = tmp_time; 1670 parent_team->t.t_region_time = tmp_time; 1671 } 1672 if (__itt_stack_caller_create_ptr) { 1673 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 1674 // create new stack stitching id before entering fork barrier 1675 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 1676 } 1677 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1678 #if KMP_AFFINITY_SUPPORTED 1679 __kmp_partition_places(parent_team); 1680 #endif 1681 1682 KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, " 1683 "master_th=%p, gtid=%d\n", 1684 root, parent_team, master_th, gtid)); 1685 __kmp_internal_fork(loc, gtid, parent_team); 1686 KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, " 1687 "master_th=%p, gtid=%d\n", 1688 root, parent_team, master_th, gtid)); 1689 1690 if (call_context == fork_context_gnu) 1691 return TRUE; 1692 1693 /* Invoke microtask for PRIMARY thread */ 1694 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid, 1695 parent_team->t.t_id, parent_team->t.t_pkfn)); 1696 1697 if (!parent_team->t.t_invoke(gtid)) { 1698 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 1699 } 1700 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid, 1701 parent_team->t.t_id, parent_team->t.t_pkfn)); 1702 KMP_MB(); /* Flush all pending memory write invalidates. */ 1703 1704 KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid)); 1705 1706 return TRUE; 1707 } 1708 1709 // Create a serialized parallel region 1710 static inline int 1711 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context, 1712 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1713 kmp_info_t *master_th, kmp_team_t *parent_team, 1714 #if OMPT_SUPPORT 1715 ompt_data_t *ompt_parallel_data, void **return_address, 1716 ompt_data_t **parent_task_data, 1717 #endif 1718 kmp_va_list ap) { 1719 kmp_team_t *team; 1720 int i; 1721 void **argv; 1722 1723 /* josh todo: hypothetical question: what do we do for OS X*? */ 1724 #if KMP_OS_LINUX && \ 1725 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1726 SimpleVLA<void *> args(argc); 1727 #else 1728 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1729 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1730 KMP_ARCH_AARCH64) */ 1731 1732 KA_TRACE( 1733 20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid)); 1734 1735 __kmpc_serialized_parallel(loc, gtid); 1736 1737 #if OMPD_SUPPORT 1738 master_th->th.th_serial_team->t.t_pkfn = microtask; 1739 #endif 1740 1741 if (call_context == fork_context_intel) { 1742 /* TODO this sucks, use the compiler itself to pass args! :) */ 1743 master_th->th.th_serial_team->t.t_ident = loc; 1744 if (!ap) { 1745 // revert change made in __kmpc_serialized_parallel() 1746 master_th->th.th_serial_team->t.t_level--; 1747 // Get args from parent team for teams construct 1748 1749 #if OMPT_SUPPORT 1750 void *dummy; 1751 void **exit_frame_p; 1752 ompt_task_info_t *task_info; 1753 ompt_lw_taskteam_t lw_taskteam; 1754 1755 if (ompt_enabled.enabled) { 1756 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1757 ompt_parallel_data, *return_address); 1758 1759 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1760 // don't use lw_taskteam after linking. content was swaped 1761 task_info = OMPT_CUR_TASK_INFO(master_th); 1762 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1763 if (ompt_enabled.ompt_callback_implicit_task) { 1764 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid); 1765 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1766 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1767 &(task_info->task_data), 1, 1768 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1769 } 1770 1771 /* OMPT state */ 1772 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1773 } else { 1774 exit_frame_p = &dummy; 1775 } 1776 #endif 1777 1778 { 1779 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1780 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1781 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1782 #if OMPT_SUPPORT 1783 , 1784 exit_frame_p 1785 #endif 1786 ); 1787 } 1788 1789 #if OMPT_SUPPORT 1790 if (ompt_enabled.enabled) { 1791 *exit_frame_p = NULL; 1792 if (ompt_enabled.ompt_callback_implicit_task) { 1793 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1794 ompt_scope_end, NULL, &(task_info->task_data), 1, 1795 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1796 } 1797 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1798 __ompt_lw_taskteam_unlink(master_th); 1799 if (ompt_enabled.ompt_callback_parallel_end) { 1800 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1801 ompt_parallel_data, *parent_task_data, 1802 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address); 1803 } 1804 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1805 } 1806 #endif 1807 } else if (microtask == (microtask_t)__kmp_teams_master) { 1808 KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team); 1809 team = master_th->th.th_team; 1810 // team->t.t_pkfn = microtask; 1811 team->t.t_invoke = invoker; 1812 __kmp_alloc_argv_entries(argc, team, TRUE); 1813 team->t.t_argc = argc; 1814 argv = (void **)team->t.t_argv; 1815 for (i = argc - 1; i >= 0; --i) 1816 *argv++ = va_arg(kmp_va_deref(ap), void *); 1817 // AC: revert change made in __kmpc_serialized_parallel() 1818 // because initial code in teams should have level=0 1819 team->t.t_level--; 1820 // AC: call special invoker for outer "parallel" of teams construct 1821 invoker(gtid); 1822 #if OMPT_SUPPORT 1823 if (ompt_enabled.enabled) { 1824 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th); 1825 if (ompt_enabled.ompt_callback_implicit_task) { 1826 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1827 ompt_scope_end, NULL, &(task_info->task_data), 0, 1828 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial); 1829 } 1830 if (ompt_enabled.ompt_callback_parallel_end) { 1831 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1832 ompt_parallel_data, *parent_task_data, 1833 OMPT_INVOKER(call_context) | ompt_parallel_league, 1834 *return_address); 1835 } 1836 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1837 } 1838 #endif 1839 } else { 1840 argv = args; 1841 for (i = argc - 1; i >= 0; --i) 1842 *argv++ = va_arg(kmp_va_deref(ap), void *); 1843 KMP_MB(); 1844 1845 #if OMPT_SUPPORT 1846 void *dummy; 1847 void **exit_frame_p; 1848 ompt_task_info_t *task_info; 1849 ompt_lw_taskteam_t lw_taskteam; 1850 ompt_data_t *implicit_task_data; 1851 1852 if (ompt_enabled.enabled) { 1853 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1854 ompt_parallel_data, *return_address); 1855 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1856 // don't use lw_taskteam after linking. content was swaped 1857 task_info = OMPT_CUR_TASK_INFO(master_th); 1858 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1859 1860 /* OMPT implicit task begin */ 1861 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1862 if (ompt_enabled.ompt_callback_implicit_task) { 1863 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1864 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1865 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), 1866 ompt_task_implicit); 1867 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid); 1868 } 1869 1870 /* OMPT state */ 1871 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1872 } else { 1873 exit_frame_p = &dummy; 1874 } 1875 #endif 1876 1877 { 1878 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1879 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1880 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1881 #if OMPT_SUPPORT 1882 , 1883 exit_frame_p 1884 #endif 1885 ); 1886 } 1887 1888 #if OMPT_SUPPORT 1889 if (ompt_enabled.enabled) { 1890 *exit_frame_p = NULL; 1891 if (ompt_enabled.ompt_callback_implicit_task) { 1892 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1893 ompt_scope_end, NULL, &(task_info->task_data), 1, 1894 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1895 } 1896 1897 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1898 __ompt_lw_taskteam_unlink(master_th); 1899 if (ompt_enabled.ompt_callback_parallel_end) { 1900 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1901 ompt_parallel_data, *parent_task_data, 1902 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address); 1903 } 1904 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1905 } 1906 #endif 1907 } 1908 } else if (call_context == fork_context_gnu) { 1909 #if OMPT_SUPPORT 1910 if (ompt_enabled.enabled) { 1911 ompt_lw_taskteam_t lwt; 1912 __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data, 1913 *return_address); 1914 1915 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1916 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1917 } 1918 // don't use lw_taskteam after linking. content was swaped 1919 #endif 1920 1921 // we were called from GNU native code 1922 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid)); 1923 return FALSE; 1924 } else { 1925 KMP_ASSERT2(call_context < fork_context_last, 1926 "__kmp_serial_fork_call: unknown fork_context parameter"); 1927 } 1928 1929 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid)); 1930 KMP_MB(); 1931 return FALSE; 1932 } 1933 1934 /* most of the work for a fork */ 1935 /* return true if we really went parallel, false if serialized */ 1936 int __kmp_fork_call(ident_t *loc, int gtid, 1937 enum fork_context_e call_context, // Intel, GNU, ... 1938 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1939 kmp_va_list ap) { 1940 void **argv; 1941 int i; 1942 int master_tid; 1943 int master_this_cons; 1944 kmp_team_t *team; 1945 kmp_team_t *parent_team; 1946 kmp_info_t *master_th; 1947 kmp_root_t *root; 1948 int nthreads; 1949 int master_active; 1950 int master_set_numthreads; 1951 int task_thread_limit = 0; 1952 int level; 1953 int active_level; 1954 int teams_level; 1955 #if KMP_NESTED_HOT_TEAMS 1956 kmp_hot_team_ptr_t **p_hot_teams; 1957 #endif 1958 { // KMP_TIME_BLOCK 1959 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1960 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1961 1962 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1963 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1964 /* Some systems prefer the stack for the root thread(s) to start with */ 1965 /* some gap from the parent stack to prevent false sharing. */ 1966 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1967 /* These 2 lines below are so this does not get optimized out */ 1968 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1969 __kmp_stkpadding += (short)((kmp_int64)dummy); 1970 } 1971 1972 /* initialize if needed */ 1973 KMP_DEBUG_ASSERT( 1974 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1975 if (!TCR_4(__kmp_init_parallel)) 1976 __kmp_parallel_initialize(); 1977 __kmp_resume_if_soft_paused(); 1978 1979 /* setup current data */ 1980 // AC: potentially unsafe, not in sync with library shutdown, 1981 // __kmp_threads can be freed 1982 master_th = __kmp_threads[gtid]; 1983 1984 parent_team = master_th->th.th_team; 1985 master_tid = master_th->th.th_info.ds.ds_tid; 1986 master_this_cons = master_th->th.th_local.this_construct; 1987 root = master_th->th.th_root; 1988 master_active = root->r.r_active; 1989 master_set_numthreads = master_th->th.th_set_nproc; 1990 task_thread_limit = 1991 master_th->th.th_current_task->td_icvs.task_thread_limit; 1992 1993 #if OMPT_SUPPORT 1994 ompt_data_t ompt_parallel_data = ompt_data_none; 1995 ompt_data_t *parent_task_data = NULL; 1996 ompt_frame_t *ompt_frame = NULL; 1997 void *return_address = NULL; 1998 1999 if (ompt_enabled.enabled) { 2000 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 2001 NULL, NULL); 2002 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 2003 } 2004 #endif 2005 2006 // Assign affinity to root thread if it hasn't happened yet 2007 __kmp_assign_root_init_mask(); 2008 2009 // Nested level will be an index in the nested nthreads array 2010 level = parent_team->t.t_level; 2011 // used to launch non-serial teams even if nested is not allowed 2012 active_level = parent_team->t.t_active_level; 2013 // needed to check nesting inside the teams 2014 teams_level = master_th->th.th_teams_level; 2015 #if KMP_NESTED_HOT_TEAMS 2016 p_hot_teams = &master_th->th.th_hot_teams; 2017 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 2018 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 2019 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 2020 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 2021 // it is either actual or not needed (when active_level > 0) 2022 (*p_hot_teams)[0].hot_team_nth = 1; 2023 } 2024 #endif 2025 2026 #if OMPT_SUPPORT 2027 if (ompt_enabled.enabled) { 2028 if (ompt_enabled.ompt_callback_parallel_begin) { 2029 int team_size = master_set_numthreads 2030 ? master_set_numthreads 2031 : get__nproc_2(parent_team, master_tid); 2032 int flags = OMPT_INVOKER(call_context) | 2033 ((microtask == (microtask_t)__kmp_teams_master) 2034 ? ompt_parallel_league 2035 : ompt_parallel_team); 2036 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 2037 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags, 2038 return_address); 2039 } 2040 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2041 } 2042 #endif 2043 2044 master_th->th.th_ident = loc; 2045 2046 // Parallel closely nested in teams construct: 2047 if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) { 2048 return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root, 2049 call_context, microtask, invoker, 2050 master_set_numthreads, level, 2051 #if OMPT_SUPPORT 2052 ompt_parallel_data, return_address, 2053 #endif 2054 ap); 2055 } // End parallel closely nested in teams construct 2056 2057 // Need this to happen before we determine the number of threads, not while 2058 // we are allocating the team 2059 //__kmp_push_current_task_to_thread(master_th, parent_team, 0); 2060 2061 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th); 2062 2063 // Determine the number of threads 2064 int enter_teams = 2065 __kmp_is_entering_teams(active_level, level, teams_level, ap); 2066 if ((!enter_teams && 2067 (parent_team->t.t_active_level >= 2068 master_th->th.th_current_task->td_icvs.max_active_levels)) || 2069 (__kmp_library == library_serial)) { 2070 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid)); 2071 nthreads = 1; 2072 } else { 2073 nthreads = master_set_numthreads 2074 ? master_set_numthreads 2075 // TODO: get nproc directly from current task 2076 : get__nproc_2(parent_team, master_tid); 2077 // Use the thread_limit set for the current target task if exists, else go 2078 // with the deduced nthreads 2079 nthreads = task_thread_limit > 0 && task_thread_limit < nthreads 2080 ? task_thread_limit 2081 : nthreads; 2082 // Check if we need to take forkjoin lock? (no need for serialized 2083 // parallel out of teams construct). 2084 if (nthreads > 1) { 2085 /* determine how many new threads we can use */ 2086 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2087 /* AC: If we execute teams from parallel region (on host), then teams 2088 should be created but each can only have 1 thread if nesting is 2089 disabled. If teams called from serial region, then teams and their 2090 threads should be created regardless of the nesting setting. */ 2091 nthreads = __kmp_reserve_threads(root, parent_team, master_tid, 2092 nthreads, enter_teams); 2093 if (nthreads == 1) { 2094 // Free lock for single thread execution here; for multi-thread 2095 // execution it will be freed later after team of threads created 2096 // and initialized 2097 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2098 } 2099 } 2100 } 2101 KMP_DEBUG_ASSERT(nthreads > 0); 2102 2103 // If we temporarily changed the set number of threads then restore it now 2104 master_th->th.th_set_nproc = 0; 2105 2106 if (nthreads == 1) { 2107 return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask, 2108 invoker, master_th, parent_team, 2109 #if OMPT_SUPPORT 2110 &ompt_parallel_data, &return_address, 2111 &parent_task_data, 2112 #endif 2113 ap); 2114 } // if (nthreads == 1) 2115 2116 // GEH: only modify the executing flag in the case when not serialized 2117 // serialized case is handled in kmpc_serialized_parallel 2118 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 2119 "curtask=%p, curtask_max_aclevel=%d\n", 2120 parent_team->t.t_active_level, master_th, 2121 master_th->th.th_current_task, 2122 master_th->th.th_current_task->td_icvs.max_active_levels)); 2123 // TODO: GEH - cannot do this assertion because root thread not set up as 2124 // executing 2125 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 2126 master_th->th.th_current_task->td_flags.executing = 0; 2127 2128 if (!master_th->th.th_teams_microtask || level > teams_level) { 2129 /* Increment our nested depth level */ 2130 KMP_ATOMIC_INC(&root->r.r_in_parallel); 2131 } 2132 2133 // See if we need to make a copy of the ICVs. 2134 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 2135 kmp_nested_nthreads_t *nested_nth = NULL; 2136 if (!master_th->th.th_set_nested_nth && 2137 (level + 1 < parent_team->t.t_nested_nth->used) && 2138 (parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) { 2139 nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1]; 2140 } else if (master_th->th.th_set_nested_nth) { 2141 nested_nth = __kmp_override_nested_nth(master_th, level); 2142 if ((level + 1 < nested_nth->used) && 2143 (nested_nth->nth[level + 1] != nthreads_icv)) 2144 nthreads_icv = nested_nth->nth[level + 1]; 2145 else 2146 nthreads_icv = 0; // don't update 2147 } else { 2148 nthreads_icv = 0; // don't update 2149 } 2150 2151 // Figure out the proc_bind_policy for the new team. 2152 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 2153 // proc_bind_default means don't update 2154 kmp_proc_bind_t proc_bind_icv = proc_bind_default; 2155 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 2156 proc_bind = proc_bind_false; 2157 } else { 2158 // No proc_bind clause specified; use current proc-bind-var for this 2159 // parallel region 2160 if (proc_bind == proc_bind_default) { 2161 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 2162 } 2163 // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND 2164 if (master_th->th.th_teams_microtask && 2165 microtask == (microtask_t)__kmp_teams_master) { 2166 proc_bind = __kmp_teams_proc_bind; 2167 } 2168 /* else: The proc_bind policy was specified explicitly on parallel clause. 2169 This overrides proc-bind-var for this parallel region, but does not 2170 change proc-bind-var. */ 2171 // Figure the value of proc-bind-var for the child threads. 2172 if ((level + 1 < __kmp_nested_proc_bind.used) && 2173 (__kmp_nested_proc_bind.bind_types[level + 1] != 2174 master_th->th.th_current_task->td_icvs.proc_bind)) { 2175 // Do not modify the proc bind icv for the two teams construct forks 2176 // They just let the proc bind icv pass through 2177 if (!master_th->th.th_teams_microtask || 2178 !(microtask == (microtask_t)__kmp_teams_master || ap == NULL)) 2179 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 2180 } 2181 } 2182 2183 // Reset for next parallel region 2184 master_th->th.th_set_proc_bind = proc_bind_default; 2185 2186 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { 2187 kmp_internal_control_t new_icvs; 2188 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 2189 new_icvs.next = NULL; 2190 if (nthreads_icv > 0) { 2191 new_icvs.nproc = nthreads_icv; 2192 } 2193 if (proc_bind_icv != proc_bind_default) { 2194 new_icvs.proc_bind = proc_bind_icv; 2195 } 2196 2197 /* allocate a new parallel team */ 2198 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2199 team = __kmp_allocate_team(root, nthreads, nthreads, 2200 #if OMPT_SUPPORT 2201 ompt_parallel_data, 2202 #endif 2203 proc_bind, &new_icvs, 2204 argc USE_NESTED_HOT_ARG(master_th)); 2205 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) 2206 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs); 2207 } else { 2208 /* allocate a new parallel team */ 2209 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2210 team = __kmp_allocate_team(root, nthreads, nthreads, 2211 #if OMPT_SUPPORT 2212 ompt_parallel_data, 2213 #endif 2214 proc_bind, 2215 &master_th->th.th_current_task->td_icvs, 2216 argc USE_NESTED_HOT_ARG(master_th)); 2217 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) 2218 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, 2219 &master_th->th.th_current_task->td_icvs); 2220 } 2221 KF_TRACE( 2222 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 2223 2224 /* setup the new team */ 2225 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 2226 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 2227 KMP_CHECK_UPDATE(team->t.t_ident, loc); 2228 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 2229 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 2230 #if OMPT_SUPPORT 2231 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 2232 return_address); 2233 #endif 2234 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 2235 // TODO: parent_team->t.t_level == INT_MAX ??? 2236 if (!master_th->th.th_teams_microtask || level > teams_level) { 2237 int new_level = parent_team->t.t_level + 1; 2238 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2239 new_level = parent_team->t.t_active_level + 1; 2240 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2241 } else { 2242 // AC: Do not increase parallel level at start of the teams construct 2243 int new_level = parent_team->t.t_level; 2244 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2245 new_level = parent_team->t.t_active_level; 2246 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2247 } 2248 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2249 // set primary thread's schedule as new run-time schedule 2250 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2251 2252 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2253 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2254 2255 // Check if hot team has potentially outdated list, and if so, free it 2256 if (team->t.t_nested_nth && 2257 team->t.t_nested_nth != parent_team->t.t_nested_nth) { 2258 KMP_INTERNAL_FREE(team->t.t_nested_nth->nth); 2259 KMP_INTERNAL_FREE(team->t.t_nested_nth); 2260 team->t.t_nested_nth = NULL; 2261 } 2262 team->t.t_nested_nth = parent_team->t.t_nested_nth; 2263 if (master_th->th.th_set_nested_nth) { 2264 if (!nested_nth) 2265 nested_nth = __kmp_override_nested_nth(master_th, level); 2266 team->t.t_nested_nth = nested_nth; 2267 KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth); 2268 master_th->th.th_set_nested_nth = NULL; 2269 master_th->th.th_set_nested_nth_sz = 0; 2270 master_th->th.th_nt_strict = false; 2271 } 2272 2273 // Update the floating point rounding in the team if required. 2274 propagateFPControl(team); 2275 #if OMPD_SUPPORT 2276 if (ompd_state & OMPD_ENABLE_BP) 2277 ompd_bp_parallel_begin(); 2278 #endif 2279 2280 KA_TRACE( 2281 20, 2282 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2283 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2284 team->t.t_nproc)); 2285 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2286 (team->t.t_master_tid == 0 && 2287 (team->t.t_parent == root->r.r_root_team || 2288 team->t.t_parent->t.t_serialized))); 2289 KMP_MB(); 2290 2291 /* now, setup the arguments */ 2292 argv = (void **)team->t.t_argv; 2293 if (ap) { 2294 for (i = argc - 1; i >= 0; --i) { 2295 void *new_argv = va_arg(kmp_va_deref(ap), void *); 2296 KMP_CHECK_UPDATE(*argv, new_argv); 2297 argv++; 2298 } 2299 } else { 2300 for (i = 0; i < argc; ++i) { 2301 // Get args from parent team for teams construct 2302 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2303 } 2304 } 2305 2306 /* now actually fork the threads */ 2307 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2308 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2309 root->r.r_active = TRUE; 2310 2311 __kmp_fork_team_threads(root, team, master_th, gtid, !ap); 2312 __kmp_setup_icv_copy(team, nthreads, 2313 &master_th->th.th_current_task->td_icvs, loc); 2314 2315 #if OMPT_SUPPORT 2316 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2317 #endif 2318 2319 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2320 2321 #if USE_ITT_BUILD 2322 if (team->t.t_active_level == 1 // only report frames at level 1 2323 && !master_th->th.th_teams_microtask) { // not in teams construct 2324 #if USE_ITT_NOTIFY 2325 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2326 (__kmp_forkjoin_frames_mode == 3 || 2327 __kmp_forkjoin_frames_mode == 1)) { 2328 kmp_uint64 tmp_time = 0; 2329 if (__itt_get_timestamp_ptr) 2330 tmp_time = __itt_get_timestamp(); 2331 // Internal fork - report frame begin 2332 master_th->th.th_frame_time = tmp_time; 2333 if (__kmp_forkjoin_frames_mode == 3) 2334 team->t.t_region_time = tmp_time; 2335 } else 2336 // only one notification scheme (either "submit" or "forking/joined", not both) 2337 #endif /* USE_ITT_NOTIFY */ 2338 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2339 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2340 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2341 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2342 } 2343 } 2344 #endif /* USE_ITT_BUILD */ 2345 2346 /* now go on and do the work */ 2347 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2348 KMP_MB(); 2349 KF_TRACE(10, 2350 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2351 root, team, master_th, gtid)); 2352 2353 #if USE_ITT_BUILD 2354 if (__itt_stack_caller_create_ptr) { 2355 // create new stack stitching id before entering fork barrier 2356 if (!enter_teams) { 2357 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL); 2358 team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2359 } else if (parent_team->t.t_serialized) { 2360 // keep stack stitching id in the serialized parent_team; 2361 // current team will be used for parallel inside the teams; 2362 // if parent_team is active, then it already keeps stack stitching id 2363 // for the league of teams 2364 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 2365 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2366 } 2367 } 2368 #endif /* USE_ITT_BUILD */ 2369 2370 // AC: skip __kmp_internal_fork at teams construct, let only primary 2371 // threads execute 2372 if (ap) { 2373 __kmp_internal_fork(loc, gtid, team); 2374 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2375 "master_th=%p, gtid=%d\n", 2376 root, team, master_th, gtid)); 2377 } 2378 2379 if (call_context == fork_context_gnu) { 2380 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2381 return TRUE; 2382 } 2383 2384 /* Invoke microtask for PRIMARY thread */ 2385 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2386 team->t.t_id, team->t.t_pkfn)); 2387 } // END of timer KMP_fork_call block 2388 2389 #if KMP_STATS_ENABLED 2390 // If beginning a teams construct, then change thread state 2391 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 2392 if (!ap) { 2393 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); 2394 } 2395 #endif 2396 2397 if (!team->t.t_invoke(gtid)) { 2398 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 2399 } 2400 2401 #if KMP_STATS_ENABLED 2402 // If was beginning of a teams construct, then reset thread state 2403 if (!ap) { 2404 KMP_SET_THREAD_STATE(previous_state); 2405 } 2406 #endif 2407 2408 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2409 team->t.t_id, team->t.t_pkfn)); 2410 KMP_MB(); /* Flush all pending memory write invalidates. */ 2411 2412 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2413 #if OMPT_SUPPORT 2414 if (ompt_enabled.enabled) { 2415 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2416 } 2417 #endif 2418 2419 return TRUE; 2420 } 2421 2422 #if OMPT_SUPPORT 2423 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2424 kmp_team_t *team) { 2425 // restore state outside the region 2426 thread->th.ompt_thread_info.state = 2427 ((team->t.t_serialized) ? ompt_state_work_serial 2428 : ompt_state_work_parallel); 2429 } 2430 2431 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2432 kmp_team_t *team, ompt_data_t *parallel_data, 2433 int flags, void *codeptr) { 2434 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2435 if (ompt_enabled.ompt_callback_parallel_end) { 2436 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2437 parallel_data, &(task_info->task_data), flags, codeptr); 2438 } 2439 2440 task_info->frame.enter_frame = ompt_data_none; 2441 __kmp_join_restore_state(thread, team); 2442 } 2443 #endif 2444 2445 void __kmp_join_call(ident_t *loc, int gtid 2446 #if OMPT_SUPPORT 2447 , 2448 enum fork_context_e fork_context 2449 #endif 2450 , 2451 int exit_teams) { 2452 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2453 kmp_team_t *team; 2454 kmp_team_t *parent_team; 2455 kmp_info_t *master_th; 2456 kmp_root_t *root; 2457 int master_active; 2458 2459 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2460 2461 /* setup current data */ 2462 master_th = __kmp_threads[gtid]; 2463 root = master_th->th.th_root; 2464 team = master_th->th.th_team; 2465 parent_team = team->t.t_parent; 2466 2467 master_th->th.th_ident = loc; 2468 2469 #if OMPT_SUPPORT 2470 void *team_microtask = (void *)team->t.t_pkfn; 2471 // For GOMP interface with serialized parallel, need the 2472 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task 2473 // and end-parallel events. 2474 if (ompt_enabled.enabled && 2475 !(team->t.t_serialized && fork_context == fork_context_gnu)) { 2476 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2477 } 2478 #endif 2479 2480 #if KMP_DEBUG 2481 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2482 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2483 "th_task_team = %p\n", 2484 __kmp_gtid_from_thread(master_th), team, 2485 team->t.t_task_team[master_th->th.th_task_state], 2486 master_th->th.th_task_team)); 2487 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th); 2488 } 2489 #endif 2490 2491 if (team->t.t_serialized) { 2492 if (master_th->th.th_teams_microtask) { 2493 // We are in teams construct 2494 int level = team->t.t_level; 2495 int tlevel = master_th->th.th_teams_level; 2496 if (level == tlevel) { 2497 // AC: we haven't incremented it earlier at start of teams construct, 2498 // so do it here - at the end of teams construct 2499 team->t.t_level++; 2500 } else if (level == tlevel + 1) { 2501 // AC: we are exiting parallel inside teams, need to increment 2502 // serialization in order to restore it in the next call to 2503 // __kmpc_end_serialized_parallel 2504 team->t.t_serialized++; 2505 } 2506 } 2507 __kmpc_end_serialized_parallel(loc, gtid); 2508 2509 #if OMPT_SUPPORT 2510 if (ompt_enabled.enabled) { 2511 if (fork_context == fork_context_gnu) { 2512 __ompt_lw_taskteam_unlink(master_th); 2513 } 2514 __kmp_join_restore_state(master_th, parent_team); 2515 } 2516 #endif 2517 2518 return; 2519 } 2520 2521 master_active = team->t.t_master_active; 2522 2523 if (!exit_teams) { 2524 // AC: No barrier for internal teams at exit from teams construct. 2525 // But there is barrier for external team (league). 2526 __kmp_internal_join(loc, gtid, team); 2527 #if USE_ITT_BUILD 2528 if (__itt_stack_caller_create_ptr) { 2529 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL); 2530 // destroy the stack stitching id after join barrier 2531 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id); 2532 team->t.t_stack_id = NULL; 2533 } 2534 #endif 2535 } else { 2536 master_th->th.th_task_state = 2537 0; // AC: no tasking in teams (out of any parallel) 2538 #if USE_ITT_BUILD 2539 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) { 2540 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL); 2541 // destroy the stack stitching id on exit from the teams construct 2542 // if parent_team is active, then the id will be destroyed later on 2543 // by master of the league of teams 2544 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id); 2545 parent_team->t.t_stack_id = NULL; 2546 } 2547 #endif 2548 } 2549 2550 KMP_MB(); 2551 2552 #if OMPT_SUPPORT 2553 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2554 void *codeptr = team->t.ompt_team_info.master_return_address; 2555 #endif 2556 2557 #if USE_ITT_BUILD 2558 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2559 if (team->t.t_active_level == 1 && 2560 (!master_th->th.th_teams_microtask || /* not in teams construct */ 2561 master_th->th.th_teams_size.nteams == 1)) { 2562 master_th->th.th_ident = loc; 2563 // only one notification scheme (either "submit" or "forking/joined", not 2564 // both) 2565 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2566 __kmp_forkjoin_frames_mode == 3) 2567 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2568 master_th->th.th_frame_time, 0, loc, 2569 master_th->th.th_team_nproc, 1); 2570 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2571 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2572 __kmp_itt_region_joined(gtid); 2573 } // active_level == 1 2574 #endif /* USE_ITT_BUILD */ 2575 2576 #if KMP_AFFINITY_SUPPORTED 2577 if (!exit_teams) { 2578 // Restore master thread's partition. 2579 master_th->th.th_first_place = team->t.t_first_place; 2580 master_th->th.th_last_place = team->t.t_last_place; 2581 } 2582 #endif // KMP_AFFINITY_SUPPORTED 2583 2584 if (master_th->th.th_teams_microtask && !exit_teams && 2585 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2586 team->t.t_level == master_th->th.th_teams_level + 1) { 2587 // AC: We need to leave the team structure intact at the end of parallel 2588 // inside the teams construct, so that at the next parallel same (hot) team 2589 // works, only adjust nesting levels 2590 #if OMPT_SUPPORT 2591 ompt_data_t ompt_parallel_data = ompt_data_none; 2592 if (ompt_enabled.enabled) { 2593 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2594 if (ompt_enabled.ompt_callback_implicit_task) { 2595 int ompt_team_size = team->t.t_nproc; 2596 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2597 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2598 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 2599 } 2600 task_info->frame.exit_frame = ompt_data_none; 2601 task_info->task_data = ompt_data_none; 2602 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 2603 __ompt_lw_taskteam_unlink(master_th); 2604 } 2605 #endif 2606 /* Decrement our nested depth level */ 2607 team->t.t_level--; 2608 team->t.t_active_level--; 2609 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2610 2611 // Restore number of threads in the team if needed. This code relies on 2612 // the proper adjustment of th_teams_size.nth after the fork in 2613 // __kmp_teams_master on each teams primary thread in the case that 2614 // __kmp_reserve_threads reduced it. 2615 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2616 int old_num = master_th->th.th_team_nproc; 2617 int new_num = master_th->th.th_teams_size.nth; 2618 kmp_info_t **other_threads = team->t.t_threads; 2619 team->t.t_nproc = new_num; 2620 for (int i = 0; i < old_num; ++i) { 2621 other_threads[i]->th.th_team_nproc = new_num; 2622 } 2623 // Adjust states of non-used threads of the team 2624 for (int i = old_num; i < new_num; ++i) { 2625 // Re-initialize thread's barrier data. 2626 KMP_DEBUG_ASSERT(other_threads[i]); 2627 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2628 for (int b = 0; b < bs_last_barrier; ++b) { 2629 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2630 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2631 #if USE_DEBUGGER 2632 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2633 #endif 2634 } 2635 if (__kmp_tasking_mode != tskm_immediate_exec) { 2636 // Synchronize thread's task state 2637 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2638 } 2639 } 2640 } 2641 2642 #if OMPT_SUPPORT 2643 if (ompt_enabled.enabled) { 2644 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data, 2645 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr); 2646 } 2647 #endif 2648 2649 return; 2650 } 2651 2652 /* do cleanup and restore the parent team */ 2653 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2654 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2655 2656 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2657 2658 /* jc: The following lock has instructions with REL and ACQ semantics, 2659 separating the parallel user code called in this parallel region 2660 from the serial user code called after this function returns. */ 2661 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2662 2663 if (!master_th->th.th_teams_microtask || 2664 team->t.t_level > master_th->th.th_teams_level) { 2665 /* Decrement our nested depth level */ 2666 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2667 } 2668 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2669 2670 #if OMPT_SUPPORT 2671 if (ompt_enabled.enabled) { 2672 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2673 if (ompt_enabled.ompt_callback_implicit_task) { 2674 int flags = (team_microtask == (void *)__kmp_teams_master) 2675 ? ompt_task_initial 2676 : ompt_task_implicit; 2677 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc; 2678 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2679 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2680 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags); 2681 } 2682 task_info->frame.exit_frame = ompt_data_none; 2683 task_info->task_data = ompt_data_none; 2684 } 2685 #endif 2686 2687 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2688 master_th, team)); 2689 __kmp_pop_current_task_from_thread(master_th); 2690 2691 master_th->th.th_def_allocator = team->t.t_def_allocator; 2692 2693 #if OMPD_SUPPORT 2694 if (ompd_state & OMPD_ENABLE_BP) 2695 ompd_bp_parallel_end(); 2696 #endif 2697 updateHWFPControl(team); 2698 2699 if (root->r.r_active != master_active) 2700 root->r.r_active = master_active; 2701 2702 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2703 master_th)); // this will free worker threads 2704 2705 /* this race was fun to find. make sure the following is in the critical 2706 region otherwise assertions may fail occasionally since the old team may be 2707 reallocated and the hierarchy appears inconsistent. it is actually safe to 2708 run and won't cause any bugs, but will cause those assertion failures. it's 2709 only one deref&assign so might as well put this in the critical region */ 2710 master_th->th.th_team = parent_team; 2711 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2712 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2713 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2714 2715 /* restore serialized team, if need be */ 2716 if (parent_team->t.t_serialized && 2717 parent_team != master_th->th.th_serial_team && 2718 parent_team != root->r.r_root_team) { 2719 __kmp_free_team(root, 2720 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2721 master_th->th.th_serial_team = parent_team; 2722 } 2723 2724 if (__kmp_tasking_mode != tskm_immediate_exec) { 2725 // Restore primary thread's task state from team structure 2726 KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 || 2727 team->t.t_primary_task_state == 1); 2728 master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state; 2729 2730 // Copy the task team from the parent team to the primary thread 2731 master_th->th.th_task_team = 2732 parent_team->t.t_task_team[master_th->th.th_task_state]; 2733 KA_TRACE(20, 2734 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n", 2735 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2736 parent_team)); 2737 } 2738 2739 // TODO: GEH - cannot do this assertion because root thread not set up as 2740 // executing 2741 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2742 master_th->th.th_current_task->td_flags.executing = 1; 2743 2744 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2745 2746 #if KMP_AFFINITY_SUPPORTED 2747 if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) { 2748 __kmp_reset_root_init_mask(gtid); 2749 } 2750 #endif 2751 #if OMPT_SUPPORT 2752 int flags = 2753 OMPT_INVOKER(fork_context) | 2754 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league 2755 : ompt_parallel_team); 2756 if (ompt_enabled.enabled) { 2757 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags, 2758 codeptr); 2759 } 2760 #endif 2761 2762 KMP_MB(); 2763 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2764 } 2765 2766 /* Check whether we should push an internal control record onto the 2767 serial team stack. If so, do it. */ 2768 void __kmp_save_internal_controls(kmp_info_t *thread) { 2769 2770 if (thread->th.th_team != thread->th.th_serial_team) { 2771 return; 2772 } 2773 if (thread->th.th_team->t.t_serialized > 1) { 2774 int push = 0; 2775 2776 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2777 push = 1; 2778 } else { 2779 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2780 thread->th.th_team->t.t_serialized) { 2781 push = 1; 2782 } 2783 } 2784 if (push) { /* push a record on the serial team's stack */ 2785 kmp_internal_control_t *control = 2786 (kmp_internal_control_t *)__kmp_allocate( 2787 sizeof(kmp_internal_control_t)); 2788 2789 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2790 2791 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2792 2793 control->next = thread->th.th_team->t.t_control_stack_top; 2794 thread->th.th_team->t.t_control_stack_top = control; 2795 } 2796 } 2797 } 2798 2799 /* Changes set_nproc */ 2800 void __kmp_set_num_threads(int new_nth, int gtid) { 2801 kmp_info_t *thread; 2802 kmp_root_t *root; 2803 2804 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2805 KMP_DEBUG_ASSERT(__kmp_init_serial); 2806 2807 if (new_nth < 1) 2808 new_nth = 1; 2809 else if (new_nth > __kmp_max_nth) 2810 new_nth = __kmp_max_nth; 2811 2812 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2813 thread = __kmp_threads[gtid]; 2814 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2815 return; // nothing to do 2816 2817 __kmp_save_internal_controls(thread); 2818 2819 set__nproc(thread, new_nth); 2820 2821 // If this omp_set_num_threads() call will cause the hot team size to be 2822 // reduced (in the absence of a num_threads clause), then reduce it now, 2823 // rather than waiting for the next parallel region. 2824 root = thread->th.th_root; 2825 if (__kmp_init_parallel && (!root->r.r_active) && 2826 (root->r.r_hot_team->t.t_nproc > new_nth) 2827 #if KMP_NESTED_HOT_TEAMS 2828 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2829 #endif 2830 ) { 2831 kmp_team_t *hot_team = root->r.r_hot_team; 2832 int f; 2833 2834 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2835 2836 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 2837 __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth); 2838 } 2839 // Release the extra threads we don't need any more. 2840 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2841 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2842 if (__kmp_tasking_mode != tskm_immediate_exec) { 2843 // When decreasing team size, threads no longer in the team should unref 2844 // task team. 2845 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2846 } 2847 __kmp_free_thread(hot_team->t.t_threads[f]); 2848 hot_team->t.t_threads[f] = NULL; 2849 } 2850 hot_team->t.t_nproc = new_nth; 2851 #if KMP_NESTED_HOT_TEAMS 2852 if (thread->th.th_hot_teams) { 2853 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2854 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2855 } 2856 #endif 2857 2858 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 2859 hot_team->t.b->update_num_threads(new_nth); 2860 __kmp_add_threads_to_team(hot_team, new_nth); 2861 } 2862 2863 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2864 2865 // Update the t_nproc field in the threads that are still active. 2866 for (f = 0; f < new_nth; f++) { 2867 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2868 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2869 } 2870 // Special flag in case omp_set_num_threads() call 2871 hot_team->t.t_size_changed = -1; 2872 } 2873 } 2874 2875 /* Changes max_active_levels */ 2876 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2877 kmp_info_t *thread; 2878 2879 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2880 "%d = (%d)\n", 2881 gtid, max_active_levels)); 2882 KMP_DEBUG_ASSERT(__kmp_init_serial); 2883 2884 // validate max_active_levels 2885 if (max_active_levels < 0) { 2886 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2887 // We ignore this call if the user has specified a negative value. 2888 // The current setting won't be changed. The last valid setting will be 2889 // used. A warning will be issued (if warnings are allowed as controlled by 2890 // the KMP_WARNINGS env var). 2891 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2892 "max_active_levels for thread %d = (%d)\n", 2893 gtid, max_active_levels)); 2894 return; 2895 } 2896 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2897 // it's OK, the max_active_levels is within the valid range: [ 0; 2898 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2899 // We allow a zero value. (implementation defined behavior) 2900 } else { 2901 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2902 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2903 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2904 // Current upper limit is MAX_INT. (implementation defined behavior) 2905 // If the input exceeds the upper limit, we correct the input to be the 2906 // upper limit. (implementation defined behavior) 2907 // Actually, the flow should never get here until we use MAX_INT limit. 2908 } 2909 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2910 "max_active_levels for thread %d = (%d)\n", 2911 gtid, max_active_levels)); 2912 2913 thread = __kmp_threads[gtid]; 2914 2915 __kmp_save_internal_controls(thread); 2916 2917 set__max_active_levels(thread, max_active_levels); 2918 } 2919 2920 /* Gets max_active_levels */ 2921 int __kmp_get_max_active_levels(int gtid) { 2922 kmp_info_t *thread; 2923 2924 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2925 KMP_DEBUG_ASSERT(__kmp_init_serial); 2926 2927 thread = __kmp_threads[gtid]; 2928 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2929 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2930 "curtask_maxaclevel=%d\n", 2931 gtid, thread->th.th_current_task, 2932 thread->th.th_current_task->td_icvs.max_active_levels)); 2933 return thread->th.th_current_task->td_icvs.max_active_levels; 2934 } 2935 2936 // nteams-var per-device ICV 2937 void __kmp_set_num_teams(int num_teams) { 2938 if (num_teams > 0) 2939 __kmp_nteams = num_teams; 2940 } 2941 int __kmp_get_max_teams(void) { return __kmp_nteams; } 2942 // teams-thread-limit-var per-device ICV 2943 void __kmp_set_teams_thread_limit(int limit) { 2944 if (limit > 0) 2945 __kmp_teams_thread_limit = limit; 2946 } 2947 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; } 2948 2949 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); 2950 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); 2951 2952 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2953 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2954 kmp_info_t *thread; 2955 kmp_sched_t orig_kind; 2956 // kmp_team_t *team; 2957 2958 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2959 gtid, (int)kind, chunk)); 2960 KMP_DEBUG_ASSERT(__kmp_init_serial); 2961 2962 // Check if the kind parameter is valid, correct if needed. 2963 // Valid parameters should fit in one of two intervals - standard or extended: 2964 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2965 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2966 orig_kind = kind; 2967 kind = __kmp_sched_without_mods(kind); 2968 2969 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2970 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2971 // TODO: Hint needs attention in case we change the default schedule. 2972 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2973 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2974 __kmp_msg_null); 2975 kind = kmp_sched_default; 2976 chunk = 0; // ignore chunk value in case of bad kind 2977 } 2978 2979 thread = __kmp_threads[gtid]; 2980 2981 __kmp_save_internal_controls(thread); 2982 2983 if (kind < kmp_sched_upper_std) { 2984 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2985 // differ static chunked vs. unchunked: chunk should be invalid to 2986 // indicate unchunked schedule (which is the default) 2987 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2988 } else { 2989 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2990 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2991 } 2992 } else { 2993 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2994 // kmp_sched_lower - 2 ]; 2995 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2996 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2997 kmp_sched_lower - 2]; 2998 } 2999 __kmp_sched_apply_mods_intkind( 3000 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); 3001 if (kind == kmp_sched_auto || chunk < 1) { 3002 // ignore parameter chunk for schedule auto 3003 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 3004 } else { 3005 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 3006 } 3007 } 3008 3009 /* Gets def_sched_var ICV values */ 3010 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 3011 kmp_info_t *thread; 3012 enum sched_type th_type; 3013 3014 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 3015 KMP_DEBUG_ASSERT(__kmp_init_serial); 3016 3017 thread = __kmp_threads[gtid]; 3018 3019 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 3020 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { 3021 case kmp_sch_static: 3022 case kmp_sch_static_greedy: 3023 case kmp_sch_static_balanced: 3024 *kind = kmp_sched_static; 3025 __kmp_sched_apply_mods_stdkind(kind, th_type); 3026 *chunk = 0; // chunk was not set, try to show this fact via zero value 3027 return; 3028 case kmp_sch_static_chunked: 3029 *kind = kmp_sched_static; 3030 break; 3031 case kmp_sch_dynamic_chunked: 3032 *kind = kmp_sched_dynamic; 3033 break; 3034 case kmp_sch_guided_chunked: 3035 case kmp_sch_guided_iterative_chunked: 3036 case kmp_sch_guided_analytical_chunked: 3037 *kind = kmp_sched_guided; 3038 break; 3039 case kmp_sch_auto: 3040 *kind = kmp_sched_auto; 3041 break; 3042 case kmp_sch_trapezoidal: 3043 *kind = kmp_sched_trapezoidal; 3044 break; 3045 #if KMP_STATIC_STEAL_ENABLED 3046 case kmp_sch_static_steal: 3047 *kind = kmp_sched_static_steal; 3048 break; 3049 #endif 3050 default: 3051 KMP_FATAL(UnknownSchedulingType, th_type); 3052 } 3053 3054 __kmp_sched_apply_mods_stdkind(kind, th_type); 3055 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 3056 } 3057 3058 int __kmp_get_ancestor_thread_num(int gtid, int level) { 3059 3060 int ii, dd; 3061 kmp_team_t *team; 3062 kmp_info_t *thr; 3063 3064 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 3065 KMP_DEBUG_ASSERT(__kmp_init_serial); 3066 3067 // validate level 3068 if (level == 0) 3069 return 0; 3070 if (level < 0) 3071 return -1; 3072 thr = __kmp_threads[gtid]; 3073 team = thr->th.th_team; 3074 ii = team->t.t_level; 3075 if (level > ii) 3076 return -1; 3077 3078 if (thr->th.th_teams_microtask) { 3079 // AC: we are in teams region where multiple nested teams have same level 3080 int tlevel = thr->th.th_teams_level; // the level of the teams construct 3081 if (level <= 3082 tlevel) { // otherwise usual algorithm works (will not touch the teams) 3083 KMP_DEBUG_ASSERT(ii >= tlevel); 3084 // AC: As we need to pass by the teams league, we need to artificially 3085 // increase ii 3086 if (ii == tlevel) { 3087 ii += 2; // three teams have same level 3088 } else { 3089 ii++; // two teams have same level 3090 } 3091 } 3092 } 3093 3094 if (ii == level) 3095 return __kmp_tid_from_gtid(gtid); 3096 3097 dd = team->t.t_serialized; 3098 level++; 3099 while (ii > level) { 3100 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 3101 } 3102 if ((team->t.t_serialized) && (!dd)) { 3103 team = team->t.t_parent; 3104 continue; 3105 } 3106 if (ii > level) { 3107 team = team->t.t_parent; 3108 dd = team->t.t_serialized; 3109 ii--; 3110 } 3111 } 3112 3113 return (dd > 1) ? (0) : (team->t.t_master_tid); 3114 } 3115 3116 int __kmp_get_team_size(int gtid, int level) { 3117 3118 int ii, dd; 3119 kmp_team_t *team; 3120 kmp_info_t *thr; 3121 3122 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 3123 KMP_DEBUG_ASSERT(__kmp_init_serial); 3124 3125 // validate level 3126 if (level == 0) 3127 return 1; 3128 if (level < 0) 3129 return -1; 3130 thr = __kmp_threads[gtid]; 3131 team = thr->th.th_team; 3132 ii = team->t.t_level; 3133 if (level > ii) 3134 return -1; 3135 3136 if (thr->th.th_teams_microtask) { 3137 // AC: we are in teams region where multiple nested teams have same level 3138 int tlevel = thr->th.th_teams_level; // the level of the teams construct 3139 if (level <= 3140 tlevel) { // otherwise usual algorithm works (will not touch the teams) 3141 KMP_DEBUG_ASSERT(ii >= tlevel); 3142 // AC: As we need to pass by the teams league, we need to artificially 3143 // increase ii 3144 if (ii == tlevel) { 3145 ii += 2; // three teams have same level 3146 } else { 3147 ii++; // two teams have same level 3148 } 3149 } 3150 } 3151 3152 while (ii > level) { 3153 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 3154 } 3155 if (team->t.t_serialized && (!dd)) { 3156 team = team->t.t_parent; 3157 continue; 3158 } 3159 if (ii > level) { 3160 team = team->t.t_parent; 3161 ii--; 3162 } 3163 } 3164 3165 return team->t.t_nproc; 3166 } 3167 3168 kmp_r_sched_t __kmp_get_schedule_global() { 3169 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 3170 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 3171 // independently. So one can get the updated schedule here. 3172 3173 kmp_r_sched_t r_sched; 3174 3175 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 3176 // __kmp_guided. __kmp_sched should keep original value, so that user can set 3177 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 3178 // different roots (even in OMP 2.5) 3179 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); 3180 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); 3181 if (s == kmp_sch_static) { 3182 // replace STATIC with more detailed schedule (balanced or greedy) 3183 r_sched.r_sched_type = __kmp_static; 3184 } else if (s == kmp_sch_guided_chunked) { 3185 // replace GUIDED with more detailed schedule (iterative or analytical) 3186 r_sched.r_sched_type = __kmp_guided; 3187 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 3188 r_sched.r_sched_type = __kmp_sched; 3189 } 3190 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); 3191 3192 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 3193 // __kmp_chunk may be wrong here (if it was not ever set) 3194 r_sched.chunk = KMP_DEFAULT_CHUNK; 3195 } else { 3196 r_sched.chunk = __kmp_chunk; 3197 } 3198 3199 return r_sched; 3200 } 3201 3202 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 3203 at least argc number of *t_argv entries for the requested team. */ 3204 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 3205 3206 KMP_DEBUG_ASSERT(team); 3207 if (!realloc || argc > team->t.t_max_argc) { 3208 3209 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 3210 "current entries=%d\n", 3211 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 3212 /* if previously allocated heap space for args, free them */ 3213 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 3214 __kmp_free((void *)team->t.t_argv); 3215 3216 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 3217 /* use unused space in the cache line for arguments */ 3218 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 3219 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 3220 "argv entries\n", 3221 team->t.t_id, team->t.t_max_argc)); 3222 team->t.t_argv = &team->t.t_inline_argv[0]; 3223 if (__kmp_storage_map) { 3224 __kmp_print_storage_map_gtid( 3225 -1, &team->t.t_inline_argv[0], 3226 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3227 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3228 team->t.t_id); 3229 } 3230 } else { 3231 /* allocate space for arguments in the heap */ 3232 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3233 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3234 : 2 * argc; 3235 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3236 "argv entries\n", 3237 team->t.t_id, team->t.t_max_argc)); 3238 team->t.t_argv = 3239 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3240 if (__kmp_storage_map) { 3241 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3242 &team->t.t_argv[team->t.t_max_argc], 3243 sizeof(void *) * team->t.t_max_argc, 3244 "team_%d.t_argv", team->t.t_id); 3245 } 3246 } 3247 } 3248 } 3249 3250 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3251 int i; 3252 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3253 team->t.t_threads = 3254 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3255 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3256 sizeof(dispatch_shared_info_t) * num_disp_buff); 3257 team->t.t_dispatch = 3258 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3259 team->t.t_implicit_task_taskdata = 3260 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3261 team->t.t_max_nproc = max_nth; 3262 3263 /* setup dispatch buffers */ 3264 for (i = 0; i < num_disp_buff; ++i) { 3265 team->t.t_disp_buffer[i].buffer_index = i; 3266 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3267 } 3268 } 3269 3270 static void __kmp_free_team_arrays(kmp_team_t *team) { 3271 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3272 int i; 3273 for (i = 0; i < team->t.t_max_nproc; ++i) { 3274 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3275 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3276 team->t.t_dispatch[i].th_disp_buffer = NULL; 3277 } 3278 } 3279 #if KMP_USE_HIER_SCHED 3280 __kmp_dispatch_free_hierarchies(team); 3281 #endif 3282 __kmp_free(team->t.t_threads); 3283 __kmp_free(team->t.t_disp_buffer); 3284 __kmp_free(team->t.t_dispatch); 3285 __kmp_free(team->t.t_implicit_task_taskdata); 3286 team->t.t_threads = NULL; 3287 team->t.t_disp_buffer = NULL; 3288 team->t.t_dispatch = NULL; 3289 team->t.t_implicit_task_taskdata = 0; 3290 } 3291 3292 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3293 kmp_info_t **oldThreads = team->t.t_threads; 3294 3295 __kmp_free(team->t.t_disp_buffer); 3296 __kmp_free(team->t.t_dispatch); 3297 __kmp_free(team->t.t_implicit_task_taskdata); 3298 __kmp_allocate_team_arrays(team, max_nth); 3299 3300 KMP_MEMCPY(team->t.t_threads, oldThreads, 3301 team->t.t_nproc * sizeof(kmp_info_t *)); 3302 3303 __kmp_free(oldThreads); 3304 } 3305 3306 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3307 3308 kmp_r_sched_t r_sched = 3309 __kmp_get_schedule_global(); // get current state of scheduling globals 3310 3311 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3312 3313 kmp_internal_control_t g_icvs = { 3314 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3315 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3316 // adjustment of threads (per thread) 3317 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3318 // whether blocktime is explicitly set 3319 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3320 #if KMP_USE_MONITOR 3321 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3322 // intervals 3323 #endif 3324 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3325 // next parallel region (per thread) 3326 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3327 __kmp_cg_max_nth, // int thread_limit; 3328 __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit 3329 // on task. This is used in the case of target thread_limit 3330 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3331 // for max_active_levels 3332 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3333 // {sched,chunk} pair 3334 __kmp_nested_proc_bind.bind_types[0], 3335 __kmp_default_device, 3336 NULL // struct kmp_internal_control *next; 3337 }; 3338 3339 return g_icvs; 3340 } 3341 3342 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3343 3344 kmp_internal_control_t gx_icvs; 3345 gx_icvs.serial_nesting_level = 3346 0; // probably =team->t.t_serial like in save_inter_controls 3347 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3348 gx_icvs.next = NULL; 3349 3350 return gx_icvs; 3351 } 3352 3353 static void __kmp_initialize_root(kmp_root_t *root) { 3354 int f; 3355 kmp_team_t *root_team; 3356 kmp_team_t *hot_team; 3357 int hot_team_max_nth; 3358 kmp_r_sched_t r_sched = 3359 __kmp_get_schedule_global(); // get current state of scheduling globals 3360 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3361 KMP_DEBUG_ASSERT(root); 3362 KMP_ASSERT(!root->r.r_begin); 3363 3364 /* setup the root state structure */ 3365 __kmp_init_lock(&root->r.r_begin_lock); 3366 root->r.r_begin = FALSE; 3367 root->r.r_active = FALSE; 3368 root->r.r_in_parallel = 0; 3369 root->r.r_blocktime = __kmp_dflt_blocktime; 3370 #if KMP_AFFINITY_SUPPORTED 3371 root->r.r_affinity_assigned = FALSE; 3372 #endif 3373 3374 /* setup the root team for this task */ 3375 /* allocate the root team structure */ 3376 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3377 3378 root_team = 3379 __kmp_allocate_team(root, 3380 1, // new_nproc 3381 1, // max_nproc 3382 #if OMPT_SUPPORT 3383 ompt_data_none, // root parallel id 3384 #endif 3385 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3386 0 // argc 3387 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3388 ); 3389 #if USE_DEBUGGER 3390 // Non-NULL value should be assigned to make the debugger display the root 3391 // team. 3392 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3393 #endif 3394 3395 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3396 3397 root->r.r_root_team = root_team; 3398 root_team->t.t_control_stack_top = NULL; 3399 3400 /* initialize root team */ 3401 root_team->t.t_threads[0] = NULL; 3402 root_team->t.t_nproc = 1; 3403 root_team->t.t_serialized = 1; 3404 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3405 root_team->t.t_sched.sched = r_sched.sched; 3406 root_team->t.t_nested_nth = &__kmp_nested_nth; 3407 KA_TRACE( 3408 20, 3409 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3410 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3411 3412 /* setup the hot team for this task */ 3413 /* allocate the hot team structure */ 3414 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3415 3416 hot_team = 3417 __kmp_allocate_team(root, 3418 1, // new_nproc 3419 __kmp_dflt_team_nth_ub * 2, // max_nproc 3420 #if OMPT_SUPPORT 3421 ompt_data_none, // root parallel id 3422 #endif 3423 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3424 0 // argc 3425 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3426 ); 3427 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3428 3429 root->r.r_hot_team = hot_team; 3430 root_team->t.t_control_stack_top = NULL; 3431 3432 /* first-time initialization */ 3433 hot_team->t.t_parent = root_team; 3434 3435 /* initialize hot team */ 3436 hot_team_max_nth = hot_team->t.t_max_nproc; 3437 for (f = 0; f < hot_team_max_nth; ++f) { 3438 hot_team->t.t_threads[f] = NULL; 3439 } 3440 hot_team->t.t_nproc = 1; 3441 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3442 hot_team->t.t_sched.sched = r_sched.sched; 3443 hot_team->t.t_size_changed = 0; 3444 hot_team->t.t_nested_nth = &__kmp_nested_nth; 3445 } 3446 3447 #ifdef KMP_DEBUG 3448 3449 typedef struct kmp_team_list_item { 3450 kmp_team_p const *entry; 3451 struct kmp_team_list_item *next; 3452 } kmp_team_list_item_t; 3453 typedef kmp_team_list_item_t *kmp_team_list_t; 3454 3455 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3456 kmp_team_list_t list, // List of teams. 3457 kmp_team_p const *team // Team to add. 3458 ) { 3459 3460 // List must terminate with item where both entry and next are NULL. 3461 // Team is added to the list only once. 3462 // List is sorted in ascending order by team id. 3463 // Team id is *not* a key. 3464 3465 kmp_team_list_t l; 3466 3467 KMP_DEBUG_ASSERT(list != NULL); 3468 if (team == NULL) { 3469 return; 3470 } 3471 3472 __kmp_print_structure_team_accum(list, team->t.t_parent); 3473 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3474 3475 // Search list for the team. 3476 l = list; 3477 while (l->next != NULL && l->entry != team) { 3478 l = l->next; 3479 } 3480 if (l->next != NULL) { 3481 return; // Team has been added before, exit. 3482 } 3483 3484 // Team is not found. Search list again for insertion point. 3485 l = list; 3486 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3487 l = l->next; 3488 } 3489 3490 // Insert team. 3491 { 3492 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3493 sizeof(kmp_team_list_item_t)); 3494 *item = *l; 3495 l->entry = team; 3496 l->next = item; 3497 } 3498 } 3499 3500 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3501 3502 ) { 3503 __kmp_printf("%s", title); 3504 if (team != NULL) { 3505 __kmp_printf("%2x %p\n", team->t.t_id, team); 3506 } else { 3507 __kmp_printf(" - (nil)\n"); 3508 } 3509 } 3510 3511 static void __kmp_print_structure_thread(char const *title, 3512 kmp_info_p const *thread) { 3513 __kmp_printf("%s", title); 3514 if (thread != NULL) { 3515 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3516 } else { 3517 __kmp_printf(" - (nil)\n"); 3518 } 3519 } 3520 3521 void __kmp_print_structure(void) { 3522 3523 kmp_team_list_t list; 3524 3525 // Initialize list of teams. 3526 list = 3527 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3528 list->entry = NULL; 3529 list->next = NULL; 3530 3531 __kmp_printf("\n------------------------------\nGlobal Thread " 3532 "Table\n------------------------------\n"); 3533 { 3534 int gtid; 3535 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3536 __kmp_printf("%2d", gtid); 3537 if (__kmp_threads != NULL) { 3538 __kmp_printf(" %p", __kmp_threads[gtid]); 3539 } 3540 if (__kmp_root != NULL) { 3541 __kmp_printf(" %p", __kmp_root[gtid]); 3542 } 3543 __kmp_printf("\n"); 3544 } 3545 } 3546 3547 // Print out __kmp_threads array. 3548 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3549 "----------\n"); 3550 if (__kmp_threads != NULL) { 3551 int gtid; 3552 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3553 kmp_info_t const *thread = __kmp_threads[gtid]; 3554 if (thread != NULL) { 3555 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3556 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3557 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3558 __kmp_print_structure_team(" Serial Team: ", 3559 thread->th.th_serial_team); 3560 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3561 __kmp_print_structure_thread(" Primary: ", 3562 thread->th.th_team_master); 3563 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3564 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3565 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3566 __kmp_print_structure_thread(" Next in pool: ", 3567 thread->th.th_next_pool); 3568 __kmp_printf("\n"); 3569 __kmp_print_structure_team_accum(list, thread->th.th_team); 3570 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3571 } 3572 } 3573 } else { 3574 __kmp_printf("Threads array is not allocated.\n"); 3575 } 3576 3577 // Print out __kmp_root array. 3578 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3579 "--------\n"); 3580 if (__kmp_root != NULL) { 3581 int gtid; 3582 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3583 kmp_root_t const *root = __kmp_root[gtid]; 3584 if (root != NULL) { 3585 __kmp_printf("GTID %2d %p:\n", gtid, root); 3586 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3587 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3588 __kmp_print_structure_thread(" Uber Thread: ", 3589 root->r.r_uber_thread); 3590 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3591 __kmp_printf(" In Parallel: %2d\n", 3592 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3593 __kmp_printf("\n"); 3594 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3595 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3596 } 3597 } 3598 } else { 3599 __kmp_printf("Ubers array is not allocated.\n"); 3600 } 3601 3602 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3603 "--------\n"); 3604 while (list->next != NULL) { 3605 kmp_team_p const *team = list->entry; 3606 int i; 3607 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3608 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3609 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid); 3610 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3611 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3612 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3613 for (i = 0; i < team->t.t_nproc; ++i) { 3614 __kmp_printf(" Thread %2d: ", i); 3615 __kmp_print_structure_thread("", team->t.t_threads[i]); 3616 } 3617 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3618 __kmp_printf("\n"); 3619 list = list->next; 3620 } 3621 3622 // Print out __kmp_thread_pool and __kmp_team_pool. 3623 __kmp_printf("\n------------------------------\nPools\n----------------------" 3624 "--------\n"); 3625 __kmp_print_structure_thread("Thread pool: ", 3626 CCAST(kmp_info_t *, __kmp_thread_pool)); 3627 __kmp_print_structure_team("Team pool: ", 3628 CCAST(kmp_team_t *, __kmp_team_pool)); 3629 __kmp_printf("\n"); 3630 3631 // Free team list. 3632 while (list != NULL) { 3633 kmp_team_list_item_t *item = list; 3634 list = list->next; 3635 KMP_INTERNAL_FREE(item); 3636 } 3637 } 3638 3639 #endif 3640 3641 //--------------------------------------------------------------------------- 3642 // Stuff for per-thread fast random number generator 3643 // Table of primes 3644 static const unsigned __kmp_primes[] = { 3645 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3646 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3647 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3648 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3649 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3650 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3651 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3652 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3653 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3654 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3655 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3656 3657 //--------------------------------------------------------------------------- 3658 // __kmp_get_random: Get a random number using a linear congruential method. 3659 unsigned short __kmp_get_random(kmp_info_t *thread) { 3660 unsigned x = thread->th.th_x; 3661 unsigned short r = (unsigned short)(x >> 16); 3662 3663 thread->th.th_x = x * thread->th.th_a + 1; 3664 3665 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3666 thread->th.th_info.ds.ds_tid, r)); 3667 3668 return r; 3669 } 3670 //-------------------------------------------------------- 3671 // __kmp_init_random: Initialize a random number generator 3672 void __kmp_init_random(kmp_info_t *thread) { 3673 unsigned seed = thread->th.th_info.ds.ds_tid; 3674 3675 thread->th.th_a = 3676 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3677 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3678 KA_TRACE(30, 3679 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3680 } 3681 3682 #if KMP_OS_WINDOWS 3683 /* reclaim array entries for root threads that are already dead, returns number 3684 * reclaimed */ 3685 static int __kmp_reclaim_dead_roots(void) { 3686 int i, r = 0; 3687 3688 for (i = 0; i < __kmp_threads_capacity; ++i) { 3689 if (KMP_UBER_GTID(i) && 3690 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3691 !__kmp_root[i] 3692 ->r.r_active) { // AC: reclaim only roots died in non-active state 3693 r += __kmp_unregister_root_other_thread(i); 3694 } 3695 } 3696 return r; 3697 } 3698 #endif 3699 3700 /* This function attempts to create free entries in __kmp_threads and 3701 __kmp_root, and returns the number of free entries generated. 3702 3703 For Windows* OS static library, the first mechanism used is to reclaim array 3704 entries for root threads that are already dead. 3705 3706 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3707 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3708 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3709 threadprivate cache array has been created. Synchronization with 3710 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3711 3712 After any dead root reclamation, if the clipping value allows array expansion 3713 to result in the generation of a total of nNeed free slots, the function does 3714 that expansion. If not, nothing is done beyond the possible initial root 3715 thread reclamation. 3716 3717 If any argument is negative, the behavior is undefined. */ 3718 static int __kmp_expand_threads(int nNeed) { 3719 int added = 0; 3720 int minimumRequiredCapacity; 3721 int newCapacity; 3722 kmp_info_t **newThreads; 3723 kmp_root_t **newRoot; 3724 3725 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3726 // resizing __kmp_threads does not need additional protection if foreign 3727 // threads are present 3728 3729 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3730 /* only for Windows static library */ 3731 /* reclaim array entries for root threads that are already dead */ 3732 added = __kmp_reclaim_dead_roots(); 3733 3734 if (nNeed) { 3735 nNeed -= added; 3736 if (nNeed < 0) 3737 nNeed = 0; 3738 } 3739 #endif 3740 if (nNeed <= 0) 3741 return added; 3742 3743 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3744 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3745 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3746 // > __kmp_max_nth in one of two ways: 3747 // 3748 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3749 // may not be reused by another thread, so we may need to increase 3750 // __kmp_threads_capacity to __kmp_max_nth + 1. 3751 // 3752 // 2) New foreign root(s) are encountered. We always register new foreign 3753 // roots. This may cause a smaller # of threads to be allocated at 3754 // subsequent parallel regions, but the worker threads hang around (and 3755 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3756 // 3757 // Anyway, that is the reason for moving the check to see if 3758 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3759 // instead of having it performed here. -BB 3760 3761 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3762 3763 /* compute expansion headroom to check if we can expand */ 3764 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3765 /* possible expansion too small -- give up */ 3766 return added; 3767 } 3768 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3769 3770 newCapacity = __kmp_threads_capacity; 3771 do { 3772 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3773 : __kmp_sys_max_nth; 3774 } while (newCapacity < minimumRequiredCapacity); 3775 newThreads = (kmp_info_t **)__kmp_allocate( 3776 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3777 newRoot = 3778 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3779 KMP_MEMCPY(newThreads, __kmp_threads, 3780 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3781 KMP_MEMCPY(newRoot, __kmp_root, 3782 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3783 // Put old __kmp_threads array on a list. Any ongoing references to the old 3784 // list will be valid. This list is cleaned up at library shutdown. 3785 kmp_old_threads_list_t *node = 3786 (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t)); 3787 node->threads = __kmp_threads; 3788 node->next = __kmp_old_threads_list; 3789 __kmp_old_threads_list = node; 3790 3791 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3792 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3793 added += newCapacity - __kmp_threads_capacity; 3794 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3795 3796 if (newCapacity > __kmp_tp_capacity) { 3797 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3798 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3799 __kmp_threadprivate_resize_cache(newCapacity); 3800 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3801 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3802 } 3803 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3804 } 3805 3806 return added; 3807 } 3808 3809 /* Register the current thread as a root thread and obtain our gtid. We must 3810 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3811 thread that calls from __kmp_do_serial_initialize() */ 3812 int __kmp_register_root(int initial_thread) { 3813 kmp_info_t *root_thread; 3814 kmp_root_t *root; 3815 int gtid; 3816 int capacity; 3817 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3818 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3819 KMP_MB(); 3820 3821 /* 2007-03-02: 3822 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3823 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3824 work as expected -- it may return false (that means there is at least one 3825 empty slot in __kmp_threads array), but it is possible the only free slot 3826 is #0, which is reserved for initial thread and so cannot be used for this 3827 one. Following code workarounds this bug. 3828 3829 However, right solution seems to be not reserving slot #0 for initial 3830 thread because: 3831 (1) there is no magic in slot #0, 3832 (2) we cannot detect initial thread reliably (the first thread which does 3833 serial initialization may be not a real initial thread). 3834 */ 3835 capacity = __kmp_threads_capacity; 3836 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3837 --capacity; 3838 } 3839 3840 // If it is not for initializing the hidden helper team, we need to take 3841 // __kmp_hidden_helper_threads_num out of the capacity because it is included 3842 // in __kmp_threads_capacity. 3843 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 3844 capacity -= __kmp_hidden_helper_threads_num; 3845 } 3846 3847 /* see if there are too many threads */ 3848 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3849 if (__kmp_tp_cached) { 3850 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3851 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3852 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3853 } else { 3854 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3855 __kmp_msg_null); 3856 } 3857 } 3858 3859 // When hidden helper task is enabled, __kmp_threads is organized as follows: 3860 // 0: initial thread, also a regular OpenMP thread. 3861 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads. 3862 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for 3863 // regular OpenMP threads. 3864 if (TCR_4(__kmp_init_hidden_helper_threads)) { 3865 // Find an available thread slot for hidden helper thread. Slots for hidden 3866 // helper threads start from 1 to __kmp_hidden_helper_threads_num. 3867 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL && 3868 gtid <= __kmp_hidden_helper_threads_num; 3869 gtid++) 3870 ; 3871 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num); 3872 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for " 3873 "hidden helper thread: T#%d\n", 3874 gtid)); 3875 } else { 3876 /* find an available thread slot */ 3877 // Don't reassign the zero slot since we need that to only be used by 3878 // initial thread. Slots for hidden helper threads should also be skipped. 3879 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3880 gtid = 0; 3881 } else { 3882 for (gtid = __kmp_hidden_helper_threads_num + 1; 3883 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++) 3884 ; 3885 } 3886 KA_TRACE( 3887 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3888 KMP_ASSERT(gtid < __kmp_threads_capacity); 3889 } 3890 3891 /* update global accounting */ 3892 __kmp_all_nth++; 3893 TCW_4(__kmp_nth, __kmp_nth + 1); 3894 3895 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3896 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3897 if (__kmp_adjust_gtid_mode) { 3898 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3899 if (TCR_4(__kmp_gtid_mode) != 2) { 3900 TCW_4(__kmp_gtid_mode, 2); 3901 } 3902 } else { 3903 if (TCR_4(__kmp_gtid_mode) != 1) { 3904 TCW_4(__kmp_gtid_mode, 1); 3905 } 3906 } 3907 } 3908 3909 #ifdef KMP_ADJUST_BLOCKTIME 3910 /* Adjust blocktime to zero if necessary */ 3911 /* Middle initialization might not have occurred yet */ 3912 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3913 if (__kmp_nth > __kmp_avail_proc) { 3914 __kmp_zero_bt = TRUE; 3915 } 3916 } 3917 #endif /* KMP_ADJUST_BLOCKTIME */ 3918 3919 /* setup this new hierarchy */ 3920 if (!(root = __kmp_root[gtid])) { 3921 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3922 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3923 } 3924 3925 #if KMP_STATS_ENABLED 3926 // Initialize stats as soon as possible (right after gtid assignment). 3927 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3928 __kmp_stats_thread_ptr->startLife(); 3929 KMP_SET_THREAD_STATE(SERIAL_REGION); 3930 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3931 #endif 3932 __kmp_initialize_root(root); 3933 3934 /* setup new root thread structure */ 3935 if (root->r.r_uber_thread) { 3936 root_thread = root->r.r_uber_thread; 3937 } else { 3938 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3939 if (__kmp_storage_map) { 3940 __kmp_print_thread_storage_map(root_thread, gtid); 3941 } 3942 root_thread->th.th_info.ds.ds_gtid = gtid; 3943 #if OMPT_SUPPORT 3944 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3945 #endif 3946 root_thread->th.th_root = root; 3947 if (__kmp_env_consistency_check) { 3948 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3949 } 3950 #if USE_FAST_MEMORY 3951 __kmp_initialize_fast_memory(root_thread); 3952 #endif /* USE_FAST_MEMORY */ 3953 3954 #if KMP_USE_BGET 3955 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3956 __kmp_initialize_bget(root_thread); 3957 #endif 3958 __kmp_init_random(root_thread); // Initialize random number generator 3959 } 3960 3961 /* setup the serial team held in reserve by the root thread */ 3962 if (!root_thread->th.th_serial_team) { 3963 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3964 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3965 root_thread->th.th_serial_team = __kmp_allocate_team( 3966 root, 1, 1, 3967 #if OMPT_SUPPORT 3968 ompt_data_none, // root parallel id 3969 #endif 3970 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3971 } 3972 KMP_ASSERT(root_thread->th.th_serial_team); 3973 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3974 root_thread->th.th_serial_team)); 3975 3976 /* drop root_thread into place */ 3977 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3978 3979 root->r.r_root_team->t.t_threads[0] = root_thread; 3980 root->r.r_hot_team->t.t_threads[0] = root_thread; 3981 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3982 // AC: the team created in reserve, not for execution (it is unused for now). 3983 root_thread->th.th_serial_team->t.t_serialized = 0; 3984 root->r.r_uber_thread = root_thread; 3985 3986 /* initialize the thread, get it ready to go */ 3987 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3988 TCW_4(__kmp_init_gtid, TRUE); 3989 3990 /* prepare the primary thread for get_gtid() */ 3991 __kmp_gtid_set_specific(gtid); 3992 3993 #if USE_ITT_BUILD 3994 __kmp_itt_thread_name(gtid); 3995 #endif /* USE_ITT_BUILD */ 3996 3997 #ifdef KMP_TDATA_GTID 3998 __kmp_gtid = gtid; 3999 #endif 4000 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 4001 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 4002 4003 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 4004 "plain=%u\n", 4005 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 4006 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 4007 KMP_INIT_BARRIER_STATE)); 4008 { // Initialize barrier data. 4009 int b; 4010 for (b = 0; b < bs_last_barrier; ++b) { 4011 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 4012 #if USE_DEBUGGER 4013 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 4014 #endif 4015 } 4016 } 4017 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 4018 KMP_INIT_BARRIER_STATE); 4019 4020 #if KMP_AFFINITY_SUPPORTED 4021 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 4022 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 4023 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 4024 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 4025 #endif /* KMP_AFFINITY_SUPPORTED */ 4026 root_thread->th.th_def_allocator = __kmp_def_allocator; 4027 root_thread->th.th_prev_level = 0; 4028 root_thread->th.th_prev_num_threads = 1; 4029 4030 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 4031 tmp->cg_root = root_thread; 4032 tmp->cg_thread_limit = __kmp_cg_max_nth; 4033 tmp->cg_nthreads = 1; 4034 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" 4035 " cg_nthreads init to 1\n", 4036 root_thread, tmp)); 4037 tmp->up = NULL; 4038 root_thread->th.th_cg_roots = tmp; 4039 4040 __kmp_root_counter++; 4041 4042 #if OMPT_SUPPORT 4043 if (ompt_enabled.enabled) { 4044 4045 kmp_info_t *root_thread = ompt_get_thread(); 4046 4047 ompt_set_thread_state(root_thread, ompt_state_overhead); 4048 4049 if (ompt_enabled.ompt_callback_thread_begin) { 4050 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 4051 ompt_thread_initial, __ompt_get_thread_data_internal()); 4052 } 4053 ompt_data_t *task_data; 4054 ompt_data_t *parallel_data; 4055 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 4056 NULL); 4057 if (ompt_enabled.ompt_callback_implicit_task) { 4058 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 4059 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); 4060 } 4061 4062 ompt_set_thread_state(root_thread, ompt_state_work_serial); 4063 } 4064 #endif 4065 #if OMPD_SUPPORT 4066 if (ompd_state & OMPD_ENABLE_BP) 4067 ompd_bp_thread_begin(); 4068 #endif 4069 4070 KMP_MB(); 4071 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4072 4073 return gtid; 4074 } 4075 4076 #if KMP_NESTED_HOT_TEAMS 4077 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 4078 const int max_level) { 4079 int i, n, nth; 4080 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 4081 if (!hot_teams || !hot_teams[level].hot_team) { 4082 return 0; 4083 } 4084 KMP_DEBUG_ASSERT(level < max_level); 4085 kmp_team_t *team = hot_teams[level].hot_team; 4086 nth = hot_teams[level].hot_team_nth; 4087 n = nth - 1; // primary thread is not freed 4088 if (level < max_level - 1) { 4089 for (i = 0; i < nth; ++i) { 4090 kmp_info_t *th = team->t.t_threads[i]; 4091 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 4092 if (i > 0 && th->th.th_hot_teams) { 4093 __kmp_free(th->th.th_hot_teams); 4094 th->th.th_hot_teams = NULL; 4095 } 4096 } 4097 } 4098 __kmp_free_team(root, team, NULL); 4099 return n; 4100 } 4101 #endif 4102 4103 // Resets a root thread and clear its root and hot teams. 4104 // Returns the number of __kmp_threads entries directly and indirectly freed. 4105 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 4106 kmp_team_t *root_team = root->r.r_root_team; 4107 kmp_team_t *hot_team = root->r.r_hot_team; 4108 int n = hot_team->t.t_nproc; 4109 int i; 4110 4111 KMP_DEBUG_ASSERT(!root->r.r_active); 4112 4113 root->r.r_root_team = NULL; 4114 root->r.r_hot_team = NULL; 4115 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 4116 // before call to __kmp_free_team(). 4117 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 4118 #if KMP_NESTED_HOT_TEAMS 4119 if (__kmp_hot_teams_max_level > 4120 0) { // need to free nested hot teams and their threads if any 4121 for (i = 0; i < hot_team->t.t_nproc; ++i) { 4122 kmp_info_t *th = hot_team->t.t_threads[i]; 4123 if (__kmp_hot_teams_max_level > 1) { 4124 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 4125 } 4126 if (th->th.th_hot_teams) { 4127 __kmp_free(th->th.th_hot_teams); 4128 th->th.th_hot_teams = NULL; 4129 } 4130 } 4131 } 4132 #endif 4133 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 4134 4135 // Before we can reap the thread, we need to make certain that all other 4136 // threads in the teams that had this root as ancestor have stopped trying to 4137 // steal tasks. 4138 if (__kmp_tasking_mode != tskm_immediate_exec) { 4139 __kmp_wait_to_unref_task_teams(); 4140 } 4141 4142 #if KMP_OS_WINDOWS 4143 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 4144 KA_TRACE( 4145 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 4146 "\n", 4147 (LPVOID) & (root->r.r_uber_thread->th), 4148 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 4149 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 4150 #endif /* KMP_OS_WINDOWS */ 4151 4152 #if OMPD_SUPPORT 4153 if (ompd_state & OMPD_ENABLE_BP) 4154 ompd_bp_thread_end(); 4155 #endif 4156 4157 #if OMPT_SUPPORT 4158 ompt_data_t *task_data; 4159 ompt_data_t *parallel_data; 4160 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 4161 NULL); 4162 if (ompt_enabled.ompt_callback_implicit_task) { 4163 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 4164 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); 4165 } 4166 if (ompt_enabled.ompt_callback_thread_end) { 4167 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 4168 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 4169 } 4170 #endif 4171 4172 TCW_4(__kmp_nth, 4173 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 4174 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; 4175 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" 4176 " to %d\n", 4177 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, 4178 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); 4179 if (i == 1) { 4180 // need to free contention group structure 4181 KMP_DEBUG_ASSERT(root->r.r_uber_thread == 4182 root->r.r_uber_thread->th.th_cg_roots->cg_root); 4183 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); 4184 __kmp_free(root->r.r_uber_thread->th.th_cg_roots); 4185 root->r.r_uber_thread->th.th_cg_roots = NULL; 4186 } 4187 __kmp_reap_thread(root->r.r_uber_thread, 1); 4188 4189 // We canot put root thread to __kmp_thread_pool, so we have to reap it 4190 // instead of freeing. 4191 root->r.r_uber_thread = NULL; 4192 /* mark root as no longer in use */ 4193 root->r.r_begin = FALSE; 4194 4195 return n; 4196 } 4197 4198 void __kmp_unregister_root_current_thread(int gtid) { 4199 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 4200 /* this lock should be ok, since unregister_root_current_thread is never 4201 called during an abort, only during a normal close. furthermore, if you 4202 have the forkjoin lock, you should never try to get the initz lock */ 4203 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 4204 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 4205 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 4206 "exiting T#%d\n", 4207 gtid)); 4208 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4209 return; 4210 } 4211 kmp_root_t *root = __kmp_root[gtid]; 4212 4213 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4214 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4215 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4216 KMP_ASSERT(root->r.r_active == FALSE); 4217 4218 KMP_MB(); 4219 4220 kmp_info_t *thread = __kmp_threads[gtid]; 4221 kmp_team_t *team = thread->th.th_team; 4222 kmp_task_team_t *task_team = thread->th.th_task_team; 4223 4224 // we need to wait for the proxy tasks before finishing the thread 4225 if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks || 4226 task_team->tt.tt_hidden_helper_task_encountered)) { 4227 #if OMPT_SUPPORT 4228 // the runtime is shutting down so we won't report any events 4229 thread->th.ompt_thread_info.state = ompt_state_undefined; 4230 #endif 4231 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 4232 } 4233 4234 __kmp_reset_root(gtid, root); 4235 4236 KMP_MB(); 4237 KC_TRACE(10, 4238 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 4239 4240 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4241 } 4242 4243 #if KMP_OS_WINDOWS 4244 /* __kmp_forkjoin_lock must be already held 4245 Unregisters a root thread that is not the current thread. Returns the number 4246 of __kmp_threads entries freed as a result. */ 4247 static int __kmp_unregister_root_other_thread(int gtid) { 4248 kmp_root_t *root = __kmp_root[gtid]; 4249 int r; 4250 4251 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 4252 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4253 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4254 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4255 KMP_ASSERT(root->r.r_active == FALSE); 4256 4257 r = __kmp_reset_root(gtid, root); 4258 KC_TRACE(10, 4259 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 4260 return r; 4261 } 4262 #endif 4263 4264 #if KMP_DEBUG 4265 void __kmp_task_info() { 4266 4267 kmp_int32 gtid = __kmp_entry_gtid(); 4268 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4269 kmp_info_t *this_thr = __kmp_threads[gtid]; 4270 kmp_team_t *steam = this_thr->th.th_serial_team; 4271 kmp_team_t *team = this_thr->th.th_team; 4272 4273 __kmp_printf( 4274 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4275 "ptask=%p\n", 4276 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4277 team->t.t_implicit_task_taskdata[tid].td_parent); 4278 } 4279 #endif // KMP_DEBUG 4280 4281 /* TODO optimize with one big memclr, take out what isn't needed, split 4282 responsibility to workers as much as possible, and delay initialization of 4283 features as much as possible */ 4284 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4285 int tid, int gtid) { 4286 /* this_thr->th.th_info.ds.ds_gtid is setup in 4287 kmp_allocate_thread/create_worker. 4288 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4289 KMP_DEBUG_ASSERT(this_thr != NULL); 4290 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4291 KMP_DEBUG_ASSERT(team); 4292 KMP_DEBUG_ASSERT(team->t.t_threads); 4293 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4294 kmp_info_t *master = team->t.t_threads[0]; 4295 KMP_DEBUG_ASSERT(master); 4296 KMP_DEBUG_ASSERT(master->th.th_root); 4297 4298 KMP_MB(); 4299 4300 TCW_SYNC_PTR(this_thr->th.th_team, team); 4301 4302 this_thr->th.th_info.ds.ds_tid = tid; 4303 this_thr->th.th_set_nproc = 0; 4304 if (__kmp_tasking_mode != tskm_immediate_exec) 4305 // When tasking is possible, threads are not safe to reap until they are 4306 // done tasking; this will be set when tasking code is exited in wait 4307 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4308 else // no tasking --> always safe to reap 4309 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4310 this_thr->th.th_set_proc_bind = proc_bind_default; 4311 4312 #if KMP_AFFINITY_SUPPORTED 4313 this_thr->th.th_new_place = this_thr->th.th_current_place; 4314 #endif 4315 this_thr->th.th_root = master->th.th_root; 4316 4317 /* setup the thread's cache of the team structure */ 4318 this_thr->th.th_team_nproc = team->t.t_nproc; 4319 this_thr->th.th_team_master = master; 4320 this_thr->th.th_team_serialized = team->t.t_serialized; 4321 4322 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4323 4324 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4325 tid, gtid, this_thr, this_thr->th.th_current_task)); 4326 4327 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4328 team, tid, TRUE); 4329 4330 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4331 tid, gtid, this_thr, this_thr->th.th_current_task)); 4332 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4333 // __kmp_initialize_team()? 4334 4335 /* TODO no worksharing in speculative threads */ 4336 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4337 4338 this_thr->th.th_local.this_construct = 0; 4339 4340 if (!this_thr->th.th_pri_common) { 4341 this_thr->th.th_pri_common = 4342 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4343 if (__kmp_storage_map) { 4344 __kmp_print_storage_map_gtid( 4345 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4346 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4347 } 4348 this_thr->th.th_pri_head = NULL; 4349 } 4350 4351 if (this_thr != master && // Primary thread's CG root is initialized elsewhere 4352 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set 4353 // Make new thread's CG root same as primary thread's 4354 KMP_DEBUG_ASSERT(master->th.th_cg_roots); 4355 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 4356 if (tmp) { 4357 // worker changes CG, need to check if old CG should be freed 4358 int i = tmp->cg_nthreads--; 4359 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" 4360 " on node %p of thread %p to %d\n", 4361 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); 4362 if (i == 1) { 4363 __kmp_free(tmp); // last thread left CG --> free it 4364 } 4365 } 4366 this_thr->th.th_cg_roots = master->th.th_cg_roots; 4367 // Increment new thread's CG root's counter to add the new thread 4368 this_thr->th.th_cg_roots->cg_nthreads++; 4369 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" 4370 " node %p of thread %p to %d\n", 4371 this_thr, this_thr->th.th_cg_roots, 4372 this_thr->th.th_cg_roots->cg_root, 4373 this_thr->th.th_cg_roots->cg_nthreads)); 4374 this_thr->th.th_current_task->td_icvs.thread_limit = 4375 this_thr->th.th_cg_roots->cg_thread_limit; 4376 } 4377 4378 /* Initialize dynamic dispatch */ 4379 { 4380 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4381 // Use team max_nproc since this will never change for the team. 4382 size_t disp_size = 4383 sizeof(dispatch_private_info_t) * 4384 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4385 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4386 team->t.t_max_nproc)); 4387 KMP_ASSERT(dispatch); 4388 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4389 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4390 4391 dispatch->th_disp_index = 0; 4392 dispatch->th_doacross_buf_idx = 0; 4393 if (!dispatch->th_disp_buffer) { 4394 dispatch->th_disp_buffer = 4395 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4396 4397 if (__kmp_storage_map) { 4398 __kmp_print_storage_map_gtid( 4399 gtid, &dispatch->th_disp_buffer[0], 4400 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4401 ? 1 4402 : __kmp_dispatch_num_buffers], 4403 disp_size, 4404 "th_%d.th_dispatch.th_disp_buffer " 4405 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4406 gtid, team->t.t_id, gtid); 4407 } 4408 } else { 4409 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4410 } 4411 4412 dispatch->th_dispatch_pr_current = 0; 4413 dispatch->th_dispatch_sh_current = 0; 4414 4415 dispatch->th_deo_fcn = 0; /* ORDERED */ 4416 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4417 } 4418 4419 this_thr->th.th_next_pool = NULL; 4420 4421 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4422 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4423 4424 KMP_MB(); 4425 } 4426 4427 /* allocate a new thread for the requesting team. this is only called from 4428 within a forkjoin critical section. we will first try to get an available 4429 thread from the thread pool. if none is available, we will fork a new one 4430 assuming we are able to create a new one. this should be assured, as the 4431 caller should check on this first. */ 4432 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4433 int new_tid) { 4434 kmp_team_t *serial_team; 4435 kmp_info_t *new_thr; 4436 int new_gtid; 4437 4438 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4439 KMP_DEBUG_ASSERT(root && team); 4440 #if !KMP_NESTED_HOT_TEAMS 4441 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4442 #endif 4443 KMP_MB(); 4444 4445 /* first, try to get one from the thread pool unless allocating thread is 4446 * the main hidden helper thread. The hidden helper team should always 4447 * allocate new OS threads. */ 4448 if (__kmp_thread_pool && !KMP_HIDDEN_HELPER_TEAM(team)) { 4449 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4450 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4451 if (new_thr == __kmp_thread_pool_insert_pt) { 4452 __kmp_thread_pool_insert_pt = NULL; 4453 } 4454 TCW_4(new_thr->th.th_in_pool, FALSE); 4455 __kmp_suspend_initialize_thread(new_thr); 4456 __kmp_lock_suspend_mx(new_thr); 4457 if (new_thr->th.th_active_in_pool == TRUE) { 4458 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); 4459 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 4460 new_thr->th.th_active_in_pool = FALSE; 4461 } 4462 __kmp_unlock_suspend_mx(new_thr); 4463 4464 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4465 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4466 KMP_ASSERT(!new_thr->th.th_team); 4467 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4468 4469 /* setup the thread structure */ 4470 __kmp_initialize_info(new_thr, team, new_tid, 4471 new_thr->th.th_info.ds.ds_gtid); 4472 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4473 4474 TCW_4(__kmp_nth, __kmp_nth + 1); 4475 4476 new_thr->th.th_task_state = 0; 4477 4478 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 4479 // Make sure pool thread has transitioned to waiting on own thread struct 4480 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0); 4481 // Thread activated in __kmp_allocate_team when increasing team size 4482 } 4483 4484 #ifdef KMP_ADJUST_BLOCKTIME 4485 /* Adjust blocktime back to zero if necessary */ 4486 /* Middle initialization might not have occurred yet */ 4487 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4488 if (__kmp_nth > __kmp_avail_proc) { 4489 __kmp_zero_bt = TRUE; 4490 } 4491 } 4492 #endif /* KMP_ADJUST_BLOCKTIME */ 4493 4494 #if KMP_DEBUG 4495 // If thread entered pool via __kmp_free_thread, wait_flag should != 4496 // KMP_BARRIER_PARENT_FLAG. 4497 int b; 4498 kmp_balign_t *balign = new_thr->th.th_bar; 4499 for (b = 0; b < bs_last_barrier; ++b) 4500 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4501 #endif 4502 4503 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4504 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4505 4506 KMP_MB(); 4507 return new_thr; 4508 } 4509 4510 /* no, well fork a new one */ 4511 KMP_ASSERT(KMP_HIDDEN_HELPER_TEAM(team) || __kmp_nth == __kmp_all_nth); 4512 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4513 4514 #if KMP_USE_MONITOR 4515 // If this is the first worker thread the RTL is creating, then also 4516 // launch the monitor thread. We try to do this as early as possible. 4517 if (!TCR_4(__kmp_init_monitor)) { 4518 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4519 if (!TCR_4(__kmp_init_monitor)) { 4520 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4521 TCW_4(__kmp_init_monitor, 1); 4522 __kmp_create_monitor(&__kmp_monitor); 4523 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4524 #if KMP_OS_WINDOWS 4525 // AC: wait until monitor has started. This is a fix for CQ232808. 4526 // The reason is that if the library is loaded/unloaded in a loop with 4527 // small (parallel) work in between, then there is high probability that 4528 // monitor thread started after the library shutdown. At shutdown it is 4529 // too late to cope with the problem, because when the primary thread is 4530 // in DllMain (process detach) the monitor has no chances to start (it is 4531 // blocked), and primary thread has no means to inform the monitor that 4532 // the library has gone, because all the memory which the monitor can 4533 // access is going to be released/reset. 4534 while (TCR_4(__kmp_init_monitor) < 2) { 4535 KMP_YIELD(TRUE); 4536 } 4537 KF_TRACE(10, ("after monitor thread has started\n")); 4538 #endif 4539 } 4540 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4541 } 4542 #endif 4543 4544 KMP_MB(); 4545 4546 { 4547 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads) 4548 ? 1 4549 : __kmp_hidden_helper_threads_num + 1; 4550 4551 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL; 4552 ++new_gtid) { 4553 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4554 } 4555 4556 if (TCR_4(__kmp_init_hidden_helper_threads)) { 4557 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num); 4558 } 4559 } 4560 4561 /* allocate space for it. */ 4562 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4563 4564 new_thr->th.th_nt_strict = false; 4565 new_thr->th.th_nt_loc = NULL; 4566 new_thr->th.th_nt_sev = severity_fatal; 4567 new_thr->th.th_nt_msg = NULL; 4568 4569 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4570 4571 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 4572 // suppress race conditions detection on synchronization flags in debug mode 4573 // this helps to analyze library internals eliminating false positives 4574 __itt_suppress_mark_range( 4575 __itt_suppress_range, __itt_suppress_threading_errors, 4576 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc)); 4577 __itt_suppress_mark_range( 4578 __itt_suppress_range, __itt_suppress_threading_errors, 4579 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state)); 4580 #if KMP_OS_WINDOWS 4581 __itt_suppress_mark_range( 4582 __itt_suppress_range, __itt_suppress_threading_errors, 4583 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init)); 4584 #else 4585 __itt_suppress_mark_range(__itt_suppress_range, 4586 __itt_suppress_threading_errors, 4587 &new_thr->th.th_suspend_init_count, 4588 sizeof(new_thr->th.th_suspend_init_count)); 4589 #endif 4590 // TODO: check if we need to also suppress b_arrived flags 4591 __itt_suppress_mark_range(__itt_suppress_range, 4592 __itt_suppress_threading_errors, 4593 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go), 4594 sizeof(new_thr->th.th_bar[0].bb.b_go)); 4595 __itt_suppress_mark_range(__itt_suppress_range, 4596 __itt_suppress_threading_errors, 4597 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go), 4598 sizeof(new_thr->th.th_bar[1].bb.b_go)); 4599 __itt_suppress_mark_range(__itt_suppress_range, 4600 __itt_suppress_threading_errors, 4601 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go), 4602 sizeof(new_thr->th.th_bar[2].bb.b_go)); 4603 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 4604 if (__kmp_storage_map) { 4605 __kmp_print_thread_storage_map(new_thr, new_gtid); 4606 } 4607 4608 // add the reserve serialized team, initialized from the team's primary thread 4609 { 4610 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4611 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4612 new_thr->th.th_serial_team = serial_team = 4613 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4614 #if OMPT_SUPPORT 4615 ompt_data_none, // root parallel id 4616 #endif 4617 proc_bind_default, &r_icvs, 4618 0 USE_NESTED_HOT_ARG(NULL)); 4619 } 4620 KMP_ASSERT(serial_team); 4621 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4622 // execution (it is unused for now). 4623 serial_team->t.t_threads[0] = new_thr; 4624 KF_TRACE(10, 4625 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4626 new_thr)); 4627 4628 /* setup the thread structures */ 4629 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4630 4631 #if USE_FAST_MEMORY 4632 __kmp_initialize_fast_memory(new_thr); 4633 #endif /* USE_FAST_MEMORY */ 4634 4635 #if KMP_USE_BGET 4636 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4637 __kmp_initialize_bget(new_thr); 4638 #endif 4639 4640 __kmp_init_random(new_thr); // Initialize random number generator 4641 4642 /* Initialize these only once when thread is grabbed for a team allocation */ 4643 KA_TRACE(20, 4644 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4645 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4646 4647 int b; 4648 kmp_balign_t *balign = new_thr->th.th_bar; 4649 for (b = 0; b < bs_last_barrier; ++b) { 4650 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4651 balign[b].bb.team = NULL; 4652 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4653 balign[b].bb.use_oncore_barrier = 0; 4654 } 4655 4656 TCW_PTR(new_thr->th.th_sleep_loc, NULL); 4657 new_thr->th.th_sleep_loc_type = flag_unset; 4658 4659 new_thr->th.th_spin_here = FALSE; 4660 new_thr->th.th_next_waiting = 0; 4661 #if KMP_OS_UNIX 4662 new_thr->th.th_blocking = false; 4663 #endif 4664 4665 #if KMP_AFFINITY_SUPPORTED 4666 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4667 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4668 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4669 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4670 #endif 4671 new_thr->th.th_def_allocator = __kmp_def_allocator; 4672 new_thr->th.th_prev_level = 0; 4673 new_thr->th.th_prev_num_threads = 1; 4674 4675 TCW_4(new_thr->th.th_in_pool, FALSE); 4676 new_thr->th.th_active_in_pool = FALSE; 4677 TCW_4(new_thr->th.th_active, TRUE); 4678 4679 new_thr->th.th_set_nested_nth = NULL; 4680 new_thr->th.th_set_nested_nth_sz = 0; 4681 4682 /* adjust the global counters */ 4683 __kmp_all_nth++; 4684 __kmp_nth++; 4685 4686 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4687 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4688 if (__kmp_adjust_gtid_mode) { 4689 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4690 if (TCR_4(__kmp_gtid_mode) != 2) { 4691 TCW_4(__kmp_gtid_mode, 2); 4692 } 4693 } else { 4694 if (TCR_4(__kmp_gtid_mode) != 1) { 4695 TCW_4(__kmp_gtid_mode, 1); 4696 } 4697 } 4698 } 4699 4700 #ifdef KMP_ADJUST_BLOCKTIME 4701 /* Adjust blocktime back to zero if necessary */ 4702 /* Middle initialization might not have occurred yet */ 4703 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4704 if (__kmp_nth > __kmp_avail_proc) { 4705 __kmp_zero_bt = TRUE; 4706 } 4707 } 4708 #endif /* KMP_ADJUST_BLOCKTIME */ 4709 4710 #if KMP_AFFINITY_SUPPORTED 4711 // Set the affinity and topology information for new thread 4712 __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE); 4713 #endif 4714 4715 /* actually fork it and create the new worker thread */ 4716 KF_TRACE( 4717 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4718 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4719 KF_TRACE(10, 4720 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4721 4722 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4723 new_gtid)); 4724 KMP_MB(); 4725 return new_thr; 4726 } 4727 4728 /* Reinitialize team for reuse. 4729 The hot team code calls this case at every fork barrier, so EPCC barrier 4730 test are extremely sensitive to changes in it, esp. writes to the team 4731 struct, which cause a cache invalidation in all threads. 4732 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4733 static void __kmp_reinitialize_team(kmp_team_t *team, 4734 kmp_internal_control_t *new_icvs, 4735 ident_t *loc) { 4736 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4737 team->t.t_threads[0], team)); 4738 KMP_DEBUG_ASSERT(team && new_icvs); 4739 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4740 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4741 4742 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4743 // Copy ICVs to the primary thread's implicit taskdata 4744 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4745 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4746 4747 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4748 team->t.t_threads[0], team)); 4749 } 4750 4751 /* Initialize the team data structure. 4752 This assumes the t_threads and t_max_nproc are already set. 4753 Also, we don't touch the arguments */ 4754 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4755 kmp_internal_control_t *new_icvs, 4756 ident_t *loc) { 4757 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4758 4759 /* verify */ 4760 KMP_DEBUG_ASSERT(team); 4761 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4762 KMP_DEBUG_ASSERT(team->t.t_threads); 4763 KMP_MB(); 4764 4765 team->t.t_master_tid = 0; /* not needed */ 4766 /* team->t.t_master_bar; not needed */ 4767 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4768 team->t.t_nproc = new_nproc; 4769 4770 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4771 team->t.t_next_pool = NULL; 4772 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4773 * up hot team */ 4774 4775 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4776 team->t.t_invoke = NULL; /* not needed */ 4777 4778 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4779 team->t.t_sched.sched = new_icvs->sched.sched; 4780 4781 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4782 team->t.t_fp_control_saved = FALSE; /* not needed */ 4783 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4784 team->t.t_mxcsr = 0; /* not needed */ 4785 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4786 4787 team->t.t_construct = 0; 4788 4789 team->t.t_ordered.dt.t_value = 0; 4790 team->t.t_master_active = FALSE; 4791 4792 #ifdef KMP_DEBUG 4793 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4794 #endif 4795 #if KMP_OS_WINDOWS 4796 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4797 #endif 4798 4799 team->t.t_control_stack_top = NULL; 4800 4801 __kmp_reinitialize_team(team, new_icvs, loc); 4802 4803 KMP_MB(); 4804 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4805 } 4806 4807 #if KMP_AFFINITY_SUPPORTED 4808 static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th, 4809 int first, int last, int newp) { 4810 th->th.th_first_place = first; 4811 th->th.th_last_place = last; 4812 th->th.th_new_place = newp; 4813 if (newp != th->th.th_current_place) { 4814 if (__kmp_display_affinity && team->t.t_display_affinity != 1) 4815 team->t.t_display_affinity = 1; 4816 // Copy topology information associated with the new place 4817 th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place]; 4818 th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place]; 4819 } 4820 } 4821 4822 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4823 // It calculates the worker + primary thread's partition based upon the parent 4824 // thread's partition, and binds each worker to a thread in their partition. 4825 // The primary thread's partition should already include its current binding. 4826 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4827 // Do not partition places for the hidden helper team 4828 if (KMP_HIDDEN_HELPER_TEAM(team)) 4829 return; 4830 // Copy the primary thread's place partition to the team struct 4831 kmp_info_t *master_th = team->t.t_threads[0]; 4832 KMP_DEBUG_ASSERT(master_th != NULL); 4833 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4834 int first_place = master_th->th.th_first_place; 4835 int last_place = master_th->th.th_last_place; 4836 int masters_place = master_th->th.th_current_place; 4837 int num_masks = __kmp_affinity.num_masks; 4838 team->t.t_first_place = first_place; 4839 team->t.t_last_place = last_place; 4840 4841 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4842 "bound to place %d partition = [%d,%d]\n", 4843 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4844 team->t.t_id, masters_place, first_place, last_place)); 4845 4846 switch (proc_bind) { 4847 4848 case proc_bind_default: 4849 // Serial teams might have the proc_bind policy set to proc_bind_default. 4850 // Not an issue -- we don't rebind primary thread for any proc_bind policy. 4851 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4852 break; 4853 4854 case proc_bind_primary: { 4855 int f; 4856 int n_th = team->t.t_nproc; 4857 for (f = 1; f < n_th; f++) { 4858 kmp_info_t *th = team->t.t_threads[f]; 4859 KMP_DEBUG_ASSERT(th != NULL); 4860 __kmp_set_thread_place(team, th, first_place, last_place, masters_place); 4861 4862 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d " 4863 "partition = [%d,%d]\n", 4864 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4865 f, masters_place, first_place, last_place)); 4866 } 4867 } break; 4868 4869 case proc_bind_close: { 4870 int f; 4871 int n_th = team->t.t_nproc; 4872 int n_places; 4873 if (first_place <= last_place) { 4874 n_places = last_place - first_place + 1; 4875 } else { 4876 n_places = num_masks - first_place + last_place + 1; 4877 } 4878 if (n_th <= n_places) { 4879 int place = masters_place; 4880 for (f = 1; f < n_th; f++) { 4881 kmp_info_t *th = team->t.t_threads[f]; 4882 KMP_DEBUG_ASSERT(th != NULL); 4883 4884 if (place == last_place) { 4885 place = first_place; 4886 } else if (place == (num_masks - 1)) { 4887 place = 0; 4888 } else { 4889 place++; 4890 } 4891 __kmp_set_thread_place(team, th, first_place, last_place, place); 4892 4893 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4894 "partition = [%d,%d]\n", 4895 __kmp_gtid_from_thread(team->t.t_threads[f]), 4896 team->t.t_id, f, place, first_place, last_place)); 4897 } 4898 } else { 4899 int S, rem, gap, s_count; 4900 S = n_th / n_places; 4901 s_count = 0; 4902 rem = n_th - (S * n_places); 4903 gap = rem > 0 ? n_places / rem : n_places; 4904 int place = masters_place; 4905 int gap_ct = gap; 4906 for (f = 0; f < n_th; f++) { 4907 kmp_info_t *th = team->t.t_threads[f]; 4908 KMP_DEBUG_ASSERT(th != NULL); 4909 4910 __kmp_set_thread_place(team, th, first_place, last_place, place); 4911 s_count++; 4912 4913 if ((s_count == S) && rem && (gap_ct == gap)) { 4914 // do nothing, add an extra thread to place on next iteration 4915 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4916 // we added an extra thread to this place; move to next place 4917 if (place == last_place) { 4918 place = first_place; 4919 } else if (place == (num_masks - 1)) { 4920 place = 0; 4921 } else { 4922 place++; 4923 } 4924 s_count = 0; 4925 gap_ct = 1; 4926 rem--; 4927 } else if (s_count == S) { // place full; don't add extra 4928 if (place == last_place) { 4929 place = first_place; 4930 } else if (place == (num_masks - 1)) { 4931 place = 0; 4932 } else { 4933 place++; 4934 } 4935 gap_ct++; 4936 s_count = 0; 4937 } 4938 4939 KA_TRACE(100, 4940 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4941 "partition = [%d,%d]\n", 4942 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4943 th->th.th_new_place, first_place, last_place)); 4944 } 4945 KMP_DEBUG_ASSERT(place == masters_place); 4946 } 4947 } break; 4948 4949 case proc_bind_spread: { 4950 int f; 4951 int n_th = team->t.t_nproc; 4952 int n_places; 4953 int thidx; 4954 if (first_place <= last_place) { 4955 n_places = last_place - first_place + 1; 4956 } else { 4957 n_places = num_masks - first_place + last_place + 1; 4958 } 4959 if (n_th <= n_places) { 4960 int place = -1; 4961 4962 if (n_places != num_masks) { 4963 int S = n_places / n_th; 4964 int s_count, rem, gap, gap_ct; 4965 4966 place = masters_place; 4967 rem = n_places - n_th * S; 4968 gap = rem ? n_th / rem : 1; 4969 gap_ct = gap; 4970 thidx = n_th; 4971 if (update_master_only == 1) 4972 thidx = 1; 4973 for (f = 0; f < thidx; f++) { 4974 kmp_info_t *th = team->t.t_threads[f]; 4975 KMP_DEBUG_ASSERT(th != NULL); 4976 4977 int fplace = place, nplace = place; 4978 s_count = 1; 4979 while (s_count < S) { 4980 if (place == last_place) { 4981 place = first_place; 4982 } else if (place == (num_masks - 1)) { 4983 place = 0; 4984 } else { 4985 place++; 4986 } 4987 s_count++; 4988 } 4989 if (rem && (gap_ct == gap)) { 4990 if (place == last_place) { 4991 place = first_place; 4992 } else if (place == (num_masks - 1)) { 4993 place = 0; 4994 } else { 4995 place++; 4996 } 4997 rem--; 4998 gap_ct = 0; 4999 } 5000 __kmp_set_thread_place(team, th, fplace, place, nplace); 5001 gap_ct++; 5002 5003 if (place == last_place) { 5004 place = first_place; 5005 } else if (place == (num_masks - 1)) { 5006 place = 0; 5007 } else { 5008 place++; 5009 } 5010 5011 KA_TRACE(100, 5012 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 5013 "partition = [%d,%d], num_masks: %u\n", 5014 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 5015 f, th->th.th_new_place, th->th.th_first_place, 5016 th->th.th_last_place, num_masks)); 5017 } 5018 } else { 5019 /* Having uniform space of available computation places I can create 5020 T partitions of round(P/T) size and put threads into the first 5021 place of each partition. */ 5022 double current = static_cast<double>(masters_place); 5023 double spacing = 5024 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 5025 int first, last; 5026 kmp_info_t *th; 5027 5028 thidx = n_th + 1; 5029 if (update_master_only == 1) 5030 thidx = 1; 5031 for (f = 0; f < thidx; f++) { 5032 first = static_cast<int>(current); 5033 last = static_cast<int>(current + spacing) - 1; 5034 KMP_DEBUG_ASSERT(last >= first); 5035 if (first >= n_places) { 5036 if (masters_place) { 5037 first -= n_places; 5038 last -= n_places; 5039 if (first == (masters_place + 1)) { 5040 KMP_DEBUG_ASSERT(f == n_th); 5041 first--; 5042 } 5043 if (last == masters_place) { 5044 KMP_DEBUG_ASSERT(f == (n_th - 1)); 5045 last--; 5046 } 5047 } else { 5048 KMP_DEBUG_ASSERT(f == n_th); 5049 first = 0; 5050 last = 0; 5051 } 5052 } 5053 if (last >= n_places) { 5054 last = (n_places - 1); 5055 } 5056 place = first; 5057 current += spacing; 5058 if (f < n_th) { 5059 KMP_DEBUG_ASSERT(0 <= first); 5060 KMP_DEBUG_ASSERT(n_places > first); 5061 KMP_DEBUG_ASSERT(0 <= last); 5062 KMP_DEBUG_ASSERT(n_places > last); 5063 KMP_DEBUG_ASSERT(last_place >= first_place); 5064 th = team->t.t_threads[f]; 5065 KMP_DEBUG_ASSERT(th); 5066 __kmp_set_thread_place(team, th, first, last, place); 5067 KA_TRACE(100, 5068 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 5069 "partition = [%d,%d], spacing = %.4f\n", 5070 __kmp_gtid_from_thread(team->t.t_threads[f]), 5071 team->t.t_id, f, th->th.th_new_place, 5072 th->th.th_first_place, th->th.th_last_place, spacing)); 5073 } 5074 } 5075 } 5076 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 5077 } else { 5078 int S, rem, gap, s_count; 5079 S = n_th / n_places; 5080 s_count = 0; 5081 rem = n_th - (S * n_places); 5082 gap = rem > 0 ? n_places / rem : n_places; 5083 int place = masters_place; 5084 int gap_ct = gap; 5085 thidx = n_th; 5086 if (update_master_only == 1) 5087 thidx = 1; 5088 for (f = 0; f < thidx; f++) { 5089 kmp_info_t *th = team->t.t_threads[f]; 5090 KMP_DEBUG_ASSERT(th != NULL); 5091 5092 __kmp_set_thread_place(team, th, place, place, place); 5093 s_count++; 5094 5095 if ((s_count == S) && rem && (gap_ct == gap)) { 5096 // do nothing, add an extra thread to place on next iteration 5097 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 5098 // we added an extra thread to this place; move on to next place 5099 if (place == last_place) { 5100 place = first_place; 5101 } else if (place == (num_masks - 1)) { 5102 place = 0; 5103 } else { 5104 place++; 5105 } 5106 s_count = 0; 5107 gap_ct = 1; 5108 rem--; 5109 } else if (s_count == S) { // place is full; don't add extra thread 5110 if (place == last_place) { 5111 place = first_place; 5112 } else if (place == (num_masks - 1)) { 5113 place = 0; 5114 } else { 5115 place++; 5116 } 5117 gap_ct++; 5118 s_count = 0; 5119 } 5120 5121 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 5122 "partition = [%d,%d]\n", 5123 __kmp_gtid_from_thread(team->t.t_threads[f]), 5124 team->t.t_id, f, th->th.th_new_place, 5125 th->th.th_first_place, th->th.th_last_place)); 5126 } 5127 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 5128 } 5129 } break; 5130 5131 default: 5132 break; 5133 } 5134 5135 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 5136 } 5137 5138 #endif // KMP_AFFINITY_SUPPORTED 5139 5140 /* allocate a new team data structure to use. take one off of the free pool if 5141 available */ 5142 kmp_team_t * 5143 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 5144 #if OMPT_SUPPORT 5145 ompt_data_t ompt_parallel_data, 5146 #endif 5147 kmp_proc_bind_t new_proc_bind, 5148 kmp_internal_control_t *new_icvs, 5149 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5150 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 5151 int f; 5152 kmp_team_t *team; 5153 int use_hot_team = !root->r.r_active; 5154 int level = 0; 5155 int do_place_partition = 1; 5156 5157 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 5158 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 5159 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 5160 KMP_MB(); 5161 5162 #if KMP_NESTED_HOT_TEAMS 5163 kmp_hot_team_ptr_t *hot_teams; 5164 if (master) { 5165 team = master->th.th_team; 5166 level = team->t.t_active_level; 5167 if (master->th.th_teams_microtask) { // in teams construct? 5168 if (master->th.th_teams_size.nteams > 1 && 5169 ( // #teams > 1 5170 team->t.t_pkfn == 5171 (microtask_t)__kmp_teams_master || // inner fork of the teams 5172 master->th.th_teams_level < 5173 team->t.t_level)) { // or nested parallel inside the teams 5174 ++level; // not increment if #teams==1, or for outer fork of the teams; 5175 // increment otherwise 5176 } 5177 // Do not perform the place partition if inner fork of the teams 5178 // Wait until nested parallel region encountered inside teams construct 5179 if ((master->th.th_teams_size.nteams == 1 && 5180 master->th.th_teams_level >= team->t.t_level) || 5181 (team->t.t_pkfn == (microtask_t)__kmp_teams_master)) 5182 do_place_partition = 0; 5183 } 5184 hot_teams = master->th.th_hot_teams; 5185 if (level < __kmp_hot_teams_max_level && hot_teams && 5186 hot_teams[level].hot_team) { 5187 // hot team has already been allocated for given level 5188 use_hot_team = 1; 5189 } else { 5190 use_hot_team = 0; 5191 } 5192 } else { 5193 // check we won't access uninitialized hot_teams, just in case 5194 KMP_DEBUG_ASSERT(new_nproc == 1); 5195 } 5196 #endif 5197 // Optimization to use a "hot" team 5198 if (use_hot_team && new_nproc > 1) { 5199 KMP_DEBUG_ASSERT(new_nproc <= max_nproc); 5200 #if KMP_NESTED_HOT_TEAMS 5201 team = hot_teams[level].hot_team; 5202 #else 5203 team = root->r.r_hot_team; 5204 #endif 5205 #if KMP_DEBUG 5206 if (__kmp_tasking_mode != tskm_immediate_exec) { 5207 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5208 "task_team[1] = %p before reinit\n", 5209 team->t.t_task_team[0], team->t.t_task_team[1])); 5210 } 5211 #endif 5212 5213 if (team->t.t_nproc != new_nproc && 5214 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5215 // Distributed barrier may need a resize 5216 int old_nthr = team->t.t_nproc; 5217 __kmp_resize_dist_barrier(team, old_nthr, new_nproc); 5218 } 5219 5220 // If not doing the place partition, then reset the team's proc bind 5221 // to indicate that partitioning of all threads still needs to take place 5222 if (do_place_partition == 0) 5223 team->t.t_proc_bind = proc_bind_default; 5224 // Has the number of threads changed? 5225 /* Let's assume the most common case is that the number of threads is 5226 unchanged, and put that case first. */ 5227 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 5228 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 5229 // This case can mean that omp_set_num_threads() was called and the hot 5230 // team size was already reduced, so we check the special flag 5231 if (team->t.t_size_changed == -1) { 5232 team->t.t_size_changed = 1; 5233 } else { 5234 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 5235 } 5236 5237 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5238 kmp_r_sched_t new_sched = new_icvs->sched; 5239 // set primary thread's schedule as new run-time schedule 5240 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 5241 5242 __kmp_reinitialize_team(team, new_icvs, 5243 root->r.r_uber_thread->th.th_ident); 5244 5245 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 5246 team->t.t_threads[0], team)); 5247 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5248 5249 #if KMP_AFFINITY_SUPPORTED 5250 if ((team->t.t_size_changed == 0) && 5251 (team->t.t_proc_bind == new_proc_bind)) { 5252 if (new_proc_bind == proc_bind_spread) { 5253 if (do_place_partition) { 5254 // add flag to update only master for spread 5255 __kmp_partition_places(team, 1); 5256 } 5257 } 5258 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 5259 "proc_bind = %d, partition = [%d,%d]\n", 5260 team->t.t_id, new_proc_bind, team->t.t_first_place, 5261 team->t.t_last_place)); 5262 } else { 5263 if (do_place_partition) { 5264 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5265 __kmp_partition_places(team); 5266 } 5267 } 5268 #else 5269 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5270 #endif /* KMP_AFFINITY_SUPPORTED */ 5271 } else if (team->t.t_nproc > new_nproc) { 5272 KA_TRACE(20, 5273 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 5274 new_nproc)); 5275 5276 team->t.t_size_changed = 1; 5277 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5278 // Barrier size already reduced earlier in this function 5279 // Activate team threads via th_used_in_team 5280 __kmp_add_threads_to_team(team, new_nproc); 5281 } 5282 // When decreasing team size, threads no longer in the team should 5283 // unref task team. 5284 if (__kmp_tasking_mode != tskm_immediate_exec) { 5285 for (f = new_nproc; f < team->t.t_nproc; f++) { 5286 kmp_info_t *th = team->t.t_threads[f]; 5287 KMP_DEBUG_ASSERT(th); 5288 th->th.th_task_team = NULL; 5289 } 5290 } 5291 #if KMP_NESTED_HOT_TEAMS 5292 if (__kmp_hot_teams_mode == 0) { 5293 // AC: saved number of threads should correspond to team's value in this 5294 // mode, can be bigger in mode 1, when hot team has threads in reserve 5295 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 5296 hot_teams[level].hot_team_nth = new_nproc; 5297 #endif // KMP_NESTED_HOT_TEAMS 5298 /* release the extra threads we don't need any more */ 5299 for (f = new_nproc; f < team->t.t_nproc; f++) { 5300 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5301 __kmp_free_thread(team->t.t_threads[f]); 5302 team->t.t_threads[f] = NULL; 5303 } 5304 #if KMP_NESTED_HOT_TEAMS 5305 } // (__kmp_hot_teams_mode == 0) 5306 else { 5307 // When keeping extra threads in team, switch threads to wait on own 5308 // b_go flag 5309 for (f = new_nproc; f < team->t.t_nproc; ++f) { 5310 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5311 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 5312 for (int b = 0; b < bs_last_barrier; ++b) { 5313 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5314 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5315 } 5316 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5317 } 5318 } 5319 } 5320 #endif // KMP_NESTED_HOT_TEAMS 5321 team->t.t_nproc = new_nproc; 5322 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5323 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 5324 __kmp_reinitialize_team(team, new_icvs, 5325 root->r.r_uber_thread->th.th_ident); 5326 5327 // Update remaining threads 5328 for (f = 0; f < new_nproc; ++f) { 5329 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5330 } 5331 5332 // restore the current task state of the primary thread: should be the 5333 // implicit task 5334 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5335 team->t.t_threads[0], team)); 5336 5337 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5338 5339 #ifdef KMP_DEBUG 5340 for (f = 0; f < team->t.t_nproc; f++) { 5341 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5342 team->t.t_threads[f]->th.th_team_nproc == 5343 team->t.t_nproc); 5344 } 5345 #endif 5346 5347 if (do_place_partition) { 5348 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5349 #if KMP_AFFINITY_SUPPORTED 5350 __kmp_partition_places(team); 5351 #endif 5352 } 5353 } else { // team->t.t_nproc < new_nproc 5354 5355 KA_TRACE(20, 5356 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5357 new_nproc)); 5358 int old_nproc = team->t.t_nproc; // save old value and use to update only 5359 team->t.t_size_changed = 1; 5360 5361 #if KMP_NESTED_HOT_TEAMS 5362 int avail_threads = hot_teams[level].hot_team_nth; 5363 if (new_nproc < avail_threads) 5364 avail_threads = new_nproc; 5365 kmp_info_t **other_threads = team->t.t_threads; 5366 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5367 // Adjust barrier data of reserved threads (if any) of the team 5368 // Other data will be set in __kmp_initialize_info() below. 5369 int b; 5370 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5371 for (b = 0; b < bs_last_barrier; ++b) { 5372 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5373 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5374 #if USE_DEBUGGER 5375 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5376 #endif 5377 } 5378 } 5379 if (hot_teams[level].hot_team_nth >= new_nproc) { 5380 // we have all needed threads in reserve, no need to allocate any 5381 // this only possible in mode 1, cannot have reserved threads in mode 0 5382 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5383 team->t.t_nproc = new_nproc; // just get reserved threads involved 5384 } else { 5385 // We may have some threads in reserve, but not enough; 5386 // get reserved threads involved if any. 5387 team->t.t_nproc = hot_teams[level].hot_team_nth; 5388 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5389 #endif // KMP_NESTED_HOT_TEAMS 5390 if (team->t.t_max_nproc < new_nproc) { 5391 /* reallocate larger arrays */ 5392 __kmp_reallocate_team_arrays(team, new_nproc); 5393 __kmp_reinitialize_team(team, new_icvs, NULL); 5394 } 5395 5396 #if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \ 5397 KMP_AFFINITY_SUPPORTED 5398 /* Temporarily set full mask for primary thread before creation of 5399 workers. The reason is that workers inherit the affinity from the 5400 primary thread, so if a lot of workers are created on the single 5401 core quickly, they don't get a chance to set their own affinity for 5402 a long time. */ 5403 kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask}; 5404 #endif 5405 5406 /* allocate new threads for the hot team */ 5407 for (f = team->t.t_nproc; f < new_nproc; f++) { 5408 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5409 KMP_DEBUG_ASSERT(new_worker); 5410 team->t.t_threads[f] = new_worker; 5411 5412 KA_TRACE(20, 5413 ("__kmp_allocate_team: team %d init T#%d arrived: " 5414 "join=%llu, plain=%llu\n", 5415 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5416 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5417 team->t.t_bar[bs_plain_barrier].b_arrived)); 5418 5419 { // Initialize barrier data for new threads. 5420 int b; 5421 kmp_balign_t *balign = new_worker->th.th_bar; 5422 for (b = 0; b < bs_last_barrier; ++b) { 5423 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5424 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5425 KMP_BARRIER_PARENT_FLAG); 5426 #if USE_DEBUGGER 5427 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5428 #endif 5429 } 5430 } 5431 } 5432 5433 #if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \ 5434 KMP_AFFINITY_SUPPORTED 5435 /* Restore initial primary thread's affinity mask */ 5436 new_temp_affinity.restore(); 5437 #endif 5438 #if KMP_NESTED_HOT_TEAMS 5439 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5440 #endif // KMP_NESTED_HOT_TEAMS 5441 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5442 // Barrier size already increased earlier in this function 5443 // Activate team threads via th_used_in_team 5444 __kmp_add_threads_to_team(team, new_nproc); 5445 } 5446 /* make sure everyone is syncronized */ 5447 // new threads below 5448 __kmp_initialize_team(team, new_nproc, new_icvs, 5449 root->r.r_uber_thread->th.th_ident); 5450 5451 /* reinitialize the threads */ 5452 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5453 for (f = 0; f < team->t.t_nproc; ++f) 5454 __kmp_initialize_info(team->t.t_threads[f], team, f, 5455 __kmp_gtid_from_tid(f, team)); 5456 5457 // set th_task_state for new threads in hot team with older thread's state 5458 kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state; 5459 for (f = old_nproc; f < team->t.t_nproc; ++f) 5460 team->t.t_threads[f]->th.th_task_state = old_state; 5461 5462 #ifdef KMP_DEBUG 5463 for (f = 0; f < team->t.t_nproc; ++f) { 5464 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5465 team->t.t_threads[f]->th.th_team_nproc == 5466 team->t.t_nproc); 5467 } 5468 #endif 5469 5470 if (do_place_partition) { 5471 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5472 #if KMP_AFFINITY_SUPPORTED 5473 __kmp_partition_places(team); 5474 #endif 5475 } 5476 } // Check changes in number of threads 5477 5478 if (master->th.th_teams_microtask) { 5479 for (f = 1; f < new_nproc; ++f) { 5480 // propagate teams construct specific info to workers 5481 kmp_info_t *thr = team->t.t_threads[f]; 5482 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5483 thr->th.th_teams_level = master->th.th_teams_level; 5484 thr->th.th_teams_size = master->th.th_teams_size; 5485 } 5486 } 5487 #if KMP_NESTED_HOT_TEAMS 5488 if (level) { 5489 // Sync barrier state for nested hot teams, not needed for outermost hot 5490 // team. 5491 for (f = 1; f < new_nproc; ++f) { 5492 kmp_info_t *thr = team->t.t_threads[f]; 5493 int b; 5494 kmp_balign_t *balign = thr->th.th_bar; 5495 for (b = 0; b < bs_last_barrier; ++b) { 5496 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5497 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5498 #if USE_DEBUGGER 5499 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5500 #endif 5501 } 5502 } 5503 } 5504 #endif // KMP_NESTED_HOT_TEAMS 5505 5506 /* reallocate space for arguments if necessary */ 5507 __kmp_alloc_argv_entries(argc, team, TRUE); 5508 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5509 // The hot team re-uses the previous task team, 5510 // if untouched during the previous release->gather phase. 5511 5512 KF_TRACE(10, (" hot_team = %p\n", team)); 5513 5514 #if KMP_DEBUG 5515 if (__kmp_tasking_mode != tskm_immediate_exec) { 5516 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5517 "task_team[1] = %p after reinit\n", 5518 team->t.t_task_team[0], team->t.t_task_team[1])); 5519 } 5520 #endif 5521 5522 #if OMPT_SUPPORT 5523 __ompt_team_assign_id(team, ompt_parallel_data); 5524 #endif 5525 5526 KMP_MB(); 5527 5528 return team; 5529 } 5530 5531 /* next, let's try to take one from the team pool */ 5532 KMP_MB(); 5533 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5534 /* TODO: consider resizing undersized teams instead of reaping them, now 5535 that we have a resizing mechanism */ 5536 if (team->t.t_max_nproc >= max_nproc) { 5537 /* take this team from the team pool */ 5538 __kmp_team_pool = team->t.t_next_pool; 5539 5540 if (max_nproc > 1 && 5541 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5542 if (!team->t.b) { // Allocate barrier structure 5543 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub); 5544 } 5545 } 5546 5547 /* setup the team for fresh use */ 5548 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5549 5550 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5551 "task_team[1] %p to NULL\n", 5552 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5553 team->t.t_task_team[0] = NULL; 5554 team->t.t_task_team[1] = NULL; 5555 5556 /* reallocate space for arguments if necessary */ 5557 __kmp_alloc_argv_entries(argc, team, TRUE); 5558 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5559 5560 KA_TRACE( 5561 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5562 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5563 { // Initialize barrier data. 5564 int b; 5565 for (b = 0; b < bs_last_barrier; ++b) { 5566 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5567 #if USE_DEBUGGER 5568 team->t.t_bar[b].b_master_arrived = 0; 5569 team->t.t_bar[b].b_team_arrived = 0; 5570 #endif 5571 } 5572 } 5573 5574 team->t.t_proc_bind = new_proc_bind; 5575 5576 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5577 team->t.t_id)); 5578 5579 #if OMPT_SUPPORT 5580 __ompt_team_assign_id(team, ompt_parallel_data); 5581 #endif 5582 5583 team->t.t_nested_nth = NULL; 5584 5585 KMP_MB(); 5586 5587 return team; 5588 } 5589 5590 /* reap team if it is too small, then loop back and check the next one */ 5591 // not sure if this is wise, but, will be redone during the hot-teams 5592 // rewrite. 5593 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5594 team = __kmp_reap_team(team); 5595 __kmp_team_pool = team; 5596 } 5597 5598 /* nothing available in the pool, no matter, make a new team! */ 5599 KMP_MB(); 5600 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5601 5602 /* and set it up */ 5603 team->t.t_max_nproc = max_nproc; 5604 if (max_nproc > 1 && 5605 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5606 // Allocate barrier structure 5607 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub); 5608 } 5609 5610 /* NOTE well, for some reason allocating one big buffer and dividing it up 5611 seems to really hurt performance a lot on the P4, so, let's not use this */ 5612 __kmp_allocate_team_arrays(team, max_nproc); 5613 5614 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5615 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5616 5617 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5618 "%p to NULL\n", 5619 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5620 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5621 // memory, no need to duplicate 5622 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5623 // memory, no need to duplicate 5624 5625 if (__kmp_storage_map) { 5626 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5627 } 5628 5629 /* allocate space for arguments */ 5630 __kmp_alloc_argv_entries(argc, team, FALSE); 5631 team->t.t_argc = argc; 5632 5633 KA_TRACE(20, 5634 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5635 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5636 { // Initialize barrier data. 5637 int b; 5638 for (b = 0; b < bs_last_barrier; ++b) { 5639 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5640 #if USE_DEBUGGER 5641 team->t.t_bar[b].b_master_arrived = 0; 5642 team->t.t_bar[b].b_team_arrived = 0; 5643 #endif 5644 } 5645 } 5646 5647 team->t.t_proc_bind = new_proc_bind; 5648 5649 #if OMPT_SUPPORT 5650 __ompt_team_assign_id(team, ompt_parallel_data); 5651 team->t.ompt_serialized_team_info = NULL; 5652 #endif 5653 5654 KMP_MB(); 5655 5656 team->t.t_nested_nth = NULL; 5657 5658 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5659 team->t.t_id)); 5660 5661 return team; 5662 } 5663 5664 /* TODO implement hot-teams at all levels */ 5665 /* TODO implement lazy thread release on demand (disband request) */ 5666 5667 /* free the team. return it to the team pool. release all the threads 5668 * associated with it */ 5669 void __kmp_free_team(kmp_root_t *root, 5670 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5671 int f; 5672 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5673 team->t.t_id)); 5674 5675 /* verify state */ 5676 KMP_DEBUG_ASSERT(root); 5677 KMP_DEBUG_ASSERT(team); 5678 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5679 KMP_DEBUG_ASSERT(team->t.t_threads); 5680 5681 int use_hot_team = team == root->r.r_hot_team; 5682 #if KMP_NESTED_HOT_TEAMS 5683 int level; 5684 if (master) { 5685 level = team->t.t_active_level - 1; 5686 if (master->th.th_teams_microtask) { // in teams construct? 5687 if (master->th.th_teams_size.nteams > 1) { 5688 ++level; // level was not increased in teams construct for 5689 // team_of_masters 5690 } 5691 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5692 master->th.th_teams_level == team->t.t_level) { 5693 ++level; // level was not increased in teams construct for 5694 // team_of_workers before the parallel 5695 } // team->t.t_level will be increased inside parallel 5696 } 5697 #if KMP_DEBUG 5698 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams; 5699 #endif 5700 if (level < __kmp_hot_teams_max_level) { 5701 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5702 use_hot_team = 1; 5703 } 5704 } 5705 #endif // KMP_NESTED_HOT_TEAMS 5706 5707 /* team is done working */ 5708 TCW_SYNC_PTR(team->t.t_pkfn, 5709 NULL); // Important for Debugging Support Library. 5710 #if KMP_OS_WINDOWS 5711 team->t.t_copyin_counter = 0; // init counter for possible reuse 5712 #endif 5713 // Do not reset pointer to parent team to NULL for hot teams. 5714 5715 /* if we are non-hot team, release our threads */ 5716 if (!use_hot_team) { 5717 if (__kmp_tasking_mode != tskm_immediate_exec) { 5718 // Wait for threads to reach reapable state 5719 for (f = 1; f < team->t.t_nproc; ++f) { 5720 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5721 kmp_info_t *th = team->t.t_threads[f]; 5722 volatile kmp_uint32 *state = &th->th.th_reap_state; 5723 while (*state != KMP_SAFE_TO_REAP) { 5724 #if KMP_OS_WINDOWS 5725 // On Windows a thread can be killed at any time, check this 5726 DWORD ecode; 5727 if (!__kmp_is_thread_alive(th, &ecode)) { 5728 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5729 break; 5730 } 5731 #endif 5732 // first check if thread is sleeping 5733 if (th->th.th_sleep_loc) 5734 __kmp_null_resume_wrapper(th); 5735 KMP_CPU_PAUSE(); 5736 } 5737 } 5738 5739 // Delete task teams 5740 int tt_idx; 5741 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5742 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5743 if (task_team != NULL) { 5744 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams 5745 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5746 team->t.t_threads[f]->th.th_task_team = NULL; 5747 } 5748 KA_TRACE( 5749 20, 5750 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5751 __kmp_get_gtid(), task_team, team->t.t_id)); 5752 #if KMP_NESTED_HOT_TEAMS 5753 __kmp_free_task_team(master, task_team); 5754 #endif 5755 team->t.t_task_team[tt_idx] = NULL; 5756 } 5757 } 5758 } 5759 5760 // Before clearing parent pointer, check if nested_nth list should be freed 5761 if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth && 5762 team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) { 5763 KMP_INTERNAL_FREE(team->t.t_nested_nth->nth); 5764 KMP_INTERNAL_FREE(team->t.t_nested_nth); 5765 } 5766 team->t.t_nested_nth = NULL; 5767 5768 // Reset pointer to parent team only for non-hot teams. 5769 team->t.t_parent = NULL; 5770 team->t.t_level = 0; 5771 team->t.t_active_level = 0; 5772 5773 /* free the worker threads */ 5774 for (f = 1; f < team->t.t_nproc; ++f) { 5775 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5776 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5777 (void)KMP_COMPARE_AND_STORE_ACQ32( 5778 &(team->t.t_threads[f]->th.th_used_in_team), 1, 2); 5779 } 5780 __kmp_free_thread(team->t.t_threads[f]); 5781 } 5782 5783 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5784 if (team->t.b) { 5785 // wake up thread at old location 5786 team->t.b->go_release(); 5787 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5788 for (f = 1; f < team->t.t_nproc; ++f) { 5789 if (team->t.b->sleep[f].sleep) { 5790 __kmp_atomic_resume_64( 5791 team->t.t_threads[f]->th.th_info.ds.ds_gtid, 5792 (kmp_atomic_flag_64<> *)NULL); 5793 } 5794 } 5795 } 5796 // Wait for threads to be removed from team 5797 for (int f = 1; f < team->t.t_nproc; ++f) { 5798 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0) 5799 KMP_CPU_PAUSE(); 5800 } 5801 } 5802 } 5803 5804 for (f = 1; f < team->t.t_nproc; ++f) { 5805 team->t.t_threads[f] = NULL; 5806 } 5807 5808 if (team->t.t_max_nproc > 1 && 5809 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5810 distributedBarrier::deallocate(team->t.b); 5811 team->t.b = NULL; 5812 } 5813 /* put the team back in the team pool */ 5814 /* TODO limit size of team pool, call reap_team if pool too large */ 5815 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5816 __kmp_team_pool = (volatile kmp_team_t *)team; 5817 } else { // Check if team was created for primary threads in teams construct 5818 // See if first worker is a CG root 5819 KMP_DEBUG_ASSERT(team->t.t_threads[1] && 5820 team->t.t_threads[1]->th.th_cg_roots); 5821 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { 5822 // Clean up the CG root nodes on workers so that this team can be re-used 5823 for (f = 1; f < team->t.t_nproc; ++f) { 5824 kmp_info_t *thr = team->t.t_threads[f]; 5825 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && 5826 thr->th.th_cg_roots->cg_root == thr); 5827 // Pop current CG root off list 5828 kmp_cg_root_t *tmp = thr->th.th_cg_roots; 5829 thr->th.th_cg_roots = tmp->up; 5830 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" 5831 " up to node %p. cg_nthreads was %d\n", 5832 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); 5833 int i = tmp->cg_nthreads--; 5834 if (i == 1) { 5835 __kmp_free(tmp); // free CG if we are the last thread in it 5836 } 5837 // Restore current task's thread_limit from CG root 5838 if (thr->th.th_cg_roots) 5839 thr->th.th_current_task->td_icvs.thread_limit = 5840 thr->th.th_cg_roots->cg_thread_limit; 5841 } 5842 } 5843 } 5844 5845 KMP_MB(); 5846 } 5847 5848 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5849 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5850 kmp_team_t *next_pool = team->t.t_next_pool; 5851 5852 KMP_DEBUG_ASSERT(team); 5853 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5854 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5855 KMP_DEBUG_ASSERT(team->t.t_threads); 5856 KMP_DEBUG_ASSERT(team->t.t_argv); 5857 5858 /* TODO clean the threads that are a part of this? */ 5859 5860 /* free stuff */ 5861 __kmp_free_team_arrays(team); 5862 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5863 __kmp_free((void *)team->t.t_argv); 5864 __kmp_free(team); 5865 5866 KMP_MB(); 5867 return next_pool; 5868 } 5869 5870 // Free the thread. Don't reap it, just place it on the pool of available 5871 // threads. 5872 // 5873 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5874 // binding for the affinity mechanism to be useful. 5875 // 5876 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5877 // However, we want to avoid a potential performance problem by always 5878 // scanning through the list to find the correct point at which to insert 5879 // the thread (potential N**2 behavior). To do this we keep track of the 5880 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5881 // With single-level parallelism, threads will always be added to the tail 5882 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5883 // parallelism, all bets are off and we may need to scan through the entire 5884 // free list. 5885 // 5886 // This change also has a potentially large performance benefit, for some 5887 // applications. Previously, as threads were freed from the hot team, they 5888 // would be placed back on the free list in inverse order. If the hot team 5889 // grew back to it's original size, then the freed thread would be placed 5890 // back on the hot team in reverse order. This could cause bad cache 5891 // locality problems on programs where the size of the hot team regularly 5892 // grew and shrunk. 5893 // 5894 // Now, for single-level parallelism, the OMP tid is always == gtid. 5895 void __kmp_free_thread(kmp_info_t *this_th) { 5896 int gtid; 5897 kmp_info_t **scan; 5898 5899 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5900 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5901 5902 KMP_DEBUG_ASSERT(this_th); 5903 5904 // When moving thread to pool, switch thread to wait on own b_go flag, and 5905 // uninitialized (NULL team). 5906 int b; 5907 kmp_balign_t *balign = this_th->th.th_bar; 5908 for (b = 0; b < bs_last_barrier; ++b) { 5909 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5910 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5911 balign[b].bb.team = NULL; 5912 balign[b].bb.leaf_kids = 0; 5913 } 5914 this_th->th.th_task_state = 0; 5915 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5916 5917 /* put thread back on the free pool */ 5918 TCW_PTR(this_th->th.th_team, NULL); 5919 TCW_PTR(this_th->th.th_root, NULL); 5920 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5921 5922 while (this_th->th.th_cg_roots) { 5923 this_th->th.th_cg_roots->cg_nthreads--; 5924 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" 5925 " %p of thread %p to %d\n", 5926 this_th, this_th->th.th_cg_roots, 5927 this_th->th.th_cg_roots->cg_root, 5928 this_th->th.th_cg_roots->cg_nthreads)); 5929 kmp_cg_root_t *tmp = this_th->th.th_cg_roots; 5930 if (tmp->cg_root == this_th) { // Thread is a cg_root 5931 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); 5932 KA_TRACE( 5933 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); 5934 this_th->th.th_cg_roots = tmp->up; 5935 __kmp_free(tmp); 5936 } else { // Worker thread 5937 if (tmp->cg_nthreads == 0) { // last thread leaves contention group 5938 __kmp_free(tmp); 5939 } 5940 this_th->th.th_cg_roots = NULL; 5941 break; 5942 } 5943 } 5944 5945 /* If the implicit task assigned to this thread can be used by other threads 5946 * -> multiple threads can share the data and try to free the task at 5947 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5948 * with higher probability when hot team is disabled but can occurs even when 5949 * the hot team is enabled */ 5950 __kmp_free_implicit_task(this_th); 5951 this_th->th.th_current_task = NULL; 5952 5953 // If the __kmp_thread_pool_insert_pt is already past the new insert 5954 // point, then we need to re-scan the entire list. 5955 gtid = this_th->th.th_info.ds.ds_gtid; 5956 if (__kmp_thread_pool_insert_pt != NULL) { 5957 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5958 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5959 __kmp_thread_pool_insert_pt = NULL; 5960 } 5961 } 5962 5963 // Scan down the list to find the place to insert the thread. 5964 // scan is the address of a link in the list, possibly the address of 5965 // __kmp_thread_pool itself. 5966 // 5967 // In the absence of nested parallelism, the for loop will have 0 iterations. 5968 if (__kmp_thread_pool_insert_pt != NULL) { 5969 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5970 } else { 5971 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5972 } 5973 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5974 scan = &((*scan)->th.th_next_pool)) 5975 ; 5976 5977 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5978 // to its address. 5979 TCW_PTR(this_th->th.th_next_pool, *scan); 5980 __kmp_thread_pool_insert_pt = *scan = this_th; 5981 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5982 (this_th->th.th_info.ds.ds_gtid < 5983 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5984 TCW_4(this_th->th.th_in_pool, TRUE); 5985 __kmp_suspend_initialize_thread(this_th); 5986 __kmp_lock_suspend_mx(this_th); 5987 if (this_th->th.th_active == TRUE) { 5988 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); 5989 this_th->th.th_active_in_pool = TRUE; 5990 } 5991 #if KMP_DEBUG 5992 else { 5993 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); 5994 } 5995 #endif 5996 __kmp_unlock_suspend_mx(this_th); 5997 5998 TCW_4(__kmp_nth, __kmp_nth - 1); 5999 6000 #ifdef KMP_ADJUST_BLOCKTIME 6001 /* Adjust blocktime back to user setting or default if necessary */ 6002 /* Middle initialization might never have occurred */ 6003 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 6004 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 6005 if (__kmp_nth <= __kmp_avail_proc) { 6006 __kmp_zero_bt = FALSE; 6007 } 6008 } 6009 #endif /* KMP_ADJUST_BLOCKTIME */ 6010 6011 KMP_MB(); 6012 } 6013 6014 /* ------------------------------------------------------------------------ */ 6015 6016 void *__kmp_launch_thread(kmp_info_t *this_thr) { 6017 #if OMP_PROFILING_SUPPORT 6018 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE"); 6019 // TODO: add a configuration option for time granularity 6020 if (ProfileTraceFile) 6021 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget"); 6022 #endif 6023 6024 int gtid = this_thr->th.th_info.ds.ds_gtid; 6025 /* void *stack_data;*/ 6026 kmp_team_t **volatile pteam; 6027 6028 KMP_MB(); 6029 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 6030 6031 if (__kmp_env_consistency_check) { 6032 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 6033 } 6034 6035 #if OMPD_SUPPORT 6036 if (ompd_state & OMPD_ENABLE_BP) 6037 ompd_bp_thread_begin(); 6038 #endif 6039 6040 #if OMPT_SUPPORT 6041 ompt_data_t *thread_data = nullptr; 6042 if (ompt_enabled.enabled) { 6043 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 6044 *thread_data = ompt_data_none; 6045 6046 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 6047 this_thr->th.ompt_thread_info.wait_id = 0; 6048 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 6049 this_thr->th.ompt_thread_info.parallel_flags = 0; 6050 if (ompt_enabled.ompt_callback_thread_begin) { 6051 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 6052 ompt_thread_worker, thread_data); 6053 } 6054 this_thr->th.ompt_thread_info.state = ompt_state_idle; 6055 } 6056 #endif 6057 6058 /* This is the place where threads wait for work */ 6059 while (!TCR_4(__kmp_global.g.g_done)) { 6060 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 6061 KMP_MB(); 6062 6063 /* wait for work to do */ 6064 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 6065 6066 /* No tid yet since not part of a team */ 6067 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 6068 6069 #if OMPT_SUPPORT 6070 if (ompt_enabled.enabled) { 6071 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 6072 } 6073 #endif 6074 6075 pteam = &this_thr->th.th_team; 6076 6077 /* have we been allocated? */ 6078 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 6079 /* we were just woken up, so run our new task */ 6080 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 6081 int rc; 6082 KA_TRACE(20, 6083 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 6084 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 6085 (*pteam)->t.t_pkfn)); 6086 6087 updateHWFPControl(*pteam); 6088 6089 #if OMPT_SUPPORT 6090 if (ompt_enabled.enabled) { 6091 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 6092 } 6093 #endif 6094 6095 rc = (*pteam)->t.t_invoke(gtid); 6096 KMP_ASSERT(rc); 6097 6098 KMP_MB(); 6099 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 6100 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 6101 (*pteam)->t.t_pkfn)); 6102 } 6103 #if OMPT_SUPPORT 6104 if (ompt_enabled.enabled) { 6105 /* no frame set while outside task */ 6106 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 6107 6108 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 6109 } 6110 #endif 6111 /* join barrier after parallel region */ 6112 __kmp_join_barrier(gtid); 6113 } 6114 } 6115 6116 #if OMPD_SUPPORT 6117 if (ompd_state & OMPD_ENABLE_BP) 6118 ompd_bp_thread_end(); 6119 #endif 6120 6121 #if OMPT_SUPPORT 6122 if (ompt_enabled.ompt_callback_thread_end) { 6123 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 6124 } 6125 #endif 6126 6127 this_thr->th.th_task_team = NULL; 6128 /* run the destructors for the threadprivate data for this thread */ 6129 __kmp_common_destroy_gtid(gtid); 6130 6131 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 6132 KMP_MB(); 6133 6134 #if OMP_PROFILING_SUPPORT 6135 llvm::timeTraceProfilerFinishThread(); 6136 #endif 6137 return this_thr; 6138 } 6139 6140 /* ------------------------------------------------------------------------ */ 6141 6142 void __kmp_internal_end_dest(void *specific_gtid) { 6143 // Make sure no significant bits are lost 6144 int gtid; 6145 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, >id); 6146 6147 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 6148 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 6149 * this is because 0 is reserved for the nothing-stored case */ 6150 6151 __kmp_internal_end_thread(gtid); 6152 } 6153 6154 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 6155 6156 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 6157 __kmp_internal_end_atexit(); 6158 } 6159 6160 #endif 6161 6162 /* [Windows] josh: when the atexit handler is called, there may still be more 6163 than one thread alive */ 6164 void __kmp_internal_end_atexit(void) { 6165 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 6166 /* [Windows] 6167 josh: ideally, we want to completely shutdown the library in this atexit 6168 handler, but stat code that depends on thread specific data for gtid fails 6169 because that data becomes unavailable at some point during the shutdown, so 6170 we call __kmp_internal_end_thread instead. We should eventually remove the 6171 dependency on __kmp_get_specific_gtid in the stat code and use 6172 __kmp_internal_end_library to cleanly shutdown the library. 6173 6174 // TODO: Can some of this comment about GVS be removed? 6175 I suspect that the offending stat code is executed when the calling thread 6176 tries to clean up a dead root thread's data structures, resulting in GVS 6177 code trying to close the GVS structures for that thread, but since the stat 6178 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 6179 the calling thread is cleaning up itself instead of another thread, it get 6180 confused. This happens because allowing a thread to unregister and cleanup 6181 another thread is a recent modification for addressing an issue. 6182 Based on the current design (20050722), a thread may end up 6183 trying to unregister another thread only if thread death does not trigger 6184 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 6185 thread specific data destructor function to detect thread death. For 6186 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 6187 is nothing. Thus, the workaround is applicable only for Windows static 6188 stat library. */ 6189 __kmp_internal_end_library(-1); 6190 #if KMP_OS_WINDOWS 6191 __kmp_close_console(); 6192 #endif 6193 } 6194 6195 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 6196 // It is assumed __kmp_forkjoin_lock is acquired. 6197 6198 int gtid; 6199 6200 KMP_DEBUG_ASSERT(thread != NULL); 6201 6202 gtid = thread->th.th_info.ds.ds_gtid; 6203 6204 if (!is_root) { 6205 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 6206 /* Assume the threads are at the fork barrier here */ 6207 KA_TRACE( 6208 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 6209 gtid)); 6210 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 6211 while ( 6212 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3)) 6213 KMP_CPU_PAUSE(); 6214 __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL); 6215 } else { 6216 /* Need release fence here to prevent seg faults for tree forkjoin 6217 barrier (GEH) */ 6218 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 6219 thread); 6220 __kmp_release_64(&flag); 6221 } 6222 } 6223 6224 // Terminate OS thread. 6225 __kmp_reap_worker(thread); 6226 6227 // The thread was killed asynchronously. If it was actively 6228 // spinning in the thread pool, decrement the global count. 6229 // 6230 // There is a small timing hole here - if the worker thread was just waking 6231 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 6232 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 6233 // the global counter might not get updated. 6234 // 6235 // Currently, this can only happen as the library is unloaded, 6236 // so there are no harmful side effects. 6237 if (thread->th.th_active_in_pool) { 6238 thread->th.th_active_in_pool = FALSE; 6239 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 6240 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 6241 } 6242 } 6243 6244 __kmp_free_implicit_task(thread); 6245 6246 // Free the fast memory for tasking 6247 #if USE_FAST_MEMORY 6248 __kmp_free_fast_memory(thread); 6249 #endif /* USE_FAST_MEMORY */ 6250 6251 __kmp_suspend_uninitialize_thread(thread); 6252 6253 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 6254 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 6255 6256 --__kmp_all_nth; 6257 // __kmp_nth was decremented when thread is added to the pool. 6258 6259 #ifdef KMP_ADJUST_BLOCKTIME 6260 /* Adjust blocktime back to user setting or default if necessary */ 6261 /* Middle initialization might never have occurred */ 6262 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 6263 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 6264 if (__kmp_nth <= __kmp_avail_proc) { 6265 __kmp_zero_bt = FALSE; 6266 } 6267 } 6268 #endif /* KMP_ADJUST_BLOCKTIME */ 6269 6270 /* free the memory being used */ 6271 if (__kmp_env_consistency_check) { 6272 if (thread->th.th_cons) { 6273 __kmp_free_cons_stack(thread->th.th_cons); 6274 thread->th.th_cons = NULL; 6275 } 6276 } 6277 6278 if (thread->th.th_pri_common != NULL) { 6279 __kmp_free(thread->th.th_pri_common); 6280 thread->th.th_pri_common = NULL; 6281 } 6282 6283 #if KMP_USE_BGET 6284 if (thread->th.th_local.bget_data != NULL) { 6285 __kmp_finalize_bget(thread); 6286 } 6287 #endif 6288 6289 #if KMP_AFFINITY_SUPPORTED 6290 if (thread->th.th_affin_mask != NULL) { 6291 KMP_CPU_FREE(thread->th.th_affin_mask); 6292 thread->th.th_affin_mask = NULL; 6293 } 6294 #endif /* KMP_AFFINITY_SUPPORTED */ 6295 6296 #if KMP_USE_HIER_SCHED 6297 if (thread->th.th_hier_bar_data != NULL) { 6298 __kmp_free(thread->th.th_hier_bar_data); 6299 thread->th.th_hier_bar_data = NULL; 6300 } 6301 #endif 6302 6303 __kmp_reap_team(thread->th.th_serial_team); 6304 thread->th.th_serial_team = NULL; 6305 __kmp_free(thread); 6306 6307 KMP_MB(); 6308 6309 } // __kmp_reap_thread 6310 6311 static void __kmp_itthash_clean(kmp_info_t *th) { 6312 #if USE_ITT_NOTIFY 6313 if (__kmp_itt_region_domains.count > 0) { 6314 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) { 6315 kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i]; 6316 while (bucket) { 6317 kmp_itthash_entry_t *next = bucket->next_in_bucket; 6318 __kmp_thread_free(th, bucket); 6319 bucket = next; 6320 } 6321 } 6322 } 6323 if (__kmp_itt_barrier_domains.count > 0) { 6324 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) { 6325 kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i]; 6326 while (bucket) { 6327 kmp_itthash_entry_t *next = bucket->next_in_bucket; 6328 __kmp_thread_free(th, bucket); 6329 bucket = next; 6330 } 6331 } 6332 } 6333 #endif 6334 } 6335 6336 static void __kmp_internal_end(void) { 6337 int i; 6338 6339 /* First, unregister the library */ 6340 __kmp_unregister_library(); 6341 6342 #if KMP_OS_WINDOWS 6343 /* In Win static library, we can't tell when a root actually dies, so we 6344 reclaim the data structures for any root threads that have died but not 6345 unregistered themselves, in order to shut down cleanly. 6346 In Win dynamic library we also can't tell when a thread dies. */ 6347 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 6348 // dead roots 6349 #endif 6350 6351 for (i = 0; i < __kmp_threads_capacity; i++) 6352 if (__kmp_root[i]) 6353 if (__kmp_root[i]->r.r_active) 6354 break; 6355 KMP_MB(); /* Flush all pending memory write invalidates. */ 6356 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6357 6358 if (i < __kmp_threads_capacity) { 6359 #if KMP_USE_MONITOR 6360 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 6361 KMP_MB(); /* Flush all pending memory write invalidates. */ 6362 6363 // Need to check that monitor was initialized before reaping it. If we are 6364 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 6365 // __kmp_monitor will appear to contain valid data, but it is only valid in 6366 // the parent process, not the child. 6367 // New behavior (201008): instead of keying off of the flag 6368 // __kmp_init_parallel, the monitor thread creation is keyed off 6369 // of the new flag __kmp_init_monitor. 6370 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6371 if (TCR_4(__kmp_init_monitor)) { 6372 __kmp_reap_monitor(&__kmp_monitor); 6373 TCW_4(__kmp_init_monitor, 0); 6374 } 6375 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6376 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6377 #endif // KMP_USE_MONITOR 6378 } else { 6379 /* TODO move this to cleanup code */ 6380 #ifdef KMP_DEBUG 6381 /* make sure that everything has properly ended */ 6382 for (i = 0; i < __kmp_threads_capacity; i++) { 6383 if (__kmp_root[i]) { 6384 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 6385 // there can be uber threads alive here 6386 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 6387 } 6388 } 6389 #endif 6390 6391 KMP_MB(); 6392 6393 // Reap the worker threads. 6394 // This is valid for now, but be careful if threads are reaped sooner. 6395 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 6396 // Get the next thread from the pool. 6397 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 6398 __kmp_thread_pool = thread->th.th_next_pool; 6399 // Reap it. 6400 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 6401 thread->th.th_next_pool = NULL; 6402 thread->th.th_in_pool = FALSE; 6403 __kmp_reap_thread(thread, 0); 6404 } 6405 __kmp_thread_pool_insert_pt = NULL; 6406 6407 // Reap teams. 6408 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 6409 // Get the next team from the pool. 6410 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 6411 __kmp_team_pool = team->t.t_next_pool; 6412 // Reap it. 6413 team->t.t_next_pool = NULL; 6414 __kmp_reap_team(team); 6415 } 6416 6417 __kmp_reap_task_teams(); 6418 6419 #if KMP_OS_UNIX 6420 // Threads that are not reaped should not access any resources since they 6421 // are going to be deallocated soon, so the shutdown sequence should wait 6422 // until all threads either exit the final spin-waiting loop or begin 6423 // sleeping after the given blocktime. 6424 for (i = 0; i < __kmp_threads_capacity; i++) { 6425 kmp_info_t *thr = __kmp_threads[i]; 6426 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 6427 KMP_CPU_PAUSE(); 6428 } 6429 #endif 6430 6431 for (i = 0; i < __kmp_threads_capacity; ++i) { 6432 // TBD: Add some checking... 6433 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6434 } 6435 6436 /* Make sure all threadprivate destructors get run by joining with all 6437 worker threads before resetting this flag */ 6438 TCW_SYNC_4(__kmp_init_common, FALSE); 6439 6440 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6441 KMP_MB(); 6442 6443 #if KMP_USE_MONITOR 6444 // See note above: One of the possible fixes for CQ138434 / CQ140126 6445 // 6446 // FIXME: push both code fragments down and CSE them? 6447 // push them into __kmp_cleanup() ? 6448 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6449 if (TCR_4(__kmp_init_monitor)) { 6450 __kmp_reap_monitor(&__kmp_monitor); 6451 TCW_4(__kmp_init_monitor, 0); 6452 } 6453 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6454 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6455 #endif 6456 } /* else !__kmp_global.t_active */ 6457 TCW_4(__kmp_init_gtid, FALSE); 6458 KMP_MB(); /* Flush all pending memory write invalidates. */ 6459 6460 __kmp_cleanup(); 6461 #if OMPT_SUPPORT 6462 ompt_fini(); 6463 #endif 6464 } 6465 6466 void __kmp_internal_end_library(int gtid_req) { 6467 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6468 /* this shouldn't be a race condition because __kmp_internal_end() is the 6469 only place to clear __kmp_serial_init */ 6470 /* we'll check this later too, after we get the lock */ 6471 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6472 // redundant, because the next check will work in any case. 6473 if (__kmp_global.g.g_abort) { 6474 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6475 /* TODO abort? */ 6476 return; 6477 } 6478 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6479 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6480 return; 6481 } 6482 6483 // If hidden helper team has been initialized, we need to deinit it 6484 if (TCR_4(__kmp_init_hidden_helper) && 6485 !TCR_4(__kmp_hidden_helper_team_done)) { 6486 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6487 // First release the main thread to let it continue its work 6488 __kmp_hidden_helper_main_thread_release(); 6489 // Wait until the hidden helper team has been destroyed 6490 __kmp_hidden_helper_threads_deinitz_wait(); 6491 } 6492 6493 KMP_MB(); /* Flush all pending memory write invalidates. */ 6494 /* find out who we are and what we should do */ 6495 { 6496 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6497 KA_TRACE( 6498 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6499 if (gtid == KMP_GTID_SHUTDOWN) { 6500 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6501 "already shutdown\n")); 6502 return; 6503 } else if (gtid == KMP_GTID_MONITOR) { 6504 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6505 "registered, or system shutdown\n")); 6506 return; 6507 } else if (gtid == KMP_GTID_DNE) { 6508 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6509 "shutdown\n")); 6510 /* we don't know who we are, but we may still shutdown the library */ 6511 } else if (KMP_UBER_GTID(gtid)) { 6512 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6513 if (__kmp_root[gtid]->r.r_active) { 6514 __kmp_global.g.g_abort = -1; 6515 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6516 __kmp_unregister_library(); 6517 KA_TRACE(10, 6518 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6519 gtid)); 6520 return; 6521 } else { 6522 __kmp_itthash_clean(__kmp_threads[gtid]); 6523 KA_TRACE( 6524 10, 6525 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6526 __kmp_unregister_root_current_thread(gtid); 6527 } 6528 } else { 6529 /* worker threads may call this function through the atexit handler, if they 6530 * call exit() */ 6531 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6532 TODO: do a thorough shutdown instead */ 6533 #ifdef DUMP_DEBUG_ON_EXIT 6534 if (__kmp_debug_buf) 6535 __kmp_dump_debug_buffer(); 6536 #endif 6537 // added unregister library call here when we switch to shm linux 6538 // if we don't, it will leave lots of files in /dev/shm 6539 // cleanup shared memory file before exiting. 6540 __kmp_unregister_library(); 6541 return; 6542 } 6543 } 6544 /* synchronize the termination process */ 6545 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6546 6547 /* have we already finished */ 6548 if (__kmp_global.g.g_abort) { 6549 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6550 /* TODO abort? */ 6551 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6552 return; 6553 } 6554 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6555 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6556 return; 6557 } 6558 6559 /* We need this lock to enforce mutex between this reading of 6560 __kmp_threads_capacity and the writing by __kmp_register_root. 6561 Alternatively, we can use a counter of roots that is atomically updated by 6562 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6563 __kmp_internal_end_*. */ 6564 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6565 6566 /* now we can safely conduct the actual termination */ 6567 __kmp_internal_end(); 6568 6569 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6570 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6571 6572 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6573 6574 #ifdef DUMP_DEBUG_ON_EXIT 6575 if (__kmp_debug_buf) 6576 __kmp_dump_debug_buffer(); 6577 #endif 6578 6579 #if KMP_OS_WINDOWS 6580 __kmp_close_console(); 6581 #endif 6582 6583 __kmp_fini_allocator(); 6584 6585 } // __kmp_internal_end_library 6586 6587 void __kmp_internal_end_thread(int gtid_req) { 6588 int i; 6589 6590 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6591 /* this shouldn't be a race condition because __kmp_internal_end() is the 6592 * only place to clear __kmp_serial_init */ 6593 /* we'll check this later too, after we get the lock */ 6594 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6595 // redundant, because the next check will work in any case. 6596 if (__kmp_global.g.g_abort) { 6597 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6598 /* TODO abort? */ 6599 return; 6600 } 6601 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6602 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6603 return; 6604 } 6605 6606 // If hidden helper team has been initialized, we need to deinit it 6607 if (TCR_4(__kmp_init_hidden_helper) && 6608 !TCR_4(__kmp_hidden_helper_team_done)) { 6609 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6610 // First release the main thread to let it continue its work 6611 __kmp_hidden_helper_main_thread_release(); 6612 // Wait until the hidden helper team has been destroyed 6613 __kmp_hidden_helper_threads_deinitz_wait(); 6614 } 6615 6616 KMP_MB(); /* Flush all pending memory write invalidates. */ 6617 6618 /* find out who we are and what we should do */ 6619 { 6620 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6621 KA_TRACE(10, 6622 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6623 if (gtid == KMP_GTID_SHUTDOWN) { 6624 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6625 "already shutdown\n")); 6626 return; 6627 } else if (gtid == KMP_GTID_MONITOR) { 6628 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6629 "registered, or system shutdown\n")); 6630 return; 6631 } else if (gtid == KMP_GTID_DNE) { 6632 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6633 "shutdown\n")); 6634 return; 6635 /* we don't know who we are */ 6636 } else if (KMP_UBER_GTID(gtid)) { 6637 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6638 if (__kmp_root[gtid]->r.r_active) { 6639 __kmp_global.g.g_abort = -1; 6640 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6641 KA_TRACE(10, 6642 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6643 gtid)); 6644 return; 6645 } else { 6646 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6647 gtid)); 6648 __kmp_unregister_root_current_thread(gtid); 6649 } 6650 } else { 6651 /* just a worker thread, let's leave */ 6652 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6653 6654 if (gtid >= 0) { 6655 __kmp_threads[gtid]->th.th_task_team = NULL; 6656 } 6657 6658 KA_TRACE(10, 6659 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6660 gtid)); 6661 return; 6662 } 6663 } 6664 #if KMP_DYNAMIC_LIB 6665 if (__kmp_pause_status != kmp_hard_paused) 6666 // AC: lets not shutdown the dynamic library at the exit of uber thread, 6667 // because we will better shutdown later in the library destructor. 6668 { 6669 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6670 return; 6671 } 6672 #endif 6673 /* synchronize the termination process */ 6674 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6675 6676 /* have we already finished */ 6677 if (__kmp_global.g.g_abort) { 6678 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6679 /* TODO abort? */ 6680 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6681 return; 6682 } 6683 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6684 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6685 return; 6686 } 6687 6688 /* We need this lock to enforce mutex between this reading of 6689 __kmp_threads_capacity and the writing by __kmp_register_root. 6690 Alternatively, we can use a counter of roots that is atomically updated by 6691 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6692 __kmp_internal_end_*. */ 6693 6694 /* should we finish the run-time? are all siblings done? */ 6695 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6696 6697 for (i = 0; i < __kmp_threads_capacity; ++i) { 6698 if (KMP_UBER_GTID(i)) { 6699 KA_TRACE( 6700 10, 6701 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6702 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6703 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6704 return; 6705 } 6706 } 6707 6708 /* now we can safely conduct the actual termination */ 6709 6710 __kmp_internal_end(); 6711 6712 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6713 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6714 6715 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6716 6717 #ifdef DUMP_DEBUG_ON_EXIT 6718 if (__kmp_debug_buf) 6719 __kmp_dump_debug_buffer(); 6720 #endif 6721 } // __kmp_internal_end_thread 6722 6723 // ----------------------------------------------------------------------------- 6724 // Library registration stuff. 6725 6726 static long __kmp_registration_flag = 0; 6727 // Random value used to indicate library initialization. 6728 static char *__kmp_registration_str = NULL; 6729 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6730 6731 static inline char *__kmp_reg_status_name() { 6732 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6733 each thread. If registration and unregistration go in different threads 6734 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6735 env var can not be found, because the name will contain different pid. */ 6736 // macOS* complains about name being too long with additional getuid() 6737 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB 6738 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(), 6739 (int)getuid()); 6740 #else 6741 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6742 #endif 6743 } // __kmp_reg_status_get 6744 6745 #if defined(KMP_USE_SHM) 6746 bool __kmp_shm_available = false; 6747 bool __kmp_tmp_available = false; 6748 // If /dev/shm is not accessible, we will create a temporary file under /tmp. 6749 char *temp_reg_status_file_name = nullptr; 6750 #endif 6751 6752 void __kmp_register_library_startup(void) { 6753 6754 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6755 int done = 0; 6756 union { 6757 double dtime; 6758 long ltime; 6759 } time; 6760 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6761 __kmp_initialize_system_tick(); 6762 #endif 6763 __kmp_read_system_time(&time.dtime); 6764 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6765 __kmp_registration_str = 6766 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6767 __kmp_registration_flag, KMP_LIBRARY_FILE); 6768 6769 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6770 __kmp_registration_str)); 6771 6772 while (!done) { 6773 6774 char *value = NULL; // Actual value of the environment variable. 6775 6776 #if defined(KMP_USE_SHM) 6777 char *shm_name = nullptr; 6778 char *data1 = nullptr; 6779 __kmp_shm_available = __kmp_detect_shm(); 6780 if (__kmp_shm_available) { 6781 int fd1 = -1; 6782 shm_name = __kmp_str_format("/%s", name); 6783 int shm_preexist = 0; 6784 fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0600); 6785 if ((fd1 == -1) && (errno == EEXIST)) { 6786 // file didn't open because it already exists. 6787 // try opening existing file 6788 fd1 = shm_open(shm_name, O_RDWR, 0600); 6789 if (fd1 == -1) { // file didn't open 6790 KMP_WARNING(FunctionError, "Can't open SHM"); 6791 __kmp_shm_available = false; 6792 } else { // able to open existing file 6793 shm_preexist = 1; 6794 } 6795 } 6796 if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size 6797 if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size; 6798 KMP_WARNING(FunctionError, "Can't set size of SHM"); 6799 __kmp_shm_available = false; 6800 } 6801 } 6802 if (__kmp_shm_available) { // SHM exists, now map it 6803 data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, 6804 fd1, 0); 6805 if (data1 == MAP_FAILED) { // failed to map shared memory 6806 KMP_WARNING(FunctionError, "Can't map SHM"); 6807 __kmp_shm_available = false; 6808 } 6809 } 6810 if (__kmp_shm_available) { // SHM mapped 6811 if (shm_preexist == 0) { // set data to SHM, set value 6812 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); 6813 } 6814 // Read value from either what we just wrote or existing file. 6815 value = __kmp_str_format("%s", data1); // read value from SHM 6816 munmap(data1, SHM_SIZE); 6817 } 6818 if (fd1 != -1) 6819 close(fd1); 6820 } 6821 if (!__kmp_shm_available) 6822 __kmp_tmp_available = __kmp_detect_tmp(); 6823 if (!__kmp_shm_available && __kmp_tmp_available) { 6824 // SHM failed to work due to an error other than that the file already 6825 // exists. Try to create a temp file under /tmp. 6826 // If /tmp isn't accessible, fall back to using environment variable. 6827 // TODO: /tmp might not always be the temporary directory. For now we will 6828 // not consider TMPDIR. 6829 int fd1 = -1; 6830 temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name); 6831 int tmp_preexist = 0; 6832 fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600); 6833 if ((fd1 == -1) && (errno == EEXIST)) { 6834 // file didn't open because it already exists. 6835 // try opening existing file 6836 fd1 = open(temp_reg_status_file_name, O_RDWR, 0600); 6837 if (fd1 == -1) { // file didn't open if (fd1 == -1) { 6838 KMP_WARNING(FunctionError, "Can't open TEMP"); 6839 __kmp_tmp_available = false; 6840 } else { 6841 tmp_preexist = 1; 6842 } 6843 } 6844 if (__kmp_tmp_available && tmp_preexist == 0) { 6845 // we created /tmp file now set size 6846 if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size; 6847 KMP_WARNING(FunctionError, "Can't set size of /tmp file"); 6848 __kmp_tmp_available = false; 6849 } 6850 } 6851 if (__kmp_tmp_available) { 6852 data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, 6853 fd1, 0); 6854 if (data1 == MAP_FAILED) { // failed to map /tmp 6855 KMP_WARNING(FunctionError, "Can't map /tmp"); 6856 __kmp_tmp_available = false; 6857 } 6858 } 6859 if (__kmp_tmp_available) { 6860 if (tmp_preexist == 0) { // set data to TMP, set value 6861 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); 6862 } 6863 // Read value from either what we just wrote or existing file. 6864 value = __kmp_str_format("%s", data1); // read value from SHM 6865 munmap(data1, SHM_SIZE); 6866 } 6867 if (fd1 != -1) 6868 close(fd1); 6869 } 6870 if (!__kmp_shm_available && !__kmp_tmp_available) { 6871 // no /dev/shm and no /tmp -- fall back to environment variable 6872 // Set environment variable, but do not overwrite if it exists. 6873 __kmp_env_set(name, __kmp_registration_str, 0); 6874 // read value to see if it got set 6875 value = __kmp_env_get(name); 6876 } 6877 #else // Windows and unix with static library 6878 // Set environment variable, but do not overwrite if it exists. 6879 __kmp_env_set(name, __kmp_registration_str, 0); 6880 // read value to see if it got set 6881 value = __kmp_env_get(name); 6882 #endif 6883 6884 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6885 done = 1; // Ok, environment variable set successfully, exit the loop. 6886 } else { 6887 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6888 // Check whether it alive or dead. 6889 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6890 char *tail = value; 6891 char *flag_addr_str = NULL; 6892 char *flag_val_str = NULL; 6893 char const *file_name = NULL; 6894 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6895 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6896 file_name = tail; 6897 if (tail != NULL) { 6898 unsigned long *flag_addr = 0; 6899 unsigned long flag_val = 0; 6900 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr)); 6901 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6902 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6903 // First, check whether environment-encoded address is mapped into 6904 // addr space. 6905 // If so, dereference it to see if it still has the right value. 6906 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6907 neighbor = 1; 6908 } else { 6909 // If not, then we know the other copy of the library is no longer 6910 // running. 6911 neighbor = 2; 6912 } 6913 } 6914 } 6915 switch (neighbor) { 6916 case 0: // Cannot parse environment variable -- neighbor status unknown. 6917 // Assume it is the incompatible format of future version of the 6918 // library. Assume the other library is alive. 6919 // WARN( ... ); // TODO: Issue a warning. 6920 file_name = "unknown library"; 6921 KMP_FALLTHROUGH(); 6922 // Attention! Falling to the next case. That's intentional. 6923 case 1: { // Neighbor is alive. 6924 // Check it is allowed. 6925 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6926 if (!__kmp_str_match_true(duplicate_ok)) { 6927 // That's not allowed. Issue fatal error. 6928 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6929 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6930 } 6931 KMP_INTERNAL_FREE(duplicate_ok); 6932 __kmp_duplicate_library_ok = 1; 6933 done = 1; // Exit the loop. 6934 } break; 6935 case 2: { // Neighbor is dead. 6936 6937 #if defined(KMP_USE_SHM) 6938 if (__kmp_shm_available) { // close shared memory. 6939 shm_unlink(shm_name); // this removes file in /dev/shm 6940 } else if (__kmp_tmp_available) { 6941 unlink(temp_reg_status_file_name); // this removes the temp file 6942 } else { 6943 // Clear the variable and try to register library again. 6944 __kmp_env_unset(name); 6945 } 6946 #else 6947 // Clear the variable and try to register library again. 6948 __kmp_env_unset(name); 6949 #endif 6950 } break; 6951 default: { 6952 KMP_DEBUG_ASSERT(0); 6953 } break; 6954 } 6955 } 6956 KMP_INTERNAL_FREE((void *)value); 6957 #if defined(KMP_USE_SHM) 6958 if (shm_name) 6959 KMP_INTERNAL_FREE((void *)shm_name); 6960 #endif 6961 } // while 6962 KMP_INTERNAL_FREE((void *)name); 6963 6964 } // func __kmp_register_library_startup 6965 6966 void __kmp_unregister_library(void) { 6967 6968 char *name = __kmp_reg_status_name(); 6969 char *value = NULL; 6970 6971 #if defined(KMP_USE_SHM) 6972 char *shm_name = nullptr; 6973 int fd1; 6974 if (__kmp_shm_available) { 6975 shm_name = __kmp_str_format("/%s", name); 6976 fd1 = shm_open(shm_name, O_RDONLY, 0600); 6977 if (fd1 != -1) { // File opened successfully 6978 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); 6979 if (data1 != MAP_FAILED) { 6980 value = __kmp_str_format("%s", data1); // read value from SHM 6981 munmap(data1, SHM_SIZE); 6982 } 6983 close(fd1); 6984 } 6985 } else if (__kmp_tmp_available) { // try /tmp 6986 fd1 = open(temp_reg_status_file_name, O_RDONLY); 6987 if (fd1 != -1) { // File opened successfully 6988 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); 6989 if (data1 != MAP_FAILED) { 6990 value = __kmp_str_format("%s", data1); // read value from /tmp 6991 munmap(data1, SHM_SIZE); 6992 } 6993 close(fd1); 6994 } 6995 } else { // fall back to envirable 6996 value = __kmp_env_get(name); 6997 } 6998 #else 6999 value = __kmp_env_get(name); 7000 #endif 7001 7002 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 7003 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 7004 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 7005 // Ok, this is our variable. Delete it. 7006 #if defined(KMP_USE_SHM) 7007 if (__kmp_shm_available) { 7008 shm_unlink(shm_name); // this removes file in /dev/shm 7009 } else if (__kmp_tmp_available) { 7010 unlink(temp_reg_status_file_name); // this removes the temp file 7011 } else { 7012 __kmp_env_unset(name); 7013 } 7014 #else 7015 __kmp_env_unset(name); 7016 #endif 7017 } 7018 7019 #if defined(KMP_USE_SHM) 7020 if (shm_name) 7021 KMP_INTERNAL_FREE(shm_name); 7022 if (temp_reg_status_file_name) 7023 KMP_INTERNAL_FREE(temp_reg_status_file_name); 7024 #endif 7025 7026 KMP_INTERNAL_FREE(__kmp_registration_str); 7027 KMP_INTERNAL_FREE(value); 7028 KMP_INTERNAL_FREE(name); 7029 7030 __kmp_registration_flag = 0; 7031 __kmp_registration_str = NULL; 7032 7033 } // __kmp_unregister_library 7034 7035 // End of Library registration stuff. 7036 // ----------------------------------------------------------------------------- 7037 7038 #if KMP_MIC_SUPPORTED 7039 7040 static void __kmp_check_mic_type() { 7041 kmp_cpuid_t cpuid_state = {0}; 7042 kmp_cpuid_t *cs_p = &cpuid_state; 7043 __kmp_x86_cpuid(1, 0, cs_p); 7044 // We don't support mic1 at the moment 7045 if ((cs_p->eax & 0xff0) == 0xB10) { 7046 __kmp_mic_type = mic2; 7047 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 7048 __kmp_mic_type = mic3; 7049 } else { 7050 __kmp_mic_type = non_mic; 7051 } 7052 } 7053 7054 #endif /* KMP_MIC_SUPPORTED */ 7055 7056 #if KMP_HAVE_UMWAIT 7057 static void __kmp_user_level_mwait_init() { 7058 struct kmp_cpuid buf; 7059 __kmp_x86_cpuid(7, 0, &buf); 7060 __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1); 7061 __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait; 7062 __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0); 7063 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", 7064 __kmp_umwait_enabled)); 7065 } 7066 #elif KMP_HAVE_MWAIT 7067 #ifndef AT_INTELPHIUSERMWAIT 7068 // Spurious, non-existent value that should always fail to return anything. 7069 // Will be replaced with the correct value when we know that. 7070 #define AT_INTELPHIUSERMWAIT 10000 7071 #endif 7072 // getauxval() function is available in RHEL7 and SLES12. If a system with an 7073 // earlier OS is used to build the RTL, we'll use the following internal 7074 // function when the entry is not found. 7075 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL; 7076 unsigned long getauxval(unsigned long) { return 0; } 7077 7078 static void __kmp_user_level_mwait_init() { 7079 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available 7080 // use them to find if the user-level mwait is enabled. Otherwise, forcibly 7081 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable 7082 // KMP_USER_LEVEL_MWAIT was set to TRUE. 7083 if (__kmp_mic_type == mic3) { 7084 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT); 7085 if ((res & 0x1) || __kmp_user_level_mwait) { 7086 __kmp_mwait_enabled = TRUE; 7087 if (__kmp_user_level_mwait) { 7088 KMP_INFORM(EnvMwaitWarn); 7089 } 7090 } else { 7091 __kmp_mwait_enabled = FALSE; 7092 } 7093 } 7094 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, " 7095 "__kmp_mwait_enabled = %d\n", 7096 __kmp_mic_type, __kmp_mwait_enabled)); 7097 } 7098 #endif /* KMP_HAVE_UMWAIT */ 7099 7100 static void __kmp_do_serial_initialize(void) { 7101 int i, gtid; 7102 size_t size; 7103 7104 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 7105 7106 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 7107 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 7108 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 7109 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 7110 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 7111 7112 #if OMPT_SUPPORT 7113 ompt_pre_init(); 7114 #endif 7115 #if OMPD_SUPPORT 7116 __kmp_env_dump(); 7117 ompd_init(); 7118 #endif 7119 7120 __kmp_validate_locks(); 7121 7122 #if ENABLE_LIBOMPTARGET 7123 /* Initialize functions from libomptarget */ 7124 __kmp_init_omptarget(); 7125 #endif 7126 7127 /* Initialize internal memory allocator */ 7128 __kmp_init_allocator(); 7129 7130 /* Register the library startup via an environment variable or via mapped 7131 shared memory file and check to see whether another copy of the library is 7132 already registered. Since forked child process is often terminated, we 7133 postpone the registration till middle initialization in the child */ 7134 if (__kmp_need_register_serial) 7135 __kmp_register_library_startup(); 7136 7137 /* TODO reinitialization of library */ 7138 if (TCR_4(__kmp_global.g.g_done)) { 7139 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 7140 } 7141 7142 __kmp_global.g.g_abort = 0; 7143 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 7144 7145 /* initialize the locks */ 7146 #if KMP_USE_ADAPTIVE_LOCKS 7147 #if KMP_DEBUG_ADAPTIVE_LOCKS 7148 __kmp_init_speculative_stats(); 7149 #endif 7150 #endif 7151 #if KMP_STATS_ENABLED 7152 __kmp_stats_init(); 7153 #endif 7154 __kmp_init_lock(&__kmp_global_lock); 7155 __kmp_init_atomic_lock(&__kmp_atomic_lock); 7156 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 7157 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 7158 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 7159 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 7160 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 7161 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 7162 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 7163 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 7164 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 7165 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 7166 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 7167 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 7168 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 7169 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 7170 #if KMP_USE_MONITOR 7171 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 7172 #endif 7173 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 7174 7175 /* conduct initialization and initial setup of configuration */ 7176 7177 __kmp_runtime_initialize(); 7178 7179 #if KMP_MIC_SUPPORTED 7180 __kmp_check_mic_type(); 7181 #endif 7182 7183 // Some global variable initialization moved here from kmp_env_initialize() 7184 #ifdef KMP_DEBUG 7185 kmp_diag = 0; 7186 #endif 7187 __kmp_abort_delay = 0; 7188 7189 // From __kmp_init_dflt_team_nth() 7190 /* assume the entire machine will be used */ 7191 __kmp_dflt_team_nth_ub = __kmp_xproc; 7192 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 7193 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 7194 } 7195 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 7196 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 7197 } 7198 __kmp_max_nth = __kmp_sys_max_nth; 7199 __kmp_cg_max_nth = __kmp_sys_max_nth; 7200 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 7201 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 7202 __kmp_teams_max_nth = __kmp_sys_max_nth; 7203 } 7204 7205 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 7206 // part 7207 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 7208 #if KMP_USE_MONITOR 7209 __kmp_monitor_wakeups = 7210 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 7211 __kmp_bt_intervals = 7212 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 7213 #endif 7214 // From "KMP_LIBRARY" part of __kmp_env_initialize() 7215 __kmp_library = library_throughput; 7216 // From KMP_SCHEDULE initialization 7217 __kmp_static = kmp_sch_static_balanced; 7218 // AC: do not use analytical here, because it is non-monotonous 7219 //__kmp_guided = kmp_sch_guided_iterative_chunked; 7220 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 7221 // need to repeat assignment 7222 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 7223 // bit control and barrier method control parts 7224 #if KMP_FAST_REDUCTION_BARRIER 7225 #define kmp_reduction_barrier_gather_bb ((int)1) 7226 #define kmp_reduction_barrier_release_bb ((int)1) 7227 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt 7228 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt 7229 #endif // KMP_FAST_REDUCTION_BARRIER 7230 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 7231 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 7232 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 7233 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 7234 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 7235 #if KMP_FAST_REDUCTION_BARRIER 7236 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 7237 // lin_64 ): hyper,1 7238 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 7239 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 7240 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 7241 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 7242 } 7243 #endif // KMP_FAST_REDUCTION_BARRIER 7244 } 7245 #if KMP_FAST_REDUCTION_BARRIER 7246 #undef kmp_reduction_barrier_release_pat 7247 #undef kmp_reduction_barrier_gather_pat 7248 #undef kmp_reduction_barrier_release_bb 7249 #undef kmp_reduction_barrier_gather_bb 7250 #endif // KMP_FAST_REDUCTION_BARRIER 7251 #if KMP_MIC_SUPPORTED 7252 if (__kmp_mic_type == mic2) { // KNC 7253 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 7254 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 7255 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 7256 1; // forkjoin release 7257 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 7258 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 7259 } 7260 #if KMP_FAST_REDUCTION_BARRIER 7261 if (__kmp_mic_type == mic2) { // KNC 7262 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 7263 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 7264 } 7265 #endif // KMP_FAST_REDUCTION_BARRIER 7266 #endif // KMP_MIC_SUPPORTED 7267 7268 // From KMP_CHECKS initialization 7269 #ifdef KMP_DEBUG 7270 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 7271 #else 7272 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 7273 #endif 7274 7275 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 7276 __kmp_foreign_tp = TRUE; 7277 7278 __kmp_global.g.g_dynamic = FALSE; 7279 __kmp_global.g.g_dynamic_mode = dynamic_default; 7280 7281 __kmp_init_nesting_mode(); 7282 7283 __kmp_env_initialize(NULL); 7284 7285 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT 7286 __kmp_user_level_mwait_init(); 7287 #endif 7288 // Print all messages in message catalog for testing purposes. 7289 #ifdef KMP_DEBUG 7290 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 7291 if (__kmp_str_match_true(val)) { 7292 kmp_str_buf_t buffer; 7293 __kmp_str_buf_init(&buffer); 7294 __kmp_i18n_dump_catalog(&buffer); 7295 __kmp_printf("%s", buffer.str); 7296 __kmp_str_buf_free(&buffer); 7297 } 7298 __kmp_env_free(&val); 7299 #endif 7300 7301 __kmp_threads_capacity = 7302 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 7303 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 7304 __kmp_tp_capacity = __kmp_default_tp_capacity( 7305 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 7306 7307 // If the library is shut down properly, both pools must be NULL. Just in 7308 // case, set them to NULL -- some memory may leak, but subsequent code will 7309 // work even if pools are not freed. 7310 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 7311 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 7312 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 7313 __kmp_thread_pool = NULL; 7314 __kmp_thread_pool_insert_pt = NULL; 7315 __kmp_team_pool = NULL; 7316 7317 /* Allocate all of the variable sized records */ 7318 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 7319 * expandable */ 7320 /* Since allocation is cache-aligned, just add extra padding at the end */ 7321 size = 7322 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 7323 CACHE_LINE; 7324 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 7325 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 7326 sizeof(kmp_info_t *) * __kmp_threads_capacity); 7327 7328 /* init thread counts */ 7329 KMP_DEBUG_ASSERT(__kmp_all_nth == 7330 0); // Asserts fail if the library is reinitializing and 7331 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 7332 __kmp_all_nth = 0; 7333 __kmp_nth = 0; 7334 7335 /* setup the uber master thread and hierarchy */ 7336 gtid = __kmp_register_root(TRUE); 7337 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 7338 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7339 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 7340 7341 KMP_MB(); /* Flush all pending memory write invalidates. */ 7342 7343 __kmp_common_initialize(); 7344 7345 #if KMP_OS_UNIX 7346 /* invoke the child fork handler */ 7347 __kmp_register_atfork(); 7348 #endif 7349 7350 #if !KMP_DYNAMIC_LIB || \ 7351 ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN) 7352 { 7353 /* Invoke the exit handler when the program finishes, only for static 7354 library and macOS* dynamic. For other dynamic libraries, we already 7355 have _fini and DllMain. */ 7356 int rc = atexit(__kmp_internal_end_atexit); 7357 if (rc != 0) { 7358 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 7359 __kmp_msg_null); 7360 } 7361 } 7362 #endif 7363 7364 #if KMP_HANDLE_SIGNALS 7365 #if KMP_OS_UNIX 7366 /* NOTE: make sure that this is called before the user installs their own 7367 signal handlers so that the user handlers are called first. this way they 7368 can return false, not call our handler, avoid terminating the library, and 7369 continue execution where they left off. */ 7370 __kmp_install_signals(FALSE); 7371 #endif /* KMP_OS_UNIX */ 7372 #if KMP_OS_WINDOWS 7373 __kmp_install_signals(TRUE); 7374 #endif /* KMP_OS_WINDOWS */ 7375 #endif 7376 7377 /* we have finished the serial initialization */ 7378 __kmp_init_counter++; 7379 7380 __kmp_init_serial = TRUE; 7381 7382 if (__kmp_version) { 7383 __kmp_print_version_1(); 7384 } 7385 7386 if (__kmp_settings) { 7387 __kmp_env_print(); 7388 } 7389 7390 if (__kmp_display_env || __kmp_display_env_verbose) { 7391 __kmp_env_print_2(); 7392 } 7393 7394 #if OMPT_SUPPORT 7395 ompt_post_init(); 7396 #endif 7397 7398 KMP_MB(); 7399 7400 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 7401 } 7402 7403 void __kmp_serial_initialize(void) { 7404 if (__kmp_init_serial) { 7405 return; 7406 } 7407 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7408 if (__kmp_init_serial) { 7409 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7410 return; 7411 } 7412 __kmp_do_serial_initialize(); 7413 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7414 } 7415 7416 static void __kmp_do_middle_initialize(void) { 7417 int i, j; 7418 int prev_dflt_team_nth; 7419 7420 if (!__kmp_init_serial) { 7421 __kmp_do_serial_initialize(); 7422 } 7423 7424 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 7425 7426 if (UNLIKELY(!__kmp_need_register_serial)) { 7427 // We are in a forked child process. The registration was skipped during 7428 // serial initialization in __kmp_atfork_child handler. Do it here. 7429 __kmp_register_library_startup(); 7430 } 7431 7432 // Save the previous value for the __kmp_dflt_team_nth so that 7433 // we can avoid some reinitialization if it hasn't changed. 7434 prev_dflt_team_nth = __kmp_dflt_team_nth; 7435 7436 #if KMP_AFFINITY_SUPPORTED 7437 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 7438 // number of cores on the machine. 7439 __kmp_affinity_initialize(__kmp_affinity); 7440 7441 #endif /* KMP_AFFINITY_SUPPORTED */ 7442 7443 KMP_ASSERT(__kmp_xproc > 0); 7444 if (__kmp_avail_proc == 0) { 7445 __kmp_avail_proc = __kmp_xproc; 7446 } 7447 7448 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 7449 // correct them now 7450 j = 0; 7451 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 7452 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 7453 __kmp_avail_proc; 7454 j++; 7455 } 7456 7457 if (__kmp_dflt_team_nth == 0) { 7458 #ifdef KMP_DFLT_NTH_CORES 7459 // Default #threads = #cores 7460 __kmp_dflt_team_nth = __kmp_ncores; 7461 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7462 "__kmp_ncores (%d)\n", 7463 __kmp_dflt_team_nth)); 7464 #else 7465 // Default #threads = #available OS procs 7466 __kmp_dflt_team_nth = __kmp_avail_proc; 7467 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7468 "__kmp_avail_proc(%d)\n", 7469 __kmp_dflt_team_nth)); 7470 #endif /* KMP_DFLT_NTH_CORES */ 7471 } 7472 7473 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 7474 __kmp_dflt_team_nth = KMP_MIN_NTH; 7475 } 7476 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 7477 __kmp_dflt_team_nth = __kmp_sys_max_nth; 7478 } 7479 7480 if (__kmp_nesting_mode > 0) 7481 __kmp_set_nesting_mode_threads(); 7482 7483 // There's no harm in continuing if the following check fails, 7484 // but it indicates an error in the previous logic. 7485 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 7486 7487 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 7488 // Run through the __kmp_threads array and set the num threads icv for each 7489 // root thread that is currently registered with the RTL (which has not 7490 // already explicitly set its nthreads-var with a call to 7491 // omp_set_num_threads()). 7492 for (i = 0; i < __kmp_threads_capacity; i++) { 7493 kmp_info_t *thread = __kmp_threads[i]; 7494 if (thread == NULL) 7495 continue; 7496 if (thread->th.th_current_task->td_icvs.nproc != 0) 7497 continue; 7498 7499 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 7500 } 7501 } 7502 KA_TRACE( 7503 20, 7504 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 7505 __kmp_dflt_team_nth)); 7506 7507 #ifdef KMP_ADJUST_BLOCKTIME 7508 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 7509 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 7510 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 7511 if (__kmp_nth > __kmp_avail_proc) { 7512 __kmp_zero_bt = TRUE; 7513 } 7514 } 7515 #endif /* KMP_ADJUST_BLOCKTIME */ 7516 7517 /* we have finished middle initialization */ 7518 TCW_SYNC_4(__kmp_init_middle, TRUE); 7519 7520 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 7521 } 7522 7523 void __kmp_middle_initialize(void) { 7524 if (__kmp_init_middle) { 7525 return; 7526 } 7527 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7528 if (__kmp_init_middle) { 7529 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7530 return; 7531 } 7532 __kmp_do_middle_initialize(); 7533 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7534 } 7535 7536 void __kmp_parallel_initialize(void) { 7537 int gtid = __kmp_entry_gtid(); // this might be a new root 7538 7539 /* synchronize parallel initialization (for sibling) */ 7540 if (TCR_4(__kmp_init_parallel)) 7541 return; 7542 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7543 if (TCR_4(__kmp_init_parallel)) { 7544 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7545 return; 7546 } 7547 7548 /* TODO reinitialization after we have already shut down */ 7549 if (TCR_4(__kmp_global.g.g_done)) { 7550 KA_TRACE( 7551 10, 7552 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 7553 __kmp_infinite_loop(); 7554 } 7555 7556 /* jc: The lock __kmp_initz_lock is already held, so calling 7557 __kmp_serial_initialize would cause a deadlock. So we call 7558 __kmp_do_serial_initialize directly. */ 7559 if (!__kmp_init_middle) { 7560 __kmp_do_middle_initialize(); 7561 } 7562 __kmp_assign_root_init_mask(); 7563 __kmp_resume_if_hard_paused(); 7564 7565 /* begin initialization */ 7566 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 7567 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7568 7569 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 7570 // Save the FP control regs. 7571 // Worker threads will set theirs to these values at thread startup. 7572 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 7573 __kmp_store_mxcsr(&__kmp_init_mxcsr); 7574 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 7575 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 7576 7577 #if KMP_OS_UNIX 7578 #if KMP_HANDLE_SIGNALS 7579 /* must be after __kmp_serial_initialize */ 7580 __kmp_install_signals(TRUE); 7581 #endif 7582 #endif 7583 7584 __kmp_suspend_initialize(); 7585 7586 #if defined(USE_LOAD_BALANCE) 7587 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7588 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 7589 } 7590 #else 7591 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7592 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7593 } 7594 #endif 7595 7596 if (__kmp_version) { 7597 __kmp_print_version_2(); 7598 } 7599 7600 /* we have finished parallel initialization */ 7601 TCW_SYNC_4(__kmp_init_parallel, TRUE); 7602 7603 KMP_MB(); 7604 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 7605 7606 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7607 } 7608 7609 void __kmp_hidden_helper_initialize() { 7610 if (TCR_4(__kmp_init_hidden_helper)) 7611 return; 7612 7613 // __kmp_parallel_initialize is required before we initialize hidden helper 7614 if (!TCR_4(__kmp_init_parallel)) 7615 __kmp_parallel_initialize(); 7616 7617 // Double check. Note that this double check should not be placed before 7618 // __kmp_parallel_initialize as it will cause dead lock. 7619 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7620 if (TCR_4(__kmp_init_hidden_helper)) { 7621 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7622 return; 7623 } 7624 7625 #if KMP_AFFINITY_SUPPORTED 7626 // Initialize hidden helper affinity settings. 7627 // The above __kmp_parallel_initialize() will initialize 7628 // regular affinity (and topology) if not already done. 7629 if (!__kmp_hh_affinity.flags.initialized) 7630 __kmp_affinity_initialize(__kmp_hh_affinity); 7631 #endif 7632 7633 // Set the count of hidden helper tasks to be executed to zero 7634 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0); 7635 7636 // Set the global variable indicating that we're initializing hidden helper 7637 // team/threads 7638 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE); 7639 7640 // Platform independent initialization 7641 __kmp_do_initialize_hidden_helper_threads(); 7642 7643 // Wait here for the finish of initialization of hidden helper teams 7644 __kmp_hidden_helper_threads_initz_wait(); 7645 7646 // We have finished hidden helper initialization 7647 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE); 7648 7649 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7650 } 7651 7652 /* ------------------------------------------------------------------------ */ 7653 7654 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7655 kmp_team_t *team) { 7656 kmp_disp_t *dispatch; 7657 7658 KMP_MB(); 7659 7660 /* none of the threads have encountered any constructs, yet. */ 7661 this_thr->th.th_local.this_construct = 0; 7662 #if KMP_CACHE_MANAGE 7663 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 7664 #endif /* KMP_CACHE_MANAGE */ 7665 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 7666 KMP_DEBUG_ASSERT(dispatch); 7667 KMP_DEBUG_ASSERT(team->t.t_dispatch); 7668 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 7669 // this_thr->th.th_info.ds.ds_tid ] ); 7670 7671 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 7672 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter 7673 if (__kmp_env_consistency_check) 7674 __kmp_push_parallel(gtid, team->t.t_ident); 7675 7676 KMP_MB(); /* Flush all pending memory write invalidates. */ 7677 } 7678 7679 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7680 kmp_team_t *team) { 7681 if (__kmp_env_consistency_check) 7682 __kmp_pop_parallel(gtid, team->t.t_ident); 7683 7684 __kmp_finish_implicit_task(this_thr); 7685 } 7686 7687 int __kmp_invoke_task_func(int gtid) { 7688 int rc; 7689 int tid = __kmp_tid_from_gtid(gtid); 7690 kmp_info_t *this_thr = __kmp_threads[gtid]; 7691 kmp_team_t *team = this_thr->th.th_team; 7692 7693 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 7694 #if USE_ITT_BUILD 7695 if (__itt_stack_caller_create_ptr) { 7696 // inform ittnotify about entering user's code 7697 if (team->t.t_stack_id != NULL) { 7698 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id); 7699 } else { 7700 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7701 __kmp_itt_stack_callee_enter( 7702 (__itt_caller)team->t.t_parent->t.t_stack_id); 7703 } 7704 } 7705 #endif /* USE_ITT_BUILD */ 7706 #if INCLUDE_SSC_MARKS 7707 SSC_MARK_INVOKING(); 7708 #endif 7709 7710 #if OMPT_SUPPORT 7711 void *dummy; 7712 void **exit_frame_p; 7713 ompt_data_t *my_task_data; 7714 ompt_data_t *my_parallel_data; 7715 int ompt_team_size; 7716 7717 if (ompt_enabled.enabled) { 7718 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid] 7719 .ompt_task_info.frame.exit_frame.ptr); 7720 } else { 7721 exit_frame_p = &dummy; 7722 } 7723 7724 my_task_data = 7725 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 7726 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 7727 if (ompt_enabled.ompt_callback_implicit_task) { 7728 ompt_team_size = team->t.t_nproc; 7729 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7730 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 7731 __kmp_tid_from_gtid(gtid), ompt_task_implicit); 7732 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 7733 } 7734 #endif 7735 7736 #if KMP_STATS_ENABLED 7737 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 7738 if (previous_state == stats_state_e::TEAMS_REGION) { 7739 KMP_PUSH_PARTITIONED_TIMER(OMP_teams); 7740 } else { 7741 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); 7742 } 7743 KMP_SET_THREAD_STATE(IMPLICIT_TASK); 7744 #endif 7745 7746 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 7747 tid, (int)team->t.t_argc, (void **)team->t.t_argv 7748 #if OMPT_SUPPORT 7749 , 7750 exit_frame_p 7751 #endif 7752 ); 7753 #if OMPT_SUPPORT 7754 *exit_frame_p = NULL; 7755 this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_team; 7756 #endif 7757 7758 #if KMP_STATS_ENABLED 7759 if (previous_state == stats_state_e::TEAMS_REGION) { 7760 KMP_SET_THREAD_STATE(previous_state); 7761 } 7762 KMP_POP_PARTITIONED_TIMER(); 7763 #endif 7764 7765 #if USE_ITT_BUILD 7766 if (__itt_stack_caller_create_ptr) { 7767 // inform ittnotify about leaving user's code 7768 if (team->t.t_stack_id != NULL) { 7769 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id); 7770 } else { 7771 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7772 __kmp_itt_stack_callee_leave( 7773 (__itt_caller)team->t.t_parent->t.t_stack_id); 7774 } 7775 } 7776 #endif /* USE_ITT_BUILD */ 7777 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7778 7779 return rc; 7780 } 7781 7782 void __kmp_teams_master(int gtid) { 7783 // This routine is called by all primary threads in teams construct 7784 kmp_info_t *thr = __kmp_threads[gtid]; 7785 kmp_team_t *team = thr->th.th_team; 7786 ident_t *loc = team->t.t_ident; 7787 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7788 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7789 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7790 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7791 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7792 7793 // This thread is a new CG root. Set up the proper variables. 7794 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 7795 tmp->cg_root = thr; // Make thr the CG root 7796 // Init to thread limit stored when league primary threads were forked 7797 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; 7798 tmp->cg_nthreads = 1; // Init counter to one active thread, this one 7799 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" 7800 " cg_nthreads to 1\n", 7801 thr, tmp)); 7802 tmp->up = thr->th.th_cg_roots; 7803 thr->th.th_cg_roots = tmp; 7804 7805 // Launch league of teams now, but not let workers execute 7806 // (they hang on fork barrier until next parallel) 7807 #if INCLUDE_SSC_MARKS 7808 SSC_MARK_FORKING(); 7809 #endif 7810 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7811 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7812 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7813 #if INCLUDE_SSC_MARKS 7814 SSC_MARK_JOINING(); 7815 #endif 7816 // If the team size was reduced from the limit, set it to the new size 7817 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) 7818 thr->th.th_teams_size.nth = thr->th.th_team_nproc; 7819 // AC: last parameter "1" eliminates join barrier which won't work because 7820 // worker threads are in a fork barrier waiting for more parallel regions 7821 __kmp_join_call(loc, gtid 7822 #if OMPT_SUPPORT 7823 , 7824 fork_context_intel 7825 #endif 7826 , 7827 1); 7828 } 7829 7830 int __kmp_invoke_teams_master(int gtid) { 7831 kmp_info_t *this_thr = __kmp_threads[gtid]; 7832 kmp_team_t *team = this_thr->th.th_team; 7833 #if KMP_DEBUG 7834 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7835 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7836 (void *)__kmp_teams_master); 7837 #endif 7838 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7839 #if OMPT_SUPPORT 7840 int tid = __kmp_tid_from_gtid(gtid); 7841 ompt_data_t *task_data = 7842 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data; 7843 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data; 7844 if (ompt_enabled.ompt_callback_implicit_task) { 7845 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7846 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid, 7847 ompt_task_initial); 7848 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid; 7849 } 7850 #endif 7851 __kmp_teams_master(gtid); 7852 #if OMPT_SUPPORT 7853 this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_league; 7854 #endif 7855 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7856 return 1; 7857 } 7858 7859 /* this sets the requested number of threads for the next parallel region 7860 encountered by this team. since this should be enclosed in the forkjoin 7861 critical section it should avoid race conditions with asymmetrical nested 7862 parallelism */ 7863 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7864 kmp_info_t *thr = __kmp_threads[gtid]; 7865 7866 if (num_threads > 0) 7867 thr->th.th_set_nproc = num_threads; 7868 } 7869 7870 void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length, 7871 int *num_threads_list) { 7872 kmp_info_t *thr = __kmp_threads[gtid]; 7873 7874 KMP_DEBUG_ASSERT(list_length > 1); 7875 7876 if (num_threads_list[0] > 0) 7877 thr->th.th_set_nproc = num_threads_list[0]; 7878 thr->th.th_set_nested_nth = 7879 (int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int)); 7880 for (kmp_uint32 i = 0; i < list_length; ++i) 7881 thr->th.th_set_nested_nth[i] = num_threads_list[i]; 7882 thr->th.th_set_nested_nth_sz = list_length; 7883 } 7884 7885 void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev, 7886 const char *msg) { 7887 kmp_info_t *thr = __kmp_threads[gtid]; 7888 thr->th.th_nt_strict = true; 7889 thr->th.th_nt_loc = loc; 7890 // if sev is unset make fatal 7891 if (sev == severity_warning) 7892 thr->th.th_nt_sev = sev; 7893 else 7894 thr->th.th_nt_sev = severity_fatal; 7895 // if msg is unset, use an appropriate message 7896 if (msg) 7897 thr->th.th_nt_msg = msg; 7898 else 7899 thr->th.th_nt_msg = "Cannot form team with number of threads specified by " 7900 "strict num_threads clause."; 7901 } 7902 7903 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams, 7904 int num_threads) { 7905 KMP_DEBUG_ASSERT(thr); 7906 // Remember the number of threads for inner parallel regions 7907 if (!TCR_4(__kmp_init_middle)) 7908 __kmp_middle_initialize(); // get internal globals calculated 7909 __kmp_assign_root_init_mask(); 7910 KMP_DEBUG_ASSERT(__kmp_avail_proc); 7911 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth); 7912 7913 if (num_threads == 0) { 7914 if (__kmp_teams_thread_limit > 0) { 7915 num_threads = __kmp_teams_thread_limit; 7916 } else { 7917 num_threads = __kmp_avail_proc / num_teams; 7918 } 7919 // adjust num_threads w/o warning as it is not user setting 7920 // num_threads = min(num_threads, nthreads-var, thread-limit-var) 7921 // no thread_limit clause specified - do not change thread-limit-var ICV 7922 if (num_threads > __kmp_dflt_team_nth) { 7923 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7924 } 7925 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) { 7926 num_threads = thr->th.th_current_task->td_icvs.thread_limit; 7927 } // prevent team size to exceed thread-limit-var 7928 if (num_teams * num_threads > __kmp_teams_max_nth) { 7929 num_threads = __kmp_teams_max_nth / num_teams; 7930 } 7931 if (num_threads == 0) { 7932 num_threads = 1; 7933 } 7934 } else { 7935 if (num_threads < 0) { 7936 __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1), 7937 __kmp_msg_null); 7938 num_threads = 1; 7939 } 7940 // This thread will be the primary thread of the league primary threads 7941 // Store new thread limit; old limit is saved in th_cg_roots list 7942 thr->th.th_current_task->td_icvs.thread_limit = num_threads; 7943 // num_threads = min(num_threads, nthreads-var) 7944 if (num_threads > __kmp_dflt_team_nth) { 7945 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7946 } 7947 if (num_teams * num_threads > __kmp_teams_max_nth) { 7948 int new_threads = __kmp_teams_max_nth / num_teams; 7949 if (new_threads == 0) { 7950 new_threads = 1; 7951 } 7952 if (new_threads != num_threads) { 7953 if (!__kmp_reserve_warn) { // user asked for too many threads 7954 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT 7955 __kmp_msg(kmp_ms_warning, 7956 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7957 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7958 } 7959 } 7960 num_threads = new_threads; 7961 } 7962 } 7963 thr->th.th_teams_size.nth = num_threads; 7964 } 7965 7966 /* this sets the requested number of teams for the teams region and/or 7967 the number of threads for the next parallel region encountered */ 7968 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7969 int num_threads) { 7970 kmp_info_t *thr = __kmp_threads[gtid]; 7971 if (num_teams < 0) { 7972 // OpenMP specification requires requested values to be positive, 7973 // but people can send us any value, so we'd better check 7974 __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1), 7975 __kmp_msg_null); 7976 num_teams = 1; 7977 } 7978 if (num_teams == 0) { 7979 if (__kmp_nteams > 0) { 7980 num_teams = __kmp_nteams; 7981 } else { 7982 num_teams = 1; // default number of teams is 1. 7983 } 7984 } 7985 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7986 if (!__kmp_reserve_warn) { 7987 __kmp_reserve_warn = 1; 7988 __kmp_msg(kmp_ms_warning, 7989 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7990 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7991 } 7992 num_teams = __kmp_teams_max_nth; 7993 } 7994 // Set number of teams (number of threads in the outer "parallel" of the 7995 // teams) 7996 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7997 7998 __kmp_push_thread_limit(thr, num_teams, num_threads); 7999 } 8000 8001 /* This sets the requested number of teams for the teams region and/or 8002 the number of threads for the next parallel region encountered */ 8003 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb, 8004 int num_teams_ub, int num_threads) { 8005 kmp_info_t *thr = __kmp_threads[gtid]; 8006 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0); 8007 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb); 8008 KMP_DEBUG_ASSERT(num_threads >= 0); 8009 8010 if (num_teams_lb > num_teams_ub) { 8011 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub), 8012 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null); 8013 } 8014 8015 int num_teams = 1; // defalt number of teams is 1. 8016 8017 if (num_teams_lb == 0 && num_teams_ub > 0) 8018 num_teams_lb = num_teams_ub; 8019 8020 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause 8021 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams; 8022 if (num_teams > __kmp_teams_max_nth) { 8023 if (!__kmp_reserve_warn) { 8024 __kmp_reserve_warn = 1; 8025 __kmp_msg(kmp_ms_warning, 8026 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 8027 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 8028 } 8029 num_teams = __kmp_teams_max_nth; 8030 } 8031 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams 8032 num_teams = num_teams_ub; 8033 } else { // num_teams_lb <= num_teams <= num_teams_ub 8034 if (num_threads <= 0) { 8035 if (num_teams_ub > __kmp_teams_max_nth) { 8036 num_teams = num_teams_lb; 8037 } else { 8038 num_teams = num_teams_ub; 8039 } 8040 } else { 8041 num_teams = (num_threads > __kmp_teams_max_nth) 8042 ? num_teams 8043 : __kmp_teams_max_nth / num_threads; 8044 if (num_teams < num_teams_lb) { 8045 num_teams = num_teams_lb; 8046 } else if (num_teams > num_teams_ub) { 8047 num_teams = num_teams_ub; 8048 } 8049 } 8050 } 8051 // Set number of teams (number of threads in the outer "parallel" of the 8052 // teams) 8053 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 8054 8055 __kmp_push_thread_limit(thr, num_teams, num_threads); 8056 } 8057 8058 // Set the proc_bind var to use in the following parallel region. 8059 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 8060 kmp_info_t *thr = __kmp_threads[gtid]; 8061 thr->th.th_set_proc_bind = proc_bind; 8062 } 8063 8064 /* Launch the worker threads into the microtask. */ 8065 8066 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 8067 kmp_info_t *this_thr = __kmp_threads[gtid]; 8068 8069 #ifdef KMP_DEBUG 8070 int f; 8071 #endif /* KMP_DEBUG */ 8072 8073 KMP_DEBUG_ASSERT(team); 8074 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 8075 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 8076 KMP_MB(); /* Flush all pending memory write invalidates. */ 8077 8078 team->t.t_construct = 0; /* no single directives seen yet */ 8079 team->t.t_ordered.dt.t_value = 8080 0; /* thread 0 enters the ordered section first */ 8081 8082 /* Reset the identifiers on the dispatch buffer */ 8083 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 8084 if (team->t.t_max_nproc > 1) { 8085 int i; 8086 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 8087 team->t.t_disp_buffer[i].buffer_index = i; 8088 team->t.t_disp_buffer[i].doacross_buf_idx = i; 8089 } 8090 } else { 8091 team->t.t_disp_buffer[0].buffer_index = 0; 8092 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 8093 } 8094 8095 KMP_MB(); /* Flush all pending memory write invalidates. */ 8096 KMP_ASSERT(this_thr->th.th_team == team); 8097 8098 #ifdef KMP_DEBUG 8099 for (f = 0; f < team->t.t_nproc; f++) { 8100 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 8101 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 8102 } 8103 #endif /* KMP_DEBUG */ 8104 8105 /* release the worker threads so they may begin working */ 8106 __kmp_fork_barrier(gtid, 0); 8107 } 8108 8109 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 8110 kmp_info_t *this_thr = __kmp_threads[gtid]; 8111 8112 KMP_DEBUG_ASSERT(team); 8113 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 8114 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 8115 KMP_MB(); /* Flush all pending memory write invalidates. */ 8116 8117 /* Join barrier after fork */ 8118 8119 #ifdef KMP_DEBUG 8120 if (__kmp_threads[gtid] && 8121 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 8122 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 8123 __kmp_threads[gtid]); 8124 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 8125 "team->t.t_nproc=%d\n", 8126 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 8127 team->t.t_nproc); 8128 __kmp_print_structure(); 8129 } 8130 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 8131 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 8132 #endif /* KMP_DEBUG */ 8133 8134 __kmp_join_barrier(gtid); /* wait for everyone */ 8135 #if OMPT_SUPPORT 8136 ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state; 8137 if (ompt_enabled.enabled && 8138 (ompt_state == ompt_state_wait_barrier_teams || 8139 ompt_state == ompt_state_wait_barrier_implicit_parallel)) { 8140 int ds_tid = this_thr->th.th_info.ds.ds_tid; 8141 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 8142 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 8143 #if OMPT_OPTIONAL 8144 void *codeptr = NULL; 8145 if (KMP_MASTER_TID(ds_tid) && 8146 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 8147 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 8148 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 8149 8150 ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel; 8151 if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) 8152 sync_kind = ompt_sync_region_barrier_teams; 8153 if (ompt_enabled.ompt_callback_sync_region_wait) { 8154 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 8155 sync_kind, ompt_scope_end, NULL, task_data, codeptr); 8156 } 8157 if (ompt_enabled.ompt_callback_sync_region) { 8158 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 8159 sync_kind, ompt_scope_end, NULL, task_data, codeptr); 8160 } 8161 #endif 8162 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 8163 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 8164 ompt_scope_end, NULL, task_data, 0, ds_tid, 8165 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 8166 } 8167 } 8168 #endif 8169 8170 KMP_MB(); /* Flush all pending memory write invalidates. */ 8171 KMP_ASSERT(this_thr->th.th_team == team); 8172 } 8173 8174 /* ------------------------------------------------------------------------ */ 8175 8176 #ifdef USE_LOAD_BALANCE 8177 8178 // Return the worker threads actively spinning in the hot team, if we 8179 // are at the outermost level of parallelism. Otherwise, return 0. 8180 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 8181 int i; 8182 int retval; 8183 kmp_team_t *hot_team; 8184 8185 if (root->r.r_active) { 8186 return 0; 8187 } 8188 hot_team = root->r.r_hot_team; 8189 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 8190 return hot_team->t.t_nproc - 1; // Don't count primary thread 8191 } 8192 8193 // Skip the primary thread - it is accounted for elsewhere. 8194 retval = 0; 8195 for (i = 1; i < hot_team->t.t_nproc; i++) { 8196 if (hot_team->t.t_threads[i]->th.th_active) { 8197 retval++; 8198 } 8199 } 8200 return retval; 8201 } 8202 8203 // Perform an automatic adjustment to the number of 8204 // threads used by the next parallel region. 8205 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 8206 int retval; 8207 int pool_active; 8208 int hot_team_active; 8209 int team_curr_active; 8210 int system_active; 8211 8212 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 8213 set_nproc)); 8214 KMP_DEBUG_ASSERT(root); 8215 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 8216 ->th.th_current_task->td_icvs.dynamic == TRUE); 8217 KMP_DEBUG_ASSERT(set_nproc > 1); 8218 8219 if (set_nproc == 1) { 8220 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 8221 return 1; 8222 } 8223 8224 // Threads that are active in the thread pool, active in the hot team for this 8225 // particular root (if we are at the outer par level), and the currently 8226 // executing thread (to become the primary thread) are available to add to the 8227 // new team, but are currently contributing to the system load, and must be 8228 // accounted for. 8229 pool_active = __kmp_thread_pool_active_nth; 8230 hot_team_active = __kmp_active_hot_team_nproc(root); 8231 team_curr_active = pool_active + hot_team_active + 1; 8232 8233 // Check the system load. 8234 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 8235 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 8236 "hot team active = %d\n", 8237 system_active, pool_active, hot_team_active)); 8238 8239 if (system_active < 0) { 8240 // There was an error reading the necessary info from /proc, so use the 8241 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 8242 // = dynamic_thread_limit, we shouldn't wind up getting back here. 8243 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 8244 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 8245 8246 // Make this call behave like the thread limit algorithm. 8247 retval = __kmp_avail_proc - __kmp_nth + 8248 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 8249 if (retval > set_nproc) { 8250 retval = set_nproc; 8251 } 8252 if (retval < KMP_MIN_NTH) { 8253 retval = KMP_MIN_NTH; 8254 } 8255 8256 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 8257 retval)); 8258 return retval; 8259 } 8260 8261 // There is a slight delay in the load balance algorithm in detecting new 8262 // running procs. The real system load at this instant should be at least as 8263 // large as the #active omp thread that are available to add to the team. 8264 if (system_active < team_curr_active) { 8265 system_active = team_curr_active; 8266 } 8267 retval = __kmp_avail_proc - system_active + team_curr_active; 8268 if (retval > set_nproc) { 8269 retval = set_nproc; 8270 } 8271 if (retval < KMP_MIN_NTH) { 8272 retval = KMP_MIN_NTH; 8273 } 8274 8275 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 8276 return retval; 8277 } // __kmp_load_balance_nproc() 8278 8279 #endif /* USE_LOAD_BALANCE */ 8280 8281 /* ------------------------------------------------------------------------ */ 8282 8283 /* NOTE: this is called with the __kmp_init_lock held */ 8284 void __kmp_cleanup(void) { 8285 int f; 8286 8287 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 8288 8289 if (TCR_4(__kmp_init_parallel)) { 8290 #if KMP_HANDLE_SIGNALS 8291 __kmp_remove_signals(); 8292 #endif 8293 TCW_4(__kmp_init_parallel, FALSE); 8294 } 8295 8296 if (TCR_4(__kmp_init_middle)) { 8297 #if KMP_AFFINITY_SUPPORTED 8298 __kmp_affinity_uninitialize(); 8299 #endif /* KMP_AFFINITY_SUPPORTED */ 8300 __kmp_cleanup_hierarchy(); 8301 TCW_4(__kmp_init_middle, FALSE); 8302 } 8303 8304 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 8305 8306 if (__kmp_init_serial) { 8307 __kmp_runtime_destroy(); 8308 __kmp_init_serial = FALSE; 8309 } 8310 8311 __kmp_cleanup_threadprivate_caches(); 8312 8313 for (f = 0; f < __kmp_threads_capacity; f++) { 8314 if (__kmp_root[f] != NULL) { 8315 __kmp_free(__kmp_root[f]); 8316 __kmp_root[f] = NULL; 8317 } 8318 } 8319 __kmp_free(__kmp_threads); 8320 // __kmp_threads and __kmp_root were allocated at once, as single block, so 8321 // there is no need in freeing __kmp_root. 8322 __kmp_threads = NULL; 8323 __kmp_root = NULL; 8324 __kmp_threads_capacity = 0; 8325 8326 // Free old __kmp_threads arrays if they exist. 8327 kmp_old_threads_list_t *ptr = __kmp_old_threads_list; 8328 while (ptr) { 8329 kmp_old_threads_list_t *next = ptr->next; 8330 __kmp_free(ptr->threads); 8331 __kmp_free(ptr); 8332 ptr = next; 8333 } 8334 8335 #if KMP_USE_DYNAMIC_LOCK 8336 __kmp_cleanup_indirect_user_locks(); 8337 #else 8338 __kmp_cleanup_user_locks(); 8339 #endif 8340 #if OMPD_SUPPORT 8341 if (ompd_state) { 8342 __kmp_free(ompd_env_block); 8343 ompd_env_block = NULL; 8344 ompd_env_block_size = 0; 8345 } 8346 #endif 8347 8348 #if KMP_AFFINITY_SUPPORTED 8349 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 8350 __kmp_cpuinfo_file = NULL; 8351 #endif /* KMP_AFFINITY_SUPPORTED */ 8352 8353 #if KMP_USE_ADAPTIVE_LOCKS 8354 #if KMP_DEBUG_ADAPTIVE_LOCKS 8355 __kmp_print_speculative_stats(); 8356 #endif 8357 #endif 8358 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 8359 __kmp_nested_nth.nth = NULL; 8360 __kmp_nested_nth.size = 0; 8361 __kmp_nested_nth.used = 0; 8362 8363 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 8364 __kmp_nested_proc_bind.bind_types = NULL; 8365 __kmp_nested_proc_bind.size = 0; 8366 __kmp_nested_proc_bind.used = 0; 8367 if (__kmp_affinity_format) { 8368 KMP_INTERNAL_FREE(__kmp_affinity_format); 8369 __kmp_affinity_format = NULL; 8370 } 8371 8372 __kmp_i18n_catclose(); 8373 8374 #if KMP_USE_HIER_SCHED 8375 __kmp_hier_scheds.deallocate(); 8376 #endif 8377 8378 #if KMP_STATS_ENABLED 8379 __kmp_stats_fini(); 8380 #endif 8381 8382 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 8383 } 8384 8385 /* ------------------------------------------------------------------------ */ 8386 8387 int __kmp_ignore_mppbeg(void) { 8388 char *env; 8389 8390 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 8391 if (__kmp_str_match_false(env)) 8392 return FALSE; 8393 } 8394 // By default __kmpc_begin() is no-op. 8395 return TRUE; 8396 } 8397 8398 int __kmp_ignore_mppend(void) { 8399 char *env; 8400 8401 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 8402 if (__kmp_str_match_false(env)) 8403 return FALSE; 8404 } 8405 // By default __kmpc_end() is no-op. 8406 return TRUE; 8407 } 8408 8409 void __kmp_internal_begin(void) { 8410 int gtid; 8411 kmp_root_t *root; 8412 8413 /* this is a very important step as it will register new sibling threads 8414 and assign these new uber threads a new gtid */ 8415 gtid = __kmp_entry_gtid(); 8416 root = __kmp_threads[gtid]->th.th_root; 8417 KMP_ASSERT(KMP_UBER_GTID(gtid)); 8418 8419 if (root->r.r_begin) 8420 return; 8421 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 8422 if (root->r.r_begin) { 8423 __kmp_release_lock(&root->r.r_begin_lock, gtid); 8424 return; 8425 } 8426 8427 root->r.r_begin = TRUE; 8428 8429 __kmp_release_lock(&root->r.r_begin_lock, gtid); 8430 } 8431 8432 /* ------------------------------------------------------------------------ */ 8433 8434 void __kmp_user_set_library(enum library_type arg) { 8435 int gtid; 8436 kmp_root_t *root; 8437 kmp_info_t *thread; 8438 8439 /* first, make sure we are initialized so we can get our gtid */ 8440 8441 gtid = __kmp_entry_gtid(); 8442 thread = __kmp_threads[gtid]; 8443 8444 root = thread->th.th_root; 8445 8446 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 8447 library_serial)); 8448 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 8449 thread */ 8450 KMP_WARNING(SetLibraryIncorrectCall); 8451 return; 8452 } 8453 8454 switch (arg) { 8455 case library_serial: 8456 thread->th.th_set_nproc = 0; 8457 set__nproc(thread, 1); 8458 break; 8459 case library_turnaround: 8460 thread->th.th_set_nproc = 0; 8461 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8462 : __kmp_dflt_team_nth_ub); 8463 break; 8464 case library_throughput: 8465 thread->th.th_set_nproc = 0; 8466 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8467 : __kmp_dflt_team_nth_ub); 8468 break; 8469 default: 8470 KMP_FATAL(UnknownLibraryType, arg); 8471 } 8472 8473 __kmp_aux_set_library(arg); 8474 } 8475 8476 void __kmp_aux_set_stacksize(size_t arg) { 8477 if (!__kmp_init_serial) 8478 __kmp_serial_initialize(); 8479 8480 #if KMP_OS_DARWIN 8481 if (arg & (0x1000 - 1)) { 8482 arg &= ~(0x1000 - 1); 8483 if (arg + 0x1000) /* check for overflow if we round up */ 8484 arg += 0x1000; 8485 } 8486 #endif 8487 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8488 8489 /* only change the default stacksize before the first parallel region */ 8490 if (!TCR_4(__kmp_init_parallel)) { 8491 size_t value = arg; /* argument is in bytes */ 8492 8493 if (value < __kmp_sys_min_stksize) 8494 value = __kmp_sys_min_stksize; 8495 else if (value > KMP_MAX_STKSIZE) 8496 value = KMP_MAX_STKSIZE; 8497 8498 __kmp_stksize = value; 8499 8500 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 8501 } 8502 8503 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8504 } 8505 8506 /* set the behaviour of the runtime library */ 8507 /* TODO this can cause some odd behaviour with sibling parallelism... */ 8508 void __kmp_aux_set_library(enum library_type arg) { 8509 __kmp_library = arg; 8510 8511 switch (__kmp_library) { 8512 case library_serial: { 8513 KMP_INFORM(LibraryIsSerial); 8514 } break; 8515 case library_turnaround: 8516 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) 8517 __kmp_use_yield = 2; // only yield when oversubscribed 8518 break; 8519 case library_throughput: 8520 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) 8521 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 8522 break; 8523 default: 8524 KMP_FATAL(UnknownLibraryType, arg); 8525 } 8526 } 8527 8528 /* Getting team information common for all team API */ 8529 // Returns NULL if not in teams construct 8530 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 8531 kmp_info_t *thr = __kmp_entry_thread(); 8532 teams_serialized = 0; 8533 if (thr->th.th_teams_microtask) { 8534 kmp_team_t *team = thr->th.th_team; 8535 int tlevel = thr->th.th_teams_level; // the level of the teams construct 8536 int ii = team->t.t_level; 8537 teams_serialized = team->t.t_serialized; 8538 int level = tlevel + 1; 8539 KMP_DEBUG_ASSERT(ii >= tlevel); 8540 while (ii > level) { 8541 for (teams_serialized = team->t.t_serialized; 8542 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 8543 } 8544 if (team->t.t_serialized && (!teams_serialized)) { 8545 team = team->t.t_parent; 8546 continue; 8547 } 8548 if (ii > level) { 8549 team = team->t.t_parent; 8550 ii--; 8551 } 8552 } 8553 return team; 8554 } 8555 return NULL; 8556 } 8557 8558 int __kmp_aux_get_team_num() { 8559 int serialized; 8560 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8561 if (team) { 8562 if (serialized > 1) { 8563 return 0; // teams region is serialized ( 1 team of 1 thread ). 8564 } else { 8565 return team->t.t_master_tid; 8566 } 8567 } 8568 return 0; 8569 } 8570 8571 int __kmp_aux_get_num_teams() { 8572 int serialized; 8573 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8574 if (team) { 8575 if (serialized > 1) { 8576 return 1; 8577 } else { 8578 return team->t.t_parent->t.t_nproc; 8579 } 8580 } 8581 return 1; 8582 } 8583 8584 /* ------------------------------------------------------------------------ */ 8585 8586 /* 8587 * Affinity Format Parser 8588 * 8589 * Field is in form of: %[[[0].]size]type 8590 * % and type are required (%% means print a literal '%') 8591 * type is either single char or long name surrounded by {}, 8592 * e.g., N or {num_threads} 8593 * 0 => leading zeros 8594 * . => right justified when size is specified 8595 * by default output is left justified 8596 * size is the *minimum* field length 8597 * All other characters are printed as is 8598 * 8599 * Available field types: 8600 * L {thread_level} - omp_get_level() 8601 * n {thread_num} - omp_get_thread_num() 8602 * h {host} - name of host machine 8603 * P {process_id} - process id (integer) 8604 * T {thread_identifier} - native thread identifier (integer) 8605 * N {num_threads} - omp_get_num_threads() 8606 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 8607 * a {thread_affinity} - comma separated list of integers or integer ranges 8608 * (values of affinity mask) 8609 * 8610 * Implementation-specific field types can be added 8611 * If a type is unknown, print "undefined" 8612 */ 8613 8614 // Structure holding the short name, long name, and corresponding data type 8615 // for snprintf. A table of these will represent the entire valid keyword 8616 // field types. 8617 typedef struct kmp_affinity_format_field_t { 8618 char short_name; // from spec e.g., L -> thread level 8619 const char *long_name; // from spec thread_level -> thread level 8620 char field_format; // data type for snprintf (typically 'd' or 's' 8621 // for integer or string) 8622 } kmp_affinity_format_field_t; 8623 8624 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 8625 #if KMP_AFFINITY_SUPPORTED 8626 {'A', "thread_affinity", 's'}, 8627 #endif 8628 {'t', "team_num", 'd'}, 8629 {'T', "num_teams", 'd'}, 8630 {'L', "nesting_level", 'd'}, 8631 {'n', "thread_num", 'd'}, 8632 {'N', "num_threads", 'd'}, 8633 {'a', "ancestor_tnum", 'd'}, 8634 {'H', "host", 's'}, 8635 {'P', "process_id", 'd'}, 8636 {'i', "native_thread_id", 'd'}}; 8637 8638 // Return the number of characters it takes to hold field 8639 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 8640 const char **ptr, 8641 kmp_str_buf_t *field_buffer) { 8642 int rc, format_index, field_value; 8643 const char *width_left, *width_right; 8644 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 8645 static const int FORMAT_SIZE = 20; 8646 char format[FORMAT_SIZE] = {0}; 8647 char absolute_short_name = 0; 8648 8649 KMP_DEBUG_ASSERT(gtid >= 0); 8650 KMP_DEBUG_ASSERT(th); 8651 KMP_DEBUG_ASSERT(**ptr == '%'); 8652 KMP_DEBUG_ASSERT(field_buffer); 8653 8654 __kmp_str_buf_clear(field_buffer); 8655 8656 // Skip the initial % 8657 (*ptr)++; 8658 8659 // Check for %% first 8660 if (**ptr == '%') { 8661 __kmp_str_buf_cat(field_buffer, "%", 1); 8662 (*ptr)++; // skip over the second % 8663 return 1; 8664 } 8665 8666 // Parse field modifiers if they are present 8667 pad_zeros = false; 8668 if (**ptr == '0') { 8669 pad_zeros = true; 8670 (*ptr)++; // skip over 0 8671 } 8672 right_justify = false; 8673 if (**ptr == '.') { 8674 right_justify = true; 8675 (*ptr)++; // skip over . 8676 } 8677 // Parse width of field: [width_left, width_right) 8678 width_left = width_right = NULL; 8679 if (**ptr >= '0' && **ptr <= '9') { 8680 width_left = *ptr; 8681 SKIP_DIGITS(*ptr); 8682 width_right = *ptr; 8683 } 8684 8685 // Create the format for KMP_SNPRINTF based on flags parsed above 8686 format_index = 0; 8687 format[format_index++] = '%'; 8688 if (!right_justify) 8689 format[format_index++] = '-'; 8690 if (pad_zeros) 8691 format[format_index++] = '0'; 8692 if (width_left && width_right) { 8693 int i = 0; 8694 // Only allow 8 digit number widths. 8695 // This also prevents overflowing format variable 8696 while (i < 8 && width_left < width_right) { 8697 format[format_index++] = *width_left; 8698 width_left++; 8699 i++; 8700 } 8701 } 8702 8703 // Parse a name (long or short) 8704 // Canonicalize the name into absolute_short_name 8705 found_valid_name = false; 8706 parse_long_name = (**ptr == '{'); 8707 if (parse_long_name) 8708 (*ptr)++; // skip initial left brace 8709 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 8710 sizeof(__kmp_affinity_format_table[0]); 8711 ++i) { 8712 char short_name = __kmp_affinity_format_table[i].short_name; 8713 const char *long_name = __kmp_affinity_format_table[i].long_name; 8714 char field_format = __kmp_affinity_format_table[i].field_format; 8715 if (parse_long_name) { 8716 size_t length = KMP_STRLEN(long_name); 8717 if (strncmp(*ptr, long_name, length) == 0) { 8718 found_valid_name = true; 8719 (*ptr) += length; // skip the long name 8720 } 8721 } else if (**ptr == short_name) { 8722 found_valid_name = true; 8723 (*ptr)++; // skip the short name 8724 } 8725 if (found_valid_name) { 8726 format[format_index++] = field_format; 8727 format[format_index++] = '\0'; 8728 absolute_short_name = short_name; 8729 break; 8730 } 8731 } 8732 if (parse_long_name) { 8733 if (**ptr != '}') { 8734 absolute_short_name = 0; 8735 } else { 8736 (*ptr)++; // skip over the right brace 8737 } 8738 } 8739 8740 // Attempt to fill the buffer with the requested 8741 // value using snprintf within __kmp_str_buf_print() 8742 switch (absolute_short_name) { 8743 case 't': 8744 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 8745 break; 8746 case 'T': 8747 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 8748 break; 8749 case 'L': 8750 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 8751 break; 8752 case 'n': 8753 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 8754 break; 8755 case 'H': { 8756 static const int BUFFER_SIZE = 256; 8757 char buf[BUFFER_SIZE]; 8758 __kmp_expand_host_name(buf, BUFFER_SIZE); 8759 rc = __kmp_str_buf_print(field_buffer, format, buf); 8760 } break; 8761 case 'P': 8762 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 8763 break; 8764 case 'i': 8765 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 8766 break; 8767 case 'N': 8768 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 8769 break; 8770 case 'a': 8771 field_value = 8772 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 8773 rc = __kmp_str_buf_print(field_buffer, format, field_value); 8774 break; 8775 #if KMP_AFFINITY_SUPPORTED 8776 case 'A': { 8777 kmp_str_buf_t buf; 8778 __kmp_str_buf_init(&buf); 8779 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 8780 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 8781 __kmp_str_buf_free(&buf); 8782 } break; 8783 #endif 8784 default: 8785 // According to spec, If an implementation does not have info for field 8786 // type, then "undefined" is printed 8787 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 8788 // Skip the field 8789 if (parse_long_name) { 8790 SKIP_TOKEN(*ptr); 8791 if (**ptr == '}') 8792 (*ptr)++; 8793 } else { 8794 (*ptr)++; 8795 } 8796 } 8797 8798 KMP_ASSERT(format_index <= FORMAT_SIZE); 8799 return rc; 8800 } 8801 8802 /* 8803 * Return number of characters needed to hold the affinity string 8804 * (not including null byte character) 8805 * The resultant string is printed to buffer, which the caller can then 8806 * handle afterwards 8807 */ 8808 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 8809 kmp_str_buf_t *buffer) { 8810 const char *parse_ptr; 8811 size_t retval; 8812 const kmp_info_t *th; 8813 kmp_str_buf_t field; 8814 8815 KMP_DEBUG_ASSERT(buffer); 8816 KMP_DEBUG_ASSERT(gtid >= 0); 8817 8818 __kmp_str_buf_init(&field); 8819 __kmp_str_buf_clear(buffer); 8820 8821 th = __kmp_threads[gtid]; 8822 retval = 0; 8823 8824 // If format is NULL or zero-length string, then we use 8825 // affinity-format-var ICV 8826 parse_ptr = format; 8827 if (parse_ptr == NULL || *parse_ptr == '\0') { 8828 parse_ptr = __kmp_affinity_format; 8829 } 8830 KMP_DEBUG_ASSERT(parse_ptr); 8831 8832 while (*parse_ptr != '\0') { 8833 // Parse a field 8834 if (*parse_ptr == '%') { 8835 // Put field in the buffer 8836 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 8837 __kmp_str_buf_catbuf(buffer, &field); 8838 retval += rc; 8839 } else { 8840 // Put literal character in buffer 8841 __kmp_str_buf_cat(buffer, parse_ptr, 1); 8842 retval++; 8843 parse_ptr++; 8844 } 8845 } 8846 __kmp_str_buf_free(&field); 8847 return retval; 8848 } 8849 8850 // Displays the affinity string to stdout 8851 void __kmp_aux_display_affinity(int gtid, const char *format) { 8852 kmp_str_buf_t buf; 8853 __kmp_str_buf_init(&buf); 8854 __kmp_aux_capture_affinity(gtid, format, &buf); 8855 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 8856 __kmp_str_buf_free(&buf); 8857 } 8858 8859 /* ------------------------------------------------------------------------ */ 8860 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 8861 int blocktime = arg; /* argument is in microseconds */ 8862 #if KMP_USE_MONITOR 8863 int bt_intervals; 8864 #endif 8865 kmp_int8 bt_set; 8866 8867 __kmp_save_internal_controls(thread); 8868 8869 /* Normalize and set blocktime for the teams */ 8870 if (blocktime < KMP_MIN_BLOCKTIME) 8871 blocktime = KMP_MIN_BLOCKTIME; 8872 else if (blocktime > KMP_MAX_BLOCKTIME) 8873 blocktime = KMP_MAX_BLOCKTIME; 8874 8875 set__blocktime_team(thread->th.th_team, tid, blocktime); 8876 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 8877 8878 #if KMP_USE_MONITOR 8879 /* Calculate and set blocktime intervals for the teams */ 8880 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 8881 8882 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 8883 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 8884 #endif 8885 8886 /* Set whether blocktime has been set to "TRUE" */ 8887 bt_set = TRUE; 8888 8889 set__bt_set_team(thread->th.th_team, tid, bt_set); 8890 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 8891 #if KMP_USE_MONITOR 8892 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 8893 "bt_intervals=%d, monitor_updates=%d\n", 8894 __kmp_gtid_from_tid(tid, thread->th.th_team), 8895 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 8896 __kmp_monitor_wakeups)); 8897 #else 8898 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 8899 __kmp_gtid_from_tid(tid, thread->th.th_team), 8900 thread->th.th_team->t.t_id, tid, blocktime)); 8901 #endif 8902 } 8903 8904 void __kmp_aux_set_defaults(char const *str, size_t len) { 8905 if (!__kmp_init_serial) { 8906 __kmp_serial_initialize(); 8907 } 8908 __kmp_env_initialize(str); 8909 8910 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { 8911 __kmp_env_print(); 8912 } 8913 } // __kmp_aux_set_defaults 8914 8915 /* ------------------------------------------------------------------------ */ 8916 /* internal fast reduction routines */ 8917 8918 PACKED_REDUCTION_METHOD_T 8919 __kmp_determine_reduction_method( 8920 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 8921 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 8922 kmp_critical_name *lck) { 8923 8924 // Default reduction method: critical construct ( lck != NULL, like in current 8925 // PAROPT ) 8926 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 8927 // can be selected by RTL 8928 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 8929 // can be selected by RTL 8930 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8931 // among generated by PAROPT. 8932 8933 PACKED_REDUCTION_METHOD_T retval; 8934 8935 int team_size; 8936 8937 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8938 8939 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8940 (loc && \ 8941 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))) 8942 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8943 8944 retval = critical_reduce_block; 8945 8946 // another choice of getting a team size (with 1 dynamic deference) is slower 8947 team_size = __kmp_get_team_num_threads(global_tid); 8948 if (team_size == 1) { 8949 8950 retval = empty_reduce_block; 8951 8952 } else { 8953 8954 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8955 8956 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ 8957 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \ 8958 KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM 8959 8960 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8961 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HAIKU || \ 8962 KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX 8963 8964 int teamsize_cutoff = 4; 8965 8966 #if KMP_MIC_SUPPORTED 8967 if (__kmp_mic_type != non_mic) { 8968 teamsize_cutoff = 8; 8969 } 8970 #endif 8971 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8972 if (tree_available) { 8973 if (team_size <= teamsize_cutoff) { 8974 if (atomic_available) { 8975 retval = atomic_reduce_block; 8976 } 8977 } else { 8978 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8979 } 8980 } else if (atomic_available) { 8981 retval = atomic_reduce_block; 8982 } 8983 #else 8984 #error "Unknown or unsupported OS" 8985 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8986 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HAIKU || 8987 // KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX 8988 8989 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS || \ 8990 KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32 || KMP_ARCH_SPARC 8991 8992 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8993 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HAIKU || KMP_OS_HURD || \ 8994 KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX 8995 8996 // basic tuning 8997 8998 if (atomic_available) { 8999 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 9000 retval = atomic_reduce_block; 9001 } 9002 } // otherwise: use critical section 9003 9004 #elif KMP_OS_DARWIN 9005 9006 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 9007 if (atomic_available && (num_vars <= 3)) { 9008 retval = atomic_reduce_block; 9009 } else if (tree_available) { 9010 if ((reduce_size > (9 * sizeof(kmp_real64))) && 9011 (reduce_size < (2000 * sizeof(kmp_real64)))) { 9012 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 9013 } 9014 } // otherwise: use critical section 9015 9016 #else 9017 #error "Unknown or unsupported OS" 9018 #endif 9019 9020 #else 9021 #error "Unknown or unsupported architecture" 9022 #endif 9023 } 9024 9025 // KMP_FORCE_REDUCTION 9026 9027 // If the team is serialized (team_size == 1), ignore the forced reduction 9028 // method and stay with the unsynchronized method (empty_reduce_block) 9029 if (__kmp_force_reduction_method != reduction_method_not_defined && 9030 team_size != 1) { 9031 9032 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 9033 9034 int atomic_available, tree_available; 9035 9036 switch ((forced_retval = __kmp_force_reduction_method)) { 9037 case critical_reduce_block: 9038 KMP_ASSERT(lck); // lck should be != 0 9039 break; 9040 9041 case atomic_reduce_block: 9042 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 9043 if (!atomic_available) { 9044 KMP_WARNING(RedMethodNotSupported, "atomic"); 9045 forced_retval = critical_reduce_block; 9046 } 9047 break; 9048 9049 case tree_reduce_block: 9050 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 9051 if (!tree_available) { 9052 KMP_WARNING(RedMethodNotSupported, "tree"); 9053 forced_retval = critical_reduce_block; 9054 } else { 9055 #if KMP_FAST_REDUCTION_BARRIER 9056 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 9057 #endif 9058 } 9059 break; 9060 9061 default: 9062 KMP_ASSERT(0); // "unsupported method specified" 9063 } 9064 9065 retval = forced_retval; 9066 } 9067 9068 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 9069 9070 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 9071 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 9072 9073 return (retval); 9074 } 9075 // this function is for testing set/get/determine reduce method 9076 kmp_int32 __kmp_get_reduce_method(void) { 9077 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 9078 } 9079 9080 // Soft pause sets up threads to ignore blocktime and just go to sleep. 9081 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 9082 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 9083 9084 // Hard pause shuts down the runtime completely. Resume happens naturally when 9085 // OpenMP is used subsequently. 9086 void __kmp_hard_pause() { 9087 __kmp_pause_status = kmp_hard_paused; 9088 __kmp_internal_end_thread(-1); 9089 } 9090 9091 // Soft resume sets __kmp_pause_status, and wakes up all threads. 9092 void __kmp_resume_if_soft_paused() { 9093 if (__kmp_pause_status == kmp_soft_paused) { 9094 __kmp_pause_status = kmp_not_paused; 9095 9096 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 9097 kmp_info_t *thread = __kmp_threads[gtid]; 9098 if (thread) { // Wake it if sleeping 9099 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 9100 thread); 9101 if (fl.is_sleeping()) 9102 fl.resume(gtid); 9103 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 9104 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 9105 } else { // thread holds the lock and may sleep soon 9106 do { // until either the thread sleeps, or we can get the lock 9107 if (fl.is_sleeping()) { 9108 fl.resume(gtid); 9109 break; 9110 } else if (__kmp_try_suspend_mx(thread)) { 9111 __kmp_unlock_suspend_mx(thread); 9112 break; 9113 } 9114 } while (1); 9115 } 9116 } 9117 } 9118 } 9119 } 9120 9121 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 9122 // TODO: add warning messages 9123 int __kmp_pause_resource(kmp_pause_status_t level) { 9124 if (level == kmp_not_paused) { // requesting resume 9125 if (__kmp_pause_status == kmp_not_paused) { 9126 // error message about runtime not being paused, so can't resume 9127 return 1; 9128 } else { 9129 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 9130 __kmp_pause_status == kmp_hard_paused); 9131 __kmp_pause_status = kmp_not_paused; 9132 return 0; 9133 } 9134 } else if (level == kmp_soft_paused) { // requesting soft pause 9135 if (__kmp_pause_status != kmp_not_paused) { 9136 // error message about already being paused 9137 return 1; 9138 } else { 9139 __kmp_soft_pause(); 9140 return 0; 9141 } 9142 } else if (level == kmp_hard_paused || level == kmp_stop_tool_paused) { 9143 // requesting hard pause or stop_tool pause 9144 if (__kmp_pause_status != kmp_not_paused) { 9145 // error message about already being paused 9146 return 1; 9147 } else { 9148 __kmp_hard_pause(); 9149 return 0; 9150 } 9151 } else { 9152 // error message about invalid level 9153 return 1; 9154 } 9155 } 9156 9157 void __kmp_omp_display_env(int verbose) { 9158 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 9159 if (__kmp_init_serial == 0) 9160 __kmp_do_serial_initialize(); 9161 __kmp_display_env_impl(!verbose, verbose); 9162 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 9163 } 9164 9165 // The team size is changing, so distributed barrier must be modified 9166 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads, 9167 int new_nthreads) { 9168 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] == 9169 bp_dist_bar); 9170 kmp_info_t **other_threads = team->t.t_threads; 9171 9172 // We want all the workers to stop waiting on the barrier while we adjust the 9173 // size of the team. 9174 for (int f = 1; f < old_nthreads; ++f) { 9175 KMP_DEBUG_ASSERT(other_threads[f] != NULL); 9176 // Ignore threads that are already inactive or not present in the team 9177 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) { 9178 // teams construct causes thread_limit to get passed in, and some of 9179 // those could be inactive; just ignore them 9180 continue; 9181 } 9182 // If thread is transitioning still to in_use state, wait for it 9183 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) { 9184 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3) 9185 KMP_CPU_PAUSE(); 9186 } 9187 // The thread should be in_use now 9188 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1); 9189 // Transition to unused state 9190 team->t.t_threads[f]->th.th_used_in_team.store(2); 9191 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2); 9192 } 9193 // Release all the workers 9194 team->t.b->go_release(); 9195 9196 KMP_MFENCE(); 9197 9198 // Workers should see transition status 2 and move to 0; but may need to be 9199 // woken up first 9200 int count = old_nthreads - 1; 9201 while (count > 0) { 9202 count = old_nthreads - 1; 9203 for (int f = 1; f < old_nthreads; ++f) { 9204 if (other_threads[f]->th.th_used_in_team.load() != 0) { 9205 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers 9206 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST( 9207 void *, other_threads[f]->th.th_sleep_loc); 9208 __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag); 9209 } 9210 } else { 9211 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0); 9212 count--; 9213 } 9214 } 9215 } 9216 // Now update the barrier size 9217 team->t.b->update_num_threads(new_nthreads); 9218 team->t.b->go_reset(); 9219 } 9220 9221 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) { 9222 // Add the threads back to the team 9223 KMP_DEBUG_ASSERT(team); 9224 // Threads were paused and pointed at th_used_in_team temporarily during a 9225 // resize of the team. We're going to set th_used_in_team to 3 to indicate to 9226 // the thread that it should transition itself back into the team. Then, if 9227 // blocktime isn't infinite, the thread could be sleeping, so we send a resume 9228 // to wake it up. 9229 for (int f = 1; f < new_nthreads; ++f) { 9230 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 9231 (void)KMP_COMPARE_AND_STORE_ACQ32( 9232 &(team->t.t_threads[f]->th.th_used_in_team), 0, 3); 9233 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads 9234 __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid, 9235 (kmp_flag_32<false, false> *)NULL); 9236 } 9237 } 9238 // The threads should be transitioning to the team; when they are done, they 9239 // should have set th_used_in_team to 1. This loop forces master to wait until 9240 // all threads have moved into the team and are waiting in the barrier. 9241 int count = new_nthreads - 1; 9242 while (count > 0) { 9243 count = new_nthreads - 1; 9244 for (int f = 1; f < new_nthreads; ++f) { 9245 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) { 9246 count--; 9247 } 9248 } 9249 } 9250 } 9251 9252 // Globals and functions for hidden helper task 9253 kmp_info_t **__kmp_hidden_helper_threads; 9254 kmp_info_t *__kmp_hidden_helper_main_thread; 9255 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks; 9256 #if KMP_OS_LINUX 9257 kmp_int32 __kmp_hidden_helper_threads_num = 8; 9258 kmp_int32 __kmp_enable_hidden_helper = TRUE; 9259 #else 9260 kmp_int32 __kmp_hidden_helper_threads_num = 0; 9261 kmp_int32 __kmp_enable_hidden_helper = FALSE; 9262 #endif 9263 9264 namespace { 9265 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num; 9266 9267 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) { 9268 // This is an explicit synchronization on all hidden helper threads in case 9269 // that when a regular thread pushes a hidden helper task to one hidden 9270 // helper thread, the thread has not been awaken once since they're released 9271 // by the main thread after creating the team. 9272 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num); 9273 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) != 9274 __kmp_hidden_helper_threads_num) 9275 ; 9276 9277 // If main thread, then wait for signal 9278 if (__kmpc_master(nullptr, *gtid)) { 9279 // First, unset the initial state and release the initial thread 9280 TCW_4(__kmp_init_hidden_helper_threads, FALSE); 9281 __kmp_hidden_helper_initz_release(); 9282 __kmp_hidden_helper_main_thread_wait(); 9283 // Now wake up all worker threads 9284 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) { 9285 __kmp_hidden_helper_worker_thread_signal(); 9286 } 9287 } 9288 } 9289 } // namespace 9290 9291 void __kmp_hidden_helper_threads_initz_routine() { 9292 // Create a new root for hidden helper team/threads 9293 const int gtid = __kmp_register_root(TRUE); 9294 __kmp_hidden_helper_main_thread = __kmp_threads[gtid]; 9295 __kmp_hidden_helper_threads = &__kmp_threads[gtid]; 9296 __kmp_hidden_helper_main_thread->th.th_set_nproc = 9297 __kmp_hidden_helper_threads_num; 9298 9299 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0); 9300 9301 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn); 9302 9303 // Set the initialization flag to FALSE 9304 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE); 9305 9306 __kmp_hidden_helper_threads_deinitz_release(); 9307 } 9308 9309 /* Nesting Mode: 9310 Set via KMP_NESTING_MODE, which takes an integer. 9311 Note: we skip duplicate topology levels, and skip levels with only 9312 one entity. 9313 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode. 9314 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels 9315 in the topology, and initializes the number of threads at each of those 9316 levels to the number of entities at each level, respectively, below the 9317 entity at the parent level. 9318 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels, 9319 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires 9320 the user to turn nesting on explicitly. This is an even more experimental 9321 option to this experimental feature, and may change or go away in the 9322 future. 9323 */ 9324 9325 // Allocate space to store nesting levels 9326 void __kmp_init_nesting_mode() { 9327 int levels = KMP_HW_LAST; 9328 __kmp_nesting_mode_nlevels = levels; 9329 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int)); 9330 for (int i = 0; i < levels; ++i) 9331 __kmp_nesting_nth_level[i] = 0; 9332 if (__kmp_nested_nth.size < levels) { 9333 __kmp_nested_nth.nth = 9334 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int)); 9335 __kmp_nested_nth.size = levels; 9336 } 9337 } 9338 9339 // Set # threads for top levels of nesting; must be called after topology set 9340 void __kmp_set_nesting_mode_threads() { 9341 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()]; 9342 9343 if (__kmp_nesting_mode == 1) 9344 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 9345 else if (__kmp_nesting_mode > 1) 9346 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 9347 9348 if (__kmp_topology) { // use topology info 9349 int loc, hw_level; 9350 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() && 9351 loc < __kmp_nesting_mode_nlevels; 9352 loc++, hw_level++) { 9353 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level); 9354 if (__kmp_nesting_nth_level[loc] == 1) 9355 loc--; 9356 } 9357 // Make sure all cores are used 9358 if (__kmp_nesting_mode > 1 && loc > 1) { 9359 int core_level = __kmp_topology->get_level(KMP_HW_CORE); 9360 int num_cores = __kmp_topology->get_count(core_level); 9361 int upper_levels = 1; 9362 for (int level = 0; level < loc - 1; ++level) 9363 upper_levels *= __kmp_nesting_nth_level[level]; 9364 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores) 9365 __kmp_nesting_nth_level[loc - 1] = 9366 num_cores / __kmp_nesting_nth_level[loc - 2]; 9367 } 9368 __kmp_nesting_mode_nlevels = loc; 9369 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 9370 } else { // no topology info available; provide a reasonable guesstimation 9371 if (__kmp_avail_proc >= 4) { 9372 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2; 9373 __kmp_nesting_nth_level[1] = 2; 9374 __kmp_nesting_mode_nlevels = 2; 9375 } else { 9376 __kmp_nesting_nth_level[0] = __kmp_avail_proc; 9377 __kmp_nesting_mode_nlevels = 1; 9378 } 9379 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 9380 } 9381 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) { 9382 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i]; 9383 } 9384 set__nproc(thread, __kmp_nesting_nth_level[0]); 9385 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode) 9386 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 9387 if (get__max_active_levels(thread) > 1) { 9388 // if max levels was set, set nesting mode levels to same 9389 __kmp_nesting_mode_nlevels = get__max_active_levels(thread); 9390 } 9391 if (__kmp_nesting_mode == 1) // turn on nesting for this case only 9392 set__max_active_levels(thread, __kmp_nesting_mode_nlevels); 9393 } 9394 9395 // Empty symbols to export (see exports_so.txt) when feature is disabled 9396 extern "C" { 9397 #if !KMP_STATS_ENABLED 9398 void __kmp_reset_stats() {} 9399 #endif 9400 #if !USE_DEBUGGER 9401 int __kmp_omp_debug_struct_info = FALSE; 9402 int __kmp_debugging = FALSE; 9403 #endif 9404 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY 9405 void __kmp_itt_fini_ittlib() {} 9406 void __kmp_itt_init_ittlib() {} 9407 #endif 9408 } 9409 9410 // end of file 9411